diff --git a/.gitignore b/.gitignore
index c246a56cf15a4..6be36bf8c243e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@ paddle/fluid/API_DEV.spec
 paddle/fluid/API_PR.spec
 paddle/fluid/op_use_default_grad_maker_DEV.spec
 paddle/fluid/op_use_default_grad_maker_PR.spec
+paddle/pten/api/*/api*
 
 *.DS_Store
 *.vs
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 334a6cfcd0ee1..03f8522ad5446 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,6 +46,7 @@ option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN XPU"    OFF)
 option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode"    OFF)
 option(WITH_ASCEND         "Compile PaddlePaddle with ASCEND"        OFF)
 option(WITH_ROCM        "Compile PaddlePaddle with ROCM platform"       OFF)
+option(WITH_IPU         "Compile PaddlePaddle with Graphcore IPU"    OFF)
 # NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON
 # to develop some acl related functionality on x86
 option(WITH_ASCEND_CL         "Compile PaddlePaddle with ASCEND CL"        ${WITH_ASCEND})
@@ -215,6 +216,7 @@ option(WITH_DGC   "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE}
 option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF)
 option(WITH_LITE   "Compile Paddle Fluid with Lite Engine" OFF)
 option(WITH_CINN   "Compile PaddlePaddle with CINN" OFF)
+option(WITH_INFRT  "Compile PaddlePaddle with INFRT" OFF)
 option(WITH_NCCL   "Compile PaddlePaddle with NCCL support"             ON)
 option(WITH_RCCL   "Compile PaddlePaddle with RCCL support"             ON)
 option(WITH_XPU_BKCL    "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL"   OFF)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index f15db6e094c17..a77f9f72ca6ad 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -97,6 +97,11 @@ if(WITH_XPU)
     add_definitions(-DPADDLE_WITH_XPU)
 endif()
 
+if(WITH_IPU)
+    message(STATUS "Compile with IPU!")
+    add_definitions(-DPADDLE_WITH_IPU)
+endif()
+
 if(WITH_GPU)
     add_definitions(-DPADDLE_WITH_CUDA)
     add_definitions(-DEIGEN_USE_GPU)
diff --git a/cmake/external/cinn.cmake b/cmake/external/cinn.cmake
index 581a5f93768d0..41b90345c8c5f 100644
--- a/cmake/external/cinn.cmake
+++ b/cmake/external/cinn.cmake
@@ -26,8 +26,7 @@ add_definitions(-w)
 ######################################
 include(ExternalProject)
 set(CINN_PREFIX_DIR ${THIRD_PARTY_PATH}/CINN)
-# TODO(zhhsplendid): Modify git tag after we have release tag
-set(CINN_GIT_TAG develop)
+set(CINN_GIT_TAG release/v0.1)
 set(CINN_OPTIONAL_ARGS -DPY_VERSION=${PY_VERSION}
                        -DWITH_CUDA=${WITH_GPU}
                        -DWITH_CUDNN=${WITH_GPU}
diff --git a/cmake/external/cryptopp.cmake b/cmake/external/cryptopp.cmake
index 913fbfed316d8..27a013c1763a7 100644
--- a/cmake/external/cryptopp.cmake
+++ b/cmake/external/cryptopp.cmake
@@ -22,9 +22,9 @@ SET(CRYPTOPP_TAG        CRYPTOPP_8_2_0)
 
 IF(WIN32)
   SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/cryptopp-static.lib" CACHE FILEPATH "cryptopp library." FORCE)
-  # There is a compilation parameter 'FI\"winapifamily.h\"' can't be used correctly
+  # There is a compilation parameter "/FI\"winapifamily.h\"" or "/FIwinapifamily.h" can't be used correctly
   # with Ninja on Windows. The only difference between the patch file and original
-  # file is that the compilation parameters are changed to 'FIwinapifamily.h'. This
+  # file is that the compilation parameters are changed to '/nologo'. This
   # patch command can be removed when upgrading to a higher version.
   if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
     set(CRYPTOPP_PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different "${PADDLE_SOURCE_DIR}/patches/cryptopp/CMakeLists.txt" "<SOURCE_DIR>/")
diff --git a/cmake/external/llvm.cmake b/cmake/external/llvm.cmake
new file mode 100644
index 0000000000000..8fd4a0741eaba
--- /dev/null
+++ b/cmake/external/llvm.cmake
@@ -0,0 +1,110 @@
+include(FetchContent)
+
+set(LLVM_DOWNLOAD_URL https://paddle-inference-dist.bj.bcebos.com/CINN/llvm11.tar.gz)
+set(LLVM_MD5 39d32b6be466781dddf5869318dcba53)
+
+set(FETCHCONTENT_BASE_DIR ${THIRD_PARTY_PATH}/llvm)
+set(FETCHCONTENT_QUIET OFF)
+FetchContent_Declare(external_llvm
+  URL ${LLVM_DOWNLOAD_URL}
+  URL_MD5 ${LLVM_MD5}
+  PREFIX ${THIRD_PARTY_PATH}/llvm
+  SOURCE_DIR ${THIRD_PARTY_PATH}/install/llvm
+)
+if (NOT LLVM_PATH)
+  FetchContent_GetProperties(external_llvm)
+  if (NOT external_llvm_POPULATED)
+    FetchContent_Populate(external_llvm)
+  endif()
+  set(LLVM_PATH ${THIRD_PARTY_PATH}/install/llvm)
+  set(LLVM_DIR ${THIRD_PARTY_PATH}/install/llvm/lib/cmake/llvm)
+  set(MLIR_DIR ${THIRD_PARTY_PATH}/install/llvm/lib/cmake/mlir)
+else ()
+  set(LLVM_DIR ${LLVM_PATH}/lib/cmake/llvm)
+  set(MLIR_DIR ${LLVM_PATH}/lib/cmake/mlir)
+endif()
+
+if (${CMAKE_CXX_COMPILER} STREQUAL "clang++")
+  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -stdlib=libc++ -lc++abi")
+endif()
+
+message(STATUS "set LLVM_DIR: ${LLVM_DIR}")
+message(STATUS "set MLIR_DIR: ${MLIR_DIR}")
+find_package(LLVM REQUIRED CONFIG HINTS ${LLVM_DIR})
+find_package(MLIR REQUIRED CONFIG HINTS ${MLIR_DIR})
+find_package(ZLIB REQUIRED)
+
+list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}")
+include(AddLLVM)
+
+include_directories(${LLVM_INCLUDE_DIRS})
+list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}")
+list(APPEND CMAKE_MODULE_PATH "${MLIR_CMAKE_DIR}")
+include(AddLLVM)
+include(TableGen)
+include(AddMLIR)
+
+message(STATUS "Found MLIR: ${MLIR_DIR}")
+message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}")
+message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
+
+# To build with MLIR, the LLVM is build from source code using the following flags:
+
+#[==[
+cmake -G Ninja ../llvm \
+  -DLLVM_ENABLE_PROJECTS="mlir;clang" \
+  -DLLVM_BUILD_EXAMPLES=OFF \
+  -DLLVM_TARGETS_TO_BUILD="X86" \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DLLVM_ENABLE_ASSERTIONS=ON \
+  -DLLVM_ENABLE_ZLIB=OFF \
+  -DLLVM_ENABLE_RTTI=ON \
+#]==]
+# The matched llvm-project version is f9dc2b7079350d0fed3bb3775f496b90483c9e42 (currently a temporary commit)
+
+add_definitions(${LLVM_DEFINITIONS})
+
+llvm_map_components_to_libnames(llvm_libs Support Core irreader
+        X86 executionengine orcjit mcjit all codegen)
+
+message(STATUS "LLVM libs: ${llvm_libs}")
+
+get_property(mlir_libs GLOBAL PROPERTY MLIR_ALL_LIBS)
+message(STATUS "MLIR libs: ${mlir_libs}")
+add_definitions(${LLVM_DEFINITIONS})
+
+
+# The minimum needed libraries for MLIR IR parse and transform.
+set(MLIR_IR_LIBS MLIRAnalysis MLIRStandardOps MLIRPass MLIRParser MLIRDialect MLIRIR MLIROptLib)
+
+
+# tb_base is the name of a xxx.td file (without the .td suffix)
+function(mlir_tablegen_on td_base)
+  set(options)
+  set(oneValueArgs DIALECT)
+  cmake_parse_arguments(mlir_tablegen_on "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  set(LLVM_TARGET_DEFINITIONS ${td_base}.td)
+  mlir_tablegen(${td_base}.hpp.inc -gen-op-decls)
+  mlir_tablegen(${td_base}.cpp.inc -gen-op-defs)
+  if (mlir_tablegen_on_DIALECT)
+    mlir_tablegen(${td_base}_dialect.hpp.inc --gen-dialect-decls -dialect=${mlir_tablegen_on_DIALECT})
+  endif()
+  add_public_tablegen_target(${td_base}_IncGen)
+  add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen)
+endfunction()
+
+function(mlir_add_rewriter td_base)
+  set(LLVM_TARGET_DEFINITIONS ${td_base}.td)
+  mlir_tablegen(${td_base}.hpp.inc -gen-rewriters "-I${CMAKE_SOURCE_DIR}/infrt/dialect/pass")
+  add_public_tablegen_target(${td_base}_IncGen)
+  add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen)
+endfunction()
+
+# Execute the mlir script with infrt-exec program.
+# @name: name of the test
+# @script: path to the mlir script file
+function (infrt_exec_check name script)
+  add_test(NAME ${name}
+    COMMAND sh -c "${CMAKE_BINARY_DIR}/infrt/host_context/infrt-exec -i ${CMAKE_CURRENT_SOURCE_DIR}/${script}| ${LLVM_PATH}/bin/FileCheck  ${CMAKE_CURRENT_SOURCE_DIR}/${script}")
+endfunction()
diff --git a/cmake/external/poplar.cmake b/cmake/external/poplar.cmake
new file mode 100644
index 0000000000000..7947a54f8b5f1
--- /dev/null
+++ b/cmake/external/poplar.cmake
@@ -0,0 +1,61 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+if(WITH_IPU)
+  set(POPLAR_DIR CACHE PATH "Path to a Poplar install")
+  set(POPART_DIR CACHE PATH "Path to a Popart install")
+  set(POPLAR_SDK_DIR CACHE PATH "Path to an extracted SDK archive or to a Poplar & Popart install directory (Will populate POPLAR_DIR and POPART_DIR)")
+
+  if(DEFINED ENV{POPLAR_SDK_DIR})
+    set(POPLAR_SDK_DIR $ENV{POPLAR_SDK_DIR})
+    execute_process(COMMAND find ${POPLAR_SDK_DIR}/ -maxdepth 1 -type d -name "popart*"
+      OUTPUT_VARIABLE POPART_DIR OUTPUT_STRIP_TRAILING_WHITESPACE)
+    execute_process(COMMAND find ${POPLAR_SDK_DIR}/ -maxdepth 1 -type d -name "poplar-*" -o -name "poplar"
+      OUTPUT_VARIABLE POPLAR_DIR OUTPUT_STRIP_TRAILING_WHITESPACE)
+    if(NOT IS_DIRECTORY "${POPLAR_DIR}")
+      message(FATAL_ERROR "Couldn't find a \"poplar\" or \"poplar-*\" folder in '${POPLAR_SDK_DIR}'")
+    endif()
+    if(NOT IS_DIRECTORY "${POPART_DIR}")
+      message(FATAL_ERROR "Couldn't find a \"popart*\" folder in '${POPLAR_SDK_DIR}'")
+    endif()
+  else()
+    message(FATAL_ERROR "You must provide a path to a Poplar install using export POPLAR_SDK_DIR=/path/to/poplar_sdk")
+  endif()
+
+  message("POPLAR_DIR is ${POPLAR_DIR}")
+  message("POPART_DIR is ${POPART_DIR}")
+
+  if(EXISTS ${POPLAR_DIR})
+    list(APPEND CMAKE_PREFIX_PATH ${POPLAR_DIR})
+    set(ENABLE_POPLAR_CMD "source ${POPLAR_DIR}/enable.sh")
+    find_package(poplar REQUIRED)
+    include_directories("${POPLAR_DIR}/include")
+    link_directories("${POPLAR_DIR}/lib")
+  endif()
+  if(NOT poplar_FOUND)
+      message(FATAL_ERROR "You must provide a path to a Poplar install using -DPOPLAR_DIR=/path/to/popart/build/install")
+  endif()
+
+  if(EXISTS ${POPART_DIR})
+    list(APPEND CMAKE_PREFIX_PATH ${POPART_DIR})
+    set(ENABLE_POPART_CMD "source ${POPART_DIR}/enable.sh")
+    find_package(popart REQUIRED COMPONENTS popart-only)
+    include_directories("${POPART_DIR}/include")
+    link_directories("${POPART_DIR}/lib")
+  endif()
+  if(NOT popart_FOUND)
+    message(FATAL_ERROR "You must provide a path to a Popart build using -DPOPART_DIR=/path/to/popart/build")
+  endif()
+  add_definitions(-DONNX_NAMESPACE=onnx)
+  add_custom_target(extern_poplar DEPENDS poplar popart-only)
+endif()
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 86acd1a001250..2a028b8dc7e7f 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -204,6 +204,9 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
     elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
         SET(PROTOBUF_REPOSITORY  https://gitee.com/tianjianhe/protobuf.git)
         SET(PROTOBUF_TAG         v3.8.0)
+    elseif(WITH_IPU)
+        SET(PROTOBUF_REPOSITORY  ${GIT_URL}/protocolbuffers/protobuf.git)
+        SET(PROTOBUF_TAG         d750fbf648256c7c631f51ffdbf67d7c18b0114e)
     else()
         SET(PROTOBUF_REPOSITORY  ${GIT_URL}/protocolbuffers/protobuf.git)
         SET(PROTOBUF_TAG         9f75c5aa851cd877fb0d93ccc31b8567a6706546)
@@ -243,6 +246,8 @@ ENDFUNCTION()
 
 if(WITH_ASCEND OR WITH_ASCEND_CL)
     SET(PROTOBUF_VERSION 3.8.0)
+elseif(WITH_IPU)
+    SET(PROTOBUF_VERSION 3.6.1)
 else()
     SET(PROTOBUF_VERSION 3.1.0)
 endif()
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 7f828fd66e2aa..d89ecd27c0954 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -34,8 +34,13 @@ ELSE ()
   SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
 ENDIF()
 
-SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211129")
+if(NOT DEFINED XPU_BASE_URL)
+  SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211129")
+else()
+  SET(XPU_BASE_URL "${XPU_BASE_URL}")
+endif()
+
 SET(XPU_XRE_URL  "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 7afff25664bbb..7495ee32bab95 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -151,6 +151,13 @@ set(COMMON_FLAGS
     ${fsanitize}
 )
 
+if(WITH_IPU)
+    set(COMMON_FLAGS ${COMMON_FLAGS} 
+        -Wno-sign-compare # Warnings in Popart
+        -Wno-non-virtual-dtor # Warnings in Popart
+    )
+endif()
+
 if(NOT APPLE)
     if((${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.0) OR (WITH_ROCM))
         set(COMMON_FLAGS
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index f2efc974073e5..71e1856147449 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -391,4 +391,14 @@ if (WIN32)
     list(APPEND third_party_deps extern_dirent)
 endif (WIN32)
 
+if (WITH_INFRT)
+    include(external/llvm)
+    list(APPEND third_party_deps external_llvm)
+endif()
+
+if (WITH_IPU)
+    include(external/poplar)
+    list(APPEND third_party_deps extern_poplar)
+endif()
+
 add_custom_target(third_party ALL DEPENDS ${third_party_deps})
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index b3a1b2e8c9587..4b88689b9b6df 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -2,4 +2,5 @@ add_subdirectory(scripts)
 add_subdirectory(testing)
 set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory")
 add_subdirectory(pten)
+add_subdirectory(infrt)
 add_subdirectory(fluid)
diff --git a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
index 641110802f1fd..51f1d936bd70a 100644
--- a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
+++ b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
@@ -11,14 +11,15 @@ else()
 endif()
 
 cc_library(fleet_executor SRCS fleet_executor.cc carrier.cc task_node.cc runtime_graph.cc
-        interceptor.cc compute_interceptor.cc interceptor_message_service.cc message_bus.cc
-        DEPS proto_desc fleet_executor_desc_proto interceptor_message_proto collective_helper
-        ${BRPC_DEPS})
+        interceptor.cc compute_interceptor.cc amplifier_interceptor.cc interceptor_message_service.cc message_bus.cc
+        DEPS proto_desc fleet_executor_desc_proto interceptor_message_proto collective_helper op_registry
+        executor_gc_helper ${BRPC_DEPS})
 
 if(WITH_DISTRIBUTE)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
   set_source_files_properties(interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   set_source_files_properties(compute_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(amplifier_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   set_source_files_properties(message_bus.h PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   set_source_files_properties(message_bus.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   set_source_files_properties(fleet_executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
diff --git a/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc b/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc
new file mode 100644
index 0000000000000..72c689732b5b7
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc
@@ -0,0 +1,60 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h"
+
+#include "paddle/fluid/distributed/fleet_executor/task_node.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace distributed {
+
+AmplifierInterceptor::AmplifierInterceptor(int64_t interceptor_id,
+                                           TaskNode* node)
+    : ComputeInterceptor(interceptor_id, node) {
+  run_per_steps_ = node->run_per_steps();
+  run_at_offset_ = node->run_at_offset();
+  reply_up_per_steps_ = node->reply_up_per_steps();
+  send_down_per_steps_ = node->send_down_per_steps();
+}
+
+void AmplifierInterceptor::RunOps() {
+  // run_per_steps_, run_at_offset_
+  // 4, 0 --> run at step 0, 4, 8, 12
+  // 4, 3 --> run at step 3, 7, 11, 15
+  if ((step_ % run_per_steps_) == run_at_offset_) {
+    ComputeInterceptor::RunOps();
+  }
+}
+
+void AmplifierInterceptor::SendDataReadyToDownStream() {
+  // run multi times, send ready one times to downstream, that is
+  // input multi times, output one times
+  if (step_ % send_down_per_steps_ == 0) {
+    ComputeInterceptor::SendDataReadyToDownStream();
+  }
+}
+
+void AmplifierInterceptor::ReplyCompletedToUpStream() {
+  // run multi times, reply one times to upstream, that is
+  // input one times, output multi times
+  if (step_ % reply_up_per_steps_ == 0) {
+    ComputeInterceptor::ReplyCompletedToUpStream();
+  }
+}
+
+REGISTER_INTERCEPTOR(Amplifier, AmplifierInterceptor);
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h b/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h
new file mode 100644
index 0000000000000..776aa8d3e88db
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <utility>
+
+#include "paddle/fluid/distributed/fleet_executor/compute_interceptor.h"
+
+namespace paddle {
+namespace distributed {
+
+class AmplifierInterceptor : public ComputeInterceptor {
+ public:
+  AmplifierInterceptor(int64_t interceptor_id, TaskNode* node);
+
+ private:
+  void RunOps() override;
+  void SendDataReadyToDownStream() override;
+  void ReplyCompletedToUpStream() override;
+
+  int64_t run_per_steps_{1};
+  int64_t run_at_offset_{0};
+
+  // one input produces multi times output
+  int64_t reply_up_per_steps_{1};
+  // one output need multi times input
+  int64_t send_down_per_steps_{1};
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
index 108a21b92fdfd..009df6438e270 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -16,22 +16,25 @@
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor_message_service.h"
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
+#include "paddle/fluid/distributed/fleet_executor/runtime_graph.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
+#include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/framework/scope.h"
 
 namespace paddle {
 namespace distributed {
 
 USE_INTERCEPTOR(Compute);
+USE_INTERCEPTOR(Amplifier);
 
-void Carrier::Init(
-    const std::unordered_map<int64_t, TaskNode*>& interceptor_id_to_node,
-    framework::Scope* root_scope, framework::Scope* minibatch_scope,
-    const std::vector<framework::Scope*>& microbatch_scopes,
-    const platform::Place& place) {
+void Carrier::Init(std::shared_ptr<RuntimeGraph> runtime_graph,
+                   framework::Scope* root_scope,
+                   framework::Scope* minibatch_scope,
+                   const std::vector<framework::Scope*>& microbatch_scopes,
+                   const platform::Place& place) {
   PADDLE_ENFORCE_EQ(is_init_, false, platform::errors::AlreadyExists(
                                          "Carrier is already init."));
-  interceptor_id_to_node_ = interceptor_id_to_node;
+  runtime_graph_ = runtime_graph;
   minibatch_scope_ = minibatch_scope;
   microbatch_scopes_ = microbatch_scopes;
   place_ = place;
@@ -41,15 +44,34 @@ void Carrier::Init(
   is_init_ = true;
 }
 
-Carrier::~Carrier() {
+void Carrier::Release() {
   // NOTE(wangxi): must join before `Derived Interceptor` destruct,
   // otherwise Derived object will be destructed before thread complete.
+
+  // Sending STOP msg to the source interceptor
+  MessageBus& msg_bus = MessageBus::Instance();
+  PADDLE_ENFORCE_EQ(msg_bus.IsInit(), true,
+                    platform::errors::PreconditionNotMet(
+                        "Message bus has not been initialized."));
+  for (int64_t id : source_interceptor_ids_) {
+    VLOG(3) << "Carrier Release is sending stop to source interceptor " << id
+            << ".";
+    InterceptorMessage stop_msg;
+    // source node STOP is send by carrier, so set src_id=-1
+    stop_msg.set_src_id(-1);
+    stop_msg.set_dst_id(id);
+    stop_msg.set_message_type(STOP);
+    msg_bus.Send(stop_msg);
+  }
+
   // TODO(wangxi): Maybe need a better to use thread.
   for (auto& interceptor : interceptor_idx_to_interceptor_) {
     interceptor.second->Join();
   }
 }
 
+Carrier::~Carrier() { VLOG(3) << "Carrier's destructor."; }
+
 bool Carrier::EnqueueInterceptorMessage(
     const InterceptorMessage& interceptor_message) {
   // enqueue message to interceptor
@@ -92,19 +114,22 @@ Interceptor* Carrier::GetInterceptor(int64_t interceptor_id) {
 }
 
 void Carrier::Start() {
-  // TODO(fleet_executor dev): this start is a faked one, need replace
-  for (const auto& pair : interceptor_idx_to_interceptor_) {
-    VLOG(3) << "Fake run is sending start to interceptor " << pair.first << ".";
-    InterceptorMessage tmp_msg;
-    tmp_msg.set_src_id(pair.first);
-    tmp_msg.set_dst_id(pair.first);
-    tmp_msg.set_message_type(DATA_IS_READY);
-    MessageBus& message_bus_instance = MessageBus::Instance();
-    PADDLE_ENFORCE_EQ(message_bus_instance.IsInit(), true,
-                      platform::errors::PreconditionNotMet(
-                          "Message bus has not been initialized."));
-    message_bus_instance.Send(tmp_msg);
+  MessageBus& msg_bus = MessageBus::Instance();
+  PADDLE_ENFORCE_EQ(msg_bus.IsInit(), true,
+                    platform::errors::PreconditionNotMet(
+                        "Message bus has not been initialized."));
+
+  for (int64_t id : source_interceptor_ids_) {
+    VLOG(3) << "Carrier Start is sending start to source interceptor " << id
+            << ".";
+    InterceptorMessage start_msg;
+    // source node data_is_ready is send by carrier, so set src_id=-1
+    start_msg.set_src_id(-1);
+    start_msg.set_dst_id(id);
+    start_msg.set_message_type(DATA_IS_READY);
+    msg_bus.Send(start_msg);
   }
+
   std::unique_lock<std::mutex> lock(running_mutex_);
   cond_var_.wait(lock);
   dev_ctx_->Wait();
@@ -136,6 +161,17 @@ void Carrier::SetCreatingFlag(bool flag) {
   creating_interceptors_ = flag;
   creating_flag_mutex_.unlock();
   if (!flag) {
+    for (auto& pair : interceptor_idx_to_interceptor_) {
+      // update the source interceptor id
+      if (std::find(source_interceptor_ids_.begin(),
+                    source_interceptor_ids_.end(),
+                    pair.first) == source_interceptor_ids_.end()) {
+        auto task = pair.second->GetTaskNode();
+        if (task != nullptr && task->upstream().empty()) {
+          source_interceptor_ids_.emplace_back(pair.first);
+        }
+      }
+    }
     // finish create interceptors outside, handle tmp messsages
     HandleTmpMessages();
   }
@@ -156,32 +192,70 @@ void Carrier::HandleTmpMessages() {
   message_tmp_.clear();
 }
 
+static std::shared_ptr<framework::GarbageCollector> GetGC(
+    const platform::Place& place) {
+  int64_t max_memory_size = framework::GetEagerDeletionThreshold();
+  std::shared_ptr<framework::GarbageCollector> gc;
+  if (max_memory_size >= 0) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    if (platform::is_gpu_place(place)) {
+      if (framework::IsFastEagerDeletionModeEnabled()) {
+        gc.reset(new framework::UnsafeFastGPUGarbageCollector(
+            BOOST_GET_CONST(platform::CUDAPlace, place), max_memory_size));
+      }
+    }
+#endif
+  }  // max_memory_size >= 0
+
+  return gc;
+}
+
 void Carrier::CreateInterceptors() {
+  if (runtime_graph_->intercepter_id_to_node().empty()) return;
+
+  auto gc = GetGC(place_);
+
   // create each Interceptor
-  if (!interceptor_id_to_node_.empty()) {
-    // no auto init since there is no config
-    for (const auto& item : interceptor_id_to_node_) {
-      int64_t interceptor_id = item.first;
-      TaskNode* task_node = item.second;
-
-      // TODO(wangxi): use node_type to select different Interceptor
-      auto interceptor =
-          std::make_unique<Interceptor>(interceptor_id, task_node);
-      interceptor->SetPlace(place_);
-      interceptor->SetMiniBatchScope(minibatch_scope_);
-      interceptor->SetMicroBatchScope(microbatch_scopes_);
-      interceptor->SetRootScope(root_scope_);
-      SetInterceptor(interceptor_id, std::move(interceptor));
-      VLOG(3) << "Create Interceptor with interceptor id: " << interceptor_id
-              << ".";
+  // no auto init since there is no config
+  for (const auto& item : runtime_graph_->intercepter_id_to_node()) {
+    int64_t interceptor_id = item.first;
+    TaskNode* task_node = item.second;
+
+    PADDLE_ENFORCE_LT(
+        task_node->run_at_offset(), task_node->run_per_steps(),
+        platform::errors::InvalidArgument(
+            "Interceptor's run_at_offset must < run_per_steps, must now "
+            "run_at_offset=%ld run_per_steps=%ld",
+            task_node->run_at_offset(), task_node->run_per_steps()));
+
+    std::unique_ptr<Interceptor> interceptor;
+    if (task_node->type().empty()) {
+      // TODO(wangxi): delete this in future
+      interceptor.reset(new Interceptor(interceptor_id, task_node));
+    } else {
+      interceptor = InterceptorFactory::Create(task_node->type(),
+                                               interceptor_id, task_node);
+    }
+    interceptor->SetPlace(place_);
+    interceptor->SetMiniBatchScope(minibatch_scope_);
+    interceptor->SetMicroBatchScope(microbatch_scopes_);
+    interceptor->SetRootScope(root_scope_);
+    interceptor->SetGC(gc);
+
+    SetInterceptor(interceptor_id, std::move(interceptor));
+    VLOG(3) << "Create Interceptor with interceptor id: " << interceptor_id
+            << " with type: " << task_node->type() << ".";
+
+    if (task_node->upstream().empty()) {
+      source_interceptor_ids_.emplace_back(interceptor_id);
     }
-    // The carrier will be always waiting for outside initializer
-    // since there is no interceptor has been created during auto init
-    creating_flag_mutex_.lock();
-    creating_interceptors_ = false;
-    creating_flag_mutex_.unlock();
-    HandleTmpMessages();
   }
+  // The carrier will be always waiting for outside initializer
+  // since there is no interceptor has been created during auto init
+  creating_flag_mutex_.lock();
+  creating_interceptors_ = false;
+  creating_flag_mutex_.unlock();
+  HandleTmpMessages();
 }
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.h b/paddle/fluid/distributed/fleet_executor/carrier.h
index c4c6a41846474..0c54201c94034 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.h
+++ b/paddle/fluid/distributed/fleet_executor/carrier.h
@@ -17,6 +17,7 @@
 #include <condition_variable>
 #include <memory>
 #include <mutex>
+#include <set>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -38,6 +39,7 @@ namespace distributed {
 
 class TaskNode;
 class InterceptorMessageServiceImpl;
+class RuntimeGraph;
 
 // A singleton MessageBus
 class Carrier final {
@@ -47,13 +49,13 @@ class Carrier final {
     return carrier;
   }
 
-  void Init(
-      const std::unordered_map<int64_t, TaskNode*>& interceptor_id_to_node,
-      framework::Scope* root_scope, framework::Scope* minibatch_scope,
-      const std::vector<framework::Scope*>& microbatch_scopes,
-      const platform::Place& place);
+  void Init(std::shared_ptr<RuntimeGraph> runtime_graph,
+            framework::Scope* root_scope, framework::Scope* minibatch_scope,
+            const std::vector<framework::Scope*>& microbatch_scopes,
+            const platform::Place& place);
 
   ~Carrier();
+  void Release();
 
   // Enqueue a message to corresponding interceptor id
   bool EnqueueInterceptorMessage(const InterceptorMessage& interceptor_message);
@@ -83,13 +85,12 @@ class Carrier final {
 
   void HandleTmpMessages();
 
-  // interceptor logic id to the Nodes info
-  std::unordered_map<int64_t, TaskNode*> interceptor_id_to_node_;
-
   // interceptor logic id to actually interceptor
   std::unordered_map<int64_t, std::unique_ptr<Interceptor>>
       interceptor_idx_to_interceptor_;
 
+  std::vector<int64_t> source_interceptor_ids_;
+
   std::vector<InterceptorMessage> message_tmp_{};
   std::mutex tmp_message_mutex_;
   bool creating_interceptors_{true};
@@ -102,7 +103,8 @@ class Carrier final {
   framework::Scope* root_scope_;
   framework::Scope* minibatch_scope_;
   paddle::platform::Place place_;
-  paddle::platform::DeviceContext* dev_ctx_ = nullptr;
+  paddle::platform::DeviceContext* dev_ctx_{nullptr};
+  std::shared_ptr<RuntimeGraph> runtime_graph_;
 };
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
index 3008c83069942..35905125a0a43 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/distributed/fleet_executor/compute_interceptor.h"
 
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
+#include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
@@ -27,19 +28,15 @@ ComputeInterceptor::ComputeInterceptor(int64_t interceptor_id, TaskNode* node)
 }
 
 void ComputeInterceptor::PrepareDeps() {
-  auto& upstream = GetTaskNode()->upstream();
-  auto& downstream = GetTaskNode()->downstream();
+  auto& upstream = node_->upstream();
+  auto& downstream = node_->downstream();
 
-  // TODO(wangxi): get from task node
-  int64_t in_buff_size = std::numeric_limits<int64_t>::max();
-  int64_t out_buff_size = 2;
-
-  for (auto up_id : upstream) {
-    in_readys_.emplace(up_id, std::make_pair(in_buff_size, 0));
-    in_stops_.emplace(up_id, false);
+  for (auto up : upstream) {
+    in_readys_.emplace(up.first, std::make_pair(up.second, 0));
+    in_stops_.emplace(up.first, false);
   }
-  for (auto down_id : downstream) {
-    out_buffs_.emplace(down_id, std::make_pair(out_buff_size, 0));
+  for (auto down : downstream) {
+    out_buffs_.emplace(down.first, std::make_pair(down.second, 0));
   }
 
   // source compute node, should we add a new SourceInterceptor?
@@ -50,18 +47,28 @@ void ComputeInterceptor::PrepareDeps() {
                           "Source ComputeInterceptor must run at least one "
                           "times, but now max_run_times=%ld",
                           node_->max_run_times()));
+    in_readys_.emplace(-1,
+                       std::make_pair(std::numeric_limits<int64_t>::max(), 0));
   }
+
+  // If there is no downstream or every downstream is in different rank,
+  // then this interceptor is the last one for current rank.
+  // This can be get during init, can be cached for later use.
+  is_last_ = downstream.empty();
 }
 
 void ComputeInterceptor::IncreaseReady(int64_t up_id) {
-  // source node has no upstream, data_is_ready is send by carrier or others
-  if (is_source_ && up_id == -1) return;
-
   auto it = in_readys_.find(up_id);
   PADDLE_ENFORCE_NE(it, in_readys_.end(),
                     platform::errors::NotFound(
                         "Cannot find upstream=%lld in in_readys.", up_id));
 
+  // source node has no upstream, data_is_ready is send by carrier or others
+  if (is_source_ && up_id == -1) {
+    it->second.second += GetTaskNode()->max_run_times();
+    return;
+  }
+
   auto max_ready_size = it->second.first;
   auto ready_size = it->second.second;
   ready_size += 1;
@@ -92,7 +99,11 @@ bool ComputeInterceptor::IsInputReady() {
   for (auto& ins : in_readys_) {
     auto ready_size = ins.second.second;
     // not ready, return false
-    if (ready_size == 0) return false;
+    if (ready_size == 0) {
+      VLOG(3) << "Interceptor " << GetInterceptorId()
+              << "'s upstreams aren't all ready.";
+      return false;
+    }
   }
   return true;
 }
@@ -102,17 +113,15 @@ bool ComputeInterceptor::CanWriteOutput() {
     auto max_buffer_size = outs.second.first;
     auto used_size = outs.second.second;
     // full, return false
-    if (used_size == max_buffer_size) return false;
+    if (used_size == max_buffer_size) {
+      VLOG(3) << "Interceptor " << GetInterceptorId()
+              << "'s out buffer is full.";
+      return false;
+    }
   }
   return true;
 }
 
-// only source node need reset
-bool ComputeInterceptor::ShouldReset() {
-  if (is_source_ && step_ == node_->max_run_times()) return true;
-  return false;
-}
-
 void ComputeInterceptor::SendDataReadyToDownStream() {
   for (auto& outs : out_buffs_) {
     auto down_id = outs.first;
@@ -129,7 +138,9 @@ void ComputeInterceptor::SendDataReadyToDownStream() {
 
     InterceptorMessage ready_msg;
     ready_msg.set_message_type(DATA_IS_READY);
-    VLOG(3) << "ComputeInterceptor Send data_is_ready msg to " << down_id;
+    VLOG(3) << "ComputeInterceptor " << interceptor_id_
+            << " Send data_is_ready msg to " << down_id
+            << " for step: " << step_;
     Send(down_id, ready_msg);
   }
 }
@@ -146,40 +157,47 @@ void ComputeInterceptor::ReplyCompletedToUpStream() {
             ready_size));
     ins.second.second = ready_size;
 
+    VLOG(3) << "ComputeInterceptor " << interceptor_id_
+            << " Reply data_is_useless msg to " << up_id
+            << " for step: " << step_;
+    if (up_id == -1) return;
+
     InterceptorMessage reply_msg;
     reply_msg.set_message_type(DATE_IS_USELESS);
-    VLOG(3) << "ComputeInterceptor Reply data_is_useless msg to " << up_id;
     Send(up_id, reply_msg);
   }
 }
 
-void ComputeInterceptor::Run() {
-  // If there is no limit, source interceptor can be executed
-  // an unlimited number of times.
-  // Now source node can only run
-  if (ShouldReset()) {
-    for (auto& out_buff : out_buffs_) {
-      // buffer is using
-      if (out_buff.second.second != 0) return;
+void ComputeInterceptor::RunOps() {
+  VLOG(3) << "ComputeInterceptor " << interceptor_id_ << " running ops for the "
+          << step_ + 1 << " time.";
+  for (auto op : node_->ops()) {
+    op->Run(*microbatch_scopes_[step_ % node_->max_run_times()], place_);
+    if (gc_) {
+      framework::DeleteUnusedTensors(
+          *microbatch_scopes_[step_ % node_->max_run_times()], op,
+          node_->unused_vars(), gc_.get());
     }
-    step_ = 0;  // reset
-    return;
   }
+}
 
-  while (IsInputReady() && CanWriteOutput() && !ShouldReset()) {
+void ComputeInterceptor::Run() {
+  while (IsInputReady() && CanWriteOutput()) {
     VLOG(3) << "id=" << GetInterceptorId() << " ComputeInterceptor running";
 
-    // step_ %= node_->max_run_times();
-    for (auto op : node_->ops()) {
-      auto* scope = microbatch_scopes_[step_ % node_->max_slot_nums()];
-      op->Run(*scope, place_);
-    }
+    RunOps();
     ++step_;
 
     // send to downstream and increase buff used
     SendDataReadyToDownStream();
     // reply to upstream and decrease ready data
     ReplyCompletedToUpStream();
+    // Try to stop Carrier
+    if (is_last_ && (step_ % node_->max_run_times() == 0)) {
+      VLOG(3) << "Interceptor " << GetInterceptorId()
+              << " is stopping carrier.";
+      StopCarrier();
+    }
   }
 }
 
@@ -221,11 +239,6 @@ void ComputeInterceptor::TryStop() {
     Send(down_id, stop);
   }
   stop_ = true;
-
-  if (out_buffs_.size() == 0) {
-    // TODO(fleet executor dev) need a better place to notify
-    StopCarrier();
-  }
 }
 
 void ComputeInterceptor::Compute(const InterceptorMessage& msg) {
diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.h b/paddle/fluid/distributed/fleet_executor/compute_interceptor.h
index 97e6da2f00eae..fb82ce76c7bdb 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.h
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.h
@@ -25,16 +25,20 @@ class ComputeInterceptor : public Interceptor {
  public:
   ComputeInterceptor(int64_t interceptor_id, TaskNode* node);
 
+ protected:
+  virtual void RunOps();
+  virtual void SendDataReadyToDownStream();
+  virtual void ReplyCompletedToUpStream();
+
+  int64_t step_{0};
+
+ private:
   void PrepareDeps();
 
   void IncreaseReady(int64_t up_id);
   void DecreaseBuff(int64_t down_id);
   bool IsInputReady();
   bool CanWriteOutput();
-  bool ShouldReset();
-
-  void SendDataReadyToDownStream();
-  void ReplyCompletedToUpStream();
 
   void Run();
   void Compute(const InterceptorMessage& msg);
@@ -42,9 +46,8 @@ class ComputeInterceptor : public Interceptor {
   void ReceivedStop(int64_t up_id);
   void TryStop();
 
- private:
   bool is_source_{false};
-  int64_t step_{0};
+  bool is_last_{false};
 
   // upstream_id-->(max_ready_size, ready_size)
   std::map<int64_t, std::pair<int64_t, int64_t>> in_readys_{};
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
index ec60ec5fd5901..3a823674d842c 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
@@ -31,14 +31,12 @@ FleetExecutor::FleetExecutor(const std::string& exe_desc_str) {
                                  "Error occurs while parsing string to proto"));
 }
 
-FleetExecutor::~FleetExecutor() {
-  // Destroy Executor
-}
+FleetExecutor::~FleetExecutor() { root_scope_->DropKids(); }
 
 void FleetExecutor::Init(const framework::ProgramDesc& program_desc,
                          framework::Scope* scope,
                          const platform::Place& place) {
-  runtime_graph_ = std::make_unique<RuntimeGraph>(program_desc, exe_desc_);
+  runtime_graph_ = std::make_shared<RuntimeGraph>(program_desc, exe_desc_);
   root_scope_ = scope;
   place_ = place;
   PADDLE_ENFORCE_NOT_NULL(root_scope_, platform::errors::InvalidArgument(
@@ -58,8 +56,8 @@ void FleetExecutor::Init(const framework::ProgramDesc& program_desc,
 void FleetExecutor::InitCarrier() {
   Carrier& carrier_instance = Carrier::Instance();
   if (!carrier_instance.IsInit()) {
-    carrier_instance.Init(runtime_graph_->intercepter_id_to_node(), root_scope_,
-                          minibatch_scope_, microbatch_scopes_, place_);
+    carrier_instance.Init(runtime_graph_, root_scope_, minibatch_scope_,
+                          microbatch_scopes_, place_);
   }
 }
 
@@ -111,10 +109,17 @@ void FleetExecutor::Run() {
       message_bus_instance.IsInit(), true,
       platform::errors::Unavailable("MessageBus has not been init yet."));
   carrier_instance.Start();
+  for (auto* micro_scop : microbatch_scopes_) {
+    // By default, we should delete all kid scopes after run executor because
+    // some operators may create local scope when running, such as while_op.
+    // But when while_op also create a local executor to run it's sub block,
+    // the sub scopes it created should not be dropped immediately, because
+    // while_grad_op will use some variables created during while_op run, so
+    // we need to keep the kids and wait for the outer executor to drop them.
+    micro_scop->DropKids();
+  }
 }
 
-void FleetExecutor::Release() { root_scope_->DropKids(); }
-
 void FleetExecutor::CopyParameters(int microbatch_id,
                                    const framework::ProgramDesc& program) {
   auto& global_block = program.Block(0);
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.h b/paddle/fluid/distributed/fleet_executor/fleet_executor.h
index 7be18772e9ec9..ac857fb6c38a2 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor.h
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.h
@@ -39,7 +39,6 @@ class FleetExecutor final {
   void Init(const framework::ProgramDesc& program_desc, framework::Scope* scope,
             const platform::Place& place);
   void Run();
-  void Release();
 
  private:
   DISABLE_COPY_AND_ASSIGN(FleetExecutor);
@@ -47,7 +46,7 @@ class FleetExecutor final {
   void InitCarrier();
   void CopyParameters(int microbatch_id, const framework::ProgramDesc& program);
   FleetExecutorDesc exe_desc_;
-  std::unique_ptr<RuntimeGraph> runtime_graph_;
+  std::shared_ptr<RuntimeGraph> runtime_graph_;
   framework::Scope* root_scope_;
   framework::Scope* minibatch_scope_;
   platform::Place place_;
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor_desc.proto b/paddle/fluid/distributed/fleet_executor/fleet_executor_desc.proto
index 1b12f1239dcbd..6890c311ec003 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor_desc.proto
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor_desc.proto
@@ -21,12 +21,11 @@ message RankInfo {
 }
 
 message FleetExecutorDesc {
-  optional string grain = 1 [ default = "coarse" ];
-  optional int64 cur_rank = 2 [ default = 0 ]; // Rank id of current processor
-  repeated RankInfo cluster_info = 3;
-  optional int32 dp_degree = 4 [ default = 1 ];
-  optional int32 mp_degree = 5 [ default = 1 ];
-  optional int32 pp_degree = 6 [ default = 1 ];
-  optional int64 num_micro_batches = 7 [ default = 1 ];
-  optional int64 num_slots = 8 [ default = 1 ];
+  optional int64 cur_rank = 1 [ default = 0 ]; // Rank id of current processor
+  repeated RankInfo cluster_info = 2;
+  optional int32 dp_degree = 3 [ default = 1 ];
+  optional int32 mp_degree = 4 [ default = 1 ];
+  optional int32 pp_degree = 5 [ default = 1 ];
+  optional int64 num_micro_batches = 6 [ default = 1 ];
+  optional int64 num_slots = 7 [ default = 1 ];
 }
diff --git a/paddle/fluid/distributed/fleet_executor/interceptor.cc b/paddle/fluid/distributed/fleet_executor/interceptor.cc
index 40429502825c9..dd7b89c4b8119 100644
--- a/paddle/fluid/distributed/fleet_executor/interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/interceptor.cc
@@ -40,22 +40,9 @@ void Interceptor::Join() {
 void Interceptor::RegisterMsgHandle(MsgHandle handle) { handle_ = handle; }
 
 void Interceptor::Handle(const InterceptorMessage& msg) {
-  if (handle_) {
-    handle_(msg);
-  } else {
-    VLOG(3) << "Interceptor is using default message handler. This handler is "
-               "only used for test purpose. Check whether you init interceptor "
-               "in the proper way.";
-    if (msg.message_type() == DATA_IS_READY) {
-      VLOG(3) << "Fake handler is sending stop message to it self.";
-      InterceptorMessage msg;
-      msg.set_message_type(STOP);
-      Send(interceptor_id_, msg);
-    } else if (msg.message_type() == STOP) {
-      stop_ = true;
-      StopCarrier();
-    }
-  }
+  PADDLE_ENFORCE_NOT_NULL(handle_, platform::errors::PreconditionNotMet(
+                                       "Message handle is not registered."));
+  handle_(msg);
 }
 
 void Interceptor::StopCarrier() {
diff --git a/paddle/fluid/distributed/fleet_executor/interceptor.h b/paddle/fluid/distributed/fleet_executor/interceptor.h
index ef1ffb1a53b3f..b0c1e46f03138 100644
--- a/paddle/fluid/distributed/fleet_executor/interceptor.h
+++ b/paddle/fluid/distributed/fleet_executor/interceptor.h
@@ -31,6 +31,7 @@
 namespace paddle {
 namespace framework {
 class Scope;
+class GarbageCollector;
 }
 namespace distributed {
 
@@ -73,6 +74,9 @@ class Interceptor {
   void SetMicroBatchScope(const std::vector<framework::Scope*>& scopes) {
     microbatch_scopes_ = scopes;
   }
+  void SetGC(const std::shared_ptr<framework::GarbageCollector>& gc) {
+    gc_ = gc;
+  }
 
   TaskNode* GetTaskNode() const { return node_; }
 
@@ -94,6 +98,7 @@ class Interceptor {
   framework::Scope* root_scope_{nullptr};
   framework::Scope* minibatch_scope_{nullptr};
   std::vector<framework::Scope*> microbatch_scopes_{};
+  std::shared_ptr<framework::GarbageCollector> gc_{nullptr};
 
  private:
   // pool the local mailbox, parse the Message
diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.cc b/paddle/fluid/distributed/fleet_executor/message_bus.cc
index 2071477372c9e..f087de69fa96b 100644
--- a/paddle/fluid/distributed/fleet_executor/message_bus.cc
+++ b/paddle/fluid/distributed/fleet_executor/message_bus.cc
@@ -14,6 +14,7 @@
 
 #include <chrono>
 #include <memory>
+#include <set>
 #include <thread>
 
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
@@ -56,6 +57,10 @@ void MessageBus::Init(
 bool MessageBus::IsInit() const { return is_init_; }
 
 MessageBus::~MessageBus() {
+  // NOTE: fleet_executor inits carrier before message bus,
+  // therefore the message bus's destructor will be called first
+  Carrier& carrier = Carrier::Instance();
+  carrier.Release();
   VLOG(3) << "Message bus releases resource.";
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
     !defined(PADDLE_WITH_ASCEND_CL)
@@ -86,6 +91,8 @@ bool MessageBus::Send(const InterceptorMessage& interceptor_message) {
                 << retry_time << " times retries.";
         return true;
       }
+      VLOG(3) << "Message bus sends failed, retry after 1 seconds.";
+      std::this_thread::sleep_for(std::chrono::milliseconds(1000));
     }
     VLOG(3) << "Message bus sends inter rank fail after 10 times retries.";
     return false;
@@ -117,16 +124,40 @@ void MessageBus::ListenPort() {
   brpc::ServerOptions options;
   options.idle_timeout_sec = -1;
   int retry_times = 0;
-  int interval = 1000;
+  int interval = 100;
   while (server_.Start(ip_for_brpc, &options) != 0) {
     ++retry_times;
     LOG(INFO) << "Message bus is retring for starting brpc for " << retry_times
               << " times. And will retry after " << interval / 1000
               << " seconds.";
     std::this_thread::sleep_for(std::chrono::milliseconds(interval));
-    interval += 2000;
+    interval += 500;
   }
   LOG(INFO) << "Message bus's listen port thread starts successful.";
+
+  std::set<int64_t> visit;
+  InterceptorMessage tmp_msg;
+  tmp_msg.set_ctrl_message(true);
+  for (auto pair : interceptor_id_to_rank_) {
+    if (rank_to_addr_.at(pair.second) == addr_) {
+      tmp_msg.set_src_id(pair.first);
+    }
+  }
+  for (auto pair : interceptor_id_to_rank_) {
+    int64_t rank = pair.second;
+    if (rank_to_addr_.at(rank) == addr_) {
+      continue;
+    }
+    tmp_msg.set_dst_id(pair.first);
+    if (visit.find(rank) == visit.end()) {
+      VLOG(3) << "Message bus is testing connection for rank: " << rank << ".";
+      visit.insert(rank);
+      while (!Send(tmp_msg)) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+      }
+      VLOG(3) << "Message bus has connected to rank: " << rank << ".";
+    }
+  }
 #else
   LOG(WARNING)
       << "Fleet executor's ListenPort() is a fake function when Paddle is "
@@ -136,6 +167,9 @@ void MessageBus::ListenPort() {
 }
 
 bool MessageBus::IsSameRank(int64_t src_id, int64_t dst_id) {
+  // -1 is sent by carrier to source interceptor
+  if (src_id == -1) src_id = dst_id;
+
   // check whether the dst is the same rank or different rank with src
   const auto& src_rank = interceptor_id_to_rank_.find(src_id);
   const auto& dst_rank = interceptor_id_to_rank_.find(dst_id);
diff --git a/paddle/fluid/distributed/fleet_executor/runtime_graph.cc b/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
index 3a76bd43f9d55..32f9e36e53037 100644
--- a/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
+++ b/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/distributed/fleet_executor/runtime_graph.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
+#include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -100,7 +101,9 @@ std::vector<OpRole> RuntimeGraph::functionality_order = {
 RuntimeGraph::RuntimeGraph(const ProgramDesc& program,
                            const FleetExecutorDesc& exe_desc)
     : exe_desc_(exe_desc) {
-  if (exe_desc.grain() == "coarse") {
+  if (exe_desc.pp_degree() == 1) {
+    OriginProgramCompile(program);
+  } else {
     SplitProgramBasedFunctionality(program);
     AssignTaskToIntercepter();
     FakeDependence();
@@ -108,10 +111,32 @@ RuntimeGraph::RuntimeGraph(const ProgramDesc& program,
   }
 }
 
+void RuntimeGraph::OriginProgramCompile(const ProgramDesc& program) {
+  int64_t cur_rank = exe_desc_.cur_rank();
+  int64_t max_run_times = exe_desc_.num_micro_batches();
+  int64_t max_slot_nums = exe_desc_.num_slots();
+
+  auto task_node = std::make_unique<TaskNode>(program, cur_rank, max_run_times,
+                                              max_slot_nums);
+  // TODO(wangxi): add skip vars
+  auto unused_vars =
+      framework::GetUnusedVars(program.Block(0), task_node->unique_ops(), {});
+  task_node->SetType("Compute");
+  task_node->SetUnusedVars(unused_vars);
+
+  task_nodes_.emplace_back(std::move(task_node));
+  int64_t task_id = task_nodes_[0]->task_id();
+  intercepter_id_to_rank_.insert({task_id, cur_rank});
+  intercepter_id_to_node_.insert({task_id, task_nodes_[0].get()});
+}
+
 void RuntimeGraph::SplitProgramBasedFunctionality(const ProgramDesc& program) {
   for (const auto& op_desc : program.Block(0).AllOps()) {
     ops_.emplace_back(OpRegistry::CreateOp(*op_desc));
   }
+  // TODO(wangxi): how to gc pipeline backward send
+  auto unused_vars = framework::GetUnusedVars(program.Block(0), ops_, {});
+
   std::unordered_map<int32_t, std::vector<OperatorBase*>> role_to_ops;
   for (const auto& op : ops_) {
     int32_t op_role = op->Attr<int32_t>("op_role");
@@ -135,33 +160,44 @@ void RuntimeGraph::SplitProgramBasedFunctionality(const ProgramDesc& program) {
     }
     role_to_ops.at(new_op_role_id).emplace_back(op.get());
   }
+
   int64_t cur_rank = exe_desc_.cur_rank();
   DistCoordSys coord_sys(exe_desc_.dp_degree(), exe_desc_.pp_degree(),
                          exe_desc_.mp_degree());
   const auto& coord = coord_sys.RankToCoord(cur_rank);
   int pipeline_stage = coord.pp_idx;
   int64_t num_pipeline_stages = exe_desc_.pp_degree();
+
   // TODO(fleet_executor dev): start up steps should be a config `num_slots`
   int64_t start_up_steps = num_pipeline_stages - pipeline_stage;
   int64_t num_micro_batches = exe_desc_.num_micro_batches();
   int64_t task_id = cur_rank * functionality_order.size();
   for (std::size_t i = 0; i < functionality_order.size(); ++i) {
+    VLOG(3) << "Runtime graph is creating task node for: " << task_id << ".";
     OpRole role = functionality_order[i];
     int32_t role_id = static_cast<int64_t>(role);
     int64_t max_run_times = num_micro_batches;
     int64_t max_slot_nums = start_up_steps;
-    if (IsLRSched(role_id) || IsOptimize(role_id)) {
-      max_run_times = 1;
-      max_slot_nums = 1;
+    // NOTE: use short path, each interceptor should run for max_run_times
+    std::vector<OperatorBase*> task_ops{};
+    if (role_to_ops.find(role_id) != role_to_ops.end()) {
+      task_ops = role_to_ops.at(role_id);
     }
-    if (role_to_ops.find(role_id) == role_to_ops.end()) {
-      task_nodes_.emplace_back(TaskNode::CreateEmptyTaskNode(
-          role_id, cur_rank, task_id, max_run_times, max_slot_nums));
+    std::unique_ptr<TaskNode> task_node = std::make_unique<TaskNode>(
+        role_id, task_ops, cur_rank, task_id, max_run_times, max_slot_nums);
+    if (IsLRSched(role_id) || IsOptimize(role_id)) {
+      task_node->SetType("Amplifier");
+      if (IsLRSched(role_id)) {
+        task_node->SetRunPerSteps(max_run_times);
+      } else {
+        task_node->SetRunAtOffset(max_run_times - 1);
+        task_node->SetRunPerSteps(max_run_times);
+      }
     } else {
-      task_nodes_.emplace_back(
-          TaskNode::CreateTaskNode(role_id, role_to_ops.at(role_id), cur_rank,
-                                   task_id, max_run_times, max_slot_nums));
+      task_node->SetType("Compute");
     }
+    task_node->SetUnusedVars(unused_vars);
+    task_nodes_.emplace_back(std::move(task_node));
     ++task_id;
   }
 }
@@ -176,42 +212,77 @@ void RuntimeGraph::FakeDependence() {
   downstream_coord.pp_idx += 1;
   int64_t pp_upstream = coord_sys.CoordToRank(upstream_coord);
   int64_t pp_downstream = coord_sys.CoordToRank(downstream_coord);
+  bool is_first_stage = (pp_upstream == -1);
+  bool is_last_stage = (pp_downstream == -1);
+
   int32_t num_of_functionality = functionality_order.size();
-  // lr -> forward -> backward -> optimize
-  //         |          |
-  // lr -> forward -> backward -> optimize
+  // lr(1:m) -> forward -> backward -> (m:1)optimize
+  //               ↑          ↓
+  // lr(1:m) -> forward -> backward -> (m:1)optimize
+  //               ↑          ↓
+  // lr(1:m) -> forward -> backward -> (m:1)optimize
   for (std::size_t i = 0; i < task_nodes_.size(); ++i) {
-    if (i != 0) {
-      task_nodes_[i]->AddUpstreamTask(cur_rank * num_of_functionality + i - 1);
+    auto& node = task_nodes_[i];
+    bool is_forward = IsForward(node->role());
+    bool is_backward = IsBackward(node->role());
+
+    int64_t cur_id = cur_rank * num_of_functionality + i;
+    int64_t prev_id = cur_id - 1;
+    int64_t next_id = cur_id + 1;
+
+    int64_t upstream_id = pp_upstream * num_of_functionality + i;
+    int64_t downstream_id = pp_downstream * num_of_functionality + i;
+
+    // 1F1B, last stage pp_buff_size should be 1, while first stage
+    // pp_buff_size should be pp_degree
+    int64_t pp_buff_size = exe_desc_.pp_degree() - coord.pp_idx;
+
+    std::vector<std::pair<int64_t, int64_t>> ups;
+    std::vector<std::pair<int64_t, int64_t>> downs;
+
+    if (i != 0) {  // not lr
+      int64_t buff_size = is_backward ? pp_buff_size : 2;
+      ups.emplace_back(prev_id, buff_size);
     }
-    if (i != task_nodes_.size() - 1) {
-      task_nodes_[i]->AddDownstreamTask(cur_rank * num_of_functionality + i +
-                                        1);
+    if (i != task_nodes_.size() - 1) {  // not optimize
+      int64_t buff_size = is_forward ? pp_buff_size : 2;
+      downs.emplace_back(next_id, buff_size);
     }
-    if (IsForward(task_nodes_[i]->role())) {
-      if (pp_upstream != -1) {
-        task_nodes_[i]->AddUpstreamTask(pp_upstream * num_of_functionality + i);
+
+    if (is_forward) {
+      if (!is_first_stage) {
+        ups.emplace_back(upstream_id, 2);
       }
-      if (pp_downstream != -1) {
-        task_nodes_[i]->AddDownstreamTask(pp_downstream * num_of_functionality +
-                                          i);
+      if (!is_last_stage) {
+        downs.emplace_back(downstream_id, 2);
       }
-    } else if (IsBackward(task_nodes_[i]->role())) {
-      if (pp_downstream != -1) {
-        task_nodes_[i]->AddUpstreamTask(pp_downstream * num_of_functionality +
-                                        i);
+    } else if (is_backward) {
+      if (!is_last_stage) {
+        ups.emplace_back(downstream_id, 2);
       }
-      if (pp_upstream != -1) {
-        task_nodes_[i]->AddDownstreamTask(pp_upstream * num_of_functionality +
-                                          i);
+      if (!is_first_stage) {
+        downs.emplace_back(upstream_id, 2);
       }
     }
+
+    for (auto up : ups) {
+      VLOG(3) << "Task(" << cur_id << ") AddUpstream Task(" << up.first
+              << ") with buff_size=" << up.second;
+      node->AddUpstreamTask(up.first, up.second);
+    }
+    for (auto down : downs) {
+      VLOG(3) << "Task(" << cur_id << ") AddDownstream Task(" << down.first
+              << ") with buff_size=" << down.second;
+      node->AddDownstreamTask(down.first, down.second);
+    }
   }
 }
 
 void RuntimeGraph::AssignTaskToIntercepter() {
   for (const auto& task : task_nodes_) {
     int64_t intercepter_id = task->task_id();
+    VLOG(3) << "Runtime graph is assigning task to interceptor: "
+            << intercepter_id << " with type: " << task->type() << ".";
     if (intercepter_id_to_node_.find(intercepter_id) !=
         intercepter_id_to_node_.end()) {
       PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/distributed/fleet_executor/runtime_graph.h b/paddle/fluid/distributed/fleet_executor/runtime_graph.h
index b19456962d631..26b758767c07f 100644
--- a/paddle/fluid/distributed/fleet_executor/runtime_graph.h
+++ b/paddle/fluid/distributed/fleet_executor/runtime_graph.h
@@ -52,6 +52,7 @@ class RuntimeGraph final {
   void FakeDependence();
   void AssignTaskToIntercepter();
   void FakeRuntimeInfo();
+  void OriginProgramCompile(const ProgramDesc& program);
   // LRSched, Forward, Backward, Optimize
   static std::vector<paddle::framework::OpRole> functionality_order;
   std::vector<std::unique_ptr<TaskNode>> task_nodes_;
diff --git a/paddle/fluid/distributed/fleet_executor/task_node.cc b/paddle/fluid/distributed/fleet_executor/task_node.cc
index 07fd091b04d97..e92ab09d481e8 100644
--- a/paddle/fluid/distributed/fleet_executor/task_node.cc
+++ b/paddle/fluid/distributed/fleet_executor/task_node.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
@@ -30,6 +31,12 @@ TaskNode::TaskNode(const framework::ProgramDesc& program, int64_t rank,
   // Should be serially invoked, not thread-safe
   static int64_t task_node_cnt = 0;
   task_id_ = task_node_cnt++;
+  for (const auto& op_desc : program.Block(0).AllOps()) {
+    ops_vec_.emplace_back(framework::OpRegistry::CreateOp(*op_desc));
+  }
+  for (const auto& op : ops_vec_) {
+    ops_.emplace_back(op.get());
+  }
 }
 
 TaskNode::TaskNode(int32_t role, const std::vector<OperatorBase*>& ops,
@@ -50,30 +57,14 @@ TaskNode::TaskNode(int32_t role, int64_t rank, int64_t task_id,
       max_run_times_(max_run_times),
       max_slot_nums_(max_slot_nums) {}
 
-std::unique_ptr<TaskNode> TaskNode::CreateEmptyTaskNode(int32_t role,
-                                                        int64_t rank,
-                                                        int64_t task_id,
-                                                        int64_t max_run_times,
-                                                        int64_t max_slot_nums) {
-  return std::make_unique<TaskNode>(role, rank, task_id, max_run_times,
-                                    max_slot_nums);
-}
-
-std::unique_ptr<TaskNode> TaskNode::CreateTaskNode(
-    int32_t role, const std::vector<OperatorBase*>& ops, int64_t rank,
-    int64_t task_id, int64_t max_run_times, int64_t max_slot_nums) {
-  return std::make_unique<TaskNode>(role, ops, rank, task_id, max_run_times,
-                                    max_slot_nums);
+bool TaskNode::AddUpstreamTask(int64_t task_id, int64_t buff_size) {
+  const auto& ret = upstream_.emplace(task_id, buff_size);
+  return ret.second;
 }
 
-bool TaskNode::AddUpstreamTask(int64_t task_id) {
-  const auto& ret = upstream_.insert(task_id);
-  return *ret.first == task_id;
-}
-
-bool TaskNode::AddDownstreamTask(int64_t task_id) {
-  const auto& ret = downstream_.insert(task_id);
-  return *ret.first == task_id;
+bool TaskNode::AddDownstreamTask(int64_t task_id, int64_t buff_size) {
+  const auto& ret = downstream_.emplace(task_id, buff_size);
+  return ret.second;
 }
 
 std::string TaskNode::DebugString() const {
@@ -85,5 +76,34 @@ std::string TaskNode::DebugString() const {
   os << "\n";
   return os.str();
 }
+
+void TaskNode::SetRunPerSteps(int64_t value) {
+  PADDLE_ENFORCE_GE(value, 1,
+                    platform::errors::InvalidArgument(
+                        "run_per_steps must >= 1, but received %ld", value));
+  run_per_steps_ = value;
+}
+
+void TaskNode::SetRunAtOffset(int64_t value) {
+  PADDLE_ENFORCE_GE(value, 0,
+                    platform::errors::InvalidArgument(
+                        "run_at_offset must >= 0, but received %ld", value));
+  run_at_offset_ = value;
+}
+
+void TaskNode::SetReplyUpPerSteps(int64_t value) {
+  PADDLE_ENFORCE_GE(
+      value, 1, platform::errors::InvalidArgument(
+                    "reply_up_per_steps must >= 1, but received %ld", value));
+  reply_up_per_steps_ = value;
+}
+
+void TaskNode::SetSendDownPerSteps(int64_t value) {
+  PADDLE_ENFORCE_GE(
+      value, 1, platform::errors::InvalidArgument(
+                    "send_down_per_steps must >= 1, but received %ld", value));
+  send_down_per_steps_ = value;
+}
+
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/task_node.h b/paddle/fluid/distributed/fleet_executor/task_node.h
index 8f4f9d80c42a5..37105bdd230ab 100644
--- a/paddle/fluid/distributed/fleet_executor/task_node.h
+++ b/paddle/fluid/distributed/fleet_executor/task_node.h
@@ -44,38 +44,69 @@ class TaskNode final {
   int32_t role() const { return role_; }
   int64_t max_run_times() const { return max_run_times_; }
   int64_t max_slot_nums() const { return max_slot_nums_; }
-  const std::unordered_set<int64_t>& upstream() const { return upstream_; }
-  const std::unordered_set<int64_t>& downstream() const { return downstream_; }
+  int64_t run_per_steps() const { return run_per_steps_; }
+  int64_t run_at_offset() const { return run_at_offset_; }
+  int64_t reply_up_per_steps() const { return reply_up_per_steps_; }
+  int64_t send_down_per_steps() const { return send_down_per_steps_; }
+  const std::unordered_map<int64_t, int64_t>& upstream() const {
+    return upstream_;
+  }
+  const std::unordered_map<int64_t, int64_t>& downstream() const {
+    return downstream_;
+  }
   const std::string& type() const { return type_; }
   const paddle::framework::ProgramDesc& program() const { return program_; }
   const std::vector<OperatorBase*>& ops() const { return ops_; }
+  const std::vector<std::unique_ptr<OperatorBase>>& unique_ops() const {
+    return ops_vec_;
+  }
+  const std::unordered_map<const OperatorBase*, std::vector<std::string>>&
+  unused_vars() const {
+    return unused_vars_;
+  }
 
-  bool AddUpstreamTask(int64_t task_id);
-  bool AddDownstreamTask(int64_t task_id);
-  std::string DebugString() const;
+  void SetRunPerSteps(int64_t value);
+  void SetRunAtOffset(int64_t value);
+  void SetReplyUpPerSteps(int64_t value);
+  void SetSendDownPerSteps(int64_t value);
+  void SetType(const std::string& type) { type_ = type; }
+  void SetUnusedVars(
+      const std::unordered_map<const OperatorBase*, std::vector<std::string>>&
+          unused_vars) {
+    unused_vars_ = unused_vars;
+  }
 
-  static std::unique_ptr<TaskNode> CreateEmptyTaskNode(int32_t role,
-                                                       int64_t rank,
-                                                       int64_t task_id,
-                                                       int64_t max_run_times,
-                                                       int64_t max_slot_nums);
-  static std::unique_ptr<TaskNode> CreateTaskNode(
-      int32_t role, const std::vector<OperatorBase*>& ops, int64_t rank,
-      int64_t task_id, int64_t max_run_times, int64_t max_slot_nums);
+  // upstream need buffs?
+  bool AddUpstreamTask(int64_t task_id, int64_t buff_size = 1);
+  bool AddDownstreamTask(int64_t task_id, int64_t buff_size = 1);
+  std::string DebugString() const;
 
  private:
   DISABLE_COPY_AND_ASSIGN(TaskNode);
   TaskNode() = default;
+  // ops_ will be removed in the future
   std::vector<OperatorBase*> ops_;
-  std::unordered_set<int64_t> upstream_;
-  std::unordered_set<int64_t> downstream_;
+  // task_id-->buff_size
+  std::unordered_map<int64_t, int64_t> upstream_;
+  std::unordered_map<int64_t, int64_t> downstream_;
   framework::ProgramDesc program_;
+  std::vector<std::unique_ptr<OperatorBase>> ops_vec_;
+  std::unordered_map<const OperatorBase*, std::vector<std::string>>
+      unused_vars_;
+
   int32_t role_;
   int64_t rank_;
   int64_t task_id_;
   int64_t max_run_times_;
   int64_t max_slot_nums_;
 
+  int64_t run_per_steps_{1};
+  int64_t run_at_offset_{0};
+  // one input produces multi times output
+  int64_t reply_up_per_steps_{1};
+  // one output need multi times input
+  int64_t send_down_per_steps_{1};
+
   std::string type_;
 };
 
diff --git a/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt
index b0f00d7058476..d4587b90c87f3 100644
--- a/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt
@@ -4,6 +4,12 @@ cc_test(interceptor_ping_pong_test SRCS interceptor_ping_pong_test.cc DEPS fleet
 set_source_files_properties(compute_interceptor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(compute_interceptor_test SRCS compute_interceptor_test.cc DEPS fleet_executor ${BRPC_DEPS})
 
+set_source_files_properties(interceptor_pipeline_short_path_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(interceptor_pipeline_short_path_test SRCS interceptor_pipeline_short_path_test.cc DEPS fleet_executor ${BRPC_DEPS})
+
+set_source_files_properties(interceptor_pipeline_long_path_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(interceptor_pipeline_long_path_test SRCS interceptor_pipeline_long_path_test.cc DEPS fleet_executor ${BRPC_DEPS})
+
 set_source_files_properties(compute_interceptor_run_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(compute_interceptor_run_op_test SRCS compute_interceptor_run_op_test.cc DEPS fleet_executor ${BRPC_DEPS} op_registry fill_constant_op elementwise_add_op scope device_context)
 
diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
index 2d9776738f831..c5348db83e029 100644
--- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
@@ -61,15 +61,15 @@ TEST(ComputeInterceptor, Compute) {
   std::vector<framework::Scope*> scopes = {scope, scope};
   platform::Place place = platform::CPUPlace();
 
+  Carrier& carrier = Carrier::Instance();
+
   MessageBus& msg_bus = MessageBus::Instance();
   msg_bus.Init({{0, 0}, {1, 0}}, {{0, "127.0.0.0:0"}}, "127.0.0.0:0");
 
-  Carrier& carrier = Carrier::Instance();
-
   // FIXME: don't delete, otherwise interceptor will use undefined node
   TaskNode* node_a =
-      new TaskNode(0, ops, 0, 0, 2, 2);  // role, ops, rank, task_id
-  TaskNode* node_b = new TaskNode(0, 0, 1, 0, 0);
+      new TaskNode(0, ops, 0, 0, 2, 0);  // role, ops, rank, task_id
+  TaskNode* node_b = new TaskNode(0, 0, 1, 2, 0);
 
   // a->b
   node_a->AddDownstreamTask(1);
@@ -90,13 +90,6 @@ TEST(ComputeInterceptor, Compute) {
   msg.set_src_id(-1);
   msg.set_dst_id(0);
   carrier.EnqueueInterceptorMessage(msg);
-
-  // stop
-  InterceptorMessage stop;
-  stop.set_message_type(STOP);
-  stop.set_src_id(-1);
-  stop.set_dst_id(0);
-  carrier.EnqueueInterceptorMessage(stop);
 }
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc
index 3cfd3073c8cb9..44dc0c9bc9b0c 100644
--- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc
@@ -35,35 +35,29 @@ class StartInterceptor : public Interceptor {
   void NOP(const InterceptorMessage& msg) {
     if (msg.message_type() == STOP) {
       stop_ = true;
+      InterceptorMessage stop;
+      stop.set_message_type(STOP);
+      Send(1, stop);  // stop 1, compute
       return;
     }
     std::cout << GetInterceptorId() << " recv msg from " << msg.src_id()
               << std::endl;
-    ++count_;
-    if (count_ == 3) {
-      InterceptorMessage stop;
-      stop.set_message_type(STOP);
-      Send(msg.dst_id(), stop);  // stop 0, this
-      Send(msg.src_id(), stop);  // stop 1, compute
-    }
   }
-  int count_{0};
 };
 
 TEST(ComputeInterceptor, Compute) {
+  Carrier& carrier = Carrier::Instance();
   MessageBus& msg_bus = MessageBus::Instance();
   msg_bus.Init({{0, 0}, {1, 0}, {2, 0}}, {{0, "127.0.0.0:0"}}, "127.0.0.0:0");
 
-  Carrier& carrier = Carrier::Instance();
-
   // NOTE: don't delete, otherwise interceptor will use undefined node
-  TaskNode* node_a = new TaskNode(0, 0, 0, 0, 0);  // role, rank, task_id
-  TaskNode* node_b = new TaskNode(0, 0, 1, 0, 0);
-  TaskNode* node_c = new TaskNode(0, 0, 2, 0, 0);
+  TaskNode* node_a = new TaskNode(0, 0, 0, 3, 0);  // role, rank, task_id
+  TaskNode* node_b = new TaskNode(0, 0, 1, 3, 0);
+  TaskNode* node_c = new TaskNode(0, 0, 2, 3, 0);
 
   // a->b->c
-  node_a->AddDownstreamTask(1);
-  node_b->AddUpstreamTask(0);
+  node_a->AddDownstreamTask(1, 3);
+  node_b->AddUpstreamTask(0, 3);
   node_b->AddDownstreamTask(2);
   node_c->AddUpstreamTask(1);
 
diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc
new file mode 100644
index 0000000000000..b3fdb0b7adff0
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc
@@ -0,0 +1,94 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include <unordered_map>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/distributed/fleet_executor/carrier.h"
+#include "paddle/fluid/distributed/fleet_executor/interceptor.h"
+#include "paddle/fluid/distributed/fleet_executor/message_bus.h"
+#include "paddle/fluid/distributed/fleet_executor/task_node.h"
+
+namespace paddle {
+namespace distributed {
+
+void LinkNodes(const std::vector<TaskNode*>& nodes) {
+  size_t size = nodes.size();
+  if (size <= 1) return;
+
+  {  // i = 0
+    TaskNode* now = nodes[0];
+    TaskNode* next = nodes[1];
+    now->AddDownstreamTask(next->task_id());
+  }
+  {  // i = size - 1
+    TaskNode* prev = nodes[size - 2];
+    TaskNode* now = nodes[size - 1];
+    now->AddUpstreamTask(prev->task_id());
+  }
+
+  for (size_t i = 1; i < size - 1; ++i) {
+    TaskNode* prev = nodes[i - 1];
+    TaskNode* now = nodes[i];
+    TaskNode* next = nodes[i + 1];
+
+    now->AddUpstreamTask(prev->task_id());
+    now->AddDownstreamTask(next->task_id());
+  }
+}
+
+TEST(AmplifierInterceptor, Amplifier) {
+  Carrier& carrier = Carrier::Instance();
+  MessageBus& msg_bus = MessageBus::Instance();
+  msg_bus.Init({{0, 0}, {1, 0}, {2, 0}, {3, 0}, {4, 0}, {5, 0}},
+               {{0, "127.0.0.0:0"}}, "127.0.0.0:0");
+
+  int64_t micro_steps = 3;
+
+  // NOTE: don't delete, otherwise interceptor will use undefined node
+  TaskNode* node_a = new TaskNode(0, 0, 0, 1, 0);  // role, rank, task_id
+  TaskNode* node_b = new TaskNode(0, 0, 1, 1, 0);
+  TaskNode* node_c = new TaskNode(0, 0, 2, 1, 0);
+  TaskNode* node_d = new TaskNode(0, 0, 3, 1, 0);
+  TaskNode* node_e = new TaskNode(0, 0, 4, 1, 0);
+  TaskNode* node_f = new TaskNode(0, 0, 5, 1, 0);
+
+  // a->b->c->d->e->f
+  LinkNodes({node_a, node_b, node_c, node_d, node_e, node_f});
+
+  // LR->b(1:3)->F->B->e(3:1)->U
+  node_b->SetReplyUpPerSteps(micro_steps);
+  node_e->SetSendDownPerSteps(micro_steps);
+
+  carrier.SetInterceptor(0, InterceptorFactory::Create("Compute", 0, node_a));
+  carrier.SetInterceptor(1, InterceptorFactory::Create("Amplifier", 1, node_b));
+  carrier.SetInterceptor(2, InterceptorFactory::Create("Compute", 2, node_c));
+  carrier.SetInterceptor(3, InterceptorFactory::Create("Compute", 3, node_d));
+  carrier.SetInterceptor(4, InterceptorFactory::Create("Amplifier", 4, node_e));
+  carrier.SetInterceptor(5, InterceptorFactory::Create("Compute", 5, node_f));
+
+  carrier.SetCreatingFlag(false);
+
+  // start
+  InterceptorMessage msg;
+  msg.set_message_type(DATA_IS_READY);
+  msg.set_src_id(-1);
+  msg.set_dst_id(0);
+  carrier.EnqueueInterceptorMessage(msg);
+}
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc
new file mode 100644
index 0000000000000..936a970c05f7c
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc
@@ -0,0 +1,109 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include <unordered_map>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/distributed/fleet_executor/carrier.h"
+#include "paddle/fluid/distributed/fleet_executor/interceptor.h"
+#include "paddle/fluid/distributed/fleet_executor/message_bus.h"
+#include "paddle/fluid/distributed/fleet_executor/task_node.h"
+
+namespace paddle {
+namespace distributed {
+
+int64_t GetBuffSize(
+    const std::map<std::pair<TaskNode*, TaskNode*>, int64_t> buffs,
+    TaskNode* from, TaskNode* to) {
+  if (buffs.find({from, to}) != buffs.end()) {
+    return buffs.at({from, to});
+  }
+  if (buffs.find({to, from}) != buffs.end()) {
+    return buffs.at({to, from});
+  }
+  return 2;  // set default 2
+}
+
+void LinkNodes(const std::vector<TaskNode*>& nodes,
+               const std::map<std::pair<TaskNode*, TaskNode*>, int64_t> buffs) {
+  size_t size = nodes.size();
+  if (size <= 1) return;
+
+  {  // i = 0
+    TaskNode* now = nodes[0];
+    TaskNode* next = nodes[1];
+    auto buff_size = GetBuffSize(buffs, now, next);
+    now->AddDownstreamTask(next->task_id(), buff_size);
+  }
+  {  // i = size - 1
+    TaskNode* prev = nodes[size - 2];
+    TaskNode* now = nodes[size - 1];
+    auto buff_size = GetBuffSize(buffs, prev, now);
+    now->AddUpstreamTask(prev->task_id(), buff_size);
+  }
+
+  for (size_t i = 1; i < size - 1; ++i) {
+    TaskNode* prev = nodes[i - 1];
+    TaskNode* now = nodes[i];
+    TaskNode* next = nodes[i + 1];
+
+    auto buff_size = GetBuffSize(buffs, prev, now);
+    now->AddUpstreamTask(prev->task_id(), buff_size);
+
+    buff_size = GetBuffSize(buffs, now, next);
+    now->AddDownstreamTask(next->task_id(), buff_size);
+  }
+}
+
+TEST(AmplifierInterceptor, Amplifier) {
+  Carrier& carrier = Carrier::Instance();
+  MessageBus& msg_bus = MessageBus::Instance();
+  msg_bus.Init({{0, 0}, {1, 0}, {2, 0}, {3, 0}}, {{0, ""}}, "");
+
+  int64_t micro_steps = 6;
+
+  // NOTE: don't delete, otherwise interceptor will use undefined node
+  TaskNode* node_a =
+      new TaskNode(0, 0, 0, micro_steps, 0);  // role, rank, task_id
+  TaskNode* node_b = new TaskNode(0, 0, 1, 3, 0);
+  TaskNode* node_c = new TaskNode(0, 0, 2, 3, 0);
+  TaskNode* node_d = new TaskNode(0, 0, 3, micro_steps, 0);
+
+  // a->b->c->d
+  // LR->F->B->U
+  LinkNodes({node_a, node_b, node_c, node_d}, {{{node_b, node_c}, 1}});
+
+  node_a->SetRunPerSteps(micro_steps);
+  node_d->SetRunPerSteps(micro_steps);
+  node_d->SetRunAtOffset(micro_steps - 1);
+
+  carrier.SetInterceptor(0, InterceptorFactory::Create("Amplifier", 0, node_a));
+  carrier.SetInterceptor(1, InterceptorFactory::Create("Compute", 1, node_b));
+  carrier.SetInterceptor(2, InterceptorFactory::Create("Compute", 2, node_c));
+  carrier.SetInterceptor(3, InterceptorFactory::Create("Amplifier", 3, node_d));
+
+  carrier.SetCreatingFlag(false);
+
+  // start
+  InterceptorMessage msg;
+  msg.set_message_type(DATA_IS_READY);
+  msg.set_src_id(-1);
+  msg.set_dst_id(0);
+  carrier.EnqueueInterceptorMessage(msg);
+}
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/service/graph_brpc_client.cc b/paddle/fluid/distributed/service/graph_brpc_client.cc
index c5ad4b0099479..a9682d6a6efcc 100644
--- a/paddle/fluid/distributed/service/graph_brpc_client.cc
+++ b/paddle/fluid/distributed/service/graph_brpc_client.cc
@@ -514,6 +514,42 @@ std::future<int32_t> GraphBrpcClient::random_sample_nodes(
   return fut;
 }
 
+std::future<int32_t> GraphBrpcClient::load_graph_split_config(
+    uint32_t table_id, std::string path) {
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(
+      server_size, [&, server_size = this->server_size ](void *done) {
+        int ret = 0;
+        auto *closure = (DownpourBrpcClosure *)done;
+        size_t fail_num = 0;
+        for (size_t request_idx = 0; request_idx < server_size; ++request_idx) {
+          if (closure->check_response(request_idx,
+                                      PS_GRAPH_LOAD_GRAPH_SPLIT_CONFIG) != 0) {
+            ++fail_num;
+            break;
+          }
+        }
+        ret = fail_num == 0 ? 0 : -1;
+        closure->set_promise_value(ret);
+      });
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+  for (size_t i = 0; i < server_size; i++) {
+    int server_index = i;
+    closure->request(server_index)
+        ->set_cmd_id(PS_GRAPH_LOAD_GRAPH_SPLIT_CONFIG);
+    closure->request(server_index)->set_table_id(table_id);
+    closure->request(server_index)->set_client_id(_client_id);
+    closure->request(server_index)->add_params(path);
+    GraphPsService_Stub rpc_stub =
+        getServiceStub(get_cmd_channel(server_index));
+    closure->cntl(server_index)->set_log_id(butil::gettimeofday_ms());
+    rpc_stub.service(closure->cntl(server_index),
+                     closure->request(server_index),
+                     closure->response(server_index), closure);
+  }
+  return fut;
+}
 std::future<int32_t> GraphBrpcClient::use_neighbors_sample_cache(
     uint32_t table_id, size_t total_size_limit, size_t ttl) {
   DownpourBrpcClosure *closure = new DownpourBrpcClosure(
diff --git a/paddle/fluid/distributed/service/graph_brpc_client.h b/paddle/fluid/distributed/service/graph_brpc_client.h
index e3d2ff1d32d72..2e5d5b6ee93cb 100644
--- a/paddle/fluid/distributed/service/graph_brpc_client.h
+++ b/paddle/fluid/distributed/service/graph_brpc_client.h
@@ -93,6 +93,8 @@ class GraphBrpcClient : public BrpcPsClient {
   virtual std::future<int32_t> use_neighbors_sample_cache(uint32_t table_id,
                                                           size_t size_limit,
                                                           size_t ttl);
+  virtual std::future<int32_t> load_graph_split_config(uint32_t table_id,
+                                                       std::string path);
   virtual std::future<int32_t> remove_graph_node(
       uint32_t table_id, std::vector<uint64_t>& node_id_list);
   virtual int32_t initialize();
diff --git a/paddle/fluid/distributed/service/graph_brpc_server.cc b/paddle/fluid/distributed/service/graph_brpc_server.cc
index 094ecbbd402c0..c1348e4804e2b 100644
--- a/paddle/fluid/distributed/service/graph_brpc_server.cc
+++ b/paddle/fluid/distributed/service/graph_brpc_server.cc
@@ -204,6 +204,8 @@ int32_t GraphBrpcService::initialize() {
       &GraphBrpcService::sample_neighbors_across_multi_servers;
   _service_handler_map[PS_GRAPH_USE_NEIGHBORS_SAMPLE_CACHE] =
       &GraphBrpcService::use_neighbors_sample_cache;
+  _service_handler_map[PS_GRAPH_LOAD_GRAPH_SPLIT_CONFIG] =
+      &GraphBrpcService::load_graph_split_config;
   // shard初始化,server启动后才可从env获取到server_list的shard信息
   initialize_shard_info();
 
@@ -658,5 +660,20 @@ int32_t GraphBrpcService::use_neighbors_sample_cache(
   ((GraphTable *)table)->make_neighbor_sample_cache(size_limit, ttl);
   return 0;
 }
+
+int32_t GraphBrpcService::load_graph_split_config(
+    Table *table, const PsRequestMessage &request, PsResponseMessage &response,
+    brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  if (request.params_size() < 1) {
+    set_response_code(response, -1,
+                      "load_graph_split_configrequest requires at least 1 "
+                      "argument1[file_path]");
+    return 0;
+  }
+  ((GraphTable *)table)->load_graph_split_config(request.params(0));
+  return 0;
+}
+
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/service/graph_brpc_server.h b/paddle/fluid/distributed/service/graph_brpc_server.h
index d1a6aa63604f3..ecd78d28ca812 100644
--- a/paddle/fluid/distributed/service/graph_brpc_server.h
+++ b/paddle/fluid/distributed/service/graph_brpc_server.h
@@ -126,6 +126,10 @@ class GraphBrpcService : public PsBaseService {
                                      PsResponseMessage &response,
                                      brpc::Controller *cntl);
 
+  int32_t load_graph_split_config(Table *table, const PsRequestMessage &request,
+                                  PsResponseMessage &response,
+                                  brpc::Controller *cntl);
+
  private:
   bool _is_initialize_shard_info;
   std::mutex _initialize_shard_mutex;
diff --git a/paddle/fluid/distributed/service/heter_server.h b/paddle/fluid/distributed/service/heter_server.h
index 66141622f8cdc..5f062755c9242 100644
--- a/paddle/fluid/distributed/service/heter_server.h
+++ b/paddle/fluid/distributed/service/heter_server.h
@@ -336,7 +336,7 @@ class HeterServer {
 
   bool IsExit() { return service_.IsExit(); }
 
-  HeterServer() { this->ready_ = 0; }
+  HeterServer() : service_(), ready_(0) {}
 
   void RegisterServiceHandler(std::string message_name,
                               HeterServiceHandler func);
@@ -391,7 +391,7 @@ class HeterServer {
   DISABLE_COPY_AND_ASSIGN(HeterServer);
   std::mutex mutex_ready_;
 
-  int ready_ = 0;
+  int ready_;
 };
 
 }  // end namespace distributed
diff --git a/paddle/fluid/distributed/service/sendrecv.proto b/paddle/fluid/distributed/service/sendrecv.proto
index 8ee9b3590721a..6dfaff1ffa1df 100644
--- a/paddle/fluid/distributed/service/sendrecv.proto
+++ b/paddle/fluid/distributed/service/sendrecv.proto
@@ -58,6 +58,7 @@ enum PsCmdID {
   PS_GRAPH_SET_NODE_FEAT = 37;
   PS_GRAPH_SAMPLE_NODES_FROM_ONE_SERVER = 38;
   PS_GRAPH_USE_NEIGHBORS_SAMPLE_CACHE = 39;
+  PS_GRAPH_LOAD_GRAPH_SPLIT_CONFIG = 40;
 }
 
 message PsRequestMessage {
diff --git a/paddle/fluid/distributed/service/service.cc b/paddle/fluid/distributed/service/service.cc
index 29941e36ea051..698ceb1578f47 100644
--- a/paddle/fluid/distributed/service/service.cc
+++ b/paddle/fluid/distributed/service/service.cc
@@ -51,7 +51,6 @@ void PSCore::init_gflag(const std::string& gflags) {
   std::vector<std::string> flags = paddle::string::split_string(gflags);
   if (flags.size() < 1) {
     flags.push_back("-max_body_size=314217728");
-    flags.push_back("-bthread_concurrency=200");
     flags.push_back("-socket_max_unwritten_bytes=2048000000");
     flags.push_back("-max_connection_pool_size=1950");
   }
diff --git a/paddle/fluid/distributed/table/common_graph_table.cc b/paddle/fluid/distributed/table/common_graph_table.cc
index b690d71eab84d..042a4dee62bda 100644
--- a/paddle/fluid/distributed/table/common_graph_table.cc
+++ b/paddle/fluid/distributed/table/common_graph_table.cc
@@ -56,7 +56,7 @@ int32_t GraphTable::add_graph_node(std::vector<uint64_t> &id_list,
     tasks.push_back(_shards_task_pool[i]->enqueue([&batch, i, this]() -> int {
       for (auto &p : batch[i]) {
         size_t index = p.first % this->shard_num - this->shard_start;
-        this->shards[index].add_graph_node(p.first)->build_edges(p.second);
+        this->shards[index]->add_graph_node(p.first)->build_edges(p.second);
       }
       return 0;
     }));
@@ -79,7 +79,7 @@ int32_t GraphTable::remove_graph_node(std::vector<uint64_t> &id_list) {
     tasks.push_back(_shards_task_pool[i]->enqueue([&batch, i, this]() -> int {
       for (auto &p : batch[i]) {
         size_t index = p % this->shard_num - this->shard_start;
-        this->shards[index].delete_node(p);
+        this->shards[index]->delete_node(p);
       }
       return 0;
     }));
@@ -97,6 +97,7 @@ void GraphShard::clear() {
 }
 
 GraphShard::~GraphShard() { clear(); }
+
 void GraphShard::delete_node(uint64_t id) {
   auto iter = node_location.find(id);
   if (iter == node_location.end()) return;
@@ -117,6 +118,14 @@ GraphNode *GraphShard::add_graph_node(uint64_t id) {
   return (GraphNode *)bucket[node_location[id]];
 }
 
+GraphNode *GraphShard::add_graph_node(Node *node) {
+  auto id = node->get_id();
+  if (node_location.find(id) == node_location.end()) {
+    node_location[id] = bucket.size();
+    bucket.push_back(node);
+  }
+  return (GraphNode *)bucket[node_location[id]];
+}
 FeatureNode *GraphShard::add_feature_node(uint64_t id) {
   if (node_location.find(id) == node_location.end()) {
     node_location[id] = bucket.size();
@@ -134,6 +143,33 @@ Node *GraphShard::find_node(uint64_t id) {
   return iter == node_location.end() ? nullptr : bucket[iter->second];
 }
 
+GraphTable::~GraphTable() {
+  for (auto p : shards) {
+    delete p;
+  }
+  for (auto p : extra_shards) {
+    delete p;
+  }
+  shards.clear();
+  extra_shards.clear();
+}
+
+int32_t GraphTable::load_graph_split_config(const std::string &path) {
+  VLOG(4) << "in server side load graph split config\n";
+  std::ifstream file(path);
+  std::string line;
+  while (std::getline(file, line)) {
+    auto values = paddle::string::split_string<std::string>(line, "\t");
+    if (values.size() < 2) continue;
+    size_t index = (size_t)std::stoi(values[0]);
+    if (index != _shard_idx) continue;
+    auto dst_id = std::stoull(values[1]);
+    extra_nodes.insert(dst_id);
+  }
+  if (extra_nodes.size() != 0) use_duplicate_nodes = true;
+  return 0;
+}
+
 int32_t GraphTable::load(const std::string &path, const std::string &param) {
   bool load_edge = (param[0] == 'e');
   bool load_node = (param[0] == 'n');
@@ -154,7 +190,7 @@ int32_t GraphTable::get_nodes_ids_by_ranges(
   res.clear();
   std::vector<std::future<std::vector<uint64_t>>> tasks;
   for (size_t i = 0; i < shards.size() && index < (int)ranges.size(); i++) {
-    end = total_size + shards[i].get_size();
+    end = total_size + shards[i]->get_size();
     start = total_size;
     while (start < end && index < ranges.size()) {
       if (ranges[index].second <= start)
@@ -169,11 +205,11 @@ int32_t GraphTable::get_nodes_ids_by_ranges(
         second -= total_size;
         tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
             [this, first, second, i]() -> std::vector<uint64_t> {
-              return shards[i].get_ids_by_range(first, second);
+              return shards[i]->get_ids_by_range(first, second);
             }));
       }
     }
-    total_size += shards[i].get_size();
+    total_size += shards[i]->get_size();
   }
   for (size_t i = 0; i < tasks.size(); i++) {
     auto vec = tasks[i].get();
@@ -217,7 +253,7 @@ int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) {
 
       size_t index = shard_id - shard_start;
 
-      auto node = shards[index].add_feature_node(id);
+      auto node = shards[index]->add_feature_node(id);
 
       node->set_feature_size(feat_name.size());
 
@@ -245,7 +281,7 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
   std::string sample_type = "random";
   bool is_weighted = false;
   int valid_count = 0;
-
+  int extra_alloc_index = 0;
   for (auto path : paths) {
     std::ifstream file(path);
     std::string line;
@@ -268,8 +304,24 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
       size_t src_shard_id = src_id % shard_num;
 
       if (src_shard_id >= shard_end || src_shard_id < shard_start) {
-        VLOG(4) << "will not load " << src_id << " from " << path
-                << ", please check id distribution";
+        if (use_duplicate_nodes == false ||
+            extra_nodes.find(src_id) == extra_nodes.end()) {
+          VLOG(4) << "will not load " << src_id << " from " << path
+                  << ", please check id distribution";
+          continue;
+        }
+        int index;
+        if (extra_nodes_to_thread_index.find(src_id) !=
+            extra_nodes_to_thread_index.end()) {
+          index = extra_nodes_to_thread_index[src_id];
+        } else {
+          index = extra_alloc_index++;
+          extra_alloc_index %= task_pool_size_;
+          extra_nodes_to_thread_index[src_id] = index;
+        }
+        extra_shards[index]->add_graph_node(src_id)->build_edges(is_weighted);
+        extra_shards[index]->add_neighbor(src_id, dst_id, weight);
+        valid_count++;
         continue;
       }
       if (count % 1000000 == 0) {
@@ -278,36 +330,130 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
       }
 
       size_t index = src_shard_id - shard_start;
-      shards[index].add_graph_node(src_id)->build_edges(is_weighted);
-      shards[index].add_neighbor(src_id, dst_id, weight);
+      shards[index]->add_graph_node(src_id)->build_edges(is_weighted);
+      shards[index]->add_neighbor(src_id, dst_id, weight);
       valid_count++;
     }
   }
   VLOG(0) << valid_count << "/" << count << " edges are loaded successfully in "
           << path;
 
+  std::vector<int> used(task_pool_size_, 0);
   // Build Sampler j
 
   for (auto &shard : shards) {
-    auto bucket = shard.get_bucket();
+    auto bucket = shard->get_bucket();
     for (size_t i = 0; i < bucket.size(); i++) {
       bucket[i]->build_sampler(sample_type);
+      used[get_thread_pool_index(bucket[i]->get_id())]++;
     }
   }
+  /*-----------------------
+  relocate the duplicate nodes to make them distributed evenly among threads.
+*/
+  for (auto &shard : extra_shards) {
+    auto bucket = shard->get_bucket();
+    for (size_t i = 0; i < bucket.size(); i++) {
+      bucket[i]->build_sampler(sample_type);
+    }
+  }
+  int size = extra_nodes_to_thread_index.size();
+  if (size == 0) return 0;
+  std::vector<int> index;
+  for (int i = 0; i < used.size(); i++) index.push_back(i);
+  sort(index.begin(), index.end(),
+       [&](int &a, int &b) { return used[a] < used[b]; });
+
+  std::vector<int> alloc(index.size(), 0), has_alloc(index.size(), 0);
+  int t = 1, aim = 0, mod = 0;
+  for (; t < used.size(); t++) {
+    if ((used[index[t]] - used[index[t - 1]]) * t >= size) {
+      break;
+    } else {
+      size -= (used[index[t]] - used[index[t - 1]]) * t;
+    }
+  }
+  aim = used[index[t - 1]] + size / t;
+  mod = size % t;
+  for (int x = t - 1; x >= 0; x--) {
+    alloc[index[x]] = aim;
+    if (t - x <= mod) alloc[index[x]]++;
+    alloc[index[x]] -= used[index[x]];
+  }
+  std::vector<uint64_t> vec[index.size()];
+  for (auto p : extra_nodes_to_thread_index) {
+    has_alloc[p.second]++;
+    vec[p.second].push_back(p.first);
+  }
+  sort(index.begin(), index.end(), [&](int &a, int &b) {
+    return has_alloc[a] - alloc[a] < has_alloc[b] - alloc[b];
+  });
+  int left = 0, right = index.size() - 1;
+  while (left < right) {
+    if (has_alloc[index[right]] - alloc[index[right]] == 0) break;
+    int x = std::min(alloc[index[left]] - has_alloc[index[left]],
+                     has_alloc[index[right]] - alloc[index[right]]);
+    has_alloc[index[left]] += x;
+    has_alloc[index[right]] -= x;
+    uint64_t id;
+    while (x--) {
+      id = vec[index[right]].back();
+      vec[index[right]].pop_back();
+      extra_nodes_to_thread_index[id] = index[left];
+      vec[index[left]].push_back(id);
+    }
+    if (has_alloc[index[right]] - alloc[index[right]] == 0) right--;
+    if (alloc[index[left]] - has_alloc[index[left]] == 0) left++;
+  }
+  std::vector<GraphShard *> extra_shards_copy;
+  for (int i = 0; i < task_pool_size_; ++i) {
+    extra_shards_copy.push_back(new GraphShard());
+  }
+  for (auto &shard : extra_shards) {
+    auto &bucket = shard->get_bucket();
+    auto &node_location = shard->get_node_location();
+    while (bucket.size()) {
+      Node *temp = bucket.back();
+      bucket.pop_back();
+      node_location.erase(temp->get_id());
+      extra_shards_copy[extra_nodes_to_thread_index[temp->get_id()]]
+          ->add_graph_node(temp);
+    }
+  }
+  for (int i = 0; i < task_pool_size_; ++i) {
+    delete extra_shards[i];
+    extra_shards[i] = extra_shards_copy[i];
+  }
   return 0;
 }
 
 Node *GraphTable::find_node(uint64_t id) {
   size_t shard_id = id % shard_num;
   if (shard_id >= shard_end || shard_id < shard_start) {
-    return nullptr;
+    if (use_duplicate_nodes == false || extra_nodes_to_thread_index.size() == 0)
+      return nullptr;
+    auto iter = extra_nodes_to_thread_index.find(id);
+    if (iter == extra_nodes_to_thread_index.end())
+      return nullptr;
+    else {
+      return extra_shards[iter->second]->find_node(id);
+    }
   }
   size_t index = shard_id - shard_start;
-  Node *node = shards[index].find_node(id);
+  Node *node = shards[index]->find_node(id);
   return node;
 }
 uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) {
-  return node_id % shard_num % shard_num_per_server % task_pool_size_;
+  if (use_duplicate_nodes == false || extra_nodes_to_thread_index.size() == 0)
+    return node_id % shard_num % shard_num_per_server % task_pool_size_;
+  size_t src_shard_id = node_id % shard_num;
+  if (src_shard_id >= shard_end || src_shard_id < shard_start) {
+    auto iter = extra_nodes_to_thread_index.find(node_id);
+    if (iter != extra_nodes_to_thread_index.end()) {
+      return iter->second;
+    }
+  }
+  return src_shard_id % shard_num_per_server % task_pool_size_;
 }
 
 uint32_t GraphTable::get_thread_pool_index_by_shard_index(
@@ -319,11 +465,16 @@ int32_t GraphTable::clear_nodes() {
   std::vector<std::future<int>> tasks;
   for (size_t i = 0; i < shards.size(); i++) {
     tasks.push_back(
-        _shards_task_pool[get_thread_pool_index_by_shard_index(i)]->enqueue(
-            [this, i]() -> int {
-              this->shards[i].clear();
-              return 0;
-            }));
+        _shards_task_pool[i % task_pool_size_]->enqueue([this, i]() -> int {
+          this->shards[i]->clear();
+          return 0;
+        }));
+  }
+  for (size_t i = 0; i < extra_shards.size(); i++) {
+    tasks.push_back(_shards_task_pool[i]->enqueue([this, i]() -> int {
+      this->extra_shards[i]->clear();
+      return 0;
+    }));
   }
   for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
   return 0;
@@ -334,7 +485,7 @@ int32_t GraphTable::random_sample_nodes(int sample_size,
                                         int &actual_size) {
   int total_size = 0;
   for (int i = 0; i < shards.size(); i++) {
-    total_size += shards[i].get_size();
+    total_size += shards[i]->get_size();
   }
   if (sample_size > total_size) sample_size = total_size;
   int range_num = random_sample_nodes_ranges;
@@ -401,8 +552,8 @@ int32_t GraphTable::random_sample_neighbors(
   size_t node_num = buffers.size();
   std::function<void(char *)> char_del = [](char *c) { delete[] c; };
   std::vector<std::future<int>> tasks;
-  std::vector<std::vector<uint32_t>> seq_id(shard_end - shard_start);
-  std::vector<std::vector<SampleKey>> id_list(shard_end - shard_start);
+  std::vector<std::vector<uint32_t>> seq_id(task_pool_size_);
+  std::vector<std::vector<SampleKey>> id_list(task_pool_size_);
   size_t index;
   for (size_t idx = 0; idx < node_num; ++idx) {
     index = get_thread_pool_index(node_ids[idx]);
@@ -524,7 +675,7 @@ int32_t GraphTable::set_node_feat(
     tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue(
         [&, idx, node_id]() -> int {
           size_t index = node_id % this->shard_num - this->shard_start;
-          auto node = shards[index].add_feature_node(node_id);
+          auto node = shards[index]->add_feature_node(node_id);
           node->set_feature_size(this->feat_name.size());
           for (int feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) {
             const std::string &feature_name = feature_names[feat_idx];
@@ -581,7 +732,7 @@ int32_t GraphTable::pull_graph_list(int start, int total_size,
   int size = 0, cur_size;
   std::vector<std::future<std::vector<Node *>>> tasks;
   for (size_t i = 0; i < shards.size() && total_size > 0; i++) {
-    cur_size = shards[i].get_size();
+    cur_size = shards[i]->get_size();
     if (size + cur_size <= start) {
       size += cur_size;
       continue;
@@ -590,7 +741,7 @@ int32_t GraphTable::pull_graph_list(int start, int total_size,
     int end = start + (count - 1) * step + 1;
     tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
         [this, i, start, end, step, size]() -> std::vector<Node *> {
-          return this->shards[i].get_batch(start - size, end - size, step);
+          return this->shards[i]->get_batch(start - size, end - size, step);
         }));
     start += count * step;
     total_size -= count;
@@ -665,7 +816,14 @@ int32_t GraphTable::initialize() {
   shard_end = shard_start + shard_num_per_server;
   VLOG(0) << "in init graph table shard idx = " << _shard_idx << " shard_start "
           << shard_start << " shard_end " << shard_end;
-  shards = std::vector<GraphShard>(shard_num_per_server, GraphShard(shard_num));
+  for (int i = 0; i < shard_num_per_server; i++) {
+    shards.push_back(new GraphShard());
+  }
+  use_duplicate_nodes = false;
+  for (int i = 0; i < task_pool_size_; i++) {
+    extra_shards.push_back(new GraphShard());
+  }
+
   return 0;
 }
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/table/common_graph_table.h b/paddle/fluid/distributed/table/common_graph_table.h
index 9ca59db3bb268..b76ab0ae95060 100644
--- a/paddle/fluid/distributed/table/common_graph_table.h
+++ b/paddle/fluid/distributed/table/common_graph_table.h
@@ -47,7 +47,6 @@ class GraphShard {
  public:
   size_t get_size();
   GraphShard() {}
-  GraphShard(int shard_num) { this->shard_num = shard_num; }
   ~GraphShard();
   std::vector<Node *> &get_bucket() { return bucket; }
   std::vector<Node *> get_batch(int start, int end, int step);
@@ -60,18 +59,18 @@ class GraphShard {
   }
 
   GraphNode *add_graph_node(uint64_t id);
+  GraphNode *add_graph_node(Node *node);
   FeatureNode *add_feature_node(uint64_t id);
   Node *find_node(uint64_t id);
   void delete_node(uint64_t id);
   void clear();
   void add_neighbor(uint64_t id, uint64_t dst_id, float weight);
-  std::unordered_map<uint64_t, int> get_node_location() {
+  std::unordered_map<uint64_t, int> &get_node_location() {
     return node_location;
   }
 
  private:
   std::unordered_map<uint64_t, int> node_location;
-  int shard_num;
   std::vector<Node *> bucket;
 };
 
@@ -355,7 +354,7 @@ class ScaledLRU {
 class GraphTable : public SparseTable {
  public:
   GraphTable() { use_cache = false; }
-  virtual ~GraphTable() {}
+  virtual ~GraphTable();
   virtual int32_t pull_graph_list(int start, int size,
                                   std::unique_ptr<char[]> &buffer,
                                   int &actual_size, bool need_feature,
@@ -374,6 +373,7 @@ class GraphTable : public SparseTable {
   virtual int32_t initialize();
 
   int32_t load(const std::string &path, const std::string &param);
+  int32_t load_graph_split_config(const std::string &path);
 
   int32_t load_edges(const std::string &path, bool reverse);
 
@@ -434,7 +434,7 @@ class GraphTable : public SparseTable {
   }
 
  protected:
-  std::vector<GraphShard> shards;
+  std::vector<GraphShard *> shards, extra_shards;
   size_t shard_start, shard_end, server_num, shard_num_per_server, shard_num;
   const int task_pool_size_ = 24;
   const int random_sample_nodes_ranges = 3;
@@ -449,7 +449,9 @@ class GraphTable : public SparseTable {
   std::vector<std::shared_ptr<::ThreadPool>> _shards_task_pool;
   std::vector<std::shared_ptr<std::mt19937_64>> _shards_task_rng_pool;
   std::shared_ptr<ScaledLRU<SampleKey, SampleResult>> scaled_lru;
-  bool use_cache;
+  std::unordered_set<uint64_t> extra_nodes;
+  std::unordered_map<uint64_t, size_t> extra_nodes_to_thread_index;
+  bool use_cache, use_duplicate_nodes;
   mutable std::mutex mutex_;
 };
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/table/graph/graph_node.cc b/paddle/fluid/distributed/table/graph/graph_node.cc
index e2311cc307b60..52c708be88488 100644
--- a/paddle/fluid/distributed/table/graph/graph_node.cc
+++ b/paddle/fluid/distributed/table/graph/graph_node.cc
@@ -65,6 +65,9 @@ void GraphNode::build_edges(bool is_weighted) {
   }
 }
 void GraphNode::build_sampler(std::string sample_type) {
+  if (sampler != nullptr) {
+    return;
+  }
   if (sample_type == "random") {
     sampler = new RandomSampler();
   } else if (sample_type == "weighted") {
diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt
index 597a08973b957..62de82832e133 100644
--- a/paddle/fluid/distributed/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/test/CMakeLists.txt
@@ -21,6 +21,9 @@ cc_test(brpc_utils_test SRCS brpc_utils_test.cc DEPS brpc_utils scope math_funct
 set_source_files_properties(graph_node_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(graph_node_test SRCS graph_node_test.cc DEPS graph_py_service scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS})
 
+set_source_files_properties(graph_node_split_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(graph_node_split_test SRCS graph_node_split_test.cc DEPS graph_py_service scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS})
+
 set_source_files_properties(feature_value_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(feature_value_test SRCS feature_value_test.cc DEPS ${COMMON_DEPS} boost table)
 
diff --git a/paddle/fluid/distributed/test/graph_node_split_test.cc b/paddle/fluid/distributed/test/graph_node_split_test.cc
new file mode 100644
index 0000000000000..3fcddde787f69
--- /dev/null
+++ b/paddle/fluid/distributed/test/graph_node_split_test.cc
@@ -0,0 +1,275 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+#include <condition_variable>  // NOLINT
+#include <fstream>
+#include <iomanip>
+#include <string>
+#include <thread>  // NOLINT
+#include <unordered_set>
+#include <vector>
+#include "google/protobuf/text_format.h"
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/service/brpc_ps_server.h"
+#include "paddle/fluid/distributed/service/env.h"
+#include "paddle/fluid/distributed/service/graph_brpc_client.h"
+#include "paddle/fluid/distributed/service/graph_brpc_server.h"
+#include "paddle/fluid/distributed/service/graph_py_service.h"
+#include "paddle/fluid/distributed/service/ps_client.h"
+#include "paddle/fluid/distributed/service/sendrecv.pb.h"
+#include "paddle/fluid/distributed/service/service.h"
+#include "paddle/fluid/distributed/table/graph/graph_node.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace operators = paddle::operators;
+namespace math = paddle::operators::math;
+namespace memory = paddle::memory;
+namespace distributed = paddle::distributed;
+
+std::vector<std::string> edges = {
+    std::string("37\t45\t0.34"),  std::string("37\t145\t0.31"),
+    std::string("37\t112\t0.21"), std::string("96\t48\t1.4"),
+    std::string("96\t247\t0.31"), std::string("96\t111\t1.21"),
+    std::string("59\t45\t0.34"),  std::string("59\t145\t0.31"),
+    std::string("59\t122\t0.21"), std::string("97\t48\t0.34"),
+    std::string("97\t247\t0.31"), std::string("97\t111\t0.21")};
+char edge_file_name[] = "edges.txt";
+
+std::vector<std::string> nodes = {
+    std::string("user\t37\ta 0.34\tb 13 14\tc hello\td abc"),
+    std::string("user\t96\ta 0.31\tb 15 10\tc 96hello\td abcd"),
+    std::string("user\t59\ta 0.11\tb 11 14"),
+    std::string("user\t97\ta 0.11\tb 12 11"),
+    std::string("item\t45\ta 0.21"),
+    std::string("item\t145\ta 0.21"),
+    std::string("item\t112\ta 0.21"),
+    std::string("item\t48\ta 0.21"),
+    std::string("item\t247\ta 0.21"),
+    std::string("item\t111\ta 0.21"),
+    std::string("item\t46\ta 0.21"),
+    std::string("item\t146\ta 0.21"),
+    std::string("item\t122\ta 0.21"),
+    std::string("item\t49\ta 0.21"),
+    std::string("item\t248\ta 0.21"),
+    std::string("item\t113\ta 0.21")};
+char node_file_name[] = "nodes.txt";
+
+std::vector<std::string> graph_split = {std::string("0\t97")};
+char graph_split_file_name[] = "graph_split.txt";
+
+void prepare_file(char file_name[], std::vector<std::string> data) {
+  std::ofstream ofile;
+  ofile.open(file_name);
+  for (auto x : data) {
+    ofile << x << std::endl;
+  }
+
+  ofile.close();
+}
+void GetDownpourSparseTableProto(
+    ::paddle::distributed::TableParameter* sparse_table_proto) {
+  sparse_table_proto->set_table_id(0);
+  sparse_table_proto->set_table_class("GraphTable");
+  sparse_table_proto->set_shard_num(127);
+  sparse_table_proto->set_type(::paddle::distributed::PS_SPARSE_TABLE);
+  ::paddle::distributed::TableAccessorParameter* accessor_proto =
+      sparse_table_proto->mutable_accessor();
+  accessor_proto->set_accessor_class("CommMergeAccessor");
+}
+
+::paddle::distributed::PSParameter GetServerProto() {
+  // Generate server proto desc
+  ::paddle::distributed::PSParameter server_fleet_desc;
+  ::paddle::distributed::ServerParameter* server_proto =
+      server_fleet_desc.mutable_server_param();
+  ::paddle::distributed::DownpourServerParameter* downpour_server_proto =
+      server_proto->mutable_downpour_server_param();
+  ::paddle::distributed::ServerServiceParameter* server_service_proto =
+      downpour_server_proto->mutable_service_param();
+  server_service_proto->set_service_class("GraphBrpcService");
+  server_service_proto->set_server_class("GraphBrpcServer");
+  server_service_proto->set_client_class("GraphBrpcClient");
+  server_service_proto->set_start_server_port(0);
+  server_service_proto->set_server_thread_num(12);
+
+  ::paddle::distributed::TableParameter* sparse_table_proto =
+      downpour_server_proto->add_downpour_table_param();
+  GetDownpourSparseTableProto(sparse_table_proto);
+  return server_fleet_desc;
+}
+
+::paddle::distributed::PSParameter GetWorkerProto() {
+  ::paddle::distributed::PSParameter worker_fleet_desc;
+  ::paddle::distributed::WorkerParameter* worker_proto =
+      worker_fleet_desc.mutable_worker_param();
+
+  ::paddle::distributed::DownpourWorkerParameter* downpour_worker_proto =
+      worker_proto->mutable_downpour_worker_param();
+
+  ::paddle::distributed::TableParameter* worker_sparse_table_proto =
+      downpour_worker_proto->add_downpour_table_param();
+  GetDownpourSparseTableProto(worker_sparse_table_proto);
+
+  ::paddle::distributed::ServerParameter* server_proto =
+      worker_fleet_desc.mutable_server_param();
+  ::paddle::distributed::DownpourServerParameter* downpour_server_proto =
+      server_proto->mutable_downpour_server_param();
+  ::paddle::distributed::ServerServiceParameter* server_service_proto =
+      downpour_server_proto->mutable_service_param();
+  server_service_proto->set_service_class("GraphBrpcService");
+  server_service_proto->set_server_class("GraphBrpcServer");
+  server_service_proto->set_client_class("GraphBrpcClient");
+  server_service_proto->set_start_server_port(0);
+  server_service_proto->set_server_thread_num(12);
+
+  ::paddle::distributed::TableParameter* server_sparse_table_proto =
+      downpour_server_proto->add_downpour_table_param();
+  GetDownpourSparseTableProto(server_sparse_table_proto);
+
+  return worker_fleet_desc;
+}
+
+/*-------------------------------------------------------------------------*/
+
+std::string ip_ = "127.0.0.1", ip2 = "127.0.0.1";
+uint32_t port_ = 5209, port2 = 5210;
+
+std::vector<std::string> host_sign_list_;
+
+std::shared_ptr<paddle::distributed::GraphBrpcServer> pserver_ptr_,
+    pserver_ptr2;
+
+std::shared_ptr<paddle::distributed::GraphBrpcClient> worker_ptr_;
+
+void RunServer() {
+  LOG(INFO) << "init first server";
+  ::paddle::distributed::PSParameter server_proto = GetServerProto();
+
+  auto _ps_env = paddle::distributed::PaddlePSEnvironment();
+  _ps_env.set_ps_servers(&host_sign_list_, 2);  // test
+  pserver_ptr_ = std::shared_ptr<paddle::distributed::GraphBrpcServer>(
+      (paddle::distributed::GraphBrpcServer*)
+          paddle::distributed::PSServerFactory::create(server_proto));
+  std::vector<framework::ProgramDesc> empty_vec;
+  framework::ProgramDesc empty_prog;
+  empty_vec.push_back(empty_prog);
+  pserver_ptr_->configure(server_proto, _ps_env, 0, empty_vec);
+  LOG(INFO) << "first server, run start(ip,port)";
+  pserver_ptr_->start(ip_, port_);
+  pserver_ptr_->build_peer2peer_connection(0);
+  LOG(INFO) << "init first server Done";
+}
+
+void RunServer2() {
+  LOG(INFO) << "init second server";
+  ::paddle::distributed::PSParameter server_proto2 = GetServerProto();
+
+  auto _ps_env2 = paddle::distributed::PaddlePSEnvironment();
+  _ps_env2.set_ps_servers(&host_sign_list_, 2);  // test
+  pserver_ptr2 = std::shared_ptr<paddle::distributed::GraphBrpcServer>(
+      (paddle::distributed::GraphBrpcServer*)
+          paddle::distributed::PSServerFactory::create(server_proto2));
+  std::vector<framework::ProgramDesc> empty_vec2;
+  framework::ProgramDesc empty_prog2;
+  empty_vec2.push_back(empty_prog2);
+  pserver_ptr2->configure(server_proto2, _ps_env2, 1, empty_vec2);
+  pserver_ptr2->start(ip2, port2);
+  pserver_ptr2->build_peer2peer_connection(1);
+}
+
+void RunClient(
+    std::map<uint64_t, std::vector<paddle::distributed::Region>>& dense_regions,
+    int index, paddle::distributed::PsBaseService* service) {
+  ::paddle::distributed::PSParameter worker_proto = GetWorkerProto();
+  paddle::distributed::PaddlePSEnvironment _ps_env;
+  auto servers_ = host_sign_list_.size();
+  _ps_env = paddle::distributed::PaddlePSEnvironment();
+  _ps_env.set_ps_servers(&host_sign_list_, servers_);
+  worker_ptr_ = std::shared_ptr<paddle::distributed::GraphBrpcClient>(
+      (paddle::distributed::GraphBrpcClient*)
+          paddle::distributed::PSClientFactory::create(worker_proto));
+  worker_ptr_->configure(worker_proto, dense_regions, _ps_env, 0);
+  worker_ptr_->set_shard_num(127);
+  worker_ptr_->set_local_channel(index);
+  worker_ptr_->set_local_graph_service(
+      (paddle::distributed::GraphBrpcService*)service);
+}
+
+void RunGraphSplit() {
+  setenv("http_proxy", "", 1);
+  setenv("https_proxy", "", 1);
+  prepare_file(edge_file_name, edges);
+  prepare_file(node_file_name, nodes);
+  prepare_file(graph_split_file_name, graph_split);
+  auto ph_host = paddle::distributed::PSHost(ip_, port_, 0);
+  host_sign_list_.push_back(ph_host.serialize_to_string());
+
+  // test-start
+  auto ph_host2 = paddle::distributed::PSHost(ip2, port2, 1);
+  host_sign_list_.push_back(ph_host2.serialize_to_string());
+  // test-end
+  // Srart Server
+  std::thread* server_thread = new std::thread(RunServer);
+
+  std::thread* server_thread2 = new std::thread(RunServer2);
+
+  sleep(2);
+  std::map<uint64_t, std::vector<paddle::distributed::Region>> dense_regions;
+  dense_regions.insert(
+      std::pair<uint64_t, std::vector<paddle::distributed::Region>>(0, {}));
+  auto regions = dense_regions[0];
+
+  RunClient(dense_regions, 0, pserver_ptr_->get_service());
+
+  /*-----------------------Test Server Init----------------------------------*/
+
+  auto pull_status = worker_ptr_->load_graph_split_config(
+      0, std::string(graph_split_file_name));
+  pull_status.wait();
+  pull_status =
+      worker_ptr_->load(0, std::string(edge_file_name), std::string("e>"));
+  srand(time(0));
+  pull_status.wait();
+  std::vector<std::vector<uint64_t>> _vs;
+  std::vector<std::vector<float>> vs;
+  pull_status = worker_ptr_->batch_sample_neighbors(
+      0, std::vector<uint64_t>(1, 10240001024), 4, _vs, vs, true);
+  pull_status.wait();
+  ASSERT_EQ(0, _vs[0].size());
+  _vs.clear();
+  vs.clear();
+  pull_status = worker_ptr_->batch_sample_neighbors(
+      0, std::vector<uint64_t>(1, 97), 4, _vs, vs, true);
+  pull_status.wait();
+  ASSERT_EQ(3, _vs[0].size());
+  std::remove(edge_file_name);
+  std::remove(node_file_name);
+  std::remove(graph_split_file_name);
+  LOG(INFO) << "Run stop_server";
+  worker_ptr_->stop_server();
+  LOG(INFO) << "Run finalize_worker";
+  worker_ptr_->finalize_worker();
+}
+
+TEST(RunGraphSplit, Run) { RunGraphSplit(); }
\ No newline at end of file
diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt
index e8cb55b7afeb9..d5abf639c83db 100644
--- a/paddle/fluid/eager/CMakeLists.txt
+++ b/paddle/fluid/eager/CMakeLists.txt
@@ -2,7 +2,7 @@ set(eager_deps pten pten_api hook_utils tensor_utils utils global_utils backward
 set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy)
 set(generated_deps dygraph_function dygraph_node)
 
-if(NOT DEFINED ON_INFER)
+if(NOT ON_INFER)
     message("Performing Eager Dygraph Auto Code Generation")
     add_subdirectory(auto_code_generator)
 endif()
diff --git a/paddle/fluid/eager/accumulation/gradient_accumulation.cc b/paddle/fluid/eager/accumulation/gradient_accumulation.cc
index 7345c3612381b..9d475d96e56ce 100644
--- a/paddle/fluid/eager/accumulation/gradient_accumulation.cc
+++ b/paddle/fluid/eager/accumulation/gradient_accumulation.cc
@@ -193,13 +193,14 @@ void TensorAdd(const egr::EagerTensor& src, egr::EagerTensor* dst) {
 
   // TODO(jiabin): Support NPU here
   PADDLE_TENSOR_ADD(float);
-  // NOTE(phlrain): xpu only support float
+// NOTE(phlrain): xpu only support float
+#ifndef PADDLE_WITH_XPU
   PADDLE_TENSOR_ADD(double);
   // NOTE(chenweihang): only support complex grad tensor accumulated,
   // support selected rows if needed in the future
   PADDLE_TENSOR_ADD(paddle::platform::complex<float>);
   PADDLE_TENSOR_ADD(paddle::platform::complex<double>);
-
+#endif
 #undef PADDLE_TENSOR_ADD
 
   if (data_type == paddle::framework::proto::VarType::FP16) {
@@ -268,13 +269,14 @@ void VariableAdd(const egr::EagerTensor& src, egr::EagerTensor* dst) {
 
   // TODO(jiabin): Support NPU here
   PADDLE_TENSOR_ADD(float);
-  // NOTE(phlrain): xpu only support float
+// NOTE(phlrain): xpu only support float
+#ifndef PADDLE_WITH_XPU
   PADDLE_TENSOR_ADD(double);
   // NOTE(chenweihang): only support complex grad tensor accumulated,
   // support selected rows if needed in the future
   PADDLE_TENSOR_ADD(paddle::platform::complex<float>);
   PADDLE_TENSOR_ADD(paddle::platform::complex<double>);
-
+#endif
 #undef PADDLE_TENSOR_ADD
 
   if (data_type == paddle::framework::proto::VarType::FP16) {
diff --git a/paddle/fluid/eager/api/generated/CMakeLists.txt b/paddle/fluid/eager/api/generated/CMakeLists.txt
index 407a8d69e52da..ebbef286f7923 100644
--- a/paddle/fluid/eager/api/generated/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/CMakeLists.txt
@@ -1,5 +1,5 @@
 add_subdirectory(eager_generated)
 
-if(NOT DEFINED ON_INFER)
+if(NOT ON_INFER)
     add_subdirectory(fluid_generated)
 endif()
diff --git a/paddle/fluid/eager/api/utils/hook_utils.cc b/paddle/fluid/eager/api/utils/hook_utils.cc
index 7f85d014fa842..85ff6687e0dbe 100644
--- a/paddle/fluid/eager/api/utils/hook_utils.cc
+++ b/paddle/fluid/eager/api/utils/hook_utils.cc
@@ -20,6 +20,7 @@
 #include "paddle/pten/core/dense_tensor.h"
 
 namespace egr {
+namespace egr_utils_api {
 
 void RegisterGradientHookForTensor(
     const egr::EagerTensor& tensor,
@@ -90,4 +91,5 @@ void RetainGradForTensor(const egr::EagerTensor& tensor) {
   }
 }
 
+}  // namespace egr_utils_api
 }  // namespace egr
diff --git a/paddle/fluid/eager/api/utils/hook_utils.h b/paddle/fluid/eager/api/utils/hook_utils.h
index bf320f0b15d4a..7e4faa5a2c701 100644
--- a/paddle/fluid/eager/api/utils/hook_utils.h
+++ b/paddle/fluid/eager/api/utils/hook_utils.h
@@ -18,6 +18,7 @@
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/pten/api/all.h"
 namespace egr {
+namespace egr_utils_api {
 
 void RegisterGradientHookForTensor(
     const egr::EagerTensor& tensor,
@@ -27,4 +28,5 @@ void RegisterReduceHookForTensor(const egr::EagerTensor& tensor,
                                  const std::function<void(void)>& hook);
 void RetainGradForTensor(const egr::EagerTensor& tensor);
 
+}  // namespace egr_utils_api
 }  // namespace egr
diff --git a/paddle/fluid/eager/api/utils/tensor_utils.cc b/paddle/fluid/eager/api/utils/tensor_utils.cc
index 9dbb308a2c906..ad6c34b7cf86c 100644
--- a/paddle/fluid/eager/api/utils/tensor_utils.cc
+++ b/paddle/fluid/eager/api/utils/tensor_utils.cc
@@ -26,6 +26,7 @@
 #include "paddle/fluid/framework/variable.h"
 
 namespace egr {
+namespace egr_utils_api {
 
 bool IsLeafTensor(const egr::EagerTensor& target) {
   std::shared_ptr<GradNodeBase> grad_node = EagerUtils::grad_node(target);
@@ -58,4 +59,5 @@ egr::EagerTensor CreateTensorWithValue(const pten::DDim& ddim,
   return out;
 }
 
+}  // namespace egr_utils_api
 }  // namespace egr
diff --git a/paddle/fluid/eager/api/utils/tensor_utils.h b/paddle/fluid/eager/api/utils/tensor_utils.h
index a0d8caf3cb307..b3c4b59682320 100644
--- a/paddle/fluid/eager/api/utils/tensor_utils.h
+++ b/paddle/fluid/eager/api/utils/tensor_utils.h
@@ -18,6 +18,7 @@
 #include "paddle/pten/api/all.h"
 
 namespace egr {
+namespace egr_utils_api {
 
 // If and only if the tensor holds an AccumulationNode
 // Then it's treated as a leaf tensor
@@ -29,4 +30,5 @@ egr::EagerTensor CreateTensorWithValue(const pten::DDim& ddim,
                                        const pten::DataLayout& layout,
                                        float value, bool is_leaf = true);
 
+}  // namespace egr_utils_api
 }  // namespace egr
diff --git a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
index 5d31c9139baa8..187c3db445222 100644
--- a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
@@ -17,13 +17,42 @@ execute_process(
 )
 
 if(WIN32)
+    set(EAGER_CODEGEN_DEPS eager_generator)
+    if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
+      set(eager_generator_path "${CMAKE_CURRENT_BINARY_DIR}")
+    else()
+      set(eager_generator_path "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}")
+    endif()
+    
+    if(${CBLAS_PROVIDER} STREQUAL MKLML)
+      message("Copied libiomp5md.dll for Eager AutoCodeGen")
+      ADD_CUSTOM_COMMAND(OUTPUT ${eager_generator_path}/libiomp5md.dll
+        COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} ${eager_generator_path}
+        DEPENDS mklml)
+      list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/libiomp5md.dll)
+    else(${CBLAS_PROVIDER} STREQUAL EXTERN_OPENBLAS)
+      message("Copied openblas.dll for Eager AutoCodeGen")
+      ADD_CUSTOM_COMMAND(OUTPUT ${eager_generator_path}/openblas.dll
+        COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_SHARED_LIB} ${eager_generator_path}
+        DEPENDS extern_openblas)
+      list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/openblas.dll)
+    endif()
+
+    if(WITH_MKLDNN)
+      message("Copied mkldnn.dll for Eager AutoCodeGen")
+      ADD_CUSTOM_COMMAND(OUTPUT ${eager_generator_path}/mkldnn.dll
+        COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB} ${eager_generator_path}
+        DEPENDS mkldnn)
+        list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/mkldnn.dll)
+    endif()
+
     add_custom_target(eager_codegen
-      COMMAND "${CMAKE_CURRENT_BINARY_DIR}/eager_generator.exe" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated" 
-      DEPENDS eager_generator
+      COMMAND "${eager_generator_path}/eager_generator.exe" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/op_list.txt"
+      DEPENDS ${EAGER_CODEGEN_DEPS}
       VERBATIM)
 else()
     add_custom_target(eager_codegen
-          COMMAND "${CMAKE_CURRENT_BINARY_DIR}/eager_generator" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated"
+          COMMAND "${CMAKE_CURRENT_BINARY_DIR}/eager_generator" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/op_list.txt"
           DEPENDS eager_generator
           VERBATIM)
 endif()
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index c0714775da852..fe29792b6e75c 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -22,33 +22,28 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/pybind/op_function_generator.h"
 #include "paddle/fluid/pybind/pybind.h"
 #include "paddle/fluid/string/string_helper.h"
 
+namespace paddle {
+namespace framework {
+
+static std::unordered_map<std::string, paddle::framework::AttributeMap>
+    operators_with_attrs = {};
+
 static std::unordered_set<std::string> operators_to_skip = {
-    "fused_elemwise_add_activation",  // No Default Attr
-    "fused_elemwise_activation",      // No Default Attr
-    "reverse",                        // Attr Error
-    "flip",                           // Attr Error
-    "cast",                           // Attr Error
-    "sum",
-    "minus",  // Multiple ops_
-    "pull_sparse",
-    "pull_box_extended_sparse",
-    "pull_sparse_v2",
-    "pull_box_sparse",
-    "fused_attention",
-    "diag_v2",
+    "minus",
 };
 
-static std::unordered_set<std::string> operators_to_codegen = {
-    "sigmoid",      "matmul_v2",   "reduce_sum", "elementwise_add",
-    "share_buffer", "var_conv_2d", "split"};
-
+static std::unordered_set<std::string> operators_to_codegen = {};
 static std::unordered_set<std::string> skipped_operators = {};
 
-namespace paddle {
-namespace framework {
+static std::string LegalizeVariableName(const std::string& var_name) {
+  std::string ret = var_name;
+  std::replace(ret.begin(), ret.end(), '-', '_');  // replace all '-' to '_'
+  return ret;
+}
 
 static std::string AttrTypeToString(const proto::AttrType& type) {
   std::string ret;
@@ -358,15 +353,81 @@ static bool CheckOpProto(proto::OpProto* op_proto) {
   return true;
 }
 
-/* -------------------------------- */
-/* --------- Collect Info --------- */
-/* -------------------------------- */
-static bool CollectInformationFromOpInfo(
-    const paddle::framework::OpInfo& op_info,
-    std::vector<paddle::framework::AttributeMap>* grad_node_default_attr_maps,
-    std::vector<std::string>* grad_op_types,
+/* --------------------------------------- */
+/* --------- Preprocess Ins/Outs --------- */
+/* --------------------------------------- */
+static void PurifyForwardOpProto(
+    const proto::OpProto& op_proto,
     std::unordered_map<std::string, size_t>* fwd_inputs_name_pos_map,
     std::unordered_map<std::string, size_t>* fwd_outputs_name_pos_map,
+    std::vector<proto::OpProto::Var>* in_vars,
+    std::vector<proto::OpProto::Var>* out_vars) {
+  // Op Name
+  const std::string op_name = op_proto.type();
+
+  // Handle dispensable inputs
+  for (const proto::OpProto::Var& input : op_proto.inputs()) {
+    std::string input_name = input.name();
+
+    // Delete dispensable tensor unless specified in op_ins_map
+    if (input.dispensable()) {
+      if (!op_ins_map.count(op_name) ||
+          !op_ins_map[op_name].count(input_name)) {
+        VLOG(6) << "Removing Dispensable Input: " << input_name;
+
+        // in_vars
+        auto iter = in_vars->begin();
+        for (iter = in_vars->begin(); iter != in_vars->end(); iter++) {
+          if (iter->name() == input_name) {
+            break;
+          }
+        }
+        in_vars->erase(iter);
+      }
+    }
+  }
+
+  for (const proto::OpProto::Var& output : op_proto.outputs()) {
+    std::string output_name = output.name();
+
+    // Delete dispensable tensor unless specified in op_outs_map
+    if (output.dispensable()) {
+      if (!op_outs_map.count(op_name) ||
+          !op_outs_map[op_name].count(output_name)) {
+        VLOG(6) << "Removing Dispensable Output: " << output_name;
+
+        // out_vars
+        auto iter = out_vars->begin();
+        for (iter = out_vars->begin(); iter != out_vars->end(); iter++) {
+          if (iter->name() == output_name) {
+            break;
+          }
+        }
+        out_vars->erase(iter);
+      }
+    }
+  }
+
+  /* ------ Maping forward slot name to fwd position ------ */
+  size_t in_pos = 0;
+  for (const auto& var : *in_vars) {
+    VLOG(6) << "Mapping input tensor: " << var.name()
+            << " To position: " << in_pos;
+    (*fwd_inputs_name_pos_map)[var.name()] = in_pos;
+    in_pos++;
+  }
+
+  size_t out_pos = 0;
+  for (const auto& var : *out_vars) {
+    VLOG(6) << "Mapping output tensor: " << var.name()
+            << " To position: " << out_pos;
+    (*fwd_outputs_name_pos_map)[var.name()] = out_pos;
+    out_pos++;
+  }
+}
+
+static void PurifyGradOpProto(
+    const proto::OpProto& op_proto,
     std::map<std::string, std::string>* grad_outs_slotname_map,
     std::map<std::string, std::string>* grad_ins_fwd_slotname_map,
     std::map<std::string, std::string>* grad_ins_grad_slotname_map,
@@ -376,6 +437,114 @@ static bool CollectInformationFromOpInfo(
     std::map<std::string,
              std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>*
         grad_outs) {
+  // Op Name
+  const std::string op_name = op_proto.type();
+
+  // Handle dispensable inputs
+  for (const proto::OpProto::Var& input : op_proto.inputs()) {
+    std::string input_name = input.name();
+
+    // Delete dispensable tensor unless specified in op_ins_map
+    if (input.dispensable()) {
+      if (!op_ins_map.count(op_name) ||
+          !op_ins_map[op_name].count(input_name)) {
+        VLOG(6) << "Removing Dispensable Input: " << input_name;
+
+        // grad_outs_slotname_map
+        auto grad_outs_slotname_map_purified = *grad_outs_slotname_map;
+        for (const auto& iter : *grad_outs_slotname_map) {
+          const std::string& grad_output_name = iter.first;
+          const std::string& matched_input_name = iter.second;
+          if (matched_input_name == input_name) {
+            grad_outs_slotname_map_purified.erase(grad_output_name);
+
+            PADDLE_ENFORCE(
+                grad_outs->count(grad_output_name) > 0,
+                paddle::platform::errors::Fatal(
+                    "Unable to find gradient output name in grad_outs."));
+            // grad_outs
+            grad_outs->erase(grad_output_name);
+          }
+        }
+        *grad_outs_slotname_map = grad_outs_slotname_map_purified;
+
+        // grad_ins_fwd_slotname_map: output as tensorwrapper
+        if (grad_ins_fwd_slotname_map->count(input_name))
+          grad_ins_fwd_slotname_map->erase(input_name);
+
+        // grad_ins: output as tensorwrapper
+        if (grad_ins->count(input_name)) grad_ins->erase(input_name);
+      }
+    }
+  }
+
+  for (const proto::OpProto::Var& output : op_proto.outputs()) {
+    std::string output_name = output.name();
+
+    // Delete dispensable tensor unless specified in op_outs_map
+    if (output.dispensable()) {
+      if (!op_outs_map.count(op_name) ||
+          !op_outs_map[op_name].count(output_name)) {
+        VLOG(6) << "Removing Dispensable Output: " << output_name;
+
+        // grad_ins_grad_slotname_map
+        auto grad_ins_grad_slotname_map_purified = *grad_ins_grad_slotname_map;
+        for (const auto& iter : *grad_ins_grad_slotname_map) {
+          const std::string& grad_input_name = iter.first;
+          const std::string& matched_output_name = iter.second;
+          if (matched_output_name == output_name) {
+            grad_ins_grad_slotname_map_purified.erase(grad_input_name);
+
+            PADDLE_ENFORCE(
+                grad_ins->count(grad_input_name) > 0,
+                paddle::platform::errors::Fatal(
+                    "Unable to find gradient input name in grad_ins."));
+            // grad_ins
+            grad_ins->erase(grad_input_name);
+          }
+        }
+        *grad_ins_grad_slotname_map = grad_ins_grad_slotname_map_purified;
+
+        // grad_ins_fwd_slotname_map: output as tensorwrapper
+        if (grad_ins_fwd_slotname_map->count(output_name))
+          grad_ins_fwd_slotname_map->erase(output_name);
+
+        // grad_ins: output as tensorwrapper
+        if (grad_ins->count(output_name)) grad_ins->erase(output_name);
+      }
+    }
+  }
+}
+
+/* -------------------------------- */
+/* --------- Collect Info --------- */
+/* -------------------------------- */
+static void CollectForwardInformationFromOpInfo(
+    const paddle::framework::OpInfo& op_info,
+    std::vector<proto::OpProto::Var>* in_vars,
+    std::vector<proto::OpProto::Var>* out_vars) {
+  const proto::OpProto& op_proto = *op_info.proto_;
+  for (const proto::OpProto::Var& input : op_proto.inputs()) {
+    in_vars->push_back(input);
+  }
+  for (const proto::OpProto::Var& output : op_proto.outputs()) {
+    out_vars->push_back(output);
+  }
+}
+
+static bool CollectGradInformationFromOpInfo(
+    const paddle::framework::OpInfo& op_info, bool* generate_forward_only,
+    std::vector<std::string>* grad_op_types,                         // grad
+    std::map<std::string, std::string>* grad_outs_slotname_map,      // grad
+    std::map<std::string, std::string>* grad_ins_fwd_slotname_map,   // grad
+    std::map<std::string, std::string>* grad_ins_grad_slotname_map,  // grad
+    std::map<std::string,
+             std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>*
+        grad_ins,  // grad
+    std::map<std::string,
+             std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>*
+        grad_outs  // grad
+    ) {
   const proto::OpProto& op_proto = *op_info.proto_;
   const std::string& op_type = op_proto.type();
   std::vector<int64_t> dims = {1, 1, 1, 1};
@@ -429,13 +598,23 @@ static bool CollectInformationFromOpInfo(
   paddle::framework::AttributeMap default_attrs;
   auto* attr_checker = op_info.Checker();
   if (attr_checker) {
+    VLOG(6) << "Checking AttributeMap Settings";
     attr_checker->Check(&attrs, true, /*only_check_exist_value=*/true);
     default_attrs = attr_checker->GetDefaultAttrMap();
+    VLOG(6) << "AttributeMap Checking Passed";
   } else {
     VLOG(6) << "Detected Null Attribute Checker, use empty default_attrs";
   }
 
+  if (operators_with_attrs.count(op_type)) {
+    VLOG(6) << "Found operator " << op_type << " using special AttributeMap";
+    attrs = operators_with_attrs[op_type];
+  }
+
   VLOG(6) << "Prepared Default Attributes Map, size = " << default_attrs.size();
+  for (const auto& iter : default_attrs) {
+    VLOG(6) << iter.first;
+  }
 
   /* ---------------------------- */
   /* --------- Backward --------- */
@@ -465,8 +644,8 @@ static bool CollectInformationFromOpInfo(
 
   /* ------ Run GradOpMaker ------ */
   if (!op_info.dygraph_grad_op_maker_) {
-    VLOG(6) << op_type << " has no GradOpMaker, skip it";
-    skipped_operators.insert(op_type);
+    VLOG(6) << op_type << " has no GradOpMaker";
+    *generate_forward_only = true;
     return false;
   }
 
@@ -476,17 +655,19 @@ static bool CollectInformationFromOpInfo(
 
   if (!grad_node) {
     VLOG(6) << "Got nullptr GradOpNode for " << op_type
-            << " likely registered EmptyGradOpMaker, skip it";
-    skipped_operators.insert(op_type);
+            << " likely registered EmptyGradOpMaker";
+    *generate_forward_only = true;
     return false;
   }
 
+  /*
   if (grad_node->size() > 1) {
     // Backward attributes can be super complicated
     VLOG(6) << "Skip GradOpNode with multiple OpBases for now: " << op_type;
     skipped_operators.insert(op_type);
     return false;
   }
+  */
 
   VLOG(6) << "Prepared GradOpNode";
 
@@ -494,7 +675,6 @@ static bool CollectInformationFromOpInfo(
   for (auto iter = grad_node->begin(); iter < grad_node->end(); iter++) {
     // Each OpBase
     paddle::imperative::OpBase& op_base = *iter;
-    grad_node_default_attr_maps->push_back(op_base.DefaultAttrsMap());
     grad_op_types->push_back(op_base.Type());
   }
 
@@ -538,22 +718,6 @@ static bool CollectInformationFromOpInfo(
                    grad_outs_slotname_map);
   VLOG(6) << "Finished Slotname Matching for Grad_Outs";
 
-  /* ------ Maping forward slot name to fwd position ------ */
-  size_t in_pos = 0;
-  for (const auto& iter : ins) {
-    VLOG(6) << "Mapping input tensor: " << iter.first
-            << " To position: " << in_pos;
-    (*fwd_inputs_name_pos_map)[iter.first] = in_pos;
-    in_pos++;
-  }
-  size_t out_pos = 0;
-  for (const auto& iter : outs) {
-    VLOG(6) << "Mapping output tensor: " << iter.first
-            << " To position: " << out_pos;
-    (*fwd_outputs_name_pos_map)[iter.first] = out_pos;
-    out_pos++;
-  }
-
   return true;
 }
 
@@ -561,16 +725,13 @@ static bool CollectInformationFromOpInfo(
 /* --------- CodeGen: Forward GradNode Creation ------ */
 /* --------------------------------------------------- */
 static std::string GenerateGradNodeCreationContent(
-    const std::vector<paddle::framework::AttributeMap>&
-        grad_node_default_attr_maps,
     const std::unordered_map<std::string, size_t>& fwd_inputs_name_pos_map,
     const std::unordered_map<std::string, size_t>& fwd_outputs_name_pos_map,
     const std::map<std::string, std::string>& grad_ins_fwd_slotname_map,
-    const proto::OpProto& op_proto) {
+    const std::string& op_type, const std::vector<proto::OpProto::Var>& in_vars,
+    const std::vector<proto::OpProto::Var>& out_vars) {
   VLOG(6) << "Generating GradNode Creation codes";
 
-  const std::string& op_type = op_proto.type();
-
   // [Generation] Construct GradOpNode
   // Run ComputeRequiredGrad
 
@@ -578,7 +739,7 @@ static std::string GenerateGradNodeCreationContent(
   // then generate: "egr::AutogradMeta* p_autograd_out =
   // egr::EagerUtils::autograd_meta("op_proto->outputs()[0].name()")"
   std::string get_autograd_meta_str = "  // Prepare Autograd Meta \n";
-  for (const proto::OpProto::Var& input : op_proto.inputs()) {
+  for (const proto::OpProto::Var& input : in_vars) {
     const std::string& input_name = input.name();
     const std::string& input_autograd_name = "p_autograd_" + input_name;
 
@@ -602,7 +763,7 @@ static std::string GenerateGradNodeCreationContent(
   // If single output slotname and not duplicable,
   // then generate: "egr::AutogradMeta* p_autograd_out =
   // egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")"
-  for (const proto::OpProto::Var& output : op_proto.outputs()) {
+  for (const proto::OpProto::Var& output : out_vars) {
     const std::string& output_name = output.name();
     const std::string& output_autograd_name = "p_autograd_" + output_name;
 
@@ -636,8 +797,8 @@ static std::string GenerateGradNodeCreationContent(
   // [GradOpNode] Generation
   std::string grad_node_creation_str = "";
 
-  size_t bwd_in_slot_num = op_proto.outputs().size();
-  size_t bwd_out_slot_num = op_proto.inputs().size();
+  size_t bwd_in_slot_num = out_vars.size();
+  size_t bwd_out_slot_num = in_vars.size();
   const char* GRAD_OP_NODE_TEMPLATE =
       "    auto grad_node = std::make_shared<GradNode%s>(%d, %d);\n";
   grad_node_creation_str += "    // Create GradOpNode\n";
@@ -669,7 +830,7 @@ static std::string GenerateGradNodeCreationContent(
   // [GradOpNode] SetGradOutMeta
   // [GradOpNode] Add Edges
   std::string compute_require_grad_args = "trace_backward";
-  for (const proto::OpProto::Var& input : op_proto.inputs()) {
+  for (const proto::OpProto::Var& input : in_vars) {
     const std::string& input_name = input.name();
     const std::string& input_autograd_name = "p_autograd_" + input_name;
     compute_require_grad_args += ", &" + input_autograd_name;
@@ -689,7 +850,7 @@ static std::string GenerateGradNodeCreationContent(
   // [AutogradMeta] SetOutRank
   // [AutogradMeta] SetHistory
   std::string pass_stop_gradient_args = "false";
-  for (const proto::OpProto::Var& output : op_proto.outputs()) {
+  for (const proto::OpProto::Var& output : out_vars) {
     const std::string& output_name = output.name();
     const std::string& output_autograd_name = "p_autograd_" + output_name;
     pass_stop_gradient_args += ", &" + output_autograd_name;
@@ -727,24 +888,11 @@ static std::string GenerateGradNodeCreationContent(
   return grad_node_creation_body_str;
 }
 
-static std::string AppendUseOp(const std::string& op_type) {
-  // [Generation] Append USE_OP
-  const char* USE_OP_TEMPLATE = "USE_OP(%s);\n";
-  std::string return_str = paddle::string::Sprintf(USE_OP_TEMPLATE, op_type);
-
-  // Special Ops
-  if (op_type == "reduce_sum")
-    return_str += paddle::string::Sprintf(USE_OP_TEMPLATE, "reduce_sum_grad");
-
-  return return_str;
-}
-
 /* -------------------------------- */
 /* --------- CodeGen: Forward ----- */
 /* -------------------------------- */
 static std::pair<std::string, std::string> GenerateForwardFunctionContents(
-    const std::vector<paddle::framework::AttributeMap>&
-        grad_node_default_attr_maps,
+    bool generate_forward_only,
     const std::unordered_map<std::string, size_t>& fwd_inputs_name_pos_map,
     const std::unordered_map<std::string, size_t>& fwd_outputs_name_pos_map,
     const std::map<std::string, std::string>& grad_ins_fwd_slotname_map,
@@ -758,7 +906,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
         std::string,
         std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>&
         grad_outs,
-    const proto::OpProto& op_proto) {
+    const std::string& op_type, const std::vector<proto::OpProto::Var>& in_vars,
+    const std::vector<proto::OpProto::Var>& out_vars) {
   /*
     // Forward Function Example:
   std::tuple<vector<Tensor>, Tensor, vector<Tensor>>
@@ -779,6 +928,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   ,ConstructDuplicableOutput(Out1Num)} };
 
         // According to op_proto->attrs()
+
         egr::legacy::RunOp("op_type", ins, outs, attr_map,
   Controller.Instance().GetExpectedPlace(), {});
 
@@ -795,8 +945,6 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   */
   VLOG(6) << "Generating Dygraph Forward Function";
 
-  const std::string& op_type = op_proto.type();
-
   std::string generated_function_body = "";
   std::string dygraph_function_args_str = "";
 
@@ -806,8 +954,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
 
   // [Generation] Get Ins Map
   std::string ins_contents_str = "";
-  std::vector<std::string> input_args_str_list(op_proto.inputs().size());
-  for (const proto::OpProto::Var& input : op_proto.inputs()) {
+  std::vector<std::string> input_args_str_list(in_vars.size());
+  for (const proto::OpProto::Var& input : in_vars) {
     const std::string& input_name = input.name();
     size_t input_position = fwd_inputs_name_pos_map.at(input_name);
     if (input.duplicable()) {
@@ -848,7 +996,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
 
   // [Generation] Get Outs Map
   std::string outs_contents_str = "";
-  for (const proto::OpProto::Var& output : op_proto.outputs()) {
+  for (const proto::OpProto::Var& output : out_vars) {
     const std::string& output_name = output.name();
     std::string outnum = "1";
     if (output.duplicable()) {
@@ -859,7 +1007,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
           paddle::string::Sprintf(FWD_NUM_ARG_TEMPLATE, outnum);
       dygraph_function_args_str += arg_str;
       const char* FWD_OUTS_CONTENT_TEMPLATE =
-          "{ \"%s\", egr::ConstructDuplicableOutput(%s) },";
+          "{ \"%s\", egr::EagerUtils::ConstructDuplicableOutput(%s) },";
       outs_contents_str += paddle::string::Sprintf(FWD_OUTS_CONTENT_TEMPLATE,
                                                    output_name, outnum);
     } else {
@@ -888,7 +1036,6 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   // [Generation] Get Attrs
   dygraph_function_args_str +=
       ", const paddle::framework::AttributeMap& attr_map";
-  generated_function_body += "\n";
 
   // [Generation] Get TraceOp
   const char* FWD_TRACE_OP_TEMPLATE =
@@ -898,54 +1045,57 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
       "     egr::Controller::Instance().GetExpectedPlace(),\n"
       "     &default_attrs, true, {});\n";
   std::string trace_op_str =
-      paddle::string::Sprintf(FWD_TRACE_OP_TEMPLATE, op_proto.type());
+      paddle::string::Sprintf(FWD_TRACE_OP_TEMPLATE, op_type);
   generated_function_body += trace_op_str;
   generated_function_body += "\n";
 
   VLOG(6) << "Generated AttrMap & TraceOp";
 
   // [Generation] Convert output VarBase to Vector/Tensor
-  size_t output_size = op_proto.outputs().size();
+  size_t output_size = out_vars.size();
   std::vector<std::string> return_contents(output_size);
   std::vector<std::string> return_types(output_size);
-  for (const proto::OpProto::Var& output : op_proto.outputs()) {
+  for (const proto::OpProto::Var& output : out_vars) {
     const std::string& output_name = output.name();
     std::string out_tensor_str;
     size_t return_position = fwd_outputs_name_pos_map.at(output_name);
+    std::string output_varname = LegalizeVariableName(output_name);
 
     if (output.duplicable()) {
       const char* FWD_OUT_TENSORS_TEMPLATE =
           "  std::vector<egr::EagerTensor> %s = "
           "egr::EagerUtils::GetOutputs(outs[\"%s\"]);\n";
       out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSORS_TEMPLATE,
-                                               output_name, output_name);
+                                               output_varname, output_name);
       return_types[return_position] = "std::vector<egr::EagerTensor>";
     } else {
       const char* FWD_OUT_TENSOR_TEMPLATE =
           "  egr::EagerTensor %s = "
           "egr::EagerUtils::GetOutput(outs[\"%s\"][0]);\n";
       out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE,
-                                               output_name, output_name);
+                                               output_varname, output_name);
       return_types[return_position] = "egr::EagerTensor";
     }
 
-    return_contents[return_position] = output_name;
+    return_contents[return_position] = output_varname;
     generated_function_body += out_tensor_str;
   }
   generated_function_body += "\n";
   VLOG(6) << "Converted Output VarBase to EagerTensor(s)";
 
   // [Generation] ComputeRequireGrad -> GradNodeCreation
-  std::string grad_node_creation_body_str = GenerateGradNodeCreationContent(
-      grad_node_default_attr_maps, fwd_inputs_name_pos_map,
-      fwd_outputs_name_pos_map, grad_ins_fwd_slotname_map, op_proto);
-  generated_function_body += grad_node_creation_body_str;
-  generated_function_body += "\n";
-  VLOG(6) << "Generated GradNode Creation codes";
+  if (!generate_forward_only) {
+    std::string grad_node_creation_body_str = GenerateGradNodeCreationContent(
+        fwd_inputs_name_pos_map, fwd_outputs_name_pos_map,
+        grad_ins_fwd_slotname_map, op_type, in_vars, out_vars);
+    generated_function_body += grad_node_creation_body_str;
+    generated_function_body += "\n";
+    VLOG(6) << "Generated GradNode Creation codes";
+  }
 
   // [Generation] Handle return: Tuple/Vector/Tensor
   generated_function_body += "\n";
-  std::string return_str;
+  std::string return_str = "";
   std::string return_type_str = "";
   std::string function_proto_return_type_str = "";
   if (return_contents.size() > 1) {
@@ -968,14 +1118,20 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
     const char* FWD_FUNCTION_PROTO_RETURN_TEMPLATE = "std::tuple<%s>";
     function_proto_return_type_str = paddle::string::Sprintf(
         FWD_FUNCTION_PROTO_RETURN_TEMPLATE, return_type_str);
-  } else {
+
+  } else if (return_contents.size() == 1) {
     // Return vector<Tensor> or Tensor
     return_type_str = return_types[0];
     const char* FWD_TENSOR_RETURN_TEMPLATE = "  return %s;";
     return_str =
         paddle::string::Sprintf(FWD_TENSOR_RETURN_TEMPLATE, return_contents[0]);
     function_proto_return_type_str = return_type_str;
+
+  } else {
+    return_str = "return nullptr;";
+    function_proto_return_type_str = "void*";
   }
+
   generated_function_body += return_str;
   generated_function_body += "\n";
   VLOG(6) << "Generated return codes";
@@ -983,14 +1139,16 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   // [Generation] Get Full Function
   std::string function_name = op_type + "_dygraph_function";
 
+  if (dygraph_function_args_str.size() > 0) {
+    auto iter = dygraph_function_args_str.begin();
+    if ((*iter) == ',') dygraph_function_args_str.erase(iter);
+  }
+
   const char* FWD_FUNCTION_TEMPLATE = "%s %s(%s) {\n\n%s\n}\n\n";
   std::string fwd_function_str = paddle::string::Sprintf(
       FWD_FUNCTION_TEMPLATE, function_proto_return_type_str, function_name,
       dygraph_function_args_str, generated_function_body);
 
-  // [Generation] Append USE_OP
-  fwd_function_str += AppendUseOp(op_type);
-
   // [Generation] Generate forward functions header
   const char* FWD_HEADER_TEMPLATE = "%s %s(%s);\n";
   std::string dygraph_function_declaration_str = paddle::string::Sprintf(
@@ -1004,8 +1162,6 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
 /* --------- CodeGen: GradNode::operator() ------ */
 /* ---------------------------------------------- */
 static std::string GenerateGradNodeCCContents(
-    const std::vector<paddle::framework::AttributeMap>&
-        grad_node_default_attr_maps,
     const std::vector<std::string>& grad_op_types,
     const std::unordered_map<std::string, size_t>& fwd_inputs_name_pos_map,
     const std::unordered_map<std::string, size_t>& fwd_outputs_name_pos_map,
@@ -1020,7 +1176,8 @@ static std::string GenerateGradNodeCCContents(
         std::string,
         std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>&
         grad_outs,
-    const proto::OpProto& op_proto) {
+    const std::string& op_type, const std::vector<proto::OpProto::Var>& in_vars,
+    const std::vector<proto::OpProto::Var>& out_vars) {
   VLOG(6) << "Generating Grad Node CC";
 
   /* [Outline]
@@ -1066,7 +1223,6 @@ static std::string GenerateGradNodeCCContents(
   }
   */
 
-  const std::string& op_type = op_proto.type();
   std::string generated_grad_function_body = "";
 
   // [Generation] Get Tracer
@@ -1122,7 +1278,7 @@ static std::string GenerateGradNodeCCContents(
 
   // [Generation] Get Outs Map
   std::unordered_set<std::string> duplicable_input_name_set;
-  for (const auto& in : op_proto.inputs()) {
+  for (const auto& in : in_vars) {
     if (in.duplicable()) duplicable_input_name_set.insert(in.name());
   }
 
@@ -1132,23 +1288,76 @@ static std::string GenerateGradNodeCCContents(
 
     if (grad_outs_slotname_map.count(grad_output_name)) {
       // Fwd Tensor
-      const std::string& fwd_input_name =
-          grad_outs_slotname_map.at(grad_output_name);
-      size_t fwd_input_position = fwd_inputs_name_pos_map.at(fwd_input_name);
-
-      if (duplicable_input_name_set.count(fwd_input_name)) {
-        const char* GRAD_OUTS_CONTENT_TEMPLATE =
-            "{ \"%s\", egr::ConstructDuplicableOutput( "
-            "this->OutputMeta()[%d].Size() ) },";
+      const std::string& fwd_name = grad_outs_slotname_map.at(grad_output_name);
+
+      /* Handle Special Case: "PullSparseOp", etc
+
+          Forward:
+
+             Ids  W
+              |   |
+           PullSparseOp
+                |
+               Out
+
+          Backward:
+
+             Ids  GradOut  W
+              |      |     |
+             PullSparseGradOp
+                     |
+                  GradOut
+
+          Its grad output "GradOut" corresponds to forward output "Out",
+          where there is a hiden inplace involved. So we find "GradOut"'s index
+         in
+          grads, and perform the inplace operation by constructing outs =
+         {{"Out", grads[i]}}
+
+          GradOut -> Out -> fwd_output_pos -> grads position -> grads[i]
+          outs = {{"Out", grads[i]}}
+
+          For returns, append "GradOut" to the very end of return list.
+      */
+      if (!fwd_inputs_name_pos_map.count(fwd_name)) {
+        PADDLE_ENFORCE(fwd_outputs_name_pos_map.count(fwd_name),
+                       paddle::platform::errors::Fatal(
+                           "fwd_name not found in fwd_inputs_name_pos_map nor "
+                           "fwd_outputs_name_pos_map"));
+
+        size_t grads_position = fwd_outputs_name_pos_map.at(fwd_name);
+        std::string grad_ptr_name = fwd_name + "_ptrs";
+        const char* GET_GRADS_PTR_TEMPLATE =
+            "  std::vector<std::shared_ptr<egr::EagerTensor>> %s;\n"
+            "  for(const auto& t : grads[%d]) {\n    "
+            "%s.emplace_back(std::move(std::make_shared<egr::EagerTensor>(t)));"
+            "\n  }\n";
+        std::string grads_ptr_str =
+            paddle::string::Sprintf(GET_GRADS_PTR_TEMPLATE, grad_ptr_name,
+                                    grads_position, grad_ptr_name);
+        generated_grad_function_body += grads_ptr_str;
+        generated_grad_function_body += "\n";
+
+        const char* GRAD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", %s },";
         outs_contents_str += paddle::string::Sprintf(
-            GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, fwd_input_position);
+            GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, grad_ptr_name);
+
       } else {
-        const char* GRAD_OUTS_CONTENT_TEMPLATE =
-            "{ \"%s\", "
-            "{std::make_shared<egr::EagerTensor>(egr::Controller::Instance()."
-            "GenerateUniqueName())}},";
-        outs_contents_str += paddle::string::Sprintf(GRAD_OUTS_CONTENT_TEMPLATE,
-                                                     grad_output_name);
+        size_t fwd_input_position = fwd_inputs_name_pos_map.at(fwd_name);
+        if (duplicable_input_name_set.count(fwd_name)) {
+          const char* GRAD_OUTS_CONTENT_TEMPLATE =
+              "{ \"%s\", egr::EagerUtils::ConstructDuplicableOutput( "
+              "this->OutputMeta()[%d].Size() ) },";
+          outs_contents_str += paddle::string::Sprintf(
+              GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, fwd_input_position);
+        } else {
+          const char* GRAD_OUTS_CONTENT_TEMPLATE =
+              "{ \"%s\", "
+              "{std::make_shared<egr::EagerTensor>(egr::Controller::Instance()."
+              "GenerateUniqueName())}},";
+          outs_contents_str += paddle::string::Sprintf(
+              GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name);
+        }
       }
     } else {
       PADDLE_THROW(platform::errors::Fatal(
@@ -1173,7 +1382,7 @@ static std::string GenerateGradNodeCCContents(
 
   // [Generation] Get Attrs Map
   std::string trace_opbase_str = "";
-  for (size_t i = 0; i < grad_node_default_attr_maps.size(); i++) {
+  for (size_t i = 0; i < grad_op_types.size(); i++) {
     const std::string& op_base_type = grad_op_types[i];
 
     const char* TRACE_OP_TEMPLATE =
@@ -1192,15 +1401,39 @@ static std::string GenerateGradNodeCCContents(
 
   // [Generation] Get Return
   std::string outputs_str = "";
+  size_t num_appended_outputs = 0;
   for (auto iter : grad_outs) {
     const std::string& grad_out_name = iter.first;
-    size_t fwd_input_position =
-        fwd_inputs_name_pos_map.at(grad_outs_slotname_map.at(grad_out_name));
+    const std::string& fwd_name = grad_outs_slotname_map.at(grad_out_name);
+
+    if (fwd_inputs_name_pos_map.count(fwd_name)) {
+      size_t fwd_input_position = fwd_inputs_name_pos_map.at(fwd_name);
+      const char* BWD_OUTPUT_TEMPLATE =
+          "  outputs[%d] = egr::EagerUtils::GetOutputs(outs[\"%s\"]);\n";
+      outputs_str += paddle::string::Sprintf(BWD_OUTPUT_TEMPLATE,
+                                             fwd_input_position, grad_out_name);
+      num_appended_outputs++;
+    } else {
+      PADDLE_ENFORCE(fwd_outputs_name_pos_map.count(fwd_name),
+                     paddle::platform::errors::Fatal(
+                         "fwd_name not found in fwd_inputs_name_pos_map nor "
+                         "fwd_outputs_name_pos_map"));
+    }
+  }
 
-    const char* BWD_OUTPUT_TEMPLATE =
-        "  outputs[%d] = egr::EagerUtils::GetOutputs(outs[\"%s\"]);\n";
-    outputs_str += paddle::string::Sprintf(BWD_OUTPUT_TEMPLATE,
-                                           fwd_input_position, grad_out_name);
+  /* Handle Special Case: "PullSparseOp", etc
+     For returns, append "GradOut" to the very end of return list. */
+  for (auto iter : grad_outs) {
+    const std::string& grad_out_name = iter.first;
+    const std::string& fwd_name = grad_outs_slotname_map.at(grad_out_name);
+
+    if (fwd_outputs_name_pos_map.count(fwd_name)) {
+      const char* BWD_OUTPUT_TEMPLATE =
+          "  outputs[%d] = egr::EagerUtils::GetOutputs(outs[\"%s\"]);\n";
+      outputs_str += paddle::string::Sprintf(
+          BWD_OUTPUT_TEMPLATE, num_appended_outputs, grad_out_name);
+      num_appended_outputs++;
+    }
   }
 
   const char* BWD_RETURN_TEMPLATE =
@@ -1230,10 +1463,9 @@ static std::string GenerateGradNodeCCContents(
 /* --------- CodeGen: GradNode Header ------ */
 /* ----------------------------------------- */
 static std::string GenerateGradNodeHeaderContents(
-    const std::vector<paddle::framework::AttributeMap>&
-        grad_node_default_attr_maps,
     const std::map<std::string, std::string>& grad_ins_fwd_slotname_map,
-    const proto::OpProto& op_proto) {
+    const std::string& op_type, const std::vector<proto::OpProto::Var>& in_vars,
+    const std::vector<proto::OpProto::Var>& out_vars) {
   VLOG(6) << "Generating Grad Node Header";
 
   const char* GRAD_NODE_TEMPLATE =
@@ -1261,8 +1493,6 @@ static std::string GenerateGradNodeHeaderContents(
       "%s\n"
       "};";
 
-  const std::string& op_type = op_proto.type();
-
   // [Generation] Handle Attributes
   std::string set_attr_map_str =
       "   void SetAttrMap(paddle::framework::AttributeMap&& attr_map) {\n     "
@@ -1279,12 +1509,12 @@ static std::string GenerateGradNodeHeaderContents(
 
   // [Generation] Handle TensorWrappers
   std::unordered_set<std::string> duplicable_tensors;
-  for (const proto::OpProto::Var& input : op_proto.inputs()) {
+  for (const proto::OpProto::Var& input : in_vars) {
     if (input.duplicable()) {
       duplicable_tensors.insert(input.name());
     }
   }
-  for (const proto::OpProto::Var& output : op_proto.outputs()) {
+  for (const proto::OpProto::Var& output : out_vars) {
     if (output.duplicable()) {
       duplicable_tensors.insert(output.name());
     }
@@ -1363,34 +1593,31 @@ static void GenerateForwardHFile(const std::string& output_dir,
   forward_header_stream.close();
 }
 
-static void GenerateForwardDygraphFile(const std::string& op_type,
-                                       const std::string& output_dir,
+static void GenerateForwardDygraphFile(const std::string& output_dir,
                                        const std::string& fwd_function_str) {
   std::string forwards_dir = output_dir + "/forwards/";
-  std::string node_h_filename = op_type + "_node.h";
-  std::string forward_cc_filename = op_type + "_dygraph.cc";
+  std::string forward_cc_filename = "dygraph_forward_functions.cc";
   std::string forward_cc_path = forwards_dir + forward_cc_filename;
   const char* FORWARD_INCLUDE_TEMPLATE =
       "#include "
       "\"paddle/fluid/eager/api/generated/fluid_generated/"
       "dygraph_forward_api.h\"\n"
       "#include "
-      "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/%s\"\n\n"
+      "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h\"\n\n"
       "#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n"
       "#include \"paddle/fluid/eager/legacy/op_runner.h\"\n";
   std::string forward_cc_include_str =
-      paddle::string::Sprintf(FORWARD_INCLUDE_TEMPLATE, node_h_filename);
+      paddle::string::Sprintf(FORWARD_INCLUDE_TEMPLATE);
   std::ofstream forward_cc_stream(forward_cc_path, std::ios::out);
   forward_cc_stream << forward_cc_include_str;
   forward_cc_stream << fwd_function_str;
   forward_cc_stream.close();
 }
 
-static void GenerateNodeHFile(const std::string& op_type,
-                              const std::string& output_dir,
+static void GenerateNodeHFile(const std::string& output_dir,
                               const std::string& grad_node_str) {
   std::string nodes_dir = output_dir + "/nodes/";
-  std::string node_h_filename = op_type + "_node.h";
+  std::string node_h_filename = "nodes.h";
   std::string node_h_path = nodes_dir + node_h_filename;
   std::string node_h_include_str =
       "#pragma once\n"
@@ -1403,12 +1630,10 @@ static void GenerateNodeHFile(const std::string& op_type,
   node_h_stream.close();
 }
 
-static void GenerateNodeCCFile(const std::string& op_type,
-                               const std::string& output_dir,
+static void GenerateNodeCCFile(const std::string& output_dir,
                                const std::string& grad_function_str) {
   std::string nodes_dir = output_dir + "/nodes/";
-  std::string node_h_filename = op_type + "_node.h";
-  std::string node_cc_filename = op_type + "_node.cc";
+  std::string node_cc_filename = "nodes.cc";
   std::string node_cc_path = nodes_dir + node_cc_filename;
   const char* NODE_CC_INCLUDE_TEMPLATE =
       "#include \"glog/logging.h\"\n"
@@ -1418,9 +1643,9 @@ static void GenerateNodeCCFile(const std::string& op_type,
       "#include \"paddle/fluid/eager/utils.h\"\n"
       "#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n"
       "#include "
-      "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/%s\"\n\n";
+      "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h\"\n\n";
   std::string node_cc_include_str =
-      paddle::string::Sprintf(NODE_CC_INCLUDE_TEMPLATE, node_h_filename);
+      paddle::string::Sprintf(NODE_CC_INCLUDE_TEMPLATE);
   std::ofstream node_cc_stream(node_cc_path, std::ios::out);
   node_cc_stream << node_cc_include_str;
   node_cc_stream << grad_function_str;
@@ -1441,6 +1666,9 @@ static std::string GenerateDygraphHFileIncludes() {
 
 static void DygraphCodeGeneration(const std::string& output_dir) {
   std::string dygraph_forward_api_str = GenerateDygraphHFileIncludes();
+  std::string fwd_function_str = "";
+  std::string grad_node_h_str = "";
+  std::string grad_node_cc_str = "";
 
   auto& op_info_map = paddle::framework::OpInfoMap::Instance().map();
 
@@ -1454,10 +1682,9 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
     /* ----------------------------- */
     /* ---- Collect Information ---- */
     /* ----------------------------- */
-    std::vector<paddle::framework::AttributeMap> grad_node_default_attr_maps;
     std::vector<std::string> grad_op_types;
-    std::unordered_map<std::string, size_t> fwd_inputs_name_pos_map;
-    std::unordered_map<std::string, size_t> fwd_outputs_name_pos_map;
+    std::vector<proto::OpProto::Var> in_vars;
+    std::vector<proto::OpProto::Var> out_vars;
     std::map<std::string, std::string> grad_outs_slotname_map;
     std::map<std::string, std::string> grad_ins_fwd_slotname_map;
     std::map<std::string, std::string> grad_ins_grad_slotname_map;
@@ -1469,65 +1696,152 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
         grad_outs;
 
     VLOG(6) << "-------- CollectInformationFromOpInfo -------";
-    bool is_available = CollectInformationFromOpInfo(
-        op_info, &grad_node_default_attr_maps, &grad_op_types,
-        &fwd_inputs_name_pos_map, &fwd_outputs_name_pos_map,
+
+    CollectForwardInformationFromOpInfo(op_info, &in_vars, &out_vars);
+
+    bool generate_forward_only = false;
+    bool is_available = CollectGradInformationFromOpInfo(
+        op_info, &generate_forward_only, &grad_op_types,
         &grad_outs_slotname_map, &grad_ins_fwd_slotname_map,
         &grad_ins_grad_slotname_map, &grad_ins, &grad_outs);
 
-    if (!is_available) continue;
+    if (!is_available && !generate_forward_only) {
+      VLOG(6) << "Skipped operator: " << op_type;
+      continue;
+    }
+
+    VLOG(6) << "-------- PurifyOpProto -------";
+    std::unordered_map<std::string, size_t> fwd_inputs_name_pos_map;
+    std::unordered_map<std::string, size_t> fwd_outputs_name_pos_map;
+    PurifyForwardOpProto(*op_proto, &fwd_inputs_name_pos_map,
+                         &fwd_outputs_name_pos_map, &in_vars, &out_vars);
+
+    if (!generate_forward_only) {
+      PurifyGradOpProto(*op_proto, &grad_outs_slotname_map,
+                        &grad_ins_fwd_slotname_map, &grad_ins_grad_slotname_map,
+                        &grad_ins, &grad_outs);
+    }
 
     /* --------------------------- */
     /* --------- CodeGen --------- */
     /* --------------------------- */
-    /* ---- xxx_dygraph.cc ---- */
+    /* ---- forward_dygraph_functions.cc ---- */
     VLOG(6) << "-------- GenerateForwardFunctionContents -------";
     std::pair<std::string, std::string> body_and_declaration =
         GenerateForwardFunctionContents(
-            grad_node_default_attr_maps, fwd_inputs_name_pos_map,
+            generate_forward_only, fwd_inputs_name_pos_map,
             fwd_outputs_name_pos_map, grad_ins_fwd_slotname_map,
             grad_ins_grad_slotname_map, grad_outs_slotname_map, grad_ins,
-            grad_outs, *op_proto);
-    std::string fwd_function_str = body_and_declaration.first;
-    GenerateForwardDygraphFile(op_type, output_dir, fwd_function_str);
+            grad_outs, op_type, in_vars, out_vars);
+
+    fwd_function_str += body_and_declaration.first + "\n";
 
     /* ---- dygraph_forward_api.h ---- */
     std::string fwd_function_declare_str = body_and_declaration.second;
     dygraph_forward_api_str += fwd_function_declare_str;
 
-    /* ---- xxx_node.h ---- */
+    if (generate_forward_only) continue;
+
+    /* ---- nodes.h ---- */
     VLOG(6) << "-------- GenerateGradNodeHeaderContents -------";
-    std::string grad_node_h_str = GenerateGradNodeHeaderContents(
-        grad_node_default_attr_maps, grad_ins_fwd_slotname_map, *op_proto);
-    GenerateNodeHFile(op_type, output_dir, grad_node_h_str);
+    grad_node_h_str +=
+        GenerateGradNodeHeaderContents(grad_ins_fwd_slotname_map, op_type,
+                                       in_vars, out_vars) +
+        "\n";
 
-    /* ---- xxx_node.cc ---- */
+    /* ---- nodes.cc ---- */
     VLOG(6) << "-------- GenerateGradNodeCCContents -------";
-    std::string grad_node_cc_str = GenerateGradNodeCCContents(
-        grad_node_default_attr_maps, grad_op_types, fwd_inputs_name_pos_map,
-        fwd_outputs_name_pos_map, grad_ins_fwd_slotname_map,
-        grad_ins_grad_slotname_map, grad_outs_slotname_map, grad_ins, grad_outs,
-        *op_proto);
-    GenerateNodeCCFile(op_type, output_dir, grad_node_cc_str);
-
-    VLOG(6) << op_type << ": Finished Generation";
+    grad_node_cc_str += GenerateGradNodeCCContents(
+                            grad_op_types, fwd_inputs_name_pos_map,
+                            fwd_outputs_name_pos_map, grad_ins_fwd_slotname_map,
+                            grad_ins_grad_slotname_map, grad_outs_slotname_map,
+                            grad_ins, grad_outs, op_type, in_vars, out_vars) +
+                        "\n";
+
+    VLOG(6) << op_type << ": Finished Generating Op: " << op_type;
   }
+  /* ---- dygraph_forward_function.cc ---- */
+  VLOG(6) << "-------- GenerateDygraphForwardCCFile -------";
+  GenerateForwardDygraphFile(output_dir, fwd_function_str);
 
   /* ---- dygraph_forward_api.h ---- */
   VLOG(6) << "-------- GenerateForwardHFile -------";
   GenerateForwardHFile(output_dir, dygraph_forward_api_str);
+
+  /* ---- nodes.h ---- */
+  VLOG(6) << "-------- GenerateNodeHFile -------";
+  GenerateNodeHFile(output_dir, grad_node_h_str);
+
+  /* ---- nodes.cc ---- */
+  VLOG(6) << "-------- GenerateNodeCCFile -------";
+  GenerateNodeCCFile(output_dir, grad_node_cc_str);
+}
+
+static void PrepareAttrMapForOps() {
+  // Handle "fused_elemwise_add_activation"
+  std::vector<std::string> functor_list = {"a", "b"};
+  operators_with_attrs["fused_elemwise_add_activation"] = {};
+  operators_with_attrs["fused_elemwise_add_activation"]["functor_list"] =
+      functor_list;
+
+  // Handle "fused_elemwise_activation"
+  operators_with_attrs["fused_elemwise_activation"] = {};
+  operators_with_attrs["fused_elemwise_activation"]["functor_list"] =
+      functor_list;
+
+  // Handle "reverse"
+  std::vector<int> axis = {0};
+  operators_with_attrs["reverse"] = {};
+  operators_with_attrs["reverse"]["axis"] = axis;
+
+  // Handle "flip"
+  operators_with_attrs["flip"] = {};
+  operators_with_attrs["flip"]["axis"] = axis;
+
+  // Handle "cast"
+  operators_with_attrs["cast"] = {};
+  operators_with_attrs["cast"]["out_dtype"] = 5;
+  operators_with_attrs["cast"]["in_dtype"] = 5;
+
+  // Handle "transfer_dtype"
+  operators_with_attrs["transfer_dtype"] = {};
+  operators_with_attrs["transfer_dtype"]["out_dtype"] = 5;
+  operators_with_attrs["transfer_dtype"]["in_dtype"] = 5;
+
+  // Handle "c_split"
+  operators_with_attrs["c_split"] = {};
+  operators_with_attrs["c_split"]["nranks"] = 1;
+}
+
+static void CollectOperatorsToCodeGen(const std::string& op_list_path) {
+  std::string line;
+  std::ifstream op_list_file(op_list_path);
+  if (op_list_file.is_open()) {
+    while (getline(op_list_file, line)) {
+      operators_to_codegen.insert(line);
+    }
+    op_list_file.close();
+  } else {
+    PADDLE_THROW(
+        paddle::platform::errors::Fatal("Unable to open op_list.txt file"));
+  }
 }
 
 }  // namespace framework
 }  // namespace paddle
 
 int main(int argc, char* argv[]) {
-  if (argc != 2) {
-    std::cerr << "argc must be 2" << std::endl;
+  if (argc != 3) {
+    std::cerr << "argc must be 3" << std::endl;
     return -1;
   }
 
   std::string eager_root = argv[1];
+  std::string op_list_path = argv[2];
+
+  paddle::framework::CollectOperatorsToCodeGen(op_list_path);
+  paddle::framework::PrepareAttrMapForOps();
+
   paddle::framework::DygraphCodeGeneration(eager_root);
 
   return 0;
diff --git a/paddle/fluid/eager/auto_code_generator/generate_file_structures.py b/paddle/fluid/eager/auto_code_generator/generate_file_structures.py
index af6cf2cec0246..56ec287561c56 100644
--- a/paddle/fluid/eager/auto_code_generator/generate_file_structures.py
+++ b/paddle/fluid/eager/auto_code_generator/generate_file_structures.py
@@ -18,12 +18,6 @@
 if __name__ == "__main__":
     assert len(sys.argv) == 2
     eager_dir = sys.argv[1]
-
-    op_list = []
-    with open(f"{eager_dir}/auto_code_generator/op_list.txt", "r") as f:
-        for line in f:
-            line = str(line.strip())
-            op_list.append(line)
     """
     paddle/fluid/eager
     |- generated
@@ -31,15 +25,15 @@
     |  |  "add_subdirectory(forwards), add_subdirectory(nodes)"
     |  
     |  |- forwards
-    |     |- op_name + "_dygraph.cc"
+    |     |- "dygraph_forward_functions.cc"
     |     |- CMakeLists.txt
-    |     |  "cc_library(dygraph_function SRCS op_name+"_dygraph.cc" DEPS ${eager_deps} ${fluid_deps} GLOB_OP_LIB)"
+    |     |  "cc_library(dygraph_function SRCS dygraph_forward_functions.cc DEPS ${eager_deps} ${fluid_deps} GLOB_OP_LIB)"
     |
     |  |- nodes
-    |     |- op_name + "_node.cc"
-    |     |- op_name + "_node.h"
+    |     |- "nodes.cc"
+    |     |- "nodes.h"
     |     |- CMakeLists.txt
-    |     |  "cc_library(dygraph_node SRCS op_name+"_node.cc" DEPS ${eager_deps} ${fluid_deps})"
+    |     |  "cc_library(dygraph_node SRCS nodes.cc DEPS ${eager_deps} ${fluid_deps})"
     | 
     |  |- dygraph_forward_api.h
     """
@@ -56,10 +50,10 @@
     dygraph_forward_api_h_path = os.path.join(generated_dir,
                                               "dygraph_forward_api.h")
     empty_files = [dygraph_forward_api_h_path]
-    for op_name in op_list:
-        empty_files.append(os.path.join(forwards_dir, op_name + "_dygraph.cc"))
-        empty_files.append(os.path.join(nodes_dir, op_name + "_node.cc"))
-        empty_files.append(os.path.join(nodes_dir, op_name + "_node.h"))
+    empty_files.append(
+        os.path.join(forwards_dir, "dygraph_forward_functions.cc"))
+    empty_files.append(os.path.join(nodes_dir, "nodes.cc"))
+    empty_files.append(os.path.join(nodes_dir, "nodes.h"))
 
     for path in empty_files:
         if not os.path.exists(path):
@@ -73,14 +67,14 @@
 
     with open(nodes_level_cmakelist_path, "w") as f:
         f.write(
-            "cc_library(dygraph_node SRCS %s DEPS ${eager_deps} ${fluid_deps})\n"
-            % " ".join([op_name + '_node.cc' for op_name in op_list]))
+            "cc_library(dygraph_node SRCS nodes.cc DEPS ${eager_deps} ${fluid_deps})\n"
+        )
         f.write("add_dependencies(dygraph_node eager_codegen)")
 
     with open(forwards_level_cmakelist_path, "w") as f:
         f.write(
-            "cc_library(dygraph_function SRCS %s DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB})\n"
-            % " ".join([op_name + '_dygraph.cc' for op_name in op_list]))
+            "cc_library(dygraph_function SRCS dygraph_forward_functions.cc DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})\n"
+        )
         f.write("add_dependencies(dygraph_function eager_codegen)")
 
     with open(generated_level_cmakelist_path, "w") as f:
diff --git a/paddle/fluid/eager/auto_code_generator/op_list.txt b/paddle/fluid/eager/auto_code_generator/op_list.txt
index 00a9abde156fb..699a84169d700 100644
--- a/paddle/fluid/eager/auto_code_generator/op_list.txt
+++ b/paddle/fluid/eager/auto_code_generator/op_list.txt
@@ -1,4 +1,542 @@
-sigmoid
+rsqrt
+multihead_matmul
+addmm
+gru
+round
+rank_attention
+fused_embedding_fc_lstm
+where_index
+bicubic_interp
+arg_min
+tile
+bilinear_tensor_product
+ctc_align
+pow2_decay_with_linear_warmup
+split
+fc
+clear_float_status
 matmul_v2
-reduce_sum
+load
+c_embedding
+elementwise_max
+adadelta
+chunk_eval
+check_finite_and_unscale
+sparse_momentum
+tan
+adam
+fsp
+where
+logical_xor
+multiclass_nms3
+one_hot_v2
+sequence_softmax
+affine_channel
+triangular_solve
+sequence_topk_avg_pooling
+space_to_depth
+reverse
+fused_embedding_eltwise_layernorm
+expand_v2
+lgamma
+solve
+deformable_psroi_pooling
+transfer_layout
+instance_norm
+decode_jpeg
+distributed_push_sparse
+gather_nd
+reduce_prod
+matrix_rank
+asin
+lstmp
+iou_similarity
+huber_loss
+one_hot
+sequence_slice
+lookup_table
+softplus
+depthwise_conv2d
+c_allreduce_sum
+fused_fc_elementwise_layernorm
+sigmoid_cross_entropy_with_logits
+exp
+scatter
+c_allreduce_min
+equal_all
+searchsorted
+fusion_squared_mat_sub
+unique
+log
+conv_shift
+smooth_l1_loss
+linear_interp_v2
+momentum
+temporal_shift
+nce
+mv
+global_scatter
+proximal_gd
+memcpy_h2d
+add_position_encoding
+cosh
+hash
+grad_add
+sign
+prelu
+linspace
+fill_diagonal
+logsigmoid
+load_combine
+fetch_v2
+randperm
+sequence_scatter
+partial_sum
+relu6
+partial_allgather
+c_scatter
+alltoall
+conv3d
+lstm_unit
+not_equal
+transpose2
+c_sync_comm_stream
+uniform_random_batch_size_like
+unfold
+lrn
+isclose
+softmax_with_cross_entropy
+isfinite_v2
+bernoulli
+max_pool3d_with_index
+gaussian_random
+flatten2
+matmul
+cvm
+recv_v2
+adamax
+masked_select
+range
+bitwise_not
+trace
+multinomial
+modified_huber_loss
+c_reduce_prod
+roll
+squared_l2_distance
+conv3d_transpose
+share_data
+fake_quantize_abs_max
+unique_with_counts
+fill
+concat
+fill_zeros_like
+hierarchical_sigmoid
+isinf_v2
+squeeze
+multiclass_nms2
+bpr_loss
+fft_c2c
+bicubic_interp_v2
+angle
+reshape
+coalesce_tensor
+dgc
+roi_align
+reshape2
+reduce_any
+unstack
+scatter_nd_add
+sequence_reshape
+bilateral_slice
+fill_any_like
+empty
+partial_recv
+pad_constant_like
+pool2d
+size
+imag
+eigh
+stack
+dgc_momentum
+lamb
+generate_proposals_v2
+c_sync_calc_stream
+bitwise_or
+gru_unit
+fake_channel_wise_quantize_dequantize_abs_max
+sampling_id
+unsqueeze2
+transfer_dtype
+allreduce
+average_accumulates
+sequence_enumerate
+fusion_seqconv_eltadd_relu
+bce_loss
+generate_proposal_labels
+im2sequence
+isinf
+c_reducescatter
+adagrad
+linear_chain_crf
+retinanet_target_assign
+fusion_group
+teacher_student_sigmoid_loss
+random_crop
+lookup_table_v2
+detection_map
+l1_norm
+sqrt
+partial_send
+fused_elemwise_activation
+slogdeterminant
+share_buffer
+bitwise_and
+diag_embed
+unbind
+dropout
+moving_average_abs_max_scale
+beam_search
+log_loss
+greater_than
+kron
+sigmoid_focal_loss
+rmsprop
+conv2d
+uniform_random_inplace
+maxout
+linear_interp
+auc
+logical_or
+batch_norm
+c_reduce_sum
 elementwise_add
+acos
+send_and_recv
+unpool
+cumprod
+sample_logits
+pull_box_extended_sparse
+crop_tensor
+fill_constant
+deformable_conv
+generate_mask_labels
+locality_aware_nms
+expand_as
+matrix_power
+greater_equal
+generate_proposals
+bilinear_interp
+sigmoid
+inplace_abn
+softshrink
+mul
+data_norm
+get_tensor_from_selected_rows
+spp
+floor
+gelu
+retinanet_detection_output
+push_dense
+silu
+sequence_erase
+real
+nearest_interp_v2
+dgc_clip_by_norm
+squeeze2
+strided_slice
+conj
+precision_recall
+save
+fusion_seqexpand_concat_fc
+fake_quantize_range_abs_max
+depthwise_conv2d_transpose
+positive_negative_pair
+square
+var_conv_2d
+log1p
+fused_softmax_mask_upper_triangle
+clip_by_norm
+atan2
+box_decoder_and_assign
+fft_r2c
+roi_pool
+overlap_add
+fill_constant_batch_size_like
+fill_any
+dequantize_log
+c_split
+barrier
+max_pool2d_with_index
+pad3d
+norm
+viterbi_decode
+mish
+box_coder
+flatten
+elementwise_mod
+margin_cross_entropy
+pull_sparse
+logical_and
+pow
+stanh
+label_smooth
+merged_momentum
+c_reduce_min
+ascend_trigger
+fused_feedforward
+rpn_target_assign
+roi_perspective_transform
+expand
+prroi_pool
+pool3d
+memcpy
+distribute_fpn_proposals
+frame
+bincount
+shape
+group_norm
+c_softmax_with_cross_entropy
+resnet_unit
+sequence_expand_as
+cos_sim
+eigvals
+save_combine
+class_center_sample
+read_file
+isfinite
+arg_max
+equal
+fake_dequantize_max_abs
+qr
+anchor_generator
+layer_norm
+merge_selected_rows
+less_equal
+rnn
+fusion_lstm
+lars_momentum
+hard_sigmoid
+isnan
+elementwise_floordiv
+correlation
+histogram
+gather_tree
+segment_pool
+sync_batch_norm
+fusion_repeated_fc_relu
+nop
+fused_attention
+expand_as_v2
+filter_by_instag
+diag_v2
+pull_box_sparse
+nll_loss
+dot
+scale
+ncclBcast
+shuffle_batch
+ncclReduce
+diag
+multiplex
+leaky_relu
+allclose
+adamw
+elementwise_pow
+prior_box
+p_norm
+c_concat
+unique_consecutive
+lod_reset
+pad
+sequence_conv
+log10
+set_value
+bitwise_xor
+center_loss
+randint
+attention_lstm
+uniform_random
+slice
+meshgrid
+hard_swish
+sin
+mean_iou
+pad2d
+inverse
+spectral_norm
+shuffle_channel
+send_v2
+psroi_pool
+seed
+ceil
+eig
+reduce_min
+cos
+ncclAllReduce
+cudnn_lstm
+reduce_sum
+digamma
+assign_value
+increment
+tdm_sampler
+fused_softmax_mask
+sequence_reverse
+eigvalsh
+diagonal
+trunc
+log2
+marker
+tanh
+yolov3_loss
+graph_send_recv
+accuracy
+atan
+less_than
+unsqueeze
+crf_decoding
+global_gather
+c_allreduce_prod
+log_softmax
+ftrl
+matrix_nms
+top_k_v2
+cast
+tanh_shrink
+hard_shrink
+multiclass_nms
+c_broadcast
+fusion_transpose_flatten_concat
+sequence_unpad
+fused_elemwise_add_activation
+pull_sparse_v2
+frobenius_norm
+crop
+cross_entropy2
+skip_layernorm
+tdm_child
+fused_embedding_seq_pool
+erf
+conv2d_inception_fusion
+trilinear_interp
+logsumexp
+fusion_seqpool_concat
+alloc_float_status
+sequence_concat
+fusion_seqpool_cvm_concat
+similarity_focus
+c_allreduce_max
+argsort
+sequence_expand
+sgd
+fused_bn_add_activation
+bilinear_interp_v2
+clip
+deformable_conv_v1
+hinge_loss
+determinant
+conv2d_transpose
+memcpy_d2h
+softsign
+fake_quantize_dequantize_abs_max
+broadcast_tensors
+grid_sampler
+fft_c2r
+pyramid_hash
+fake_quantize_dequantize_moving_average_abs_max
+multi_dot
+sequence_pool
+broadcast
+transpose
+top_k
+dist
+affine_grid
+gaussian_random_batch_size_like
+fake_channel_wise_dequantize_max_abs
+reciprocal
+sequence_mask
+fill_diagonal_tensor
+abs
+partial_concat
+elu
+index_select
+row_conv
+cross
+elementwise_mul
+decayed_adagrad
+bipartite_match
+run_program
+fake_quantize_moving_average_abs_max
+mine_hard_examples
+target_assign
+lstm
+truncated_gaussian_random
+match_matrix_tensor
+elementwise_div
+kldiv_loss
+cumsum
+sum
+proximal_adagrad
+update_loss_scaling
+shard_index
+selu
+mean
+gumbel_softmax
+sequence_pad
+tree_conv
+assign
+flatten_contiguous_range
+tril_triu
+brelu
+celu
+reduce_mean
+sinh
+rank_loss
+reduce_max
+fusion_gru
+fill_zeros_like2
+expm1
+squared_l2_norm
+elementwise_sub
+margin_rank_loss
+faster_tokenizer
+c_identity
+c_reduce_max
+relu
+is_empty
+reduce_all
+edit_distance
+distributed_lookup_table
+bmm
+yolo_box
+soft_relu
+density_prior_box
+eye
+swish
+cross_entropy
+dpsgd
+cholesky
+batch_fc
+nearest_interp
+gather
+trilinear_interp_v2
+box_clip
+c_allgather
+isnan_v2
+softmax
+conv2d_fusion
+fused_batch_norm_act
+get_float_status
+index_sample
+elementwise_min
+logical_not
+collect_fpn_proposals
+pixel_shuffle
+thresholded_relu
+polygon_box_transform
+lookup_table_dequant
+warpctc
+fake_channel_wise_quantize_abs_max
+dequantize_abs_max
+svd
+flip
diff --git a/paddle/fluid/eager/tests/CMakeLists.txt b/paddle/fluid/eager/tests/CMakeLists.txt
index 289f24dfa6367..c1506d8139b43 100644
--- a/paddle/fluid/eager/tests/CMakeLists.txt
+++ b/paddle/fluid/eager/tests/CMakeLists.txt
@@ -1,2 +1,6 @@
 add_subdirectory(data_structure_tests)
 add_subdirectory(task_tests)
+
+if(NOT ON_INFER)
+    add_subdirectory(performance_tests)
+endif()
diff --git a/paddle/fluid/eager/tests/performance_tests/CMakeLists.txt b/paddle/fluid/eager/tests/performance_tests/CMakeLists.txt
new file mode 100644
index 0000000000000..8811aa8ad38a5
--- /dev/null
+++ b/paddle/fluid/eager/tests/performance_tests/CMakeLists.txt
@@ -0,0 +1,7 @@
+cc_library(performance_benchmark_utils SRCS benchmark_utils.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node scale_op matmul_v2_op)
+
+cc_test(test_egr_performance_benchmark_eager_cpu SRCS benchmark_eager_cpu.cc DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps})
+cc_test(test_egr_performance_benchmark_fluid_cpu SRCS benchmark_fluid_cpu.cc DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps})
+
+cc_test(test_egr_performance_benchmark_eager_cuda SRCS benchmark_eager_cuda.cc DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps})
+cc_test(test_egr_performance_benchmark_fluid_cuda SRCS benchmark_fluid_cuda.cc DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps})
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
index 0a84f3b523aee..c100e3b70f384 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
@@ -14,6 +14,7 @@
 
 // Eager Dygraph
 
+#include <paddle/fluid/framework/op_registry.h>
 #include <chrono>
 
 #include "gtest/gtest.h"
@@ -25,15 +26,15 @@
 
 #include "paddle/fluid/imperative/tracer.h"
 
-#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h"
+#include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
 
 #ifdef WITH_GPERFTOOLS
 #include "gperftools/profiler.h"
 #endif
 
-// TODO(jiabin): remove nolint here!!!
-using namespace egr;  // NOLINT
+using namespace egr;            // NOLINT
+using namespace egr_utils_api;  // NOLINT
 
 // Disable pten path
 DECLARE_bool(run_pten_kernel);
@@ -42,11 +43,11 @@ TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; }
 
 TEST(Benchmark, EagerScaleCPU) {
   // Prepare Device Contexts
-  egr::InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   for (const std::string& mode : {"Accuracy", "Performance"}) {
     paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 4, 4, 4});
-    egr::EagerTensor tensor = EagerUtils::CreateTensorWithValue(
+    egr::EagerTensor tensor = CreateTensorWithValue(
         ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
         pten::DataLayout::NCHW, 5.0, true);
     RetainGradForTensor(tensor);
@@ -78,20 +79,20 @@ TEST(Benchmark, EagerScaleCPU) {
 
 TEST(Benchmark, EagerIntermediateMatmulCPU) {
   // Prepare Device Contexts
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   auto tracer = std::make_shared<paddle::imperative::Tracer>();
   paddle::imperative::SetCurrentTracer(tracer);
 
   for (const std::string& mode : {"Accuracy", "Performance"}) {
     paddle::framework::DDim ddimX = paddle::framework::make_ddim({2, 2});
-    egr::EagerTensor X = EagerUtils::CreateTensorWithValue(
+    egr::EagerTensor X = CreateTensorWithValue(
         ddimX, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
         pten::DataLayout::NCHW, 1.0, true);
     RetainGradForTensor(X);
 
     paddle::framework::DDim ddimY = paddle::framework::make_ddim({2, 2});
-    egr::EagerTensor Y = EagerUtils::CreateTensorWithValue(
+    egr::EagerTensor Y = CreateTensorWithValue(
         ddimY, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
         pten::DataLayout::NCHW, 2.0, true);
     RetainGradForTensor(Y);
@@ -122,7 +123,7 @@ TEST(Benchmark, EagerIntermediateMatmulCPU) {
 
 TEST(Benchmark, EagerIntermediateMLPCPU) {
   // Prepare Device Contexts
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   auto tracer = std::make_shared<paddle::imperative::Tracer>();
   paddle::imperative::SetCurrentTracer(tracer);
@@ -130,7 +131,7 @@ TEST(Benchmark, EagerIntermediateMLPCPU) {
   for (const std::string& mode : {"Accuracy", "Performance"}) {
     paddle::framework::DDim ddimX =
         paddle::framework::make_ddim({MLP_M, MLP_N});
-    egr::EagerTensor X = EagerUtils::CreateTensorWithValue(
+    egr::EagerTensor X = CreateTensorWithValue(
         ddimX, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
         pten::DataLayout::NCHW, MLP_X_VAL, true);
     RetainGradForTensor(X);
@@ -140,13 +141,13 @@ TEST(Benchmark, EagerIntermediateMLPCPU) {
     for (size_t i = 0; i < MLP_NUM_LINEAR; i++) {
       paddle::framework::DDim ddimW =
           paddle::framework::make_ddim({MLP_N, MLP_K});
-      egr::EagerTensor W = EagerUtils::CreateTensorWithValue(
+      egr::EagerTensor W = CreateTensorWithValue(
           ddimW, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
           pten::DataLayout::NCHW, MLP_W_VAL, true);
       RetainGradForTensor(W);
 
       paddle::framework::DDim ddimB = paddle::framework::make_ddim({MLP_K});
-      egr::EagerTensor B = EagerUtils::CreateTensorWithValue(
+      egr::EagerTensor B = CreateTensorWithValue(
           ddimB, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
           pten::DataLayout::NCHW, MLP_B_VAL, true);
       RetainGradForTensor(B);
@@ -178,3 +179,8 @@ TEST(Benchmark, EagerIntermediateMLPCPU) {
     }
   }
 }
+
+USE_OP(scale);
+USE_OP(elementwise_add);
+USE_OP(matmul_v2);
+USE_OP(reduce_sum);
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
index b373802c79eb4..c8f4b1b32e453 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 // Eager Dygraph
+#include <paddle/fluid/framework/op_registry.h>
 #include <chrono>
 
 #include "gtest/gtest.h"
@@ -24,26 +25,28 @@
 
 #include "paddle/fluid/imperative/tracer.h"
 
-#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h"
+#include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
 
 #ifdef WITH_GPERFTOOLS
 #include "gperftools/profiler.h"
 #endif
 
-// TODO(jiabin): remove nolint here!!!
-using namespace egr;  // NOLINT
+using namespace egr;            // NOLINT
+using namespace egr_utils_api;  // NOLINT
 
 DECLARE_bool(run_pten_kernel);
 
 TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; }
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
 TEST(Benchmark, EagerScaleCUDA) {
-  egr::InitEnv(paddle::platform::CUDAPlace());
+  eager_test::InitEnv(paddle::platform::CUDAPlace());
 
   for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
     paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 4, 4, 4});
-    egr::EagerTensor tensor = EagerUtils::CreateTensorWithValue(
+    egr::EagerTensor tensor = CreateTensorWithValue(
         ddim, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
         pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/);
     RetainGradForTensor(tensor);
@@ -77,7 +80,7 @@ TEST(Benchmark, EagerScaleCUDA) {
 
 TEST(Benchmark, EagerIntermediateMatmulCUDA) {
   paddle::platform::CUDAPlace place;
-  egr::InitEnv(place);
+  eager_test::InitEnv(place);
 
   auto tracer = std::make_shared<paddle::imperative::Tracer>();
   tracer->SetExpectedPlace(place);
@@ -85,13 +88,13 @@ TEST(Benchmark, EagerIntermediateMatmulCUDA) {
 
   for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
     paddle::framework::DDim ddimX = paddle::framework::make_ddim({2, 2});
-    egr::EagerTensor X = EagerUtils::CreateTensorWithValue(
+    egr::EagerTensor X = CreateTensorWithValue(
         ddimX, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
         pten::DataLayout::NCHW, 1.0, true);
     RetainGradForTensor(X);
 
     paddle::framework::DDim ddimY = paddle::framework::make_ddim({2, 2});
-    egr::EagerTensor Y = EagerUtils::CreateTensorWithValue(
+    egr::EagerTensor Y = CreateTensorWithValue(
         ddimY, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
         pten::DataLayout::NCHW, 2.0, true);
     RetainGradForTensor(Y);
@@ -125,7 +128,7 @@ TEST(Benchmark, EagerIntermediateMatmulCUDA) {
 
 TEST(Benchmark, EagerIntermediateMLPCUDA) {
   paddle::platform::CUDAPlace place;
-  egr::InitEnv(place);
+  eager_test::InitEnv(place);
 
   auto tracer = std::make_shared<paddle::imperative::Tracer>();
   tracer->SetExpectedPlace(place);
@@ -134,7 +137,7 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) {
   for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
     paddle::framework::DDim ddimX =
         paddle::framework::make_ddim({MLP_M, MLP_N});
-    egr::EagerTensor X = EagerUtils::CreateTensorWithValue(
+    egr::EagerTensor X = CreateTensorWithValue(
         ddimX, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
         pten::DataLayout::NCHW, MLP_X_VAL, true);
     RetainGradForTensor(X);
@@ -144,13 +147,13 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) {
     for (size_t i = 0; i < MLP_NUM_LINEAR; i++) {
       paddle::framework::DDim ddimW =
           paddle::framework::make_ddim({MLP_N, MLP_K});
-      egr::EagerTensor W = EagerUtils::CreateTensorWithValue(
+      egr::EagerTensor W = CreateTensorWithValue(
           ddimW, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
           pten::DataLayout::NCHW, MLP_W_VAL, true);
       RetainGradForTensor(W);
 
       paddle::framework::DDim ddimB = paddle::framework::make_ddim({MLP_K});
-      egr::EagerTensor B = EagerUtils::CreateTensorWithValue(
+      egr::EagerTensor B = CreateTensorWithValue(
           ddimB, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
           pten::DataLayout::NCHW, MLP_B_VAL, true);
       RetainGradForTensor(B);
@@ -185,3 +188,11 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) {
     }
   }
 }
+
+USE_OP(scale);
+USE_OP(matmul_v2);
+USE_OP(reduce_sum);
+USE_OP(reduce_sum_grad);
+USE_OP(elementwise_add);
+
+#endif  // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
index 20844055e300d..68e7512eedbde 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
@@ -24,7 +24,7 @@
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 
-#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h"
+#include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
 #include "paddle/fluid/imperative/basic_engine.h"
 #include "paddle/fluid/imperative/tracer.h"
@@ -45,7 +45,7 @@ namespace imperative {
 TEST(Benchmark, FluidScaleCPU) {
   // Prepare Device Contexts
   platform::CPUPlace place;
-  egr::InitEnv(place);
+  eager_test::InitEnv(place);
 
   for (const std::string& mode : {"Accuracy", "Performance"}) {
     std::shared_ptr<imperative::VarBase> X(new imperative::VarBase(true, "X"));
@@ -88,7 +88,7 @@ TEST(Benchmark, FluidScaleCPU) {
 TEST(Benchmark, FluidMatmulCPU) {
   // Prepare Device Contexts
   platform::CPUPlace place;
-  egr::InitEnv(place);
+  eager_test::InitEnv(place);
 
   for (const std::string& mode : {"Accuracy", "Performance"}) {
     std::shared_ptr<imperative::VarBase> X(new imperative::VarBase(true, "X"));
@@ -141,7 +141,7 @@ TEST(Benchmark, FluidMatmulCPU) {
 TEST(Benchmark, FluidMLPCPU) {
   // Prepare Device Contexts
   platform::CPUPlace place;
-  egr::InitEnv(place);
+  eager_test::InitEnv(place);
 
   for (const std::string& mode : {"Accuracy", "Performance"}) {
     std::vector<float> x_src_data(MLP_M * MLP_N, MLP_X_VAL);
@@ -217,5 +217,6 @@ TEST(Benchmark, FluidMLPCPU) {
 }  // namespace paddle
 
 USE_OP(scale);
+USE_OP(elementwise_add);
 USE_OP(matmul_v2);
 USE_OP(reduce_sum);
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
index 620a4d1cd128d..50423b5a64fcf 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
@@ -24,7 +24,7 @@
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 
-#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h"
+#include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
 #include "paddle/fluid/imperative/basic_engine.h"
 #include "paddle/fluid/imperative/tracer.h"
@@ -39,13 +39,15 @@ DECLARE_bool(run_pten_kernel);
 
 TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; }
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
 namespace paddle {
 namespace imperative {
 
 TEST(Benchmark, FluidScaleCUDA) {
   // Prepare Device Contexts
   platform::CUDAPlace place;
-  egr::InitEnv(place);
+  eager_test::InitEnv(place);
 
   for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
     std::shared_ptr<imperative::VarBase> X(new imperative::VarBase(true, "X"));
@@ -98,7 +100,7 @@ TEST(Benchmark, FluidScaleCUDA) {
 TEST(Benchmark, FluidMatmulCUDA) {
   // Prepare Device Contexts
   platform::CUDAPlace place;
-  egr::InitEnv(place);
+  eager_test::InitEnv(place);
 
   for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
     std::shared_ptr<imperative::VarBase> X(new imperative::VarBase(true, "X"));
@@ -161,7 +163,7 @@ TEST(Benchmark, FluidMatmulCUDA) {
 TEST(Benchmark, FluidMLPCUDA) {
   // Prepare Device Contexts
   platform::CUDAPlace place;
-  egr::InitEnv(place);
+  eager_test::InitEnv(place);
 
   for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
     paddle::platform::DeviceContextPool& pool =
@@ -252,3 +254,6 @@ USE_OP(scale);
 USE_OP(matmul_v2);
 USE_OP(reduce_sum);
 USE_OP(reduce_sum_grad);
+USE_OP(elementwise_add);
+
+#endif  // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
index ae5d02c1e943f..baa99dc93c2dd 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
@@ -36,10 +36,6 @@
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/memory/memcpy.h"
 
-#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h"
-
-#include "paddle/pten/core/kernel_registry.h"
-
 static size_t max_num_benchmark_runs = 5000;
 
 namespace egr {
@@ -64,9 +60,9 @@ void benchmark_eager_scale(const EagerTensor& tensor, bool accuracy_check) {
 
   if (accuracy_check) {
     // Examine Forward Grad (w.r.t max_num_runs = 10)
-    CompareTensorWithValue<float>(input_tensor, 8189.0);
+    eager_test::CompareTensorWithValue<float>(input_tensor, 8189.0);
     // Examine Backward Grad (w.r.t max_num_runs = 10)
-    CompareGradTensorWithValue<float>(tensor, 1024.0);
+    eager_test::CompareGradTensorWithValue<float>(tensor, 1024.0);
   }
 }
 
@@ -89,10 +85,10 @@ void benchmark_eager_intermediate_matmul(const EagerTensor& X,
 
   if (accuracy_check) {
     // Examine Forward Grad (w.r.t max_num_runs = 2)
-    CompareVariableWithValue<float>(input_tensor0, 16);
+    eager_test::CompareVariableWithValue<float>(input_tensor0, 16);
     // Examine Backward Grad (w.r.t max_num_runs = 2)
-    CompareGradVariableWithValue<float>(X, 16);
-    CompareGradVariableWithValue<float>(Y, 16);
+    eager_test::CompareGradVariableWithValue<float>(X, 16);
+    eager_test::CompareGradVariableWithValue<float>(Y, 16);
   }
 }
 
@@ -122,11 +118,11 @@ void benchmark_eager_intermediate_mlp(const EagerTensor& X,
         compute_mlp_expected_results();
 
     // Examine Forward Grad (w.r.t max_num_runs = 2)
-    CompareVariableWithValue<float>(Out, result["Out"]);
+    eager_test::CompareVariableWithValue<float>(Out, result["Out"]);
 
     // Examine Backward Grad (w.r.t max_num_runs = 2)
-    CompareGradVariableWithValue<float>(X, result["GradX"]);
-    CompareGradVariableWithValue<float>(Ws[0], result["GradW"]);
+    eager_test::CompareGradVariableWithValue<float>(X, result["GradX"]);
+    eager_test::CompareGradVariableWithValue<float>(Ws[0], result["GradW"]);
   }
 }
 
@@ -141,6 +137,8 @@ static void FluidCheckTensorValue(const std::shared_ptr<imperative::VarBase>& X,
   auto* tensor = X->MutableVar()->GetMutable<framework::LoDTensor>();
   float* t_ptr = tensor->mutable_data<float>(place);
   std::vector<float> host_data(tensor->numel());
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (place == paddle::platform::CUDAPlace()) {
     paddle::platform::DeviceContextPool& pool =
         paddle::platform::DeviceContextPool::Instance();
@@ -153,6 +151,8 @@ static void FluidCheckTensorValue(const std::shared_ptr<imperative::VarBase>& X,
                          sizeof(float) * tensor->numel(), stream);
     t_ptr = host_data.data();
   }
+#endif
+
   VLOG(6) << "Tensor Value: " << t_ptr[0] << ", Expected Value: " << value;
   PADDLE_ENFORCE(
       t_ptr[0] == value,
@@ -166,6 +166,8 @@ static void FluidCheckGradTensorValue(
   auto* grad_tensor = X->MutableGradVar()->GetMutable<framework::LoDTensor>();
   float* g_ptr = grad_tensor->mutable_data<float>(place);
   std::vector<float> g_host_data(grad_tensor->numel());
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (place == paddle::platform::CUDAPlace()) {
     paddle::platform::DeviceContextPool& pool =
         paddle::platform::DeviceContextPool::Instance();
@@ -178,6 +180,8 @@ static void FluidCheckGradTensorValue(
                          sizeof(float) * grad_tensor->numel(), stream);
     g_ptr = g_host_data.data();
   }
+#endif
+
   VLOG(6) << "Tensor Value: " << g_ptr[0] << ", Expected Value: " << value;
   PADDLE_ENFORCE(
       g_ptr[0] == value,
diff --git a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
index 3921ce5b69cd7..c03db1a1575df 100644
--- a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
+++ b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
@@ -6,6 +6,6 @@ cc_test(test_egr_task_hook SRCS hook_test.cc DEPS ${eager_deps} ${fluid_deps} ea
 cc_test(test_egr_task_cross_batch SRCS cross_batch_accumulation_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
 cc_test(test_egr_task_fwd_bwd_joint SRCS fwd_bwd_joint_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
 
-if(NOT DEFINED ON_INFER)
+if(NOT ON_INFER)
     cc_test(test_egr_task_autocodegen SRCS generated_test.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps})
 endif()
diff --git a/paddle/fluid/eager/tests/task_tests/backward_test.cc b/paddle/fluid/eager/tests/task_tests/backward_test.cc
index d63cff23ba9c8..0ec86b7cc360c 100644
--- a/paddle/fluid/eager/tests/task_tests/backward_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/backward_test.cc
@@ -30,19 +30,17 @@
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/tensor_meta.h"
 
-using namespace egr;  // NOLINT
-
-namespace eager_test {
+namespace egr {
 
 TEST(Backward, SingleNodeEmptyGrad) {
   // Prepare Device Contexts
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // Prepare Inputs
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
-  egr::EagerTensor target_tensor = CreateTensorWithValue(
+  egr::EagerTensor target_tensor = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
 
@@ -67,7 +65,7 @@ TEST(Backward, SingleNodeEmptyGrad) {
         std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
     auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
 
-    egr::RetainGradForTensor(leaf_tensor);
+    egr_utils_api::RetainGradForTensor(leaf_tensor);
 
     // Connect Node0 -> AccumulationNode via Edge
     auto meta = egr::AutogradMeta();
@@ -80,26 +78,26 @@ TEST(Backward, SingleNodeEmptyGrad) {
   RunBackward(outs, {});
 
   // Check Output Value
-  CompareGradTensorWithValue<float>(leaf_tensor, 5.0);
+  eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 5.0);
 }
 
 TEST(Backward, SingleNodeCustomGrad) {
   // Prepare Device Contexts
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // Prepare Inputs
   std::vector<egr::EagerTensor> target_tensors;
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
-  egr::EagerTensor tensor = CreateTensorWithValue(
+  egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
   target_tensors.emplace_back(std::move(tensor));
 
   std::vector<egr::EagerTensor> grad_tensors;
   // Create Grad Tensor
-  egr::EagerTensor grad_tensor = CreateTensorWithValue(
+  egr::EagerTensor grad_tensor = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/);
   grad_tensors.emplace_back(std::move(grad_tensor));
@@ -128,7 +126,7 @@ TEST(Backward, SingleNodeCustomGrad) {
         std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
     auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
 
-    egr::RetainGradForTensor(leaf_tensor);
+    egr_utils_api::RetainGradForTensor(leaf_tensor);
 
     // Connect Node0 -> AccumulationNode via Edge
     auto meta = egr::AutogradMeta();
@@ -141,7 +139,7 @@ TEST(Backward, SingleNodeCustomGrad) {
   RunBackward(target_tensors, grad_tensors);
 
   // Check Output Value
-  CompareGradTensorWithValue<float>(leaf_tensor, 50.0);
+  eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 50.0);
 }
 
 /*
@@ -153,14 +151,14 @@ Node0
 */
 TEST(Backward, LinearNodes) {
   // Prepare Device Contexts
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // Prepare Inputs
   std::vector<egr::EagerTensor> target_tensors;
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
-  egr::EagerTensor tensor = CreateTensorWithValue(
+  egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
   target_tensors.emplace_back(std::move(tensor));
@@ -202,7 +200,7 @@ TEST(Backward, LinearNodes) {
         std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
     auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
 
-    egr::RetainGradForTensor(leaf_tensor);
+    egr_utils_api::RetainGradForTensor(leaf_tensor);
 
     // Connect Node1 -> AccumulationNode via Edge
     auto meta1 = egr::AutogradMeta();
@@ -215,7 +213,7 @@ TEST(Backward, LinearNodes) {
   RunBackward(target_tensors, {});
 
   // Check Output Value
-  CompareGradTensorWithValue<float>(leaf_tensor, 50.0);
+  eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 50.0);
 }
 
 /*
@@ -227,17 +225,17 @@ Node0   Node1
 */
 TEST(Backward, WithAccumulation) {
   // Prepare Device Contexts
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // Prepare Inputs
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
   std::vector<egr::EagerTensor> target_tensors;
-  egr::EagerTensor tensor0 = CreateTensorWithValue(
+  egr::EagerTensor tensor0 = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
-  egr::EagerTensor tensor1 = CreateTensorWithValue(
+  egr::EagerTensor tensor1 = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
   target_tensors.emplace_back(std::move(tensor0));
@@ -245,10 +243,10 @@ TEST(Backward, WithAccumulation) {
 
   // Create Grad Tensor
   std::vector<egr::EagerTensor> grad_tensors;
-  egr::EagerTensor grad_tensor0 = CreateTensorWithValue(
+  egr::EagerTensor grad_tensor0 = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 5.0 /*value*/, false /*is_leaf*/);
-  egr::EagerTensor grad_tensor1 = CreateTensorWithValue(
+  egr::EagerTensor grad_tensor1 = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/);
   grad_tensors.emplace_back(std::move(grad_tensor0));
@@ -303,7 +301,7 @@ TEST(Backward, WithAccumulation) {
         std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
     auto_grad_meta2->SetSingleOutRankWithSlot(0, 0);
 
-    egr::RetainGradForTensor(leaf_tensor);
+    egr_utils_api::RetainGradForTensor(leaf_tensor);
 
     // Connect Node2 -> AccumulationNode via Edge
     auto meta2 = egr::AutogradMeta();
@@ -314,7 +312,7 @@ TEST(Backward, WithAccumulation) {
 
   RunBackward(target_tensors, grad_tensors);
 
-  CompareGradTensorWithValue<float>(leaf_tensor, 2500.0);
+  eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 2500.0);
 }
 
-}  // namespace eager_test
+}  // namespace egr
diff --git a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
index e1e138cdee8ba..52e10b2b1b8a0 100644
--- a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
@@ -31,17 +31,15 @@
 
 #include "paddle/fluid/eager/tests/test_utils.h"
 
-using namespace egr;  // NOLINT
-
-namespace eager_test {
+namespace egr {
 
 TEST(CrossBatchAccumulation, SingleScaleNode) {
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   std::vector<egr::EagerTensor> target_tensors;
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
 
-  egr::EagerTensor tensor = CreateTensorWithValue(
+  egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
   target_tensors.emplace_back(std::move(tensor));
@@ -60,7 +58,7 @@ TEST(CrossBatchAccumulation, SingleScaleNode) {
     auto_grad_meta->SetGradNode(
         std::dynamic_pointer_cast<GradNodeBase>(scale_node_ptr));
     auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
-    RetainGradForTensor(target_tensor);  // result: 1.0
+    egr_utils_api::RetainGradForTensor(target_tensor);  // result: 1.0
 
     auto meta = AutogradMeta();
     meta.SetSingleOutRankWithSlot(0, 0);
@@ -71,18 +69,18 @@ TEST(CrossBatchAccumulation, SingleScaleNode) {
     auto_grad_meta1->SetGradNode(
         std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
     auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
-    RetainGradForTensor(leaf_tensor);
+    egr_utils_api::RetainGradForTensor(leaf_tensor);
   }
 
   RunBackward(target_tensors, {});
 
-  CompareGradTensorWithValue<float>(target_tensor, 1.0);
-  CompareGradTensorWithValue<float>(leaf_tensor, 5.0);
+  eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0);
+  eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 5.0);
 
   RunBackward(target_tensors, {});
 
-  CompareGradTensorWithValue<float>(target_tensor, 1.0);
-  CompareGradTensorWithValue<float>(leaf_tensor, 10.0);
+  eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0);
+  eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 10.0);
 }
 
-}  // namespace eager_test
+}  // namespace egr
diff --git a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
index 4d93f0188a746..ea9aae83ff189 100644
--- a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
@@ -24,10 +24,7 @@
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 
-// TODO(jiabin): remove nolint here!!!
-using namespace egr;  // NOLINT
-
-namespace eager_test {
+namespace egr {
 
 TEST(EagerUtils, AutoGradMeta) {
   // Construct Eager Tensor
@@ -63,7 +60,7 @@ TEST(EagerUtils, AutoGradMeta) {
   std::vector<AutogradMeta*> autograd_metas =
       EagerUtils::multi_autograd_meta(&ets);
   std::vector<AutogradMeta*> unsafe_autograd_metas =
-      EagerUtils::unsafe_autograd_meta(&ets);
+      EagerUtils::unsafe_autograd_meta(ets);
   CHECK_NOTNULL(unsafe_autograd_metas[0]);
   CHECK_NOTNULL(unsafe_autograd_metas[1]);
 
@@ -167,7 +164,7 @@ TEST(EagerUtils, PassStopGradient) {
 
 TEST(EagerUtils, SyncToVarsSingle) {
   paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 4, 4, 4});
-  auto tensor = eager_test::CreateTestCPUTensor(5.0f, ddim);
+  auto tensor = CreateTestCPUTensor(5.0f, ddim);
   std::vector<std::shared_ptr<egr::EagerTensor>> var_bases =
       egr::EagerUtils::SyncToVars(tensor);
 
@@ -185,9 +182,8 @@ TEST(EagerUtils, SyncToVarsSingle) {
 
 TEST(EagerUtils, SyncToVarsMultiple) {
   paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 4, 4, 4});
-  std::vector<egr::EagerTensor> tensors = {
-      eager_test::CreateTestCPUTensor(1.0f, ddim),
-      eager_test::CreateTestCPUTensor(2.0f, ddim)};
+  std::vector<egr::EagerTensor> tensors = {CreateTestCPUTensor(1.0f, ddim),
+                                           CreateTestCPUTensor(2.0f, ddim)};
 
   std::vector<std::shared_ptr<egr::EagerTensor>> var_bases =
       egr::EagerUtils::SyncToVars(tensors);
@@ -280,4 +276,4 @@ TEST(EagerUtils, ConstructDuplicableOutput) {
   CHECK(outs[0]->initialized() == false);
 }
 
-}  // namespace eager_test
+}  // namespace egr
diff --git a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc
index 6e23226cde432..205f231eceeed 100644
--- a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc
@@ -27,21 +27,18 @@
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/tensor_meta.h"
 
-// TODO(jiabin): remove nolint here!!!
-using namespace egr;  // NOLINT
-
-namespace eager_test {
+namespace egr {
 
 TEST(Forward, SingleNode) {
   // Prepare Device Contexts
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // Prepare Inputs
   std::vector<egr::EagerTensor> target_tensors;
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
-  egr::EagerTensor t = CreateTensorWithValue(
+  egr::EagerTensor t = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 5.0 /*value*/, false /*is_leaf*/);
   target_tensors.emplace_back(std::move(t));
@@ -55,7 +52,7 @@ TEST(Forward, SingleNode) {
       tensor, scale, bias, true /*bias_after_scale*/, true /*trace_backward*/);
 
   // Examine Forward Output
-  CompareTensorWithValue<float>(out, 13.0);
+  eager_test::CompareTensorWithValue<float>(out, 13.0);
 
   // Examine GradNode
   {
@@ -80,14 +77,14 @@ Node1
  out
 */
 TEST(Forward, LinearNodes) {
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // Prepare Inputs
   std::vector<egr::EagerTensor> target_tensors;
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
-  egr::EagerTensor t = CreateTensorWithValue(
+  egr::EagerTensor t = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 5.0 /*value*/, false /*is_leaf*/);
   target_tensors.emplace_back(std::move(t));
@@ -108,10 +105,10 @@ TEST(Forward, LinearNodes) {
       out0, scale1, bias1, true /*bias_after_scale*/, true /*trace_backward*/);
 
   // Examine Forward Output 0
-  CompareTensorWithValue<float>(out0, 13.0);
+  eager_test::CompareTensorWithValue<float>(out0, 13.0);
 
   // Examine Forward Output 1
-  CompareTensorWithValue<float>(out1, 75.0);
+  eager_test::CompareTensorWithValue<float>(out1, 75.0);
 
   // Examine GradNode
   {
@@ -156,14 +153,14 @@ TEST(Forward, LinearNodes) {
    out1    out2
 */
 TEST(Forward, BranchedNodes) {
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // Prepare Inputs
   std::vector<egr::EagerTensor> target_tensors;
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
-  egr::EagerTensor t = CreateTensorWithValue(
+  egr::EagerTensor t = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 5.0 /*value*/, false /*is_leaf*/);
   target_tensors.emplace_back(std::move(t));
@@ -190,13 +187,13 @@ TEST(Forward, BranchedNodes) {
       out0, scale2, bias2, true /*bias_after_scale*/, true /*trace_backward*/);
 
   // Examine Forward Output 0
-  CompareTensorWithValue<float>(out0, 13.0);
+  eager_test::CompareTensorWithValue<float>(out0, 13.0);
 
   // Examine Forward Output 1
-  CompareTensorWithValue<float>(out1, 75.0);
+  eager_test::CompareTensorWithValue<float>(out1, 75.0);
 
   // Examine Forward Output 2
-  CompareTensorWithValue<float>(out2, 150.0);
+  eager_test::CompareTensorWithValue<float>(out2, 150.0);
 
   // Examine GradNode
   {
@@ -248,4 +245,4 @@ TEST(Forward, BranchedNodes) {
   }
 }
 
-}  // namespace eager_test
+}  // namespace egr
diff --git a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
index 751e95487659c..e292844c8ee58 100644
--- a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
@@ -29,10 +29,7 @@
 
 #include "paddle/fluid/eager/tests/test_utils.h"
 
-// TODO(jiabin): remove nolint here!!!
-using namespace egr;  // NOLINT
-
-namespace eager_test {
+namespace egr {
 
 egr::EagerTensor hook_function(const egr::EagerTensor& t) {
   auto t_dense = std::dynamic_pointer_cast<pten::DenseTensor>(t.impl());
@@ -61,14 +58,14 @@ egr::EagerTensor hook_function(const egr::EagerTensor& t) {
 }
 
 TEST(FwdBwdJoint, SingleNode) {
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // 1. Prepare Input
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
-  egr::EagerTensor tensor = CreateTensorWithValue(
+  egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/);
-  RetainGradForTensor(tensor);
+  egr_utils_api::RetainGradForTensor(tensor);
 
   // 3. Run Forward
   float scale = 2.0;
@@ -77,7 +74,7 @@ TEST(FwdBwdJoint, SingleNode) {
       tensor, scale, bias, true /*bias_after_scale*/, true /*trace_backward*/);
 
   // Examine Forward Output
-  CompareTensorWithValue<float>(out, 13.0);
+  eager_test::CompareTensorWithValue<float>(out, 13.0);
 
   std::vector<egr::EagerTensor> outs = {out};
   // 4. Run Backward
@@ -88,7 +85,7 @@ TEST(FwdBwdJoint, SingleNode) {
                  EagerUtils::unsafe_autograd_meta(tensor)->Grad().impl())
                  ->data<float>()[0];
   // Examine Backward Grad
-  CompareGradTensorWithValue<float>(tensor, 2.0);
+  eager_test::CompareGradTensorWithValue<float>(tensor, 2.0);
 }
 
 /*
@@ -101,14 +98,14 @@ Node1
  out
 */
 TEST(FwdBwdJoint, LinearNodes) {
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // 1. Prepare Input
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
-  egr::EagerTensor tensor = CreateTensorWithValue(
+  egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/);
-  RetainGradForTensor(tensor);
+  egr_utils_api::RetainGradForTensor(tensor);
 
   // 3. Run Forward
   // Run Forward Node 0
@@ -125,17 +122,17 @@ TEST(FwdBwdJoint, LinearNodes) {
       out0, scale1, bias1, true /*bias_after_scale*/, true /*trace_backward*/);
 
   // Examine Forward Output 0
-  CompareTensorWithValue<float>(out0, 13.0);
+  eager_test::CompareTensorWithValue<float>(out0, 13.0);
 
   // Examine Forward Output 1
-  CompareTensorWithValue<float>(out1, 75.0);
+  eager_test::CompareTensorWithValue<float>(out1, 75.0);
 
   std::vector<egr::EagerTensor> outs = {out1};
   // 4. Run Backward
   RunBackward(outs, {});
 
   // Examine Backward Grad
-  CompareGradTensorWithValue<float>(tensor, 10.0);
+  eager_test::CompareGradTensorWithValue<float>(tensor, 10.0);
 }
 
 /*
@@ -149,14 +146,14 @@ TEST(FwdBwdJoint, LinearNodes) {
    out1    out2
 */
 TEST(FwdBwdJoint, BranchedNodes) {
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // 1. Prepare Input
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
-  egr::EagerTensor tensor = CreateTensorWithValue(
+  egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/);
-  RetainGradForTensor(tensor);
+  egr_utils_api::RetainGradForTensor(tensor);
 
   // 3. Run Forward
   // Run Forward Node 0
@@ -179,10 +176,10 @@ TEST(FwdBwdJoint, BranchedNodes) {
       out0, scale2, bias2, true /*bias_after_scale*/, true /*trace_backward*/);
 
   // Examine Forward Output 0
-  CompareTensorWithValue<float>(out0, 13.0);
+  eager_test::CompareTensorWithValue<float>(out0, 13.0);
 
   // Examine Forward Output 1
-  CompareTensorWithValue<float>(out1, 75.0);
+  eager_test::CompareTensorWithValue<float>(out1, 75.0);
 
   // Examine Forward Output 2
   {
@@ -201,7 +198,7 @@ TEST(FwdBwdJoint, BranchedNodes) {
   RunBackward(outs, {});
 
   // Examine Backward Grad
-  CompareGradTensorWithValue<float>(tensor, 30.0);
+  eager_test::CompareGradTensorWithValue<float>(tensor, 30.0);
 }
 
 /*
@@ -215,14 +212,14 @@ TEST(FwdBwdJoint, BranchedNodes) {
    out1    out2
 */
 TEST(FwdBwdJoint, GradientHook) {
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // 1. Prepare Input
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
-  egr::EagerTensor tensor = CreateTensorWithValue(
+  egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/);
-  RetainGradForTensor(tensor);
+  egr_utils_api::RetainGradForTensor(tensor);
 
   std::function<egr::EagerTensor(const egr::EagerTensor&)> hook =
       &hook_function;
@@ -234,24 +231,24 @@ TEST(FwdBwdJoint, GradientHook) {
   egr::EagerTensor out0 =
       egr::scale(tensor, scale0, bias0, true /*bias_after_scale*/,
                  true /*trace_backward*/);
-  RetainGradForTensor(out0);                  // hook: +5
-  RegisterGradientHookForTensor(out0, hook);  // hook: +5
+  egr_utils_api::RetainGradForTensor(out0);                  // hook: +5
+  egr_utils_api::RegisterGradientHookForTensor(out0, hook);  // hook: +5
 
   // Run Forward Node 1
   float scale1 = 5.0;
   float bias1 = 10.0;
   egr::EagerTensor out1 = egr::scale(
       out0, scale1, bias1, true /*bias_after_scale*/, true /*trace_backward*/);
-  RetainGradForTensor(out1);                  // hook: +5
-  RegisterGradientHookForTensor(out1, hook);  // hook: +5
+  egr_utils_api::RetainGradForTensor(out1);                  // hook: +5
+  egr_utils_api::RegisterGradientHookForTensor(out1, hook);  // hook: +5
 
   // Run Forward Node 2
   float scale2 = 10.0;
   float bias2 = 20.0;
   egr::EagerTensor out2 = egr::scale(
       out0, scale2, bias2, true /*bias_after_scale*/, true /*trace_backward*/);
-  RetainGradForTensor(out2);                  // hook: +5
-  RegisterGradientHookForTensor(out2, hook);  // hook: +5
+  egr_utils_api::RetainGradForTensor(out2);                  // hook: +5
+  egr_utils_api::RegisterGradientHookForTensor(out2, hook);  // hook: +5
 
   // 4. Run Backward
   std::vector<egr::EagerTensor> outs = {out1, out2};
@@ -259,16 +256,16 @@ TEST(FwdBwdJoint, GradientHook) {
 
   // Examine Backward Grad
   // leaf grad
-  CompareGradTensorWithValue<float>(tensor, 190.0);
+  eager_test::CompareGradTensorWithValue<float>(tensor, 190.0);
 
   // out0 grad
-  CompareGradTensorWithValue<float>(out0, 90.0);
+  eager_test::CompareGradTensorWithValue<float>(out0, 90.0);
 
   // out1 grad
-  CompareGradTensorWithValue<float>(out1, 1.0);
+  eager_test::CompareGradTensorWithValue<float>(out1, 1.0);
 
   // out2 grad
-  CompareGradTensorWithValue<float>(out2, 1.0);
+  eager_test::CompareGradTensorWithValue<float>(out2, 1.0);
 }
 
 /*
@@ -282,14 +279,14 @@ TEST(FwdBwdJoint, GradientHook) {
    out1    out2
 */
 TEST(FwdBwdJoint, CrossBatchAccumulation) {
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // 1. Prepare Input
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
-  egr::EagerTensor tensor = CreateTensorWithValue(
+  egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/);
-  RetainGradForTensor(tensor);
+  egr_utils_api::RetainGradForTensor(tensor);
 
   // 3. Run Forward
   // Run Forward Node 0
@@ -316,13 +313,13 @@ TEST(FwdBwdJoint, CrossBatchAccumulation) {
   RunBackward(outs, {});
 
   // Examine Backward Grad
-  CompareGradTensorWithValue<float>(tensor, 30.0);
+  eager_test::CompareGradTensorWithValue<float>(tensor, 30.0);
 
   // Cross Batch Accumulation
   RunBackward(outs, {});
 
   // Examine Backward Grad
-  CompareGradTensorWithValue<float>(tensor, 60.0);
+  eager_test::CompareGradTensorWithValue<float>(tensor, 60.0);
 }
 
 /* ---------------------------------------------------- */
@@ -331,14 +328,14 @@ TEST(FwdBwdJoint, CrossBatchAccumulation) {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 TEST(FwdBwdJoint, SingleNodeCUDA) {
-  InitEnv(paddle::platform::CUDAPlace());
+  eager_test::InitEnv(paddle::platform::CUDAPlace());
 
   // 1. Prepare Input
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
-  egr::EagerTensor tensor = CreateTensorWithValue(
+  egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/);
-  RetainGradForTensor(tensor);
+  egr_utils_api::RetainGradForTensor(tensor);
 
   // 3. Run Forward
   float scale = 2.0;
@@ -347,14 +344,14 @@ TEST(FwdBwdJoint, SingleNodeCUDA) {
       tensor, scale, bias, true /*bias_after_scale*/, true /*trace_backward*/);
 
   // Examine Forward Output
-  CompareTensorWithValue<float>(out, 13.0);
+  eager_test::CompareTensorWithValue<float>(out, 13.0);
 
   std::vector<egr::EagerTensor> outs = {out};
   // 4. Run Backward
   RunBackward(outs, {});
 
   // Examine Backward Grad
-  CompareGradTensorWithValue<float>(tensor, 2.0);
+  eager_test::CompareGradTensorWithValue<float>(tensor, 2.0);
 }
 
 /*
@@ -368,14 +365,14 @@ TEST(FwdBwdJoint, SingleNodeCUDA) {
    out1    out2
 */
 TEST(FwdBwdJoint, BranchedNodesCUDA) {
-  InitEnv(paddle::platform::CUDAPlace());
+  eager_test::InitEnv(paddle::platform::CUDAPlace());
 
   // 1. Prepare Input
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
-  egr::EagerTensor tensor = CreateTensorWithValue(
+  egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/);
-  RetainGradForTensor(tensor);
+  egr_utils_api::RetainGradForTensor(tensor);
 
   // 3. Run Forward
   // Run Forward Node 0
@@ -398,11 +395,11 @@ TEST(FwdBwdJoint, BranchedNodesCUDA) {
       out0, scale2, bias2, true /*bias_after_scale*/, true /*trace_backward*/);
 
   // Examine Forward Output 0
-  CompareTensorWithValue<float>(out0, 13.0);
+  eager_test::CompareTensorWithValue<float>(out0, 13.0);
   // Examine Forward Output 1
-  CompareTensorWithValue<float>(out1, 75.0);
+  eager_test::CompareTensorWithValue<float>(out1, 75.0);
   // Examine Forward Output 2
-  CompareTensorWithValue<float>(out2, 150.0);
+  eager_test::CompareTensorWithValue<float>(out2, 150.0);
 
   // TODO(jiabin): fix this with add functor
   // 4. Run Backward
@@ -410,8 +407,8 @@ TEST(FwdBwdJoint, BranchedNodesCUDA) {
   RunBackward(outs, {});
 
   // Examine Backward Grad
-  CompareGradTensorWithValue<float>(tensor, 30.0);
+  eager_test::CompareGradTensorWithValue<float>(tensor, 30.0);
 }
 #endif
 
-}  // namespace eager_test
+}  // namespace egr
diff --git a/paddle/fluid/eager/tests/task_tests/generated_test.cc b/paddle/fluid/eager/tests/task_tests/generated_test.cc
index eb8d1e517eaf3..a06091247bf7a 100644
--- a/paddle/fluid/eager/tests/task_tests/generated_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/generated_test.cc
@@ -30,66 +30,98 @@
 #include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
 #include "paddle/pten/core/kernel_registry.h"
 
-// TODO(jiabin): remove nolint here!!!
-using namespace egr;  // NOLINT
-
-namespace eager_test {
+namespace egr {
 
 TEST(Generated, Sigmoid) {
   // Prepare Device Contexts
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
   VLOG(6) << "Init Env";
   // 1. Prepare Input
   paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 4, 4, 4});
   VLOG(6) << "Make Dim";
-  egr::EagerTensor tensor = CreateTensorWithValue(
+  egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 0.0, true);
   VLOG(6) << "Make EagerTensor";
-  RetainGradForTensor(tensor);
+  egr_utils_api::RetainGradForTensor(tensor);
   VLOG(6) << "Retain Grad for Tensor";
   auto output_tensor = sigmoid_dygraph_function(tensor, {});
   VLOG(6) << "Run Backward";
-  CompareVariableWithValue<float>(output_tensor, 0.5);
+  eager_test::CompareVariableWithValue<float>(output_tensor, 0.5);
 
   std::vector<egr::EagerTensor> target_tensors = {output_tensor};
   VLOG(6) << "Runing Backward";
   RunBackward(target_tensors, {});
 
   VLOG(6) << "Finish Backward";
-  CompareGradVariableWithValue<float>(tensor, 0.25);
+  eager_test::CompareGradVariableWithValue<float>(tensor, 0.25);
 }
 
 TEST(Generated, Matmul_v2) {
   // Prepare Device Contexts
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   auto tracer = std::make_shared<paddle::imperative::Tracer>();
   paddle::imperative::SetCurrentTracer(tracer);
 
   // 1. Prepare Input
   paddle::framework::DDim ddimX = paddle::framework::make_ddim({4, 16});
-  egr::EagerTensor X = CreateTensorWithValue(
+  egr::EagerTensor X = egr_utils_api::CreateTensorWithValue(
       ddimX, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 3.0, true);
-  RetainGradForTensor(X);
+  egr_utils_api::RetainGradForTensor(X);
 
   paddle::framework::DDim ddimY = paddle::framework::make_ddim({16, 20});
-  egr::EagerTensor Y = CreateTensorWithValue(
+  egr::EagerTensor Y = egr_utils_api::CreateTensorWithValue(
       ddimY, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 2.0, true);
-  RetainGradForTensor(Y);
+  egr_utils_api::RetainGradForTensor(Y);
 
   auto output_tensor = matmul_v2_dygraph_function(
       X, Y, {{"trans_x", false}, {"trans_y", false}});
 
-  CompareVariableWithValue<float>(output_tensor, 96);
+  eager_test::CompareVariableWithValue<float>(output_tensor, 96);
+
+  std::vector<egr::EagerTensor> target_tensors = {output_tensor};
+  RunBackward(target_tensors, {});
+
+  eager_test::CompareGradVariableWithValue<float>(X, 2.0 * 20);
+  eager_test::CompareGradVariableWithValue<float>(Y, 3.0 * 4);
+}
+
+TEST(Generated, ElementwiseAdd) {
+  // Prepare Device Contexts
+  eager_test::InitEnv(paddle::platform::CPUPlace());
+
+  auto tracer = std::make_shared<paddle::imperative::Tracer>();
+  paddle::imperative::SetCurrentTracer(tracer);
+
+  // 1. Prepare Input
+  paddle::framework::DDim ddimX = paddle::framework::make_ddim({4, 16});
+  egr::EagerTensor X = egr_utils_api::CreateTensorWithValue(
+      ddimX, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
+      pten::DataLayout::NCHW, 3.0, true);
+  egr_utils_api::RetainGradForTensor(X);
+
+  paddle::framework::DDim ddimY = paddle::framework::make_ddim({4, 16});
+  egr::EagerTensor Y = egr_utils_api::CreateTensorWithValue(
+      ddimY, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
+      pten::DataLayout::NCHW, 2.0, true);
+  egr_utils_api::RetainGradForTensor(Y);
+
+  auto output_tensor = elementwise_add_dygraph_function(X, Y, {});
+
+  eager_test::CompareVariableWithValue<float>(output_tensor, 5);
 
   std::vector<egr::EagerTensor> target_tensors = {output_tensor};
   RunBackward(target_tensors, {});
 
-  CompareGradVariableWithValue<float>(X, 2.0 * 20);
-  CompareGradVariableWithValue<float>(Y, 3.0 * 4);
+  eager_test::CompareGradVariableWithValue<float>(X, 1.0);
+  eager_test::CompareGradVariableWithValue<float>(Y, 1.0);
 }
 
-}  // namespace eager_test
+}  // namespace egr
+
+USE_OP(sigmoid);
+USE_OP(elementwise_add);
+USE_OP(matmul_v2);
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc
index 326240d0cb7b9..32b28d8efd21b 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc
@@ -30,9 +30,7 @@
 
 #include "paddle/fluid/eager/tests/test_utils.h"
 
-using namespace egr;  // NOLINT
-
-namespace eager_test {
+namespace egr {
 
 egr::EagerTensor hook_function(const egr::EagerTensor& t) {
   auto t_dense = std::dynamic_pointer_cast<pten::DenseTensor>(t.impl());
@@ -61,14 +59,14 @@ egr::EagerTensor hook_function(const egr::EagerTensor& t) {
 }
 
 TEST(RetainGrad, HookBeforeRetainGrad) {
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // Prepare Inputs
   std::vector<egr::EagerTensor> target_tensors;
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
-  egr::EagerTensor tensor = CreateTensorWithValue(
+  egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
   target_tensors.emplace_back(std::move(tensor));
@@ -99,8 +97,9 @@ TEST(RetainGrad, HookBeforeRetainGrad) {
         std::dynamic_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
             auto_grad_meta));
 
-    RegisterGradientHookForTensor(target_tensor, hook);
-    RetainGradForTensor(target_tensor);  // result: 1.0 + 3.0 = 4.0
+    egr_utils_api::RegisterGradientHookForTensor(target_tensor, hook);
+    egr_utils_api::RetainGradForTensor(
+        target_tensor);  // result: 1.0 + 3.0 = 4.0
   }
 
   // Connect ScaleNode -> AccumulationNode via Edge
@@ -126,25 +125,26 @@ TEST(RetainGrad, HookBeforeRetainGrad) {
         std::dynamic_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
             auto_grad_meta));
 
-    RegisterGradientHookForTensor(leaf_tensor, hook);
-    RetainGradForTensor(leaf_tensor);  // result: 4.0*5.0 + 3.0 = 23.0
+    egr_utils_api::RegisterGradientHookForTensor(leaf_tensor, hook);
+    egr_utils_api::RetainGradForTensor(
+        leaf_tensor);  // result: 4.0*5.0 + 3.0 = 23.0
   }
 
   RunBackward(target_tensors, {});
 
-  CompareGradTensorWithValue<float>(target_tensor, 4.0);
-  CompareGradTensorWithValue<float>(leaf_tensor, 23.0);
+  eager_test::CompareGradTensorWithValue<float>(target_tensor, 4.0);
+  eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 23.0);
 }
 
 TEST(RetainGrad, HookAfterRetainGrad) {
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // Prepare Inputs
   std::vector<egr::EagerTensor> target_tensors;
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
-  egr::EagerTensor tensor = CreateTensorWithValue(
+  egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
   target_tensors.emplace_back(std::move(tensor));
@@ -173,8 +173,8 @@ TEST(RetainGrad, HookAfterRetainGrad) {
         std::dynamic_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
             auto_grad_meta));
 
-    RetainGradForTensor(target_tensor);  // result: 1.0
-    RegisterGradientHookForTensor(target_tensor, hook);
+    egr_utils_api::RetainGradForTensor(target_tensor);  // result: 1.0
+    egr_utils_api::RegisterGradientHookForTensor(target_tensor, hook);
   }
 
   // Connect ScaleNode -> AccumulationNode via Edge
@@ -200,15 +200,15 @@ TEST(RetainGrad, HookAfterRetainGrad) {
         std::dynamic_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
             auto_grad_meta));
 
-    RetainGradForTensor(leaf_tensor);  // RetainGrad for leaf tensor gets
-                                       // postponed, result: 4.0*5.0 + 3.0 =
-                                       // 23.0
-    RegisterGradientHookForTensor(leaf_tensor, hook);
+    egr_utils_api::RetainGradForTensor(
+        leaf_tensor);  // RetainGrad for leaf tensor gets
+                       // postponed, result: 4.0*5.0 + 3.0 =
+                       // 23.0
+    egr_utils_api::RegisterGradientHookForTensor(leaf_tensor, hook);
   }
 
   RunBackward(target_tensors, {});
-  CompareGradTensorWithValue<float>(target_tensor, 1.0);
-  CompareGradTensorWithValue<float>(leaf_tensor, 23.0);
+  eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0);
+  eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 23.0);
 }
-
-}  // namespace eager_test
+}  // namespace egr
diff --git a/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc b/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc
index 5b96c726b2228..5e86cac83a285 100644
--- a/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc
@@ -23,39 +23,34 @@
 #include "paddle/fluid/eager/tests/test_utils.h"
 #include "paddle/pten/api/lib/utils/allocator.h"
 
-#include "paddle/pten/core/kernel_registry.h"
-
-// TODO(jiabin): remove nolint here!!!
-using namespace egr;  // NOLINT
-
-namespace eager_test {
+namespace egr {
 
 TEST(TensorUtils, Test) {
   // Prepare Device Contexts
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // Prepare Inputs
   std::vector<egr::EagerTensor> target_tensors;
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
-  egr::EagerTensor t = CreateTensorWithValue(
+  egr::EagerTensor t = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/);
 
-  egr::EagerTensor t_grad = CreateTensorWithValue(
+  egr::EagerTensor t_grad = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
 
-  CHECK_EQ(IsLeafTensor(t), true);
+  CHECK_EQ(egr_utils_api::IsLeafTensor(t), true);
 
   // Test Utils
-  CompareTensorWithValue<float>(t, 5.0);
+  eager_test::CompareTensorWithValue<float>(t, 5.0);
 
   egr::AutogradMeta* meta = egr::EagerUtils::autograd_meta(&t);
   *meta->MutableGrad() = t_grad;
 
-  CompareGradTensorWithValue<float>(t, 1.0);
+  eager_test::CompareGradTensorWithValue<float>(t, 1.0);
 }
 
-}  // namespace eager_test
+}  // namespace egr
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index 28eefd62c5aa0..be06bf9eb344b 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -48,9 +48,9 @@ AutogradMeta* EagerUtils::unsafe_autograd_meta(const egr::EagerTensor& target) {
 }
 
 std::vector<AutogradMeta*> EagerUtils::unsafe_autograd_meta(
-    std::vector<egr::EagerTensor>* targets) {
+    const std::vector<egr::EagerTensor>& targets) {
   std::vector<AutogradMeta*> metas;
-  for (const egr::EagerTensor& t : *targets) {
+  for (const egr::EagerTensor& t : targets) {
     metas.push_back(unsafe_autograd_meta(t));
   }
   return metas;
diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h
index f7e226a2aba36..03f922e5bf9ba 100644
--- a/paddle/fluid/eager/utils.h
+++ b/paddle/fluid/eager/utils.h
@@ -114,7 +114,7 @@ class EagerUtils {
   // This method will return an AutogradMeta pointer unsafely.
   static AutogradMeta* unsafe_autograd_meta(const egr::EagerTensor& target);
   static std::vector<AutogradMeta*> unsafe_autograd_meta(
-      std::vector<egr::EagerTensor>* targets);
+      const std::vector<egr::EagerTensor>& targets);
 
   template <typename T, typename... Args>
   static bool ComputeRequireGrad(T trace_backward, Args&&... args) {
diff --git a/paddle/fluid/framework/conv_search_cache.h b/paddle/fluid/framework/conv_search_cache.h
index db8dc22f68663..51446f287e94b 100644
--- a/paddle/fluid/framework/conv_search_cache.h
+++ b/paddle/fluid/framework/conv_search_cache.h
@@ -17,11 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator_kernel_configs.h"
 
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#else
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc
index c511526c3159d..1b5db8380514d 100644
--- a/paddle/fluid/framework/data_set.cc
+++ b/paddle/fluid/framework/data_set.cc
@@ -463,6 +463,11 @@ void DatasetImpl<T>::WaitPreLoadDone() {
 // release memory data
 template <typename T>
 void DatasetImpl<T>::ReleaseMemory() {
+  release_thread_ = new std::thread(&DatasetImpl<T>::ReleaseMemoryFun, this);
+}
+
+template <typename T>
+void DatasetImpl<T>::ReleaseMemoryFun() {
   VLOG(3) << "DatasetImpl<T>::ReleaseMemory() begin";
   if (input_channel_) {
     input_channel_->Clear();
diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h
index b41f701548f3f..58223a2f28b4f 100644
--- a/paddle/fluid/framework/data_set.h
+++ b/paddle/fluid/framework/data_set.h
@@ -63,6 +63,7 @@ class Dataset {
   virtual void SetTrainerNum(int trainer_num) = 0;
   // set fleet send batch size
   virtual void SetFleetSendBatchSize(int64_t size) = 0;
+  virtual void ReleaseMemoryFun() = 0;
   // set fs name and ugi
   virtual void SetHdfsConfig(const std::string& fs_name,
                              const std::string& fs_ugi) = 0;
@@ -168,8 +169,13 @@ template <typename T>
 class DatasetImpl : public Dataset {
  public:
   DatasetImpl();
-  virtual ~DatasetImpl() {}
+  virtual ~DatasetImpl() {
+    if (release_thread_ != nullptr) {
+      release_thread_->join();
+    }
+  }
   virtual void SetFileList(const std::vector<std::string>& filelist);
+  virtual void ReleaseMemoryFun();
   virtual void SetThreadNum(int thread_num);
   virtual void SetTrainerNum(int trainer_num);
   virtual void SetFleetSendBatchSize(int64_t size);
@@ -295,6 +301,7 @@ class DatasetImpl : public Dataset {
   int64_t fleet_send_batch_size_;
   int64_t fleet_send_sleep_seconds_;
   std::vector<std::thread> preload_threads_;
+  std::thread* release_thread_ = nullptr;
   bool merge_by_insid_;
   bool parse_ins_id_;
   bool parse_content_;
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index 3429677a2403e..b1573093ec333 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -291,13 +291,9 @@ void AllReduceOpHandle::SyncNCCLAllReduce() {
           nccl_ctxs_->GetRunEnvNCCLCtx(run_order_, use_hierarchical_allreduce_);
       auto &nccl_ctx = nccl_ctxs->at(dev_id);
       auto stream = nccl_ctx.stream();
-#ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipGetLastError());
-#else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError());
-#endif
+
+      platform::GpuStreamSync(stream);
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
     }
   }
 }
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h
index 033d9396e9bf2..02e35895205b7 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.h
@@ -33,7 +33,7 @@ class NCCLCommunicator;
 }  // namespace paddle
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/framework/details/nccl_op_handle.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #elif defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/framework/details/bkcl_op_handle.h"
 #include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index 36b840e4945a0..a11a244214d4f 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -111,7 +111,7 @@ void BroadcastOpHandle::BroadcastOneVar(
 
       broadcast_calls.emplace_back(
           [send_recv_buffer, numel, type, root_id, &nccl_ctx] {
-            PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+            PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
                 send_recv_buffer, numel, static_cast<ncclDataType_t>(type),
                 root_id, nccl_ctx.comm_, nccl_ctx.stream()));
           });
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
index 0b062b1a3f49a..055c7e63863b3 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -44,7 +44,7 @@ struct BKCLContextMap;
 }  // namespace paddle
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #elif defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
 #endif
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h
index 6ca4baa6d8b04..2e82fe22dba73 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h
@@ -95,7 +95,7 @@ struct TestBroadcastOpHandle {
 #endif
     } else if (use_device_ == p::kCUDA) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-      int count = p::GetCUDADeviceCount();
+      int count = p::GetGPUDeviceCount();
       if (count <= 1) {
         LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
                         "device count is "
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 68c5daaac5d78..f9c28cbee50c3 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -40,7 +40,7 @@ class NCCLCommunicator;
 }  // namespace paddle
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
 #endif
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
index 07f7bbdb97a8d..bcdd6129230b0 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -49,10 +49,10 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
       platform::CUDADeviceGuard guard(
           BOOST_GET_CONST(platform::CUDAPlace, place).device);
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           hipEventCreateWithFlags(&event_, hipEventDisableTiming));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
 #endif
       PADDLE_ENFORCE_NOT_NULL(event_, platform::errors::InvalidArgument(
@@ -75,9 +75,9 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
     auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_->GetPlace());
     platform::CUDADeviceGuard guard(gpu_place.device);
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(event_));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event_));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event_));
 #endif
   }
 #endif
@@ -160,12 +160,12 @@ void EagerDeletionOpHandle::ClearGarbages(
         reinterpret_cast<StreamGarbageCollector *>(gc_)->stream();
     auto callback_func = [=]() {
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event_, compute_stream));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, compute_stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(
           hipStreamWaitEvent(callback_stream, event_, 0));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, compute_stream));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, compute_stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(
           cudaStreamWaitEvent(callback_stream, event_, 0));
 #endif
     };
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
index 94507140a81d6..bd153f24fa318 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -55,9 +55,9 @@ FusedAllReduceOpHandle::~FusedAllReduceOpHandle() {
   auto destroy_event = [](gpuEvent_t event) {
     if (event == nullptr) return;
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(event));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event));
 #endif
   };
   destroy_event(start_event_);
@@ -87,10 +87,10 @@ void FusedAllReduceOpHandle::RunImpl() {
     auto create_event = [](gpuEvent_t *event) {
       if (*event) return;
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           hipEventCreateWithFlags(event, hipEventDisableTiming));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           cudaEventCreateWithFlags(event, cudaEventDisableTiming));
 #endif
     };
@@ -109,12 +109,12 @@ void FusedAllReduceOpHandle::RunImpl() {
     auto &nccl_ctx = flat_nccl_ctxs->at(gpu_place.device);
     nccl_stream = nccl_ctx.stream();
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(start_event_, compute_stream));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(start_event_, compute_stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(
         hipStreamWaitEvent(nccl_stream, start_event_, 0));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(start_event_, compute_stream));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(start_event_, compute_stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaStreamWaitEvent(nccl_stream, start_event_, 0));
 #endif
   } else {
@@ -169,12 +169,12 @@ void FusedAllReduceOpHandle::RunImpl() {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   if (FLAGS_allreduce_record_one_event) {
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(end_event_, nccl_stream));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(end_event_, nccl_stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(
         hipStreamWaitEvent(compute_stream, end_event_, 0));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(end_event_, nccl_stream));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(end_event_, nccl_stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaStreamWaitEvent(compute_stream, end_event_, 0));
 #endif
   }
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
index 31336b92c4dfb..d522981c77fa1 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
@@ -35,7 +35,7 @@ class NCCLCommunicator;
 }  // namespace paddle
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/framework/details/nccl_op_handle.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #elif defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
 #endif
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.h b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
index 2fd1e0e7e9889..e08a768f8ce07 100644
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
@@ -37,7 +37,7 @@ struct NCCLContextMap;
 }  // namespace paddle
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc
index 98c37ca3c406a..38e20127f1612 100644
--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -48,7 +48,7 @@ struct TestGatherOpHandle {
   void InitCtxOnGpu(bool use_gpu) {
     if (use_gpu) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      int count = p::GetCUDADeviceCount();
+      int count = p::GetGPUDeviceCount();
       if (count <= 1) {
         LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
                         "device count is "
diff --git a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h
index c59f61347303d..9cfc3ada6ac3d 100644
--- a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h
@@ -35,7 +35,7 @@ class NCCLCommunicator;
 }  // namespace paddle
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/framework/details/nccl_op_handle.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
index a9ea336e42545..8255707654416 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
@@ -40,7 +40,7 @@ static std::vector<std::mutex>& multi_op_var2gpu_str_mutex() {
 }
 
 static void InitMultiGPUOpVarMap() {
-  int dev_count = platform::GetCUDADeviceCount();
+  int dev_count = platform::GetGPUDeviceCount();
   PADDLE_ENFORCE_GT(dev_count, 0,
                     platform::errors::NotFound(
                         "cuda device must > 0, now dev_count=%d", dev_count));
@@ -161,11 +161,11 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
                             op_var));
 
 #ifdef __HIPCC__
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1,
                          hipMemcpyHostToDevice, dev_ctx->stream()));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1,
                           cudaMemcpyHostToDevice, dev_ctx->stream()));
 #endif
diff --git a/paddle/fluid/framework/details/nccl_op_handle.h b/paddle/fluid/framework/details/nccl_op_handle.h
index 762f4071b5cab..324d39ed8bb77 100644
--- a/paddle/fluid/framework/details/nccl_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_op_handle.h
@@ -27,7 +27,7 @@
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/dynload/rccl.h"
 #endif
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 
 DECLARE_bool(sync_nccl_allreduce);
 
@@ -52,16 +52,16 @@ class NCCLOpHandleBase : public OpHandleBase {
   virtual ~NCCLOpHandleBase() {
     for (auto& ev : inter_events_) {
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(ev.second));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(ev.second));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(ev.second));
 #endif
     }
     for (auto& ev : exter_events_) {
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(ev.second));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(ev.second));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(ev.second));
 #endif
     }
   }
@@ -109,14 +109,14 @@ class NCCLOpHandleBase : public OpHandleBase {
 
       platform::SetDeviceId(dev_id);
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventCreateWithFlags(
+      PADDLE_ENFORCE_GPU_SUCCESS(hipEventCreateWithFlags(
           &inter_events_[dev_id], hipEventDisableTiming));
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventCreateWithFlags(
+      PADDLE_ENFORCE_GPU_SUCCESS(hipEventCreateWithFlags(
           &exter_events_[dev_id], hipEventDisableTiming));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags(
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreateWithFlags(
           &inter_events_[dev_id], cudaEventDisableTiming));
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags(
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreateWithFlags(
           &exter_events_[dev_id], cudaEventDisableTiming));
 #endif
       VLOG(10) << "Create events on dev_id:" << dev_id
@@ -142,7 +142,7 @@ class NCCLOpHandleBase : public OpHandleBase {
              << ", dev_id:" << dev_id << ", dtype:" << datatype
              << ", place:" << place;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
         sendbuff, recvbuff, count, datatype, op, comm, stream));
   }
 
@@ -192,7 +192,7 @@ class NCCLOpHandleBase : public OpHandleBase {
              << ", dtype:" << datatype << ", place:" << place
              << ", stream:" << stream;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce(
         sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream));
 
 #ifdef PADDLE_WITH_HIP
@@ -202,11 +202,7 @@ class NCCLOpHandleBase : public OpHandleBase {
 #endif
 
     if (FLAGS_sync_nccl_allreduce) {
-#ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
+      platform::GpuStreamSync(stream);
     }
   }
 
@@ -230,26 +226,21 @@ class NCCLOpHandleBase : public OpHandleBase {
 #ifdef PADDLE_WITH_HIP
     hipStreamWaitEvent(stream, inter_events_.at(dev_id), 0);
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
         sendbuff, recvbuff, count, datatype, op, comm, stream));
 
     hipEventRecord(exter_events_.at(dev_id), stream);
-
-    if (FLAGS_sync_nccl_allreduce) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-    }
 #else
     cudaStreamWaitEvent(stream, inter_events_.at(dev_id), 0);
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
         sendbuff, recvbuff, count, datatype, op, comm, stream));
 
     cudaEventRecord(exter_events_.at(dev_id), stream);
-
+#endif
     if (FLAGS_sync_nccl_allreduce) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
+      platform::GpuStreamSync(stream);
     }
-#endif
   }
 
   void InterBroadCast(platform::Place place, void* sendbuff, size_t count,
@@ -269,7 +260,7 @@ class NCCLOpHandleBase : public OpHandleBase {
 #else
     cudaStreamWaitEvent(stream, exter_events_.at(dev_id), 0);
 #endif
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
         sendbuff, count, datatype, 0, comm, stream));
   }
 
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 4b5d0563d7394..25b5eefc05cda 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -35,9 +35,9 @@ OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW {
   for (auto &ev : events_) {
     if (ev.second) {
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(ev.second));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(ev.second));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(ev.second));
 #endif
     }
   }
@@ -50,10 +50,10 @@ void OpHandleBase::InitCUDA() {
     int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device;
     platform::SetDeviceId(dev_id);
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         hipEventCreateWithFlags(&events_[dev_id], hipEventDisableTiming));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
 #endif
   }
@@ -182,9 +182,9 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
         static_cast<platform::CUDADeviceContext *>(waited_ctx)->stream();
     for (auto &ev : events_) {
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(stream, ev.second, 0));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream, ev.second, 0));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(stream, ev.second, 0));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(stream, ev.second, 0));
 #endif
     }
   }
@@ -221,10 +221,10 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
               static_cast<platform::CUDADeviceContext *>(dev_ctxes_.at(place))
                   ->stream();
 #ifdef PADDLE_WITH_HIP
-          PADDLE_ENFORCE_CUDA_SUCCESS(
+          PADDLE_ENFORCE_GPU_SUCCESS(
               hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
 #else
-          PADDLE_ENFORCE_CUDA_SUCCESS(
+          PADDLE_ENFORCE_GPU_SUCCESS(
               cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
 #endif
 #else
@@ -250,11 +250,7 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
             auto stream =
                 static_cast<platform::CUDADeviceContext *>(pool.Get(place))
                     ->stream();
-#ifdef PADDLE_WITH_HIP
-            PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-            PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
+            platform::GpuStreamSync(stream);
 #else
             PADDLE_THROW(platform::errors::PreconditionNotMet(
                 "Not compiled with CUDA."));
@@ -279,10 +275,10 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
                             dev_ctxes_.at(in_var_handle->place()))
                             ->stream();
 #ifdef PADDLE_WITH_HIP
-          PADDLE_ENFORCE_CUDA_SUCCESS(
+          PADDLE_ENFORCE_GPU_SUCCESS(
               hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
 #else
-          PADDLE_ENFORCE_CUDA_SUCCESS(
+          PADDLE_ENFORCE_GPU_SUCCESS(
               cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
 #endif
 #else
@@ -319,10 +315,10 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
       auto *cuda_dev_ctx = static_cast<platform::CUDADeviceContext *>(p.second);
       VLOG(10) << "cudadevicecontext:" << cuda_dev_ctx << ", dev_id:" << dev_id;
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           hipEventRecord(events_.at(dev_id), cuda_dev_ctx->stream()));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           cudaEventRecord(events_.at(dev_id), cuda_dev_ctx->stream()));
 #endif
     }
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index a485838a95942..bbc458804a195 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -193,7 +193,7 @@ void ReduceOpHandle::RunImpl() {
         size_t numel = static_cast<size_t>(lod_tensor.numel());
         all_reduce_calls.emplace_back(
             [buffer, recvbuffer, type, numel, root_id, &nccl_ctx] {
-              PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
+              PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce(
                   buffer, recvbuffer, numel, static_cast<ncclDataType_t>(type),
                   ncclSum, root_id, nccl_ctx.comm_, nccl_ctx.stream()));
             });
diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h
index d56b6b3663003..4b9f289eaa787 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -41,7 +41,7 @@ struct NCCLContextMap;
 }  // namespace platform
 }  // namespace paddle
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #elif defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
 #endif
diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc
index 82f5ea6a66891..35dba48845472 100644
--- a/paddle/fluid/framework/details/reduce_op_handle_test.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
@@ -59,7 +59,7 @@ struct TestReduceOpHandle {
     use_gpu_ = use_gpu;
     if (use_gpu) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-      int count = p::GetCUDADeviceCount();
+      int count = p::GetGPUDeviceCount();
       if (count <= 1) {
         LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
                         "device count is "
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
index ccc64a9cdc335..1225e2ee025b2 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
@@ -39,14 +39,14 @@ ShareTensorBufferFunctor::ShareTensorBufferFunctor(
     Scope *scope, size_t scope_idx, const std::string &op_type,
     const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
     const std::vector<std::string> &out_var_names, const bool &is_variant_scope,
-    bool share_dims)
+    bool share_dims_and_dtype)
     : scope_(scope),
       scope_idx_(scope_idx),
       op_type_(op_type),
       in_var_infos_(in_var_infos),
       out_var_names_(out_var_names),
       is_variant_scope_(is_variant_scope),
-      share_dims_(share_dims) {
+      share_dims_and_dtype_(share_dims_and_dtype) {
   PADDLE_ENFORCE_EQ(in_var_infos_.size(), out_var_names_.size(),
                     platform::errors::PreconditionNotMet(
                         "The number of input variables and output variables "
@@ -147,12 +147,14 @@ void ShareTensorBufferFunctor::operator()(Scope *exec_scope) {
       // NOTE(zhiqiu): In the case of inplace addto, if the operator of
       // the in_out_vars is skipped during running, we should set the dims of
       // output as the same as input.
-      if (share_dims_) {
+      if (share_dims_and_dtype_) {
         out_tensor->Resize(in_tensor.dims());
+        out_tensor->ShareDataTypeWith(in_tensor);
       }
 
       VLOG(2) << "Share tensor buffer when running " << op_type_ << " : "
-              << in_var_info->Name() << " -> " << out_var_names_[i];
+              << in_var_info->Name() << " -> " << out_var_names_[i]
+              << " share_dims_and_dtype = " << share_dims_and_dtype_;
     }
   }
 }
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_functor.h b/paddle/fluid/framework/details/share_tensor_buffer_functor.h
index 528b047bccc13..f0ddb3f0137a2 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_functor.h
+++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.h
@@ -73,12 +73,14 @@ class ShareTensorBufferFunctor {
       Scope *scope, size_t scope_idx, const std::string &op_type,
       const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
       const std::vector<std::string> &out_var_names,
-      const bool &is_variant_scope, bool share_dims = false);
+      const bool &is_variant_scope, bool share_dims_and_dtype = false);
 
   void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info,
                        const std::string &out_var_name);
 
-  void SetShareDims(bool share_dims) { share_dims_ = share_dims; }
+  void SetShareDimsAndDtype(bool share_dims_and_dtype) {
+    share_dims_and_dtype_ = share_dims_and_dtype;
+  }
 
   void operator()(Scope *exec_scope);
 
@@ -108,7 +110,7 @@ class ShareTensorBufferFunctor {
   // NOTE(zhiqiu): In the case of inplace addto, if the operator of
   // the in_out_vars is skipped during running, we should set the dims of output
   // as the same as input.
-  bool share_dims_{false};
+  bool share_dims_and_dtype_{false};
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
index 7e10c669ac478..aa942415fb404 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
@@ -64,10 +64,10 @@ ComputationOpHandle *GetUniquePendingComputationOpHandle(
 ShareTensorBufferOpHandle::ShareTensorBufferOpHandle(
     ir::Node *node, Scope *scope, size_t scope_idx, const std::string &op_type,
     const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
-    const std::vector<std::string> &out_var_names, bool share_dims)
+    const std::vector<std::string> &out_var_names, bool share_dims_and_dtype)
     : OpHandleBase(node),
       functor_(scope, scope_idx, op_type, in_var_infos, out_var_names,
-               is_variant_scope_, share_dims) {}
+               is_variant_scope_, share_dims_and_dtype) {}
 
 std::unordered_map<std::string, std::string>
 ShareTensorBufferOpHandle::ReusedVars() const {
@@ -79,8 +79,9 @@ void ShareTensorBufferOpHandle::AddReuseVarPair(
   functor_.AddReuseVarPair(in_var_info, out_var_name);
 }
 
-void ShareTensorBufferOpHandle::SetShareDims(bool share_dims) {
-  functor_.SetShareDims(share_dims);
+void ShareTensorBufferOpHandle::SetShareDimsAndDtype(
+    bool share_dims_and_dtype) {
+  functor_.SetShareDimsAndDtype(share_dims_and_dtype);
 }
 
 void ShareTensorBufferOpHandle::InitCUDA() {
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h
index dd2364fec4af5..d3852a85d019b 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h
@@ -56,7 +56,7 @@ class ShareTensorBufferOpHandle : public OpHandleBase {
   void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info,
                        const std::string &out_var_name);
 
-  void SetShareDims(bool share_dims);
+  void SetShareDimsAndDtype(bool share_dims_and_dtype);
 
   const ShareTensorBufferFunctor &Functor() const { return functor_; }
 
diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
index 37399e5ddc09d..d916b9bc26276 100644
--- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
@@ -23,7 +23,7 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/profiler.h"
 
 DECLARE_bool(sync_nccl_allreduce);
@@ -182,7 +182,7 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
              << ", k:" << k << ", place:" << place << ", dtype:" << dtype;
 
     all_gather_calls.emplace_back([=] {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
           in_tensor_buf, gather_buff, 2 * k, static_cast<ncclDataType_t>(dtype),
           comm, stream));
     });
diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
index 8bfea0f1ae8b8..5c3aef71ec40e 100644
--- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
@@ -21,7 +21,7 @@
 #include "paddle/fluid/framework/details/dgc_const_values.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index 600d75db53c7e..15acedf3cf50a 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -54,7 +54,7 @@ class DeviceContext;
 }  // namespace paddle
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index 5aef43263575e..739e05e1d7971 100644
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -181,7 +181,7 @@ enum TableType {
 message TableParameter {
   optional uint64 table_id = 1;
   optional string table_class = 2;
-  optional uint64 shard_num = 3;
+  optional uint64 shard_num = 3 [ default = 1000 ];
   optional TableType type = 4;
   optional TableAccessorParameter accessor = 5;
 }
@@ -190,42 +190,73 @@ message TableAccessorParameter {
   optional string accessor_class = 1;
   optional SGDParameter embed_sgd_param = 2;
   optional SGDParameter embedx_sgd_param = 3;
-  optional uint32 fea_dim = 4; // for sparse table, this means field size of one
-                               // value; for dense table, this means total value
-                               // num
-  optional uint32 embedx_dim = 5;       // embedx feature size
-  optional uint32 embedx_threshold = 6; // embedx feature create threshold
+  optional uint32 fea_dim = 4 [ default = 11 ];   // field size of one value
+  optional uint32 embedx_dim = 5 [ default = 8 ]; // embedx feature size
+  optional uint32 embedx_threshold = 6
+      [ default = 10 ]; // embedx feature create threshold
   optional CtrAccessorParameter ctr_accessor_param = 7;
+  repeated TableAccessorSaveParameter table_accessor_save_param = 8;
 }
 
 // TODO(guanqun): add NaiveSGD/Adam...
 message SGDParameter {
   optional string name = 1;
-  optional SGDRuleParameter adagrad = 2;
+  optional SparseNaiveSGDRuleParameter naive = 2;
+  optional SparseAdagradSGDRuleParameter adagrad = 3;
+  optional SparseAdamSGDParameter adam = 4;
 }
 
-message SGDRuleParameter {
-  optional double learning_rate = 1;
-  optional double initial_g2sum = 2;
-  optional double initial_range = 3 [ default = 0 ];
+message SparseNaiveSGDRuleParameter { // SparseNaiveSGDRule
+  optional double learning_rate = 1 [ default = 0.05 ];
+  optional double initial_range = 2 [ default = 0.0001 ];
+  repeated float weight_bounds = 3;
+}
+
+message
+    SparseAdagradSGDRuleParameter { // SparseAdaGradSGDRule|StdAdaGradSGDRule
+  optional double learning_rate = 1 [ default = 0.05 ];
+  optional double initial_g2sum = 2 [ default = 3.0 ];
+  optional double initial_range = 3 [ default = 0.0001 ];
   repeated float weight_bounds = 4;
 }
 
+message SparseAdamSGDParameter { // SparseAdamSGDRule
+  optional double learning_rate = 1 [ default = 0.001 ];
+  optional double initial_range = 2 [ default = 0.0001 ];
+  optional double beta1_decay_rate = 3 [ default = 0.9 ];
+  optional double beta2_decay_rate = 4 [ default = 0.999 ];
+  optional double ada_epsilon = 5 [ default = 1e-08 ];
+  repeated float weight_bounds = 6;
+}
+
 message CtrAccessorParameter {
-  optional float nonclk_coeff = 1; // to calculate show_click_score
-  optional float click_coeff = 2;  // to calculate show_click_score
-  optional float base_threshold =
-      3; // show_click_score > base_threshold, this feature can be saved
-  optional float delta_threshold =
-      4; // delta_score > delta_threshold, this feature can be saved
-  optional float delta_keep_days =
-      5; // unseen_day < delta_keep_days, this feature can be saved
-  optional float show_click_decay_rate = 6; // show/click will update to
-                                            // show/click *
-                                            // show_click_decay_rate after a day
-  optional float delete_threshold = 7;      // threshold to shrink a feasign
-  optional float delete_after_unseen_days = 8;
-  optional int32 ssd_unseenday_threshold = 9;
+  optional float nonclk_coeff = 1
+      [ default = 0.1 ]; // to calculate show_click_score
+  optional float click_coeff = 2
+      [ default = 1 ]; // to calculate show_click_score
+  optional float base_threshold = 3 [
+    default = 1.5
+  ]; // show_click_score > base_threshold, this feature can be saved
+  optional float delta_threshold = 4
+      [ default =
+            0.25 ]; // delta_score > delta_threshold, this feature can be saved
+  optional float delta_keep_days = 5
+      [ default =
+            16 ]; // unseen_day < delta_keep_days, this feature can be saved
+  optional float show_click_decay_rate = 6
+      [ default = 0.98 ]; // show/click will update to
+                          // show/click *
+                          // show_click_decay_rate after a day
+  optional float delete_threshold = 7
+      [ default = 0.8 ]; // threshold to shrink a feasign
+  optional float delete_after_unseen_days = 8 [ default = 30 ];
+  optional int32 ssd_unseenday_threshold = 9 [ default = 1 ];
+}
+
+message TableAccessorSaveParameter {
+  optional uint32 param = 1;
+  optional string converter = 2;
+  optional string deconverter = 3;
 }
 
 message FsClientParameter {
@@ -274,6 +305,7 @@ message DistributedStrategy {
   optional bool semi_auto = 35 [ default = false ];
   optional bool adam_d2sum = 36 [ default = true ];
   optional bool auto_search = 37 [ default = false ];
+  optional bool heter_ccl_mode = 38 [ default = false ];
 
   optional RecomputeConfig recompute_configs = 101;
   optional AMPConfig amp_configs = 102;
diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc
index 4e2d7bb979b61..9b8bdebe706eb 100644
--- a/paddle/fluid/framework/dlpack_tensor_test.cc
+++ b/paddle/fluid/framework/dlpack_tensor_test.cc
@@ -115,7 +115,7 @@ void TestMainLoop() {
   std::vector<platform::Place> places{platform::CPUPlace(),
                                       platform::CUDAPlace(0),
                                       platform::CUDAPinnedPlace()};
-  if (platform::GetCUDADeviceCount() > 1) {
+  if (platform::GetGPUDeviceCount() > 1) {
     places.emplace_back(platform::CUDAPlace(1));
   }
 #else
diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.h b/paddle/fluid/framework/fleet/ascend_wrapper.h
index f749ee8cfa0ba..82ce3b28776f1 100644
--- a/paddle/fluid/framework/fleet/ascend_wrapper.h
+++ b/paddle/fluid/framework/fleet/ascend_wrapper.h
@@ -24,7 +24,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/timer.h"
 
diff --git a/paddle/fluid/framework/fleet/box_wrapper.cc b/paddle/fluid/framework/fleet/box_wrapper.cc
index 37fbf47f854ad..8564a42165961 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.cc
+++ b/paddle/fluid/framework/fleet/box_wrapper.cc
@@ -19,7 +19,7 @@
 #include <memory>
 #include <numeric>
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/box_wrapper.cu b/paddle/fluid/framework/fleet/box_wrapper.cu
index c9b5abf7a9bef..c91d371f5a155 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.cu
+++ b/paddle/fluid/framework/fleet/box_wrapper.cu
@@ -19,7 +19,7 @@
 #include <numeric>
 #include "paddle/fluid/framework/fleet/box_wrapper.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/box_wrapper.h b/paddle/fluid/framework/fleet/box_wrapper.h
index 645d725871a06..b043edca138a8 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.h
+++ b/paddle/fluid/framework/fleet/box_wrapper.h
@@ -40,7 +40,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/timer.h"
 #include "paddle/fluid/string/string_helper.h"
@@ -397,7 +397,7 @@ class BoxWrapper {
     if (nullptr != s_instance_) {
       VLOG(3) << "Begin InitializeGPU";
       std::vector<gpuStream_t*> stream_list;
-      for (int i = 0; i < platform::GetCUDADeviceCount(); ++i) {
+      for (int i = 0; i < platform::GetGPUDeviceCount(); ++i) {
         VLOG(3) << "before get context i[" << i << "]";
         platform::CUDADeviceContext* context =
             dynamic_cast<platform::CUDADeviceContext*>(
@@ -416,7 +416,7 @@ class BoxWrapper {
         slot_name_omited_in_feedpass_.insert(slot_name);
       }
       slot_vector_ = slot_vector;
-      keys_tensor.resize(platform::GetCUDADeviceCount());
+      keys_tensor.resize(platform::GetGPUDeviceCount());
     }
   }
 
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc
index 66c043e137a24..225c2656fbfd1 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -740,10 +740,10 @@ void FleetWrapper::PushDenseVarsAsync(
                  BOOST_GET_CONST(platform::CUDAPlace, place), g_data,
                  sizeof(float) * count, stream);
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event, stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, stream));
     hipEventSynchronize(event);
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, stream));
     cudaEventSynchronize(event);
 #endif
 
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h
index 6fddedccf0258..deb2b90c93353 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.h
@@ -35,7 +35,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 #ifdef PADDLE_WITH_HETERPS
-#include "paddle/fluid/platform/type_defs.h"
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
index 646a2e97d319f..e7f098320c6c7 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
@@ -28,7 +28,7 @@ limitations under the License. */
 // #include "cudf/concurrent_unordered_map.cuh.h"
 #include "paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h"
 #ifdef PADDLE_WITH_HETERPS
-#include "paddle/fluid/platform/type_defs.h"
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
index ec852ec83ca09..c293b07e8995c 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -347,7 +347,7 @@ void HeterComm<KeyType, ValType, GradType>::build_ps(int num, KeyType* h_keys,
 
   gpuStream_t streams[stream_num];
   for (int i = 0; i < stream_num; ++i) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&(streams[i])));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&(streams[i])));
     auto d_k_buf = memory::AllocShared(place, chunk_size * sizeof(KeyType));
     auto d_v_buf = memory::AllocShared(place, chunk_size * sizeof(ValType));
     d_key_bufs.push_back(d_k_buf);
@@ -360,11 +360,11 @@ void HeterComm<KeyType, ValType, GradType>::build_ps(int num, KeyType* h_keys,
   while (cur_len < len) {
     cur_stream = cur_stream % stream_num;
     int tmp_len = cur_len + chunk_size > len ? len - cur_len : chunk_size;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaMemcpyAsync(d_key_bufs[cur_stream]->ptr(), h_keys + cur_len,
                         sizeof(KeyType) * tmp_len, cudaMemcpyHostToDevice,
                         streams[cur_stream]));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaMemcpyAsync(d_val_bufs[cur_stream]->ptr(), h_vals + cur_len,
                         sizeof(ValType) * tmp_len, cudaMemcpyHostToDevice,
                         streams[cur_stream]));
@@ -378,7 +378,7 @@ void HeterComm<KeyType, ValType, GradType>::build_ps(int num, KeyType* h_keys,
 
   for (int i = 0; i < stream_num; ++i) {
     cudaStreamSynchronize(streams[i]);
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(streams[i]));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams[i]));
   }
 }
 
@@ -402,14 +402,14 @@ void HeterComm<KeyType, ValType, GradType>::merge_grad(int gpu_num,
   GradType* d_merge_grads_ptr =
       reinterpret_cast<GradType*>(d_merge_grads->ptr());
 
-  PADDLE_ENFORCE_CUDA_SUCCESS(cub::DeviceRadixSort::SortPairs(
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
       NULL, temp_storage_bytes, d_keys, d_merge_keys_ptr, d_grads,
       d_merge_grads_ptr, len, 0, 8 * sizeof(KeyType), stream, false));
 
   void* d_buff = NULL;
   auto d_temp_storage = memory::AllocShared(place, temp_storage_bytes);
 
-  PADDLE_ENFORCE_CUDA_SUCCESS(cub::DeviceRadixSort::SortPairs(
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
       d_temp_storage->ptr(), temp_storage_bytes, d_keys, d_merge_keys_ptr,
       d_grads, d_merge_grads_ptr, len, 0, 8 * sizeof(KeyType), stream, false));
   temp_storage_bytes = 0;
@@ -417,7 +417,7 @@ void HeterComm<KeyType, ValType, GradType>::merge_grad(int gpu_num,
   auto d_num_runs_out_mem = memory::AllocShared(place, sizeof(int));
   int* d_num_runs_out = reinterpret_cast<int*>(d_num_runs_out_mem->ptr());
 
-  PADDLE_ENFORCE_CUDA_SUCCESS(cub::DeviceReduce::ReduceByKey(
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::ReduceByKey(
       NULL, temp_storage_bytes, d_merge_keys_ptr, d_keys, d_merge_grads_ptr,
       d_grads, d_num_runs_out, merger_, len, stream, false));
 
@@ -426,13 +426,13 @@ void HeterComm<KeyType, ValType, GradType>::merge_grad(int gpu_num,
     d_temp_storage = memory::AllocShared(place, temp_storage_bytes);
   }
 
-  PADDLE_ENFORCE_CUDA_SUCCESS(cub::DeviceReduce::ReduceByKey(
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::ReduceByKey(
       d_temp_storage->ptr(), temp_storage_bytes, d_merge_keys_ptr, d_keys,
       d_merge_grads_ptr, d_grads, d_num_runs_out, merger_, len, stream, false));
 
   cudaMemcpyAsync(&uniq_len, d_num_runs_out, sizeof(int),
                   cudaMemcpyDeviceToHost, stream);
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
 }
 
 template <typename KeyType, typename ValType, typename GradType>
@@ -461,12 +461,12 @@ void HeterComm<KeyType, ValType, GradType>::split_input_to_shard(
 
   size_t temp_storage_bytes;
   const int num_bits = 1 + log2i(total_gpu);
-  PADDLE_ENFORCE_CUDA_SUCCESS(cub::DeviceRadixSort::SortPairs(
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
       NULL, temp_storage_bytes, d_shard_index_tmp_ptr, d_shard_index_ptr,
       d_idx_tmp_ptr, d_idx_ptr, len, 0, num_bits, stream));
 
   auto d_temp_storage = memory::AllocShared(place, temp_storage_bytes);
-  PADDLE_ENFORCE_CUDA_SUCCESS(cub::DeviceRadixSort::SortPairs(
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
       d_temp_storage->ptr(), temp_storage_bytes, d_shard_index_tmp_ptr,
       d_shard_index_ptr, d_idx_tmp_ptr, d_idx_ptr, len, 0, num_bits, stream));
   calc_shard_offset<<<grid_size, block_size_, 0, stream>>>(d_shard_index_ptr,
@@ -720,12 +720,12 @@ int HeterComm<KeyType, ValType, GradType>::gather_one_node_grad(
              cudaMemcpyHostToDevice);
 
   // allgather grad len
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupStart());
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
       (const void*)(d_node_len + gpu_num), (void*)d_node_len, 1, ncclInt,
       nccl_inner_comm, stream));
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupEnd());
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
   cudaMemcpy(h_node_len, d_node_len, sizeof(int) * total_gpu,
              cudaMemcpyDeviceToHost);
 
@@ -737,15 +737,15 @@ int HeterComm<KeyType, ValType, GradType>::gather_one_node_grad(
   storage.alloc(max_size * total_gpu);
 
   // allgather keys and grads
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupStart());
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
       d_keys, storage.all_keys, max_size, ncclUint64, nccl_inner_comm, stream));
 
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
       d_grads, storage.all_grads, max_size * sizeof(GradType), ncclUint8,
       nccl_inner_comm, stream));
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupEnd());
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
 
   int h_left[total_gpu];
   int h_right[total_gpu];
@@ -802,11 +802,11 @@ int HeterComm<KeyType, ValType, GradType>::gather_multi_node_grad(
   cudaMemcpy(d_node_len, h_node_len, sizeof(int), cudaMemcpyHostToDevice);
 
   // allgather grad len
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupStart());
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
       d_node_len, d_node_len, 1, ncclInt, nccl_inter_comm, stream));
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupEnd());
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
   cudaMemcpy(h_node_len, d_node_len, sizeof(int) * node_size_,
              cudaMemcpyDeviceToHost);
 
@@ -818,15 +818,15 @@ int HeterComm<KeyType, ValType, GradType>::gather_multi_node_grad(
   storage.alloc(max_size * node_size_);
 
   // allgather keys and grads
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupStart());
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
       d_keys, storage.all_keys, max_size, ncclUint64, nccl_inter_comm, stream));
 
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
       d_grads, storage.all_grads, max_size * sizeof(GradType), ncclUint8,
       nccl_inter_comm, stream));
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupEnd());
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
 
   int merge_num = 0;
   for (int i = 0; i < node_size_; ++i) {
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
index a369a612d4935..ccdb6c5cdd64e 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
@@ -30,11 +30,11 @@ GPUResource::GPUResource(std::vector<int>& dev_ids, int index) {
   remote_streams_.resize(dev_ids_.size());
 
   for (size_t i = 0; i < dev_ids_.size(); ++i) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaStreamCreateWithFlags(&local_streams_[i], cudaStreamNonBlocking));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaStreamCreateWithFlags(&comm_streams_[i], cudaStreamNonBlocking));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaStreamCreateWithFlags(&remote_streams_[i], cudaStreamNonBlocking));
   }
 }
@@ -42,13 +42,13 @@ GPUResource::GPUResource(std::vector<int>& dev_ids, int index) {
 GPUResource::~GPUResource() {
   platform::CUDADeviceGuard guard(dev_id_);
   for (size_t i = 0; i < local_streams_.size(); ++i) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(local_streams_[i]));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(local_streams_[i]));
   }
   for (size_t i = 0; i < comm_streams_.size(); ++i) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(comm_streams_[i]));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(comm_streams_[i]));
   }
   for (size_t i = 0; i < remote_streams_.size(); ++i) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(remote_streams_[i]));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(remote_streams_[i]));
   }
 }
 
@@ -58,7 +58,7 @@ void HeterPsResource::enable_p2p() {
     for (size_t j = 0; j < dev_ids_.size(); ++j) {
       if (i != j) {
         int p2p_flag;
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             cudaDeviceCanAccessPeer(&p2p_flag, dev_ids_[i], dev_ids_[j]));
         if (p2p_flag == 1) {
           cudaError_t ret = cudaDeviceEnablePeerAccess(dev_ids_[j], 0);
diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.cc b/paddle/fluid/framework/fleet/nccl_wrapper.cc
index 3ac95632de6bf..cbd06deeafc75 100644
--- a/paddle/fluid/framework/fleet/nccl_wrapper.cc
+++ b/paddle/fluid/framework/fleet/nccl_wrapper.cc
@@ -22,7 +22,7 @@ bool NCCLWrapper::is_initialized_ = false;
 
 void NCCLWrapper::InitNCCL() {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitRank(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclCommInitRank(
       &(nccl_info_.comm_), nccl_info_.global_ranks_, nccl_info_.nccl_id_,
       nccl_info_.my_global_rank_));
 #endif
@@ -38,7 +38,7 @@ void NCCLWrapper::SetNCCLId(const NCCLInfo& nccl_info) {
 
 NCCLInfo NCCLWrapper::GetNCCLId() {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_)));
 #endif
   return nccl_info_;
@@ -52,9 +52,9 @@ void NCCLWrapper::SetRankInfo(const int local_rank, const int global_rank,
   nccl_info_.global_ranks_ = ranks;
   platform::SetDeviceId(local_rank);
 #ifdef PADDLE_WITH_RCCL
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&(nccl_info_.stream_)));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&(nccl_info_.stream_)));
 #else
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&(nccl_info_.stream_)));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&(nccl_info_.stream_)));
 #endif
 #endif
   return;
@@ -67,7 +67,7 @@ void NCCLWrapper::SyncVar(const int root_rank, const Scope& scope,
     auto var = scope.FindVar(name);
     LoDTensor* tensor = var->GetMutable<LoDTensor>();
     int32_t total_size = tensor->numel();
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
         reinterpret_cast<void*>(tensor->data<float>()), total_size, ncclFloat,
         root_rank, nccl_info_.comm_, nccl_info_.stream_));
 #ifdef PADDLE_WITH_RCCL
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
index 6519a514ff3b6..a0954ef0709dc 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h"
 #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
index b726a629586e1..c163c2de11019 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
@@ -37,8 +37,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/dynload/nccl.h"
-#include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 #include "paddle/fluid/platform/place.h"
 #ifdef PADDLE_WITH_PSCORE
@@ -230,7 +230,7 @@ class PSGPUWrapper {
                              ? 1.0
                              : config["mf_max_bound"];
     for (size_t i = 0; i < heter_devices_.size(); i++) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(heter_devices_[i]));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(heter_devices_[i]));
       this->SetSparseSGD(nonclk_coeff, clk_coeff, min_bound, max_bound,
                          learning_rate, initial_g2sum, initial_range);
       this->SetEmbedxSGD(mf_create_thresholds, mf_learning_rate,
diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
index 9ab6b5d8c178b..06d1ef84c1955 100644
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -53,6 +53,15 @@ void XPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
 }
 #endif
 
+#ifdef PADDLE_WITH_IPU
+IPUGarbageCollector::IPUGarbageCollector(const platform::IPUPlace &place,
+                                         size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+void IPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
+  callback();
+}
+#endif
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector(
     const platform::CUDAPlace &place, size_t max_memory_size)
@@ -83,9 +92,9 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
     : GarbageCollector(place, max_memory_size) {
   platform::CUDADeviceGuard guard(place.device);
 #ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream_));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream_));
 #else
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream_));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream_));
   callback_manager_.reset(
       new platform::StreamCallbackManager<gpuStream_t>(stream_));
 #endif
@@ -94,13 +103,8 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
 StreamGarbageCollector::~StreamGarbageCollector() {
   auto place = BOOST_GET_CONST(platform::CUDAPlace, this->dev_ctx_->GetPlace());
   platform::CUDADeviceGuard guard(place.device);
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream_));
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamDestroy(stream_));
-#else
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_));
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_));
-#endif
+  platform::GpuStreamSync(stream_);
+  platform::GpuDestroyStream(stream_);
 }
 
 gpuStream_t StreamGarbageCollector::stream() const { return stream_; }
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index 2c2b57bbe420a..0cfeda37c222e 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -80,6 +80,16 @@ class XPUGarbageCollector : public GarbageCollector {
 };
 #endif
 
+#ifdef PADDLE_WITH_IPU
+class IPUGarbageCollector : public GarbageCollector {
+ public:
+  IPUGarbageCollector(const platform::IPUPlace &place, size_t max_memory_size);
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override;
+};
+#endif
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 class UnsafeFastGPUGarbageCollector : public GarbageCollector {
  public:
diff --git a/paddle/fluid/framework/generator.cc b/paddle/fluid/framework/generator.cc
index 154154fc79517..a020bda823167 100644
--- a/paddle/fluid/framework/generator.cc
+++ b/paddle/fluid/framework/generator.cc
@@ -18,8 +18,8 @@ limitations under the License. */
 #include <memory>
 #include <utility>
 
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
 namespace framework {
@@ -33,7 +33,7 @@ const std::shared_ptr<Generator>& GetDefaultCUDAGenerator(int64_t device_id) {
   static std::vector<std::shared_ptr<Generator>> default_cuda_generators;
 
   std::call_once(num_devices_init_flag, []() {
-    num_cuda_devices = paddle::platform::GetCUDADeviceCount();
+    num_cuda_devices = paddle::platform::GetGPUDeviceCount();
     cuda_device_flags.resize(num_cuda_devices);
     default_cuda_generators.resize(num_cuda_devices);
   });
diff --git a/paddle/fluid/framework/heter_pipeline_trainer_test.cc b/paddle/fluid/framework/heter_pipeline_trainer_test.cc
index af8eca32ee2f4..417c7685bcbeb 100644
--- a/paddle/fluid/framework/heter_pipeline_trainer_test.cc
+++ b/paddle/fluid/framework/heter_pipeline_trainer_test.cc
@@ -115,8 +115,6 @@ TEST(HeterPipelineTrainerTest, GPU) {
   t3.add_trainers(1);
   t3.add_trainers(1);
   t3.add_trainers(1);
-  t3.add_dump_fields("hello");
-  t3.add_dump_param("fc_0");
   auto* heter_section_param3 = t3.mutable_heter_section_param();
   heter_section_param3->set_num_pipeline_stages(3);
   heter_section_param3->set_pipeline_stage(2);
diff --git a/paddle/fluid/framework/heter_section_worker.cc b/paddle/fluid/framework/heter_section_worker.cc
index a8db38f8077dd..69a4a180a9071 100644
--- a/paddle/fluid/framework/heter_section_worker.cc
+++ b/paddle/fluid/framework/heter_section_worker.cc
@@ -277,7 +277,7 @@ void HeterSectionWorker::CopyParameters(int microbatch_id,
 void HeterSectionWorker::Run() {
   if (debug_) {
     size_t total_ops_size = forward_ops_.size() + backward_ops_.size();
-    op_name_.resize(total_ops_size);
+    op_name_.reserve(total_ops_size);
     op_total_time_.resize(total_ops_size);
     platform::SetNumThreads(1);
     // forward op + backward op
diff --git a/paddle/fluid/framework/heterxpu_trainer.cc b/paddle/fluid/framework/heterxpu_trainer.cc
index 8049a1c9424be..93b7869cc1d25 100644
--- a/paddle/fluid/framework/heterxpu_trainer.cc
+++ b/paddle/fluid/framework/heterxpu_trainer.cc
@@ -51,11 +51,11 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
     platform::CUDAPlace place = platform::CUDAPlace(num);
     platform::CUDADeviceGuard guard(place.device);
     cudaStream_t stream;
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream));
     copy_streams_.push_back(stream);
     places_.push_back(place);
     cudaEvent_t event;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
     events_.push_back(event);
 #endif
@@ -104,7 +104,7 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
   //   platform::CUDAPlace place = platform::CUDAPlace(num);
   //   platform::CUDADeviceGuard guard(place.device);
   //   cudaStream_t stream;
-  //   PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream));
+  //   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream));
   //   copy_streams_.push_back(stream);
   //   places_.push_back(place);
   // }
@@ -157,7 +157,7 @@ void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) {
     }
   }
 #ifdef PADDLE_WITH_CUDA
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, stream));
   cudaEventSynchronize(event);
 #endif
 }
@@ -287,7 +287,7 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) {
 #ifdef PADDLE_WITH_CUDA
       auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
       platform::CUDADeviceGuard guard(dev_id);
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
 #endif
       object_pool_.Push(context);
@@ -441,7 +441,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
 #ifdef PADDLE_WITH_CUDA
     auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
     platform::CUDADeviceGuard guard(dev_id);
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
 #endif
   }
@@ -461,7 +461,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
 #endif
     }
 #ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaEventRecord(context->event_, copy_streams_[context->place_num_]));
     while (cudaEventQuery(context->event_) != cudaSuccess) {
       VLOG(3) << "wait for kernel";
@@ -481,7 +481,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
 #ifdef PADDLE_WITH_CUDA
   auto* dev_ctx = static_cast<platform::CUDADeviceContext*>(
       platform::DeviceContextPool::Instance().Get(place));
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       cudaEventRecord(context->event_, dev_ctx->stream()));
   // cudaEventSynchronize(context->event_);
   {
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index e384cb4633794..b98a228868266 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -161,7 +161,7 @@ cc_test(test_skip_layernorm_fuse_pass SRCS skip_layernorm_fuse_pass_tester.cc DE
 cc_test(test_multihead_matmul_fuse_pass SRCS multihead_matmul_fuse_pass_tester.cc DEPS multihead_matmul_fuse_pass)
 cc_test(test_conv_bn_fuse_pass_cc SRCS conv_bn_fuse_pass_tester.cc DEPS conv_bn_fuse_pass)
 cc_test(test_adaptive_pool2d_convert_global_pass SRCS adaptive_pool2d_convert_global_pass_tester.cc DEPS adaptive_pool2d_convert_global_pass)
-cc_test(test_unsqueeze2_eltwise_fuse_pass SRCS unsqueeze2_eltwise_fuse_pass_tester.cc DEPS unsqueeze2_eltwise_fuse_pass)
+cc_test(test_unsqueeze2_eltwise_fuse_pass_cc SRCS unsqueeze2_eltwise_fuse_pass_tester.cc DEPS unsqueeze2_eltwise_fuse_pass)
 cc_test(test_layer_norm_fuse_pass_cc SRCS layer_norm_fuse_pass_tester.cc DEPS layer_norm_fuse_pass pass_test_util naive_executor)
 cc_test(test_generate_pass_cc SRCS generate_pass_tester.cc DEPS generate_pass pass_desc_proto)
 if(WITH_GPU OR WITH_ROCM)
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
index bb78cdab67752..e246a10961c0c 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -130,6 +130,32 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const {
     GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern);
+
+    // Only support 2D-Tensor as weight for FC
+    std::vector<int64_t> w_shape = w->Var()->GetShape();
+    size_t w_rank = w_shape.size();
+    if (w_rank != 2) return;
+
+    // axis of elementwise_add should be -1 or x_num_col_dims
+    auto x_num_col_dims =
+        BOOST_GET_CONST(int, mul->Op()->GetAttr("x_num_col_dims"));
+    auto axis = BOOST_GET_CONST(int, elementwise_add->Op()->GetAttr("axis"));
+    if (axis != -1 && axis != x_num_col_dims) return;
+
+    // Shape of bias should be [1, out_size] or [out_size]
+    std::vector<int64_t> b_shape = bias->Var()->GetShape();
+    if (b_shape.size() == 1) {
+      if (b_shape[0] != w_shape[1]) {
+        return;
+      }
+    } else if (b_shape.size() == 2) {
+      if (b_shape[0] != 1 || b_shape[1] != w_shape[1]) {
+        return;
+      }
+    } else {
+      return;
+    }
+
     Node* relu = nullptr;
     Node* relu_out = nullptr;
     if (with_relu) {
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
index 5046911036818..39b544e716079 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
@@ -55,14 +55,14 @@ TEST(FCFusePass, basic) {
   auto* bias_0 = layers.data("conv2d_bias_0", {}, true);
   auto* conv2d_out = layers.conv2d(a, filters_0, bias_0, false);
   auto* relu_out_0 = layers.relu(conv2d_out);
-  auto* weights_0 = layers.data("weights_0", {}, true);
+  auto* weights_0 = layers.data("weights_0", {5, 4}, true);
   auto* mul_out_0 = layers.mul(relu_out_0, weights_0);
-  auto* bias_1 = layers.data("bias_1", {}, true);
+  auto* bias_1 = layers.data("bias_1", {4}, true);
   auto* add_out_0 = layers.elementwise_add(mul_out_0, bias_1, nullptr, 1);
   auto* relu_out_1 = layers.relu(add_out_0);
-  auto* weights_1 = layers.data("weights_1", {}, true);
+  auto* weights_1 = layers.data("weights_1", {8, 9}, true);
   auto* mul_out_1 = layers.mul(relu_out_1, weights_1);
-  auto* bias_2 = layers.data("bias_2", {}, true);
+  auto* bias_2 = layers.data("bias_2", {1, 9}, true);
   auto* add_out_1 = layers.elementwise_add(mul_out_1, bias_2, nullptr, 1);
   VLOG(4) << add_out_1;
 
diff --git a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
index ae662c64af331..f12273e94dddd 100644
--- a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
@@ -24,12 +24,8 @@ class Node;
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#endif
+
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
index ec014d331fa44..005f006ab0478 100644
--- a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
@@ -15,13 +15,8 @@
 #include "paddle/fluid/framework/ir/fuse_bn_add_act_pass.h"
 #include <string>
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/enforce.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#endif
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index dd0ffe8b9fd0d..5334b08248992 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1619,6 +1619,26 @@ PDNode *patterns::Reshape::operator()() {
   return reshape_out;
 }
 
+PDNode *patterns::Slice::operator()() {
+  auto prev_op = pattern->NewNode(prev_op_repr())->assert_is_op();
+
+  auto slice_op = pattern->NewNode(slice_op_repr())->assert_is_op("slice");
+
+  auto slice_in = pattern->NewNode(slice_in_repr())
+                      ->AsInput()
+                      ->assert_is_op_input("slice", "Input");
+  auto slice_out = pattern->NewNode(slice_out_repr())
+                       ->AsOutput()
+                       ->assert_is_op_output("slice", "Out");
+
+  auto next_op = pattern->NewNode(next_op_repr())->assert_is_op();
+
+  prev_op->LinksTo({slice_in});
+  slice_op->LinksFrom({slice_in}).LinksTo({slice_out});
+  next_op->LinksFrom({slice_out});
+  return slice_out;
+}
+
 PDNode *patterns::Matmul::operator()() {
   auto matmul_op = pattern->NewNode(matmul_op_repr())->assert_is_op("matmul");
 
@@ -2315,7 +2335,7 @@ PDNode *patterns::QuantizePlacement::operator()(
       std::unordered_set<std::string>({"concat", "conv2d", "elementwise_add",
                                        "fc", "matmul", "pool2d", "prior_box",
                                        "reshape2", "transpose2", "fusion_gru",
-                                       "fusion_lstm", "multi_gru"});
+                                       "fusion_lstm", "multi_gru", "slice"});
   if (!quantize_enabled_op_types.empty()) {
     supported_op_types = quantize_enabled_op_types;
   }
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index d7bfdc57d1c7e..fa8504d074a88 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -980,6 +980,20 @@ struct Reshape : public PatternBase {
   PATTERN_DECL_NODE(reshape_out);
   PATTERN_DECL_NODE(next_op);
 };
+// Slice op
+// Forward pass for slice.
+// slice_out is a result of the operator.
+struct Slice : public PatternBase {
+  Slice(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "slice") {}
+
+  PDNode* operator()();
+  PATTERN_DECL_NODE(prev_op);
+  PATTERN_DECL_NODE(slice_in);
+  PATTERN_DECL_NODE(slice_op);
+  PATTERN_DECL_NODE(slice_out);
+  PATTERN_DECL_NODE(next_op);
+};
 
 // Matmul op
 // Forward pass for matmul.
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
index bf7cd55fab268..1ca6e989f275c 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
@@ -283,7 +283,8 @@ void BufferSharedInplaceOpPass::ApplyImpl(ProgramDesc *main_program,
     op->SetInput("X", inputs);
     op->SetOutput("Out", outputs);
     op->SetOutput("XOut", inputs);  // add necessary dependency
-    op->SetAttr("share_dims", std::vector<bool>(inputs.size(), false));
+    op->SetAttr("share_dims_and_dtype",
+                std::vector<bool>(inputs.size(), false));
   }
   block->Flush();
 }
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc
index d09de5be84c35..0ed2ec51b89cb 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc
@@ -277,7 +277,7 @@ static void BuildInplaceAddToGraph(Node *in_var_0, Node *in_var_1,
   grad_add_op_desc->SetInput("X", {in_var_1->Name()});
   grad_add_op_desc->SetOutput("Out", {out_var->Name()});
   grad_add_op_desc->SetOutput("XOut", {in_var_1->Name()});
-  grad_add_op_desc->SetAttr("share_dims", std::vector<bool>(1, true));
+  grad_add_op_desc->SetAttr("share_dims_and_dtype", std::vector<bool>(1, true));
 
   // Add share_buffer op between in_var_0 and in_var_1
   OpDesc share_buffer_op;
@@ -285,7 +285,7 @@ static void BuildInplaceAddToGraph(Node *in_var_0, Node *in_var_1,
   share_buffer_op.SetInput("X", {in_var_0->Name()});
   share_buffer_op.SetOutput("Out", {in_var_1->Name()});
   share_buffer_op.SetOutput("XOut", {in_var_0->Name()});
-  share_buffer_op.SetAttr("share_dims", std::vector<bool>(1, false));
+  share_buffer_op.SetAttr("share_dims_and_dtype", std::vector<bool>(1, false));
 
   auto *new_share_buffer_op = graph->CreateOpNode(&share_buffer_op);
   new_share_buffer_op->inputs.push_back(in_var_0);
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
index f6465d385841d..9d1e2301704b3 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
@@ -329,7 +329,7 @@ bool MemoryReusePass::IsVarPairReusable(
 void MemoryReusePass::AddReuseVar(details::ComputationOpHandle *op,
                                   details::VarHandle *in_var,
                                   details::VarHandle *out_var,
-                                  bool share_dims) const {
+                                  bool share_dims_and_dtype) const {
   PADDLE_ENFORCE_GT(
       (*var_infos_)[op->GetScopeIdx()].count(in_var->Name()), 0,
       platform::errors::NotFound("Var(%s) does not in mem opt var infos.",
@@ -349,8 +349,8 @@ void MemoryReusePass::AddReuseVar(details::ComputationOpHandle *op,
     share_buffer_op->AddInput(in_var);
   }
 
-  if (share_dims) {
-    share_buffer_op->SetShareDims(true);
+  if (share_dims_and_dtype) {
+    share_buffer_op->SetShareDimsAndDtype(true);
   }
 
   share_buffer_op->AddReuseVarPair(
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index 2bf8a3b64f0a7..3df4a84470524 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -676,6 +676,57 @@ void CPUQuantizePass::QuantizeReshape(Graph* graph) const {
   PrettyLogDetail("---    quantized %d reshape ops", quantize_reshape_count);
 }
 
+void CPUQuantizePass::QuantizeSlice(Graph* graph) const {
+  GraphPatternDetector gpd;
+  auto pattern = gpd.mutable_pattern();
+  patterns::Slice slice_pattern{pattern, name_scope_};
+  slice_pattern();
+
+  int quantize_slice_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "Quantize slice op";
+    GET_IR_NODE_FROM_SUBGRAPH(slice_op, slice_op, slice_pattern);
+
+    // skip if should not be quantized
+    if (!platform::HasOpINT8DataType(slice_op->Op())) {
+      LogQuantizationDisabled(slice_op);
+      return;
+    }
+    GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, slice_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, slice_pattern);
+
+    // skip if prev op and next op is not quantized
+    if (!IsOpDequantized(prev_op) && !IsOpQuantized(next_op)) {
+      return;
+    }
+    GET_IR_NODE_FROM_SUBGRAPH(slice_in, slice_in, slice_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(slice_out, slice_out, slice_pattern);
+
+    if (!AreScalesPresentForNodes({slice_out})) {
+      LogCannotQuantizeOp(slice_op);
+      return;
+    }
+
+    bool is_input_unsigned{false};
+    auto input_scale = GetScaleValueForNode(slice_out, &is_input_unsigned);
+    QuantizeInput(g, slice_op, slice_in, "Input", input_scale,
+                  is_input_unsigned);
+
+    bool is_output_unsigned{false};
+    auto output_scale = GetScaleValueForNode(slice_out, &is_output_unsigned);
+    DequantizeOutput(g, slice_op, slice_out, "Out", output_scale,
+                     is_output_unsigned);
+
+    ++quantize_slice_count;
+  };
+
+  gpd(graph, handler);
+  AddStatis(quantize_slice_count);
+
+  PrettyLogDetail("---    quantized %d slice ops", quantize_slice_count);
+}
+
 void CPUQuantizePass::QuantizeMatmul(Graph* graph) const {
   GraphPatternDetector gpd;
   auto pattern = gpd.mutable_pattern();
@@ -1024,6 +1075,7 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
   QuantizeFusionGru(graph);
   QuantizeMultiGru(graph);
   QuantizeFusionLSTM(graph);
+  QuantizeSlice(graph);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
index 18735633c0d69..b3ee98263c0c0 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
@@ -61,6 +61,7 @@ class CPUQuantizePass : public FusePassBase {
   void QuantizeFusionGru(Graph* graph) const;
   void QuantizeMultiGru(Graph* graph) const;
   void QuantizeFusionLSTM(Graph* graph) const;
+  void QuantizeSlice(Graph* graph) const;
 
   void QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name,
                      double scale_to_one, bool is_input_unsigned,
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
index b6a8de263aa2a..838912f659ff7 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@@ -55,6 +55,10 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
     op->SetInput("X", {inputs[0]});
     op->SetOutput("Out", {outputs[0]});
     op->SetAttr("mkldnn_data_type", mkldnn_data_type);
+  } else if (type == "slice") {
+    op->SetInput("Input", {inputs[0]});
+    op->SetOutput("Out", {outputs[0]});
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
   } else if (type == "dropout") {
     op->SetInput("X", {inputs[0]});
     op->SetOutput("Out", {outputs[0]});
@@ -784,6 +788,113 @@ TEST(CpuQuantizePass, reshapeBetweenNonQuantizedOp) {
                   added_nodes_count, 2.0f * 127);
 }
 
+static const std::initializer_list<std::string> variable_names_slice = {
+    "a", "b", "c", "d"};
+
+// a->Dequantize->b
+// b->Slice->c
+// c->Dropout->d
+ProgramDesc BuildProgramDescSlice() {
+  ProgramDesc prog;
+  for (auto& v : variable_names_slice) {
+    prog.MutableBlock(0)->Var(v);
+  }
+  SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true);
+  SetOp(&prog, "slice", "Slice", {"b"}, {"c"}, true, "int8");
+  SetOp(&prog, "dropout", "Dropout", {"c"}, {"d"}, true, "float32");
+
+  return prog;
+}
+
+// a->Transpose->b
+// b->slice->c
+// c->Dropout->d
+ProgramDesc BuildProgramDescSliceBetweenNonQuantizedOp() {
+  ProgramDesc prog;
+  for (auto& v : variable_names_slice) {
+    prog.MutableBlock(0)->Var(v);
+  }
+
+  SetOp(&prog, "transpose2", "Transpose2", {"a"}, {"b"}, true, "float32");
+  SetOp(&prog, "slice", "Slice", {"b"}, {"c"}, true, "int8");
+  SetOp(&prog, "dropout", "Dropout", {"c"}, {"d"}, true, "float32");
+
+  return prog;
+}
+
+void MainTestSlice(const ProgramDesc& prog, int transpose_count,
+                   int slice_count, int quant_count, int dequant_count,
+                   int added_nodes_count, float scale) {
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  int original_nodes_num, current_nodes_num;
+  PreparePass(&graph, prog, variable_names_slice, &original_nodes_num,
+              &current_nodes_num);
+
+  float quant_scale = 1.0f;
+  float dequant_scale = 1.0f;
+  int quantize_nodes_count = 0;
+  int dequantize_nodes_count = 0;
+  int transpose_nodes_count = 0;
+  int slice_nodes_count = 0;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      if (op->Type() == "transpose2") {
+        transpose_nodes_count++;
+      } else if (op->Type() == "slice") {
+        slice_nodes_count++;
+      } else if (op->Type() == "quantize") {
+        quantize_nodes_count++;
+        quant_scale = BOOST_GET_CONST(float, op->GetAttr("Scale"));
+        EXPECT_EQ(quant_scale, scale) << "Scale for node '" + op->Type() + "'.";
+      } else if (op->Type() == "dequantize") {
+        dequantize_nodes_count++;
+        auto op_name = op->GetAttrIfExists<std::string>("name");
+        VLOG(3) << op_name << "\n";
+        if (op_name != "Dequantize1") {
+          dequant_scale = BOOST_GET_CONST(float, op->GetAttr("Scale"));
+          EXPECT_EQ(dequant_scale, scale)
+              << "Scale for node '" + op->Type() + "'.";
+        }
+      }
+    }
+  }
+  EXPECT_EQ(transpose_nodes_count, transpose_count);
+  EXPECT_EQ(slice_nodes_count, slice_count);
+  EXPECT_EQ(quantize_nodes_count, quant_count);
+  EXPECT_EQ(dequantize_nodes_count, dequant_count);
+  EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
+}
+
+TEST(CpuQuantizePass, slice) {
+  // a->Dequantize->b
+  // b2->Quant->b3->slice->c1->Dequant->c2
+  // c2->Dropout->d
+  int slice_count = 1;
+  int transpose_count = 0;
+  int quant_count = 1;
+  int dequant_count = 2;
+  // 1 Quant + 1 IN + 1 DeQuant + 1 OUT
+  int added_nodes_count = 4;
+  MainTestSlice(BuildProgramDescSlice(), transpose_count, slice_count,
+                quant_count, dequant_count, added_nodes_count, 2.0f * 127);
+}
+
+TEST(CpuQuantizePass, sliceBetweenNonQuantizedOp) {
+  // a->Transpos2->b
+  // b->slice->c
+  // c->Dropout->d
+  int slice_count = 1;
+  int transpose_count = 1;
+  int quant_count = 0;
+  int dequant_count = 0;
+  // 0 Quant + 0 IN + 0 DeQuant + 0 OUT
+  int added_nodes_count = 0;
+  MainTestSlice(BuildProgramDescSliceBetweenNonQuantizedOp(), transpose_count,
+                slice_count, quant_count, dequant_count, added_nodes_count,
+                2.0f * 127);
+}
+
 static const std::initializer_list<std::string> variable_names_matmul = {
     "a", "b", "c", "d", "e", "f"};
 
diff --git a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc
index b2b1a7515f0a5..2d60129165a60 100644
--- a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc
+++ b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -231,3 +232,7 @@ void SimplifyWithBasicOpsPass::ReplaceOutputVar(Node* op, Node* old_var,
 
 REGISTER_PASS(simplify_with_basic_ops_pass,
               paddle::framework::ir::SimplifyWithBasicOpsPass);
+REGISTER_PASS_CAPABILITY(simplify_with_basic_ops_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination().EQ(
+            "scale", 0));
diff --git a/paddle/fluid/framework/library_type.h b/paddle/fluid/framework/library_type.h
index 8fe314cf5f18c..f7539aa485957 100644
--- a/paddle/fluid/framework/library_type.h
+++ b/paddle/fluid/framework/library_type.h
@@ -61,6 +61,8 @@ inline LibraryType StringToLibraryType(const char* ctype) {
     return LibraryType::kPlain;
   } else if (s == std::string("XPU")) {
     return LibraryType::kPlain;
+  } else if (s == std::string("IPU")) {
+    return LibraryType::kPlain;
   } else if (s == std::string("NPU")) {
     return LibraryType::kPlain;
   } else if (s == std::string("CUDA")) {
@@ -68,7 +70,7 @@ inline LibraryType StringToLibraryType(const char* ctype) {
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Unknown LibraryType string (%s), only support library type string "
-        "include PLAIN, MKLDNN, CUDNN, CPU and CUDA.",
+        "include PLAIN, MKLDNN, CUDNN, CPU, CUDA and IPU.",
         s.c_str()));
   }
 }
diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu
index 8fb59d682e40f..10e7ed0fb6021 100644
--- a/paddle/fluid/framework/mixed_vector_test.cu
+++ b/paddle/fluid/framework/mixed_vector_test.cu
@@ -24,7 +24,7 @@
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/mixed_vector.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 template <typename T>
 using vec = paddle::framework::Vector<T>;
@@ -63,7 +63,7 @@ TEST(mixed_vector, GPU_VECTOR) {
 }
 
 TEST(mixed_vector, MultiGPU) {
-  if (paddle::platform::GetCUDADeviceCount() < 2) {
+  if (paddle::platform::GetGPUDeviceCount() < 2) {
     LOG(WARNING) << "Skip mixed_vector.MultiGPU since there are not multiple "
                     "GPUs in your machine.";
     return;
diff --git a/paddle/fluid/framework/new_executor/data_transfer.cc b/paddle/fluid/framework/new_executor/data_transfer.cc
index 15e6b2a1ff939..064dfa0170bdb 100644
--- a/paddle/fluid/framework/new_executor/data_transfer.cc
+++ b/paddle/fluid/framework/new_executor/data_transfer.cc
@@ -137,7 +137,7 @@ std::shared_ptr<OperatorBase> TransferLayout(const std::string& var_name,
   // 1. Generate new_var_name and Initialize it
   *new_var_name =
       var_name + "_layout_" + std::to_string(var_scope->VarSize() + 1);
-  auto* ptr = local_scope->Var(new_var_name);
+  auto* ptr = local_scope->Var(*new_var_name);
 
   auto var_type = var_scope->Var(var_name)->Type();
   InitializeVariable(ptr, static_cast<proto::VarType::Type>(var_type));
@@ -171,8 +171,8 @@ std::shared_ptr<OperatorBase> TransferDtype(const std::string& var_name,
   // 1. Generate new_var_name and Initialize it
   *new_var_name =
       var_name + "_dtype_" + std::to_string(var_scope->VarSize() + 1);
-  auto* ptr = local_scope->Var(new_var_name);
-  var_scope->SetVarDesc(var_name, nullptr);
+  auto* ptr = local_scope->Var(*new_var_name);
+
   auto var_type = var_scope->Var(var_name)->Type();
   InitializeVariable(ptr, static_cast<proto::VarType::Type>(var_type));
 
@@ -211,7 +211,7 @@ std::shared_ptr<OperatorBase> TransferDevice(const std::string& var_name,
   // 1. Generate new_var_name and Initialize it
   *new_var_name =
       var_name + "_device_" + std::to_string(var_scope->VarSize() + 1);
-  auto* ptr = local_scope->Var(new_var_name);
+  auto* ptr = local_scope->Var(*new_var_name);
 
   auto var_type = var_scope->Var(var_name)->Type();
   InitializeVariable(ptr, static_cast<proto::VarType::Type>(var_type));
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 94b2118ba9d73..dcbdd12f88fb7 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -398,13 +398,8 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
   /*For profiling/benchmark only*/
   if (FLAGS_benchmark) {
     instr_node.DeviceContext().Wait();
-#if defined(PADDLE_WITH_CUDA)
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError());
-    VLOG(4) << "Operator(" << op->Type()
-            << "): context wait and get last error";
-#endif
-#if defined(PADDLE_WITH_HIP)
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipGetLastError());
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
     VLOG(4) << "Operator(" << op->Type()
             << "): context wait and get last error";
 #endif
@@ -439,6 +434,7 @@ void InterpreterCore::ExecuteInstructionList(
 
   if (UNLIKELY(exception_holder_.IsCaught())) {
     VLOG(4) << "Exception caught " << exception_holder_.Type();
+    async_work_queue_->Cancel();
     exception_holder_.ReThrow();
   }
 
@@ -514,7 +510,7 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id) {
     ready_ops.pop();
     auto& instr_node = vec_instruction_.at(instr_id);
     auto* op = instr_node.OpBase();
-    platform::RecordEvent instruction_event(op->Type());
+    platform::RecordEvent instruction_event(op->Type().c_str());
     interpreter::WaitEvent(instr_node, place_);
 
     try {
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
index 0501522a7a810..3817a11b9afe4 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -20,9 +20,26 @@
 #include "paddle/fluid/operators/controlflow/recurrent_op_helper.h"
 #include "paddle/fluid/operators/controlflow/while_op_helper.h"
 
+PADDLE_DEFINE_EXPORTED_bool(
+    new_executor_sequential_run, false,
+    "Enable sequential execution for standalone executor, used for debug");
 namespace paddle {
 namespace framework {
 namespace interpreter {
+
+void AsyncWorkQueue::AddTask(const OpFuncType& op_func_type,
+                             std::function<void()> fn) {
+  // NOTE(zhiqiu): use thhe second queue of size of, so only one thread is used.
+  if (FLAGS_new_executor_sequential_run) {
+    VLOG(4) << "FLAGS_new_executor_sequential_run:"
+            << FLAGS_new_executor_sequential_run;
+    queue_group_->AddTask(static_cast<size_t>(OpFuncType::kQueueAsync),
+                          std::move(fn));
+  } else {
+    queue_group_->AddTask(static_cast<size_t>(op_func_type), std::move(fn));
+  }
+}
+
 using VariableIdMap = std::map<std::string, std::vector<int>>;
 
 AtomicVectorSizeT& AsyncWorkQueue::PrepareAtomicDeps(
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.h b/paddle/fluid/framework/new_executor/interpretercore_util.h
index c92cea6c97c86..8f27c7e1811fb 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.h
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.h
@@ -77,9 +77,7 @@ class AsyncWorkQueue {
 
   // void WaitEmpty() { queue_group_->WaitQueueGroupEmpty(); }
 
-  void AddTask(const OpFuncType& op_func_type, std::function<void()> fn) {
-    queue_group_->AddTask(static_cast<size_t>(op_func_type), std::move(fn));
-  }
+  void AddTask(const OpFuncType& op_func_type, std::function<void()> fn);
 
   void Cancel() { queue_group_->Cancel(); }
 
diff --git a/paddle/fluid/framework/new_executor/profiler.h b/paddle/fluid/framework/new_executor/profiler.h
index 51c9e3d66a6f0..8df8db35592bb 100644
--- a/paddle/fluid/framework/new_executor/profiler.h
+++ b/paddle/fluid/framework/new_executor/profiler.h
@@ -15,7 +15,7 @@
 #pragma once
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/timer.h"
 
 namespace paddle {
@@ -45,7 +45,7 @@ class ProfilerGuard {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, place);
       cost_info_->device_memory_bytes =
-          platform::RecordedCudaMallocSize(cuda_place.device);
+          platform::RecordedGpuMallocSize(cuda_place.device);
 #endif
     }
   }
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 6ef44fb127afb..4236fcf8dc134 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1181,9 +1181,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       }
       BuildPtenKernelContext(*runtime_ctx, dev_ctx);
       (*pt_kernel_)(pt_kernel_context_.get());
-
       WriteBackToOutputs(runtime_ctx);
-
       pt_kernel_context_->ClearData();
     } else {
       (*kernel_func_)(
@@ -1214,14 +1212,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   /*For profiling/benchmark only*/
   if (FLAGS_benchmark) {
     dev_ctx->Wait();
-#if defined(PADDLE_WITH_CUDA)
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError());
-    VLOG(4) << "Operator(" << Type() << "): context wait and get last error";
+#if defined(PADDLE_WITH_CUDA) || defined(PADLDE_WITH_ROCM)
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
 #endif
-#if defined(PADDLE_WITH_HIP)
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipGetLastError());
     VLOG(4) << "Operator(" << Type() << "): context wait and get last error";
-#endif
   }
 
   if (FLAGS_check_nan_inf) {
@@ -1814,45 +1808,31 @@ void OperatorWithKernel::BuildPtenKernelContext(
     size_t start_idx =
         (i == 0 ? 0 : pt_kernel_context_->InputRangeAt(i - 1).second);
     size_t end_idx = start_idx + ins_vector.size();
-
-    // The current size of input/output in pt_kernel_context_ is at least equal
-    // the start_idx. For the reason of reusing the allocted of inputs or
-    // outputs in pt_kernel_context_, the current size of input/output can be
-    // greater then the index of which the tensort wanted to set to, so it will
-    // use ReMakePtenDenseTensorFromVar to make pten tensor.
-    if (pt_kernel_context_->InputsSize() == start_idx) {
-      paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_inputs;
-      for (auto* var : ins_vector) {
-        tmp_inputs.emplace_back(
-            experimental::MakePtenTensorBaseFromVar(*var, in_def));
-      }
-      pt_kernel_context_->EmplaceBackInputs(std::move(tmp_inputs));
-    } else if (pt_kernel_context_->InputsSize() > start_idx) {
-      size_t input_size = pt_kernel_context_->InputsSize();
-      for (size_t j = 0; j < ins_vector.size(); ++j) {
-        if (input_size > start_idx + j) {
+    auto current_vector_size = pt_kernel_context_->InputsSize();
+
+    // If the memory needed is less than the current memory allocated, we will
+    // reuse the current memory by using ReMakePtenDenseTensorFromVar.
+    // Otherwise，we will create new storage.
+    for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
+      if (current_vector_size > start_idx + offset) {
+        auto& input_ptr =
+            pt_kernel_context_->MutableInputPtrAt(start_idx + offset);
+        if (input_ptr == nullptr) {
+          input_ptr = experimental::MakePtenTensorBaseFromVar(
+              *ins_vector[offset], in_def);
+        } else {
           experimental::ReMakePtenDenseTensorFromVar(
-              *ins_vector[j], in_def,
+              *ins_vector[offset], in_def,
               pt_kernel_context_->MutableInputAt<pten::DenseTensor>(start_idx +
-                                                                    j));
-          // TODO(chentianyu03): When multi input kernel, open this code
-          /*
-          } else {
-            pt_kernel_context_->EmplaceBackInputWithoutSetRange(
-                experimental::MakePtenTensorBaseFromVar(*ins_vector[j],
-          in_def));
-          */
+                                                                    offset));
         }
+      } else {
+        pt_kernel_context_->EmplaceBackInputWithoutSetRange(
+            experimental::MakePtenTensorBaseFromVar(*ins_vector[offset],
+                                                    in_def));
       }
-      pt_kernel_context_->MutableInputRangeAt(i) =
-          std::make_pair(start_idx, end_idx);
-    } else {
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "Error start index when trying to set new tensor to inputs, start "
-          "index is `%d`, but current pt_kernel_context_.inputs.size() is "
-          "`%d`.",
-          start_idx, pt_kernel_context_->InputsSize()));
     }
+    pt_kernel_context_->AssignInputRange(std::make_pair(start_idx, end_idx), i);
   }
 
   for (size_t i = 0; i < output_names.size(); ++i) {
@@ -1862,46 +1842,25 @@ void OperatorWithKernel::BuildPtenKernelContext(
     size_t start_idx =
         (i == 0 ? 0 : pt_kernel_context_->OutputRangeAt(i - 1).second);
     size_t end_idx = start_idx + outs_vector.size();
-
-    // The current size of input/output in pt_kernel_context_ is at least equal
-    // the start_idx. For the reason of reusing the allocted of inputs or
-    // outputs in pt_kernel_context_, the current size of input/output can be
-    // greater then the index of which the tensort wanted to set to, so it will
-    // use ReMakePtenDenseTensorFromVar to make pten tensor.
-    if (pt_kernel_context_->OutputsSize() == start_idx) {
-      paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_outputs;
-      for (auto* var : outs_vector) {
-        tmp_outputs.emplace_back(
-            experimental::MakePtenTensorBaseFromVar(var, out_def));
-      }
-      pt_kernel_context_->EmplaceBackOutputs(std::move(tmp_outputs));
-    } else if (pt_kernel_context_->OutputsSize() > start_idx) {
-      size_t output_size = pt_kernel_context_->OutputsSize();
-      for (size_t j = 0; j < outs_vector.size(); ++j) {
-        if (output_size > start_idx + j) {
-          experimental::ReMakePtenDenseTensorFromVar(
-              outs_vector[j], out_def,
-              pt_kernel_context_->MutableOutputAt<pten::DenseTensor>(start_idx +
-                                                                     j));
-
-          // TODO(chentianyu03): When multi output kernel, open this code
-          /*
-          } else {
-            pt_kernel_context_->EmplaceBackOutputWithoutSetRange(
-                experimental::MakePtenTensorBaseFromVar(outs_vector[j],
-          out_def));
-              */
-        }
+    auto current_vector_size = pt_kernel_context_->OutputsSize();
+
+    // If the memory needed is less than the current memory allocated, we will
+    // reuse the current memory by using ReMakePtenDenseTensorFromVar.
+    // Otherwise，we will create new storage.
+    for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
+      if (current_vector_size > start_idx + offset) {
+        experimental::ReMakePtenDenseTensorFromVar(
+            outs_vector[offset], out_def,
+            pt_kernel_context_->MutableOutputAt<pten::DenseTensor>(start_idx +
+                                                                   offset));
+      } else {
+        pt_kernel_context_->EmplaceBackOutputWithoutSetRange(
+            experimental::MakePtenTensorBaseFromVar(outs_vector[offset],
+                                                    out_def));
       }
-      pt_kernel_context_->MutableOutputRangeAt(i) =
-          std::make_pair(start_idx, end_idx);
-    } else {
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "Error start index when trying to set new tensor to inputs, start "
-          "index is `%d`, but current pt_kernel_context_.outputs.size() is "
-          "`%d`.",
-          start_idx, pt_kernel_context_->OutputsSize()));
     }
+    pt_kernel_context_->AssignOutputRange(std::make_pair(start_idx, end_idx),
+                                          i);
   }
 
   for (size_t i = 0; i < attr_names.size(); ++i) {
diff --git a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
index b80a265f8a41b..b13166cff60aa 100644
--- a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
+++ b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
@@ -4,20 +4,22 @@ cc_library(transform_desc SRCS transform_desc.cc DEPS proto_desc cinn)
 cc_library(cinn_graph_symbolization SRCS cinn_graph_symbolization.cc DEPS lod_tensor graph transform_desc cinn)
 cc_library(cinn_compiler SRCS cinn_compiler.cc DEPS framework_proto graph lod_tensor cinn_cache_key cinn_graph_symbolization cinn)
 
-cc_test(cinn_lib_test SRCS cinn_lib_test.cc DEPS cinn)
-set_tests_properties(cinn_lib_test PROPERTIES LABELS "RUN_TYPE=CINN")
+if (WITH_TESTING)
+  cc_test(cinn_lib_test SRCS cinn_lib_test.cc DEPS cinn)
+  set_tests_properties(cinn_lib_test PROPERTIES LABELS "RUN_TYPE=CINN")
 
-cc_test(cinn_cache_key_test SRCS cinn_cache_key_test.cc DEPS cinn_cache_key)
-set_tests_properties(cinn_cache_key_test PROPERTIES LABELS "RUN_TYPE=CINN")
+  cc_test(cinn_cache_key_test SRCS cinn_cache_key_test.cc DEPS cinn_cache_key)
+  set_tests_properties(cinn_cache_key_test PROPERTIES LABELS "RUN_TYPE=CINN")
 
-cc_test(build_cinn_pass_test SRCS build_cinn_pass_test.cc DEPS build_cinn_pass cinn_compiler)
-set_tests_properties(build_cinn_pass_test PROPERTIES LABELS "RUN_TYPE=CINN")
+  cc_test(build_cinn_pass_test SRCS build_cinn_pass_test.cc DEPS build_cinn_pass cinn_compiler)
+  set_tests_properties(build_cinn_pass_test PROPERTIES LABELS "RUN_TYPE=CINN")
 
-cc_test(transform_desc_test SRCS transform_desc_test.cc DEPS transform_desc)
-set_tests_properties(transform_desc_test PROPERTIES LABELS "RUN_TYPE=CINN")
+  cc_test(transform_desc_test SRCS transform_desc_test.cc DEPS transform_desc)
+  set_tests_properties(transform_desc_test PROPERTIES LABELS "RUN_TYPE=CINN")
 
-cc_test(cinn_graph_symbolization_test SRCS cinn_graph_symbolization_test.cc DEPS cinn_graph_symbolization)
-set_tests_properties(cinn_graph_symbolization_test PROPERTIES LABELS "RUN_TYPE=CINN")
+  cc_test(cinn_graph_symbolization_test SRCS cinn_graph_symbolization_test.cc DEPS cinn_graph_symbolization)
+  set_tests_properties(cinn_graph_symbolization_test PROPERTIES LABELS "RUN_TYPE=CINN")
 
-cc_test(cinn_compiler_test SRCS cinn_compiler_test.cc DEPS cinn_compiler place proto_desc graph_viz_pass build_cinn_pass cinn)
-set_tests_properties(cinn_compiler_test PROPERTIES LABELS "RUN_TYPE=CINN")
+  cc_test(cinn_compiler_test SRCS cinn_compiler_test.cc DEPS cinn_compiler place proto_desc graph_viz_pass build_cinn_pass cinn)
+  set_tests_properties(cinn_compiler_test PROPERTIES LABELS "RUN_TYPE=CINN")
+endif()
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
index 3f1b6c78d8417..7fc8eff3d31c9 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -193,6 +193,8 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
   CinnGraphSymbolization symbol{compiled_num, graph, target, input_tensors};
   auto frontend_program = symbol();
   ProgramPass::Apply(&frontend_program, target, {"Decomposer"});
+  auto fetch_ids = symbol.GetFetchIds();
+  ::cinn::frontend::ApplyPass(&frontend_program, fetch_ids, "RemoveIdentity");
   auto cinn_graph = std::make_shared<::cinn::hlir::framework::Graph>(
       frontend_program, target);
   VLOG(1) << "-- The " << compiled_num << "-th compilation ("
@@ -201,7 +203,6 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
   ApplyPass(cinn_graph.get(), "OpFusion");
   auto scope = BuildScope(target, cinn_graph);
 
-  auto fetch_ids = symbol.GetFetchIds();
   VLOG(4) << "All fetch var ids in CINN: "
           << string::join_strings(fetch_ids, ',');
 
@@ -209,6 +210,7 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
       std::make_unique<GraphCompiler>(target, scope, cinn_graph);
   GraphCompiler::CompileOptions options;
   options.with_instantiate_variables = false;
+  options.with_buffer_handle_instruction_inserted = true;
   auto compiled_res =
       graph_compiler->Build(options, std::move(fetch_ids), stream);
   auto compiled_obj = std::make_unique<CinnCompiledObject>();
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 78774f0489638..18d0ee78ffbbc 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -34,7 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc
index b423d0e05e174..51a2d641bb00a 100644
--- a/paddle/fluid/framework/pten_utils.cc
+++ b/paddle/fluid/framework/pten_utils.cc
@@ -136,17 +136,17 @@ KernelArgsNameMakerByOpProto::GetInputArgsNames() {
     auto& in = op_proto_->inputs()[i];
     auto& in_name = in.name();
     if ((in.has_extra() && in.extra()) || (in.has_quant() && in.quant())) {
-      VLOG(1) << "Parse PtenKernel input: skip extra & quant input - "
+      VLOG(3) << "Parse PtenKernel input: skip extra & quant input - "
               << in_name;
       continue;
     }
     // If contains dispensable input, we should override the
     // GetExpectedPtenKernelArgs method self
     if (in.has_dispensable() && in.dispensable()) {
-      VLOG(1) << "Parse PtenKernel input: skip dispensable input - " << in_name;
+      VLOG(3) << "Parse PtenKernel input: skip dispensable input - " << in_name;
       continue;
     }
-    VLOG(1) << "Parse PtenKernel input: " << in_name;
+    VLOG(3) << "Parse PtenKernel input: " << in_name;
     input_names_.emplace_back(in_name);
   }
   return input_names_;
@@ -158,7 +158,7 @@ KernelArgsNameMakerByOpProto::GetOutputArgsNames() {
     auto& out = op_proto_->outputs()[i];
     auto& out_name = out.name();
     // TODO(chenweihang): outputs also need skip some cases
-    VLOG(1) << "Parse PtenKernel output: " << out_name;
+    VLOG(3) << "Parse PtenKernel output: " << out_name;
     output_names_.emplace_back(out_name);
   }
   return output_names_;
@@ -172,17 +172,17 @@ KernelArgsNameMakerByOpProto::GetAttrsArgsNames() {
     if (attr_name == "use_mkldnn" || attr_name == "op_role" ||
         attr_name == "op_role_var" || attr_name == "op_namescope" ||
         attr_name == "op_callstack" || attr_name == "op_device") {
-      VLOG(1) << "Parse PtenKernel attribute: skip needless attr - "
+      VLOG(3) << "Parse PtenKernel attribute: skip needless attr - "
               << attr_name;
       continue;
     }
     if ((attr.has_extra() && attr.extra()) ||
         (attr.has_quant() && attr.quant())) {
-      VLOG(1) << "Parse PtenKernel attribute: skip extra & quant attr - "
+      VLOG(3) << "Parse PtenKernel attribute: skip extra & quant attr - "
               << attr_name;
       continue;
     }
-    VLOG(1) << "Parse PtenKernel attribute: " << attr_name;
+    VLOG(3) << "Parse PtenKernel attribute: " << attr_name;
     attr_names_.emplace_back(attr_name);
   }
 
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index 8d927b87c9abe..cbbc020989d1e 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/tensor.h"
 
+DECLARE_bool(use_stream_safe_cuda_allocator);
+
 namespace paddle {
 namespace memory {
 namespace allocation {
@@ -60,14 +62,7 @@ void* Tensor::mutable_data(const platform::Place& place,
           "The Tensor's shape is [",
           dims(), "] now"));
   size_t size = numel() * SizeOfType(type);
-  if (requested_size) {
-    PADDLE_ENFORCE_GE(
-        requested_size, size,
-        platform::errors::InvalidArgument(
-            "The requested memory size is less than the memory size of Tensor. "
-            "But received requested memory size is %d, "
-            "memory size of Tensor is %d.",
-            requested_size, size));
+  if (requested_size && (requested_size > size)) {
     size = requested_size;
   }
   /* some versions of boost::variant don't have operator!= */
@@ -89,6 +84,35 @@ void* Tensor::mutable_data(const platform::Place& place,
   return mutable_data(place, type_, requested_size);
 }
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+void* Tensor::mutable_data(const platform::CUDAPlace& place,
+                           proto::VarType::Type type,
+                           const gpuStream_t& stream) {
+  if (!FLAGS_use_stream_safe_cuda_allocator) {
+    return mutable_data(place, type);
+  }
+
+  type_ = type;
+  PADDLE_ENFORCE_GE(
+      numel(), 0,
+      platform::errors::PreconditionNotMet(
+          "The Tensor's element number must be equal or greater than zero. "
+          "The Tensor's shape is [",
+          dims(), "] now"));
+  size_t size = numel() * SizeOfType(type);
+
+  /* some versions of boost::variant don't have operator!= */
+  if (holder_ == nullptr || !(holder_->place() == place) ||
+      holder_->size() < size + offset_) {
+    holder_.reset();
+    holder_ = memory::AllocShared(place, size, stream);
+    offset_ = 0;
+  }
+  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                                 offset_);
+}
+#endif
+
 Tensor& Tensor::ShareDataWith(const Tensor& src) {
   src.check_memory_size();
   *this = src;
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 7f8d7bffa986e..4b1ae041fc4ca 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -81,6 +81,7 @@ class TensorInplaceVersion {
   bool IsUnique() const { return inplace_version_ == 0; }
   void Bump() { ++inplace_version_; }
   uint32_t CurrentVersion() const { return inplace_version_; }
+  void SetInplaceVersionToZero() { inplace_version_ = 0; }
 
  private:
   uint32_t inplace_version_;
@@ -149,6 +150,11 @@ class Tensor {
 
   void* mutable_data(const platform::Place& place, size_t requested_size = 0);
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  void* mutable_data(const platform::CUDAPlace& place,
+                     proto::VarType::Type type, const gpuStream_t& stream);
+#endif
+
   /**
    * @brief     Return a pointer to mutable memory block.
    *
@@ -260,6 +266,8 @@ class Tensor {
     // should not be copied.
   }
 
+  void ShareDataTypeWith(const Tensor& tensor) { type_ = tensor.type_; }
+
   bool IsSharedBufferWith(const Tensor& src) const {
     return holder_ && holder_ == src.Holder();
   }
diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc
index 1d5e638729361..eb8a1e4cea9fb 100644
--- a/paddle/fluid/framework/var_type_traits.cc
+++ b/paddle/fluid/framework/var_type_traits.cc
@@ -22,7 +22,7 @@
 #ifdef PADDLE_WITH_CUDA
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 #include <cudnn.h>
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
@@ -30,8 +30,8 @@
 #endif
 #ifdef PADDLE_WITH_HIP
 #if defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"  // NOLINT
-#include "paddle/fluid/platform/nccl_helper.h"            // NOLINT
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"   // NOLINT
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"  // NOLINT
 #endif
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"  // NOLINT
 #include "paddle/fluid/operators/miopen_rnn_cache.h"
diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc
index ae7ae85207d84..9a9b90cd81179 100644
--- a/paddle/fluid/framework/var_type_traits_test.cc
+++ b/paddle/fluid/framework/var_type_traits_test.cc
@@ -23,15 +23,15 @@
 #ifdef PADDLE_WITH_CUDA
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/operators/cudnn_rnn_cache.h"
 #endif
 #ifdef PADDLE_WITH_HIP
 #if defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"  // NOLINT
-#include "paddle/fluid/platform/nccl_helper.h"            // NOLINT
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"   // NOLINT
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"  // NOLINT
 #endif
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"  // NOLINT
 #include "paddle/fluid/operators/miopen_rnn_cache.h"
diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h
index 792a2accd41d6..f8ad990a668ce 100644
--- a/paddle/fluid/framework/variable.h
+++ b/paddle/fluid/framework/variable.h
@@ -75,6 +75,7 @@ class Variable {
   framework::TensorInplaceVersion* InplaceVersionCounter();
 
  public:
+  void SetInplaceVersionToZero();
   uint32_t CurrentInplaceVersion();
   void BumpInplaceVersion();
 
@@ -134,6 +135,12 @@ inline framework::TensorInplaceVersion* Variable::InplaceVersionCounter() {
   return version_counter_ptr;
 }
 
+inline void Variable::SetInplaceVersionToZero() {
+  auto inplace_version_counter = this->InplaceVersionCounter();
+  if (inplace_version_counter)
+    inplace_version_counter->SetInplaceVersionToZero();
+}
+
 inline uint32_t Variable::CurrentInplaceVersion() {
   auto version_counter_ptr = InplaceVersionCounter();
   if (version_counter_ptr) {
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index 9121610d29eaa..594b0d48a8aad 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -30,6 +30,9 @@ if(NOT WIN32)
         cc_library(hccl_context SRCS hccl_context.cc DEPS collective_helper device_context tensor var_type_traits)
         cc_library(reducer SRCS reducer.cc DEPS layer)
     endif()
+    if(WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL)
+        cc_library(heter_ccl_context SRCS heter_ccl_context.cc DEPS collective_helper device_context tensor var_type_traits)
+    endif()
     cc_library(data_loader SRCS data_loader.cc DEPS enforce)
 endif(NOT WIN32)
 if(WITH_GLOO)
diff --git a/paddle/fluid/imperative/all_reduce.cc b/paddle/fluid/imperative/all_reduce.cc
index b922811b4f104..31da214fbc39a 100644
--- a/paddle/fluid/imperative/all_reduce.cc
+++ b/paddle/fluid/imperative/all_reduce.cc
@@ -28,8 +28,8 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/imperative/parallel_context.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/nccl_helper.h"
 #include "paddle/fluid/string/string_helper.h"
 
 namespace paddle {
@@ -64,7 +64,7 @@ static void AllReduce(const framework::Tensor &src, framework::Tensor *dst,
   dst->Resize(src.dims());
   auto *dst_ptr = dst->mutable_data(src.place(), src.type());
   auto nccl_dtype = platform::ToNCCLDataType(src.type());
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
       src_ptr, dst_ptr, src.numel(), nccl_dtype, ncclSum, comm->comm(),
       stream));
 }
@@ -100,16 +100,12 @@ static void AllReduce(const framework::SelectedRows &src,
   if (!use_calc_stream) {
     dev_ctx->Wait();
   }
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
       gpu_rows_num_ptr + strategy.local_rank_, gpu_rows_num_ptr, 1, ncclInt64,
       comm->comm(), stream));
 
   if (!use_calc_stream) {
-#ifdef PADDLE_WITH_RCCL
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
+    platform::GpuStreamSync(stream);
   }
 
   const auto *cpu_rows_num_ptr = rows_num_vector.data();
@@ -146,11 +142,11 @@ static void AllReduce(const framework::SelectedRows &src,
     // allgather is used to speed up the allreduce by replacing broadcast.
     auto row_sendcount = cpu_rows_num_ptr[0];
     VLOG(3) << "allgather replaces broadcast to speed up in sparse allreduce";
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
         src_rows_ptr, dst_rows_ptr, row_sendcount, ncclInt64, comm->comm(),
         stream));
     auto value_sendcount = cpu_rows_num_ptr[0] * feature_size;
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
         src_tensor_ptr, dst_tensor_ptr, value_sendcount, nccl_dtype,
         comm->comm(), stream));
     return;
@@ -158,13 +154,13 @@ static void AllReduce(const framework::SelectedRows &src,
   for (int i = 0; i < strategy.nranks_; ++i) {
     if (cpu_rows_num_ptr[i] > 0) {
       // 2. Broadcast the rows of SelectedRows
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBroadcast(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBroadcast(
           src_rows_ptr, dst_rows_ptr + row_offset, cpu_rows_num_ptr[i],
           ncclInt64, i, comm->comm(), stream));
       // 3. Broadcast the tensor data of SelectedRows
       auto *dst_tensor_ptr_i = reinterpret_cast<uint8_t *>(dst_tensor_ptr) +
                                row_offset * feature_size * sizeof_dtype;
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBroadcast(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBroadcast(
           src_tensor_ptr, dst_tensor_ptr_i, cpu_rows_num_ptr[i] * feature_size,
           nccl_dtype, i, comm->comm(), stream));
       row_offset += cpu_rows_num_ptr[i];
@@ -209,12 +205,8 @@ void AllReduce(const framework::Variable &src, framework::Variable *dst,
       AllReduce(src.Get<framework::SelectedRows>(),
                 tmp_dst.GetMutable<framework::SelectedRows>(), strategy, stream,
                 comm);
-// stream must synchronize to ensure accuracy of the move operation
-#ifdef PADDLE_WITH_RCCL
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
+      // stream must synchronize to ensure accuracy of the move operation
+      platform::GpuStreamSync(stream);
       *dst = std::move(tmp_dst);
     }
 #endif
diff --git a/paddle/fluid/imperative/bkcl_context.cc b/paddle/fluid/imperative/bkcl_context.cc
index 8c6b840f60a59..6569929d6f5d7 100644
--- a/paddle/fluid/imperative/bkcl_context.cc
+++ b/paddle/fluid/imperative/bkcl_context.cc
@@ -150,6 +150,23 @@ void BKCLParallelContext::AllReduceByStream(const framework::Variable &src,
   }
 }
 
+void BKCLParallelContext::Broadcast(framework::Variable *src, int ring_id) {
+  VLOG(3) << "/// DEBUG /// start inter broadcast with ring_id: " << ring_id;
+  framework::Tensor *src_tensor = src->GetMutable<framework::LoDTensor>();
+  const auto &place = src_tensor->place();
+  platform::BKCLComm *comm =
+      platform::BKCLCommContext::Instance().Get(ring_id, place);
+  XPUStream stream = comm->stream();
+
+  void *src_ptr = src_tensor->data<void>();
+  auto data_type = platform::ToBKCLDataType(src_tensor->type());
+
+  PADDLE_ENFORCE_EQ(bkcl_broadcast(comm->comm(), src_ptr, src_ptr,
+                                   src_tensor->numel(), data_type, 0, stream),
+                    BKCL_SUCCESS,
+                    platform::errors::Unavailable("bkcl_broadcast failed"));
+}
+
 paddle::platform::DeviceContext *BKCLParallelContext::GetDeviceContext(
     int ring_id) {
   return static_cast<platform::DeviceContext *>(
diff --git a/paddle/fluid/imperative/bkcl_context.h b/paddle/fluid/imperative/bkcl_context.h
index 652b7689666c6..a5a10b19389c0 100644
--- a/paddle/fluid/imperative/bkcl_context.h
+++ b/paddle/fluid/imperative/bkcl_context.h
@@ -42,6 +42,8 @@ class BKCLParallelContext : public ParallelContext {
                          framework::Variable* dst, int ring_id,
                          bool use_calc_stream) override;
 
+  void Broadcast(framework::Variable* src, int ring_id) override;
+
   paddle::platform::DeviceContext* GetDeviceContext(int ring_id) override;
 
   void WaitCompute(int ring_id) override;
diff --git a/paddle/fluid/imperative/gloo_context.cc b/paddle/fluid/imperative/gloo_context.cc
index ef1bf0d158787..1eaf0c6538043 100644
--- a/paddle/fluid/imperative/gloo_context.cc
+++ b/paddle/fluid/imperative/gloo_context.cc
@@ -37,7 +37,7 @@ void GLOOParallelContext::Init() {
   gloo_wrapper->SetSize(strategy_.nranks_);
   gloo_wrapper->SetRank(strategy_.local_rank_);
   gloo_wrapper->SetPrefix("");
-  gloo_wrapper->SetIface("lo");
+  gloo_wrapper->SetIface("");
   auto addr = paddle::string::Split(strategy_.trainer_endpoints_[0], ':');
   VLOG(4) << "Server is" << strategy_.trainer_endpoints_[0];
   std::string host = addr[0];
@@ -176,6 +176,11 @@ void GLOOParallelContext::AllReduce(const framework::SelectedRows &src,
   }
 }
 
+void GLOOParallelContext::Broadcast(framework::Variable *src, int ring_id) {
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "Unimplemented inter-broadcast for CPU now."));
+}
+
 paddle::platform::DeviceContext *GLOOParallelContext::GetDeviceContext(
     int ring_id) {
   // return the CPUDeviceContext
diff --git a/paddle/fluid/imperative/gloo_context.h b/paddle/fluid/imperative/gloo_context.h
index 305a75a881153..e7c9ba4cfddb6 100644
--- a/paddle/fluid/imperative/gloo_context.h
+++ b/paddle/fluid/imperative/gloo_context.h
@@ -47,6 +47,8 @@ class GLOOParallelContext : public ParallelContext {
                          framework::Variable* dst, int ring_id,
                          bool use_calc_stream) override;
 
+  void Broadcast(framework::Variable* src, int ring_id) override;
+
   paddle::platform::DeviceContext* GetDeviceContext(int ring_id) override;
 
   void WaitCompute(int ring_id) override;
diff --git a/paddle/fluid/imperative/hccl_context.cc b/paddle/fluid/imperative/hccl_context.cc
index 4f1135fa9ddd4..55c52ae6c11de 100644
--- a/paddle/fluid/imperative/hccl_context.cc
+++ b/paddle/fluid/imperative/hccl_context.cc
@@ -158,6 +158,29 @@ void HCCLParallelContext::AllReduceByStream(const framework::Variable &src,
   }
 }
 
+void HCCLParallelContext::Broadcast(framework::Variable *src, int ring_id) {
+  VLOG(3) << "/// DEBUG /// start inter broadcast with ring_id: " << ring_id;
+  if (src->IsType<framework::LoDTensor>()) {
+    framework::Tensor *src_tensor = src->GetMutable<framework::LoDTensor>();
+    const auto &place = src_tensor->place();
+    platform::HCCLComm *comm =
+        platform::HCCLCommContext::Instance().Get(ring_id, place);
+    aclrtStream stream = comm->stream();
+
+    void *src_ptr =
+        reinterpret_cast<void *>(const_cast<void *>(src_tensor->data<void>()));
+    auto hccl_dtype = platform::ToHCCLDataType(src_tensor->type());
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
+        src_ptr, src_tensor->numel(), hccl_dtype, 0, comm->comm(),
+        reinterpret_cast<void *>(stream)));
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Unsupported variable type %s for imperative allreduce, only "
+        "LoDTensor is supported.",
+        platform::demangle(framework::ToTypeName(src->Type()))));
+  }
+}
+
 paddle::platform::DeviceContext *HCCLParallelContext::GetDeviceContext(
     int ring_id) {
   return static_cast<platform::DeviceContext *>(
diff --git a/paddle/fluid/imperative/hccl_context.h b/paddle/fluid/imperative/hccl_context.h
index b7f22f3a0b0f1..e5f58dea9fb06 100644
--- a/paddle/fluid/imperative/hccl_context.h
+++ b/paddle/fluid/imperative/hccl_context.h
@@ -50,6 +50,8 @@ class HCCLParallelContext : public ParallelContext {
                          framework::Variable* dst, int ring_id,
                          bool use_calc_stream) override;
 
+  void Broadcast(framework::Variable* src, int ring_id) override;
+
   paddle::platform::DeviceContext* GetDeviceContext(int ring_id) override;
 
   void WaitCompute(int ring_id) override;
diff --git a/paddle/fluid/imperative/heter_ccl_context.cc b/paddle/fluid/imperative/heter_ccl_context.cc
new file mode 100644
index 0000000000000..896f29fdd0c25
--- /dev/null
+++ b/paddle/fluid/imperative/heter_ccl_context.cc
@@ -0,0 +1,205 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/imperative/heter_ccl_context.h"
+
+// NCCL first
+#ifdef PADDLE_WITH_NCCL
+#include "paddle/fluid/imperative/all_reduce.h"
+#endif
+
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/gen_comm_id_helper.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/split.h"
+#include "paddle/fluid/string/string_helper.h"
+
+namespace paddle {
+namespace framework {
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace imperative {
+
+HeterParallelContext::HeterParallelContext(const ParallelStrategy &strategy,
+                                           const int &device_id)
+#ifdef PADDLE_WITH_NCCL
+    : ParallelContext(strategy, platform::CUDAPlace(device_id))
+#elif PADDLE_WITH_XPU_BKCL
+    : ParallelContext(strategy, platform::XPUPlace(device_id))
+#elif PADDLE_WITH_ASCEND_CL
+    : ParallelContext(strategy, platform::NPUPlace(device_id))
+#else
+    : ParallelContext(strategy, platform::CPUPlace())
+#endif
+{
+  // construct node_strategy_ from global strategy by selecting the
+  // endpoints with same ip address.
+  std::string node_ip = strategy_.current_endpoint_.substr(
+      0, strategy_.current_endpoint_.find(':'));
+  int node_nranks = 0;
+  int inter_rank = -1;
+
+  std::vector<std::string> all_eps = strategy_.trainer_endpoints_;
+  std::vector<std::string> inter_endpoints;
+  std::set<std::string> nodes_ips;
+  for (auto ep : all_eps) {
+    std::string ip = ep.substr(0, ep.find(':'));
+    // record ip of different nodes
+    if (nodes_ips.find(ip) == nodes_ips.end()) {
+      if (ep == strategy_.current_endpoint_) {
+        inter_rank = nodes_ips.size();
+      }
+      inter_endpoints.push_back(ep);
+      nodes_ips.emplace(ip);
+    }
+
+    if (ip == node_ip) {
+      if (ep == strategy_.current_endpoint_) {
+        node_strategy_.local_rank_ = node_nranks;
+      }
+      node_nranks++;
+      node_strategy_.trainer_endpoints_.push_back(ep);
+    }
+  }
+
+  VLOG(0) << "init node size " << node_nranks << " rank "
+          << node_strategy_.local_rank_;
+
+  PADDLE_ENFORCE_NE(node_nranks, 0,
+                    platform::errors::InvalidArgument(
+                        "The number of local nranks should not be zero."));
+  node_strategy_.nranks_ = node_nranks;
+  node_strategy_.current_endpoint_ = strategy_.current_endpoint_;
+
+  if (inter_rank >= 0 && inter_endpoints.size() > 1) {
+    inter_strategy_.nranks_ = inter_endpoints.size();
+    inter_strategy_.local_rank_ = inter_rank;
+    inter_strategy_.current_endpoint_ = strategy_.current_endpoint_;
+    inter_strategy_.trainer_endpoints_ = inter_endpoints;
+#ifdef PADDLE_WITH_GLOO
+    inter_parallel_ctx_ = std::make_shared<GLOOParallelContext>(
+        inter_strategy_, platform::CPUPlace());
+#endif
+  }
+
+  VLOG(0) << "init inter size " << inter_endpoints.size() << " rank "
+          << inter_rank;
+
+#ifdef PADDLE_WITH_NCCL
+  node_place_ = platform::CUDAPlace(device_id);
+  node_parallel_ctx_ =
+      std::make_shared<NCCLParallelContext>(node_strategy_, node_place_);
+#endif
+#ifdef PADDLE_WITH_XPU_BKCL
+  node_place_ = platform::XPUPlace(device_id);
+  node_parallel_ctx_ =
+      std::make_shared<BKCLParallelContext>(node_strategy_, node_place_);
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  node_place_ = platform::NPUPlace(device_id);
+  node_parallel_ctx_ =
+      std::make_shared<HCCLParallelContext>(node_strategy_, node_place_);
+#endif
+}
+
+void HeterParallelContext::Init() {
+  PADDLE_ENFORCE_NE(
+      node_parallel_ctx_, nullptr,
+      platform::errors::Unavailable(
+          "The heter parallel context has not been initialized."));
+
+  if (inter_parallel_ctx_ != nullptr) {
+    inter_parallel_ctx_->Init();
+  }
+
+  node_parallel_ctx_->Init();
+
+  VLOG(3) << "/// DEBUG /// heter parallel env init done..." << std::endl;
+}
+
+void HeterParallelContext::InitWithRingID(int ring_id) {
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "Unimplemented InitWithRingID from heter ctx."));
+}
+
+void HeterParallelContext::AllReduceByStream(const framework::Variable &src,
+                                             framework::Variable *dst,
+                                             int ring_id,
+                                             bool use_calc_stream) {
+  // step 1: call reduce within node
+  VLOG(3) << "/// DEBUG /// step 1: reduce in node... ";
+  node_parallel_ctx_->AllReduceByStream(src, dst, ring_id, false);
+  node_parallel_ctx_->WaitComm(ring_id);
+
+  // step 2: call allreduce between nodes with gloo
+  if (inter_parallel_ctx_ != nullptr) {
+    // copy src to cpu
+    // dst is now the src
+    auto src_tensor = dst->Get<framework::LoDTensor>();
+    framework::Variable src_cpu;
+    auto src_cpu_tensor = src_cpu.GetMutable<framework::LoDTensor>();
+    framework::TensorCopySync(src_tensor, platform::CPUPlace(), src_cpu_tensor);
+
+    // allreduce src/cpu to dst/cpu
+    framework::Variable dst_cpu;
+    inter_parallel_ctx_->AllReduceByStream(src_cpu, &dst_cpu, ring_id, false);
+    inter_parallel_ctx_->WaitComm(ring_id);
+
+    // copy dst/cpu to dst
+    auto dst_cpu_tensor = dst_cpu.Get<framework::LoDTensor>();
+    auto dst_tensor = dst->GetMutable<framework::LoDTensor>();
+    framework::TensorCopySync(dst_cpu_tensor, dst_tensor->place(), dst_tensor);
+
+    inter_parallel_ctx_->WaitComm(ring_id);
+  }
+
+  // step 3: call broadcast within node
+  VLOG(3) << "/// DEBUG /// step 3: broadcast within node... ";
+  node_parallel_ctx_->WaitComm(ring_id);
+  node_parallel_ctx_->Broadcast(dst, ring_id);
+  node_parallel_ctx_->WaitComm(ring_id);
+}
+
+void HeterParallelContext::Broadcast(framework::Variable *src, int ring_id) {
+  PADDLE_THROW(platform::errors::Unimplemented("Unimplemented function."));
+}
+
+paddle::platform::DeviceContext *HeterParallelContext::GetDeviceContext(
+    int ring_id) {
+  // directly call the implementation of target parallel ctx.
+  return node_parallel_ctx_->GetDeviceContext(ring_id);
+}
+
+void HeterParallelContext::WaitCompute(int ring_id) {
+  // directly call the implementation of target parallel ctx.
+  node_parallel_ctx_->WaitCompute(ring_id);
+}
+
+void HeterParallelContext::WaitComm(int ring_id) {
+  // directly call the implementation of target parallel ctx.
+  node_parallel_ctx_->WaitComm(ring_id);
+}
+
+void HeterParallelContext::SynchronizeCompute() {
+  // directly call the implementation of target parallel ctx.
+  node_parallel_ctx_->SynchronizeCompute();
+}
+
+}  //  namespace imperative
+}  //  namespace paddle
diff --git a/paddle/fluid/imperative/heter_ccl_context.h b/paddle/fluid/imperative/heter_ccl_context.h
new file mode 100644
index 0000000000000..8ea5e85603ab5
--- /dev/null
+++ b/paddle/fluid/imperative/heter_ccl_context.h
@@ -0,0 +1,78 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#ifdef PADDLE_WITH_NCCL
+#include "paddle/fluid/imperative/nccl_context.h"
+#endif
+
+#ifdef PADDLE_WITH_XPU_BKCL
+#include "paddle/fluid/imperative/bkcl_context.h"
+#endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/imperative/hccl_context.h"
+#endif
+
+#include "paddle/fluid/imperative/gloo_context.h"
+#include "paddle/fluid/imperative/parallel_context.h"
+
+namespace paddle {
+namespace framework {
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace imperative {
+
+class HeterParallelContext : public ParallelContext {
+ public:
+  explicit HeterParallelContext(const ParallelStrategy& strategy,
+                                const int& device_id);
+
+  ~HeterParallelContext() override = default;
+
+  void Init() override;
+
+  void InitWithRingID(int ring_id) override;
+
+  void AllReduceByStream(const framework::Variable& src,
+                         framework::Variable* dst, int ring_id,
+                         bool use_calc_stream) override;
+
+  void Broadcast(framework::Variable* src, int ring_id) override;
+
+  paddle::platform::DeviceContext* GetDeviceContext(int ring_id) override;
+
+  void WaitCompute(int ring_id) override;
+
+  void WaitComm(int ring_id) override;
+
+  void SynchronizeCompute() override;
+
+ private:
+  ParallelStrategy inter_strategy_;
+  ParallelStrategy node_strategy_;
+  platform::Place node_place_;
+  std::shared_ptr<imperative::ParallelContext> node_parallel_ctx_{nullptr};
+  std::shared_ptr<imperative::ParallelContext> inter_parallel_ctx_{nullptr};
+};
+
+}  //  namespace imperative
+}  //  namespace paddle
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index ec5fb63f0d933..892c864027d11 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -281,16 +281,6 @@ class VarBase {
   static ThreadSafeNameSet name_set_;
 };
 
-class Layer {
- public:
-  virtual ~Layer() {}
-
-  virtual std::vector<std::shared_ptr<VarBase>> Forward(
-      const std::vector<std::shared_ptr<VarBase>>& inputs) {
-    return {};
-  }
-};
-
 std::shared_ptr<GradOpNode> CreateGradOpNode(
     const framework::OperatorBase& op, const NameVarBaseMap& ins,
     const NameVarBaseMap& outs, const framework::AttributeMap& attrs,
diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc
index 32becda4edc95..15146f6c1204e 100644
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -20,6 +20,14 @@
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
 #endif
 
+#ifdef PADDLE_WITH_NCCL
+#include <nccl.h>
+#include "paddle/fluid/platform/dynload/nccl.h"
+#endif
+
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
 
@@ -127,6 +135,20 @@ void NCCLParallelContext::AllReduceByStream(const framework::Variable &src,
   AllReduce(src, dst, strategy_, ring_id, use_calc_stream);
 }
 
+void NCCLParallelContext::Broadcast(framework::Variable *src, int ring_id) {
+  VLOG(3) << "/// DEBUG /// start inter broadcast with ring_id: " << ring_id;
+  framework::Tensor *src_tensor = src->GetMutable<framework::LoDTensor>();
+  const auto &place = src_tensor->place();
+  platform::NCCLComm *comm =
+      platform::NCCLCommContext::Instance().Get(ring_id, place);
+  gpuStream_t stream = comm->stream();
+
+  void *src_ptr = src_tensor->data<void>();
+  auto nccl_dtype = platform::ToNCCLDataType(src_tensor->type());
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
+      src_ptr, src_tensor->numel(), nccl_dtype, 0, comm->comm(), stream));
+}
+
 paddle::platform::DeviceContext *NCCLParallelContext::GetDeviceContext(
     int ring_id) {
   return static_cast<platform::DeviceContext *>(
@@ -153,11 +175,11 @@ void NCCLParallelContext::WaitCompute(int ring_id) {
 
 // compute_stream-->event-->comm_stream
 #ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event, compute_stream));
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(comm_stream, event, 0));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, compute_stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(comm_stream, event, 0));
 #else
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, compute_stream));
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(comm_stream, event, 0));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, compute_stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(comm_stream, event, 0));
 #endif
 }
 
@@ -179,11 +201,11 @@ void NCCLParallelContext::WaitComm(int ring_id) {
 
 // comm_stream-->event-->compute_stream
 #ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event, comm_stream));
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(compute_stream, event, 0));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, comm_stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(compute_stream, event, 0));
 #else
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, comm_stream));
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(compute_stream, event, 0));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, comm_stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(compute_stream, event, 0));
 #endif
 }
 
diff --git a/paddle/fluid/imperative/nccl_context.h b/paddle/fluid/imperative/nccl_context.h
index 1eee393aa714b..bb5b8ea32df4f 100644
--- a/paddle/fluid/imperative/nccl_context.h
+++ b/paddle/fluid/imperative/nccl_context.h
@@ -18,7 +18,7 @@
 #include <vector>
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/cuda_resource_pool.h"
+#include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
 #endif
 
 #ifdef PADDLE_WITH_NCCL
@@ -60,6 +60,8 @@ class NCCLParallelContext : public ParallelContext {
                          framework::Variable* dst, int ring_id,
                          bool use_calc_stream) override;
 
+  void Broadcast(framework::Variable* src, int ring_id) override;
+
   paddle::platform::DeviceContext* GetDeviceContext(int ring_id) override;
 
   void WaitCompute(int ring_id) override;
diff --git a/paddle/fluid/imperative/parallel_context.h b/paddle/fluid/imperative/parallel_context.h
index f537a316014d6..8bdfccc144243 100644
--- a/paddle/fluid/imperative/parallel_context.h
+++ b/paddle/fluid/imperative/parallel_context.h
@@ -56,6 +56,8 @@ class ParallelContext {
                                  framework::Variable* dst, int ring_id,
                                  bool use_calc_stream) = 0;
 
+  virtual void Broadcast(framework::Variable* src, int ring_id) = 0;
+
   virtual paddle::platform::DeviceContext* GetDeviceContext(int ring_id) = 0;
 
   // comm_stream[ring_id] wait compute_stream.
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 604f9d2be9e48..8875ef74bce14 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -24,6 +24,8 @@
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
 #endif
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+
 DECLARE_bool(check_nan_inf);
 DECLARE_bool(run_pten_kernel);
 DECLARE_bool(benchmark);
@@ -299,44 +301,28 @@ static void BuildDygraphPtenKernelContext(
 
     size_t start_idx = (i == 0 ? 0 : kernel_ctx->InputRangeAt(i - 1).second);
     size_t end_idx = start_idx + ins_vector.size();
-
-    // The current size of input/output in pt_kernel_context_ is at least equal
-    // the start_idx. For the reason of reusing the allocted of inputs or
-    // outputs in pt_kernel_context_, the current size of input/output can be
-    // greater then the index of which the tensort wanted to set to, so it will
-    // use ReMakePtenDenseTensorFromVar to make pten tensor.
-    if (kernel_ctx->InputsSize() == start_idx) {
-      paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_inputs;
-      for (const auto& var : ins_vector) {
-        const auto& variable = var->Var();
-        tmp_inputs.emplace_back(
-            experimental::MakePtenTensorBaseFromVar(variable, in_def));
-      }
-      kernel_ctx->EmplaceBackInputs(std::move(tmp_inputs));
-    } else if (kernel_ctx->InputsSize() > start_idx) {
-      size_t input_size = kernel_ctx->InputsSize();
-      for (size_t j = 0; j < ins_vector.size(); ++j) {
-        if (input_size > start_idx + j) {
+    auto current_vector_size = kernel_ctx->InputsSize();
+
+    // If the memory needed is less than the current memory allocated, we will
+    // reuse the current memory by using ReMakePtenDenseTensorFromVar.
+    // Otherwise，we will create new storage.
+    for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
+      const auto& variable = ins_vector[offset]->Var();
+      if (current_vector_size > start_idx + offset) {
+        auto& input_ptr = kernel_ctx->MutableInputPtrAt(start_idx + offset);
+        if (input_ptr == nullptr) {
+          input_ptr = experimental::MakePtenTensorBaseFromVar(variable, in_def);
+        } else {
           experimental::ReMakePtenDenseTensorFromVar(
-              ins_vector[j]->Var(), in_def,
-              kernel_ctx->MutableInputAt<pten::DenseTensor>(start_idx + j));
-          // TODO(chentianyu03): When multi input kernel, open this code
-          /*
-          } else {
-            kernel_ctx->EmplaceBackInputWithoutSetRange(
-                experimental::MakePtenTensorBaseFromVar(ins_vector[j]->Var(),
-                                                        in_def));
-          */
+              variable, in_def, kernel_ctx->MutableInputAt<pten::DenseTensor>(
+                                    start_idx + offset));
         }
+      } else {
+        kernel_ctx->EmplaceBackInputWithoutSetRange(
+            experimental::MakePtenTensorBaseFromVar(variable, in_def));
       }
-      kernel_ctx->MutableInputRangeAt(i) = std::make_pair(start_idx, end_idx);
-    } else {
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "Error start index when trying to set new tensor to inputs, start "
-          "index is `%d`, but current pt_kernel_context_.inputs.size() is "
-          "`%d`.",
-          start_idx, kernel_ctx->InputsSize()));
     }
+    kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
   }
 
   for (size_t i = 0; i < output_names.size(); ++i) {
@@ -345,44 +331,22 @@ static void BuildDygraphPtenKernelContext(
 
     size_t start_idx = (i == 0 ? 0 : kernel_ctx->OutputRangeAt(i - 1).second);
     size_t end_idx = start_idx + outs_vector.size();
-
-    // The current size of input/output in pt_kernel_context_ is at least equal
-    // the start_idx. For the reason of reusing the allocted of inputs or
-    // outputs in pt_kernel_context_, the current size of input/output can be
-    // greater then the index of which the tensort wanted to set to, so it will
-    // use ReMakePtenDenseTensorFromVar to make pten tensor.
-    if (kernel_ctx->OutputsSize() == start_idx) {
-      paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_outputs;
-      for (auto& var : outs_vector) {
-        auto* variable = var->MutableVar();
-        tmp_outputs.emplace_back(
-            experimental::MakePtenTensorBaseFromVar(variable, out_def));
-      }
-      kernel_ctx->EmplaceBackOutputs(std::move(tmp_outputs));
-    } else if (kernel_ctx->OutputsSize() > start_idx) {
-      size_t output_size = kernel_ctx->OutputsSize();
-      for (size_t j = 0; j < outs_vector.size(); ++j) {
-        if (output_size > i + j) {
-          experimental::ReMakePtenDenseTensorFromVar(
-              outs_vector[j]->MutableVar(), out_def,
-              kernel_ctx->MutableOutputAt<pten::DenseTensor>(i + j));
-          // TODO(chentianyu03): When multi output kernel, open this code
-          /*
-          } else {
-            kernel_ctx->EmplaceBackOutputWithoutSetRange(
-                experimental::MakePtenTensorBaseFromVar(
-                    outs_vector[j]->MutableVar(), out_def));
-          */
-        }
+    auto current_vector_size = kernel_ctx->OutputsSize();
+    // If the memory needed is less than the current memory allocated, we will
+    // reuse the current memory by using ReMakePtenDenseTensorFromVar.
+    // Otherwise，we will create new storage.
+    for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
+      if (current_vector_size > start_idx + offset) {
+        experimental::ReMakePtenDenseTensorFromVar(
+            outs_vector[offset]->MutableVar(), out_def,
+            kernel_ctx->MutableOutputAt<pten::DenseTensor>(start_idx + offset));
+      } else {
+        kernel_ctx->EmplaceBackOutputWithoutSetRange(
+            experimental::MakePtenTensorBaseFromVar(
+                outs_vector[offset]->MutableVar(), out_def));
       }
-      kernel_ctx->MutableOutputRangeAt(i) = std::make_pair(start_idx, end_idx);
-    } else {
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "Error start index when trying to set new tensor to inputs, start "
-          "index is `%d`, but current pt_kernel_context_.outputs.size() is "
-          "`%d`.",
-          start_idx, kernel_ctx->OutputsSize()));
     }
+    kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
   }
 
   for (size_t i = 0; i < attr_names.size(); ++i) {
@@ -561,12 +525,8 @@ static void PreparedOpRunPtImpl(
 
   if (FLAGS_benchmark) {
     dev_ctx->Wait();
-#if defined(PADDLE_WITH_CUDA)
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError());
-    VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error";
-#endif
-#if defined(PADDLE_WITH_HIP)
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipGetLastError());
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
     VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error";
 #endif
   }
diff --git a/paddle/fluid/imperative/py_layer_fwd.h b/paddle/fluid/imperative/py_layer_fwd.h
index 1baf73ab3b95d..159371970dcac 100644
--- a/paddle/fluid/imperative/py_layer_fwd.h
+++ b/paddle/fluid/imperative/py_layer_fwd.h
@@ -101,6 +101,28 @@ py::object PyLayerApply(const platform::Place& place, const py::handle& cls,
               "`%s` type argument can not be cast into `Tensor`.",
               ptr->ptr()->ob_type->tp_name));
         }
+      } else if (py::isinstance<py::tuple>(*ptr) ||
+                 py::isinstance<py::list>(*ptr)) {
+        try {
+          auto tuple_arg = ptr->cast<py::tuple>();
+          for (auto iter = tuple_arg.begin(); iter != tuple_arg.end(); ++iter) {
+            try {
+              auto t = iter->cast<std::shared_ptr<VarBase>>();
+              input_vars.push_back(t);
+            } catch (py::cast_error& err) {
+              PADDLE_THROW(platform::errors::InvalidArgument(
+                  "The `PyLayer.forward` function contains invalid argument, "
+                  "the "
+                  "`%s` type argument can not be cast into `Tensor`.",
+                  ptr->ptr()->ob_type->tp_name));
+            }
+          }
+        } catch (py::cast_error& err) {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "The `PyLayer.forward` function contains invalid argument, the "
+              "`%s` type argument can not be cast into `Tensor`.",
+              ptr->ptr()->ob_type->tp_name));
+        }
       }
     }
   }
@@ -119,6 +141,28 @@ py::object PyLayerApply(const platform::Place& place, const py::handle& cls,
               "`%s` type argument can not be cast into `Tensor`.",
               ptr->second.ptr()->ob_type->tp_name));
         }
+      } else if (py::isinstance<py::tuple>(*ptr->second) ||
+                 py::isinstance<py::list>(*ptr->second)) {
+        try {
+          auto tuple_arg = ptr->second.cast<py::tuple>();
+          for (auto iter = tuple_arg.begin(); iter != tuple_arg.end(); ++iter) {
+            try {
+              auto t = iter->cast<std::shared_ptr<VarBase>>();
+              input_vars.push_back(t);
+            } catch (py::cast_error& err) {
+              PADDLE_THROW(platform::errors::InvalidArgument(
+                  "The `PyLayer.forward` function contains invalid argument, "
+                  "the "
+                  "`%s` type argument can not be cast into `Tensor`.",
+                  ptr->second.ptr()->ob_type->tp_name));
+            }
+          }
+        } catch (py::cast_error& err) {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "The `PyLayer.forward` function contains invalid argument, the "
+              "`%s` type argument can not be cast into `Tensor`.",
+              ptr->second.ptr()->ob_type->tp_name));
+        }
       }
     }
   }
@@ -182,6 +226,15 @@ py::object PyLayerApply(const platform::Place& place, const py::handle& cls,
       }
     }
     if (if_inplace) {
+      // when pylayer forward is inplace strategy, check whether tensor is leaf
+      for (auto& t : input_vars) {
+        PADDLE_ENFORCE_EQ(t->IsLeaf() && !t->OverridedStopGradient(), false,
+                          platform::errors::InvalidArgument(
+                              "Leaf Var (%s) that doesn't stop gradient can't "
+                              "use inplace strategy.",
+                              t->Name()));
+      }
+
       inplace_map["X"] = "Out";
     }
 
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index 2f023f644fd06..068de4f0435bb 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -27,8 +27,9 @@
 namespace paddle {
 namespace imperative {
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
-    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||     \
+    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
+    defined(PADDLE_WITH_ASCEND_CL)
 // div the nranks
 void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) {
   framework::Tensor *tensor =
@@ -41,6 +42,9 @@ void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     DivNRanks(tensor, nranks, context);
 #endif
+  } else if (platform::is_npu_place(tensor->place())) {
+    // TODO(kuizhiqing)
+    VLOG(4) << "divnrank for npu not support yet";
   } else if (platform::is_cpu_place(tensor->place())) {
     VLOG(4) << "before div 2" << *tensor;
     VLOG(4) << "NDiv for cpu devices : rank = " << nranks;
@@ -207,6 +211,70 @@ void SplitTensorsWithType<platform::XPUDeviceContext>(
 }
 #endif
 
+// NOTE(liubo48): Only implement operators::math::SplitFunctor for npu now.
+// If later the operators::StridedMemcpyWithAxis0 is supported,
+// then this specific SplitTensorsForAllReduce can be removed.
+#ifdef PADDLE_WITH_ASCEND_CL
+template <>
+void SplitTensorsForAllReduce<platform::NPUDeviceContext, float>(
+    const platform::NPUDeviceContext &context,
+    framework::Variable *p_dense_contents,
+    std::vector<framework::Tensor> *p_dense_tensors) {
+  auto *in = p_dense_contents->GetMutable<framework::LoDTensor>();
+  std::vector<framework::Tensor *> outs;
+  std::vector<const framework::Tensor *> shape_refer;
+
+  outs.reserve(p_dense_tensors->size());
+  shape_refer.reserve(p_dense_tensors->size());
+
+  for (auto &tensor : *p_dense_tensors) {
+    outs.emplace_back(&tensor);
+    shape_refer.emplace_back(&tensor);
+  }
+  operators::math::SplitFunctor<platform::NPUDeviceContext, float>
+      split_functor_;
+  split_functor_(context, *in, shape_refer, 0, &outs);
+}
+
+template <>
+void ConcatTensorsWithType<platform::NPUDeviceContext>(
+    const platform::NPUDeviceContext &context,
+    const std::vector<framework::Tensor> &dense_tensors_,
+    framework::Variable *p_dense_contents,
+    framework::proto::VarType::Type type) {
+  switch (type) {
+    case framework::proto::VarType::FP32:
+      ConcatTensorsForAllReduce<platform::NPUDeviceContext, float>(
+          context, dense_tensors_, p_dense_contents);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type (%s) is not supported when it concats tensors for "
+          "allreduce.",
+          framework::DataTypeToString(type)));
+  }
+}
+
+template <>
+void SplitTensorsWithType<platform::NPUDeviceContext>(
+    const platform::NPUDeviceContext &context,
+    framework::Variable *p_dense_contents,
+    std::vector<framework::Tensor> *p_dense_tensors,
+    framework::proto::VarType::Type type) {
+  switch (type) {
+    case framework::proto::VarType::FP32:
+      SplitTensorsForAllReduce<platform::NPUDeviceContext, float>(
+          context, p_dense_contents, p_dense_tensors);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type (%s) is not supported when it splits tensors for "
+          "allreduce.",
+          framework::DataTypeToString(type)));
+  }
+}
+#endif
+
 void Group::ConcatTensors(const platform::DeviceContext &context) {
   auto place = context.GetPlace();
   if (platform::is_gpu_place(place)) {
@@ -831,7 +899,7 @@ void Reducer::MarkGroupReady(size_t group_index) {
       }
     });
 #elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) || \
-    defined(PADDLE_WITH_GLOO)
+    defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL)
     FusedAllReduceSchedule(run_order, group, next_group_);
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
@@ -1014,7 +1082,7 @@ void Reducer::FinalizeBackward() {
   if (find_unused_vars_each_step_) {
 // TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
-    defined(PADDLE_WITH_GLOO)
+    defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL)
     ProcessUnusedDenseVars();
 #endif
     // Initialize local used vars
diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h
index b5a7dd149f09f..3c03babc52cbe 100644
--- a/paddle/fluid/imperative/reducer.h
+++ b/paddle/fluid/imperative/reducer.h
@@ -48,8 +48,9 @@ class VariableWrapper;
 namespace paddle {
 namespace imperative {
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
-    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||     \
+    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
+    defined(PADDLE_WITH_ASCEND_CL)
 
 template <typename T>
 struct DivNRanksFunctor {
diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt
index adb560df77c78..32e982f1f15ca 100644
--- a/paddle/fluid/imperative/tests/CMakeLists.txt
+++ b/paddle/fluid/imperative/tests/CMakeLists.txt
@@ -1,8 +1,10 @@
 if(WIN32)
     cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS device_context)
 else()
-    if (WITH_NCCL OR WITH_RCCL)
+    if (WITH_GLOO AND (WITH_NCCL OR WITH_RCCL))
         cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context)
+        cc_test(heter_ccl_context_test SRCS heter_ccl_context_test.cc DEPS heter_ccl_context nccl_context imperative_gloo_context gloo_context gloo_wrapper gloo fs shell)
+        #set_tests_properties(heter_ccl_context_test PROPERTIES LABELS "RUN_TYPE=DIST")
     endif()
     if (WITH_XPU_BKCL)
         cc_test(bkcl_context_test SRCS bkcl_context_test.cc DEPS bkcl_context)
diff --git a/paddle/fluid/imperative/tests/heter_ccl_context_test.cc b/paddle/fluid/imperative/tests/heter_ccl_context_test.cc
new file mode 100644
index 0000000000000..d36743510e5ba
--- /dev/null
+++ b/paddle/fluid/imperative/tests/heter_ccl_context_test.cc
@@ -0,0 +1,89 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <chrono>
+#include <thread>  // NOLINT
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/imperative/heter_ccl_context.h"
+
+#include "gtest/gtest.h"
+
+namespace imperative = paddle::imperative;
+namespace platform = paddle::platform;
+namespace framework = paddle::framework;
+
+imperative::ParallelStrategy GetStrategy(int local_rank) {
+  std::vector<std::string> eps = {"127.0.0.1:37580", "127.0.0.1:37581"};
+  imperative::ParallelStrategy strategy;
+  strategy.trainer_endpoints_ = eps;
+  strategy.current_endpoint_ = eps[local_rank];
+  strategy.nranks_ = eps.size();
+  strategy.local_rank_ = local_rank;
+  return strategy;
+}
+
+#ifdef PADDLE_WITH_NCCL
+void AllReduceByStream(int local_rank, int device_id) {
+  int data_size = 32;
+  const auto& place = platform::CUDAPlace(device_id);
+  platform::CUDADeviceContext ctx(place);
+
+  // heter_parallel_ctx
+  imperative::HeterParallelContext hpc(GetStrategy(local_rank), device_id);
+
+  // init
+  hpc.Init();
+
+  // input and output data
+  framework::Variable* src_dev_var(new framework::Variable());
+  auto* src_dev_tensor = src_dev_var->GetMutable<framework::LoDTensor>();
+  src_dev_tensor->mutable_data<float>(framework::make_ddim({data_size}), place);
+
+  std::vector<float> src_vec;
+  for (int i = 0; i < data_size; i++) {
+    src_vec.push_back(1.0 + local_rank);
+  }
+  framework::TensorFromVector(src_vec, ctx, src_dev_tensor);
+  ctx.Wait();
+
+  framework::Variable* dst_dev_var(new framework::Variable());
+  auto* dst_dev_tensor = dst_dev_var->GetMutable<framework::LoDTensor>();
+  dst_dev_tensor->mutable_data<float>(framework::make_ddim({data_size}), place);
+
+  // call allreduce
+  hpc.AllReduceByStream(*src_dev_var, dst_dev_var, 0, false);
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+
+  // check result
+  std::vector<float> dst_vec;
+  framework::TensorToVector(*dst_dev_tensor, ctx, &dst_vec);
+  ctx.Wait();
+
+  EXPECT_EQ(dst_vec.size(), src_vec.size());
+  for (int i = 0; i < data_size; i++) {
+    EXPECT_EQ(dst_vec[i], 3.0);
+  }
+}
+
+TEST(AllReduceByStream, Run) {
+  if (platform::GetGPUDeviceCount() >= 2) {
+    std::thread t0(AllReduceByStream, 0, 0);
+    std::thread t1(AllReduceByStream, 1, 1);
+    t0.join();
+    t1.join();
+  }
+}
+#endif
diff --git a/paddle/fluid/imperative/tests/nccl_context_test.cc b/paddle/fluid/imperative/tests/nccl_context_test.cc
index 2d8a08217b0b8..401e4e324eb89 100644
--- a/paddle/fluid/imperative/tests/nccl_context_test.cc
+++ b/paddle/fluid/imperative/tests/nccl_context_test.cc
@@ -14,6 +14,8 @@
 
 #include <thread>  // NOLINT
 
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/imperative/nccl_context.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
 
@@ -21,6 +23,7 @@
 
 namespace imperative = paddle::imperative;
 namespace platform = paddle::platform;
+namespace framework = paddle::framework;
 
 int nrings = 2;
 imperative::ParallelStrategy GetStrategy(int local_rank) {
@@ -68,4 +71,51 @@ TEST(BcastNCCLId, Run) {
                              NCCL_UNIQUE_ID_BYTES));
   }
 }
+
+void Broadcast(int local_rank, int device_id) {
+  int data_size = 4;
+  float test_data = 7;
+  const auto& place = platform::CUDAPlace(device_id);
+  platform::CUDADeviceContext ctx(place);
+
+  imperative::NCCLParallelContext npc(GetStrategy(local_rank), place);
+
+  // init
+  npc.Init();
+
+  framework::Variable* src_dev_var(new framework::Variable());
+  auto* src_dev_tensor = src_dev_var->GetMutable<framework::LoDTensor>();
+  src_dev_tensor->mutable_data<float>(framework::make_ddim({data_size}), place);
+
+  // fill data for rank 0 only
+  std::vector<float> src_vec;
+  if (local_rank == 0) {
+    for (int i = 0; i < data_size; i++) {
+      src_vec.push_back(test_data);
+    }
+    framework::TensorFromVector(src_vec, ctx, src_dev_tensor);
+  }
+  ctx.Wait();
+
+  npc.Broadcast(src_dev_var, 0);
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+
+  // check result
+  std::vector<float> dst_vec;
+  framework::TensorToVector(*src_dev_tensor, ctx, &dst_vec);
+  ctx.Wait();
+
+  for (int i = 0; i < data_size; i++) {
+    EXPECT_EQ(dst_vec[i], test_data);
+  }
+}
+
+TEST(Broadcast, Run) {
+  if (platform::GetGPUDeviceCount() >= 2) {
+    std::thread t0(Broadcast, 0, 0);
+    std::thread t1(Broadcast, 1, 1);
+    t0.join();
+    t1.join();
+  }
+}
 #endif
diff --git a/paddle/fluid/imperative/variable_wrapper.h b/paddle/fluid/imperative/variable_wrapper.h
index 9fbbe7d06f8ad..c257191a546e4 100644
--- a/paddle/fluid/imperative/variable_wrapper.h
+++ b/paddle/fluid/imperative/variable_wrapper.h
@@ -209,13 +209,23 @@ class VariableWrapper {
 
   uint32_t InplaceVersionSnapshot() const { return inplace_version_snapshot_; }
 
-  void ResetInplaceVersion() {
-    auto new_version = var_.CurrentInplaceVersion();
+  void ResetInplaceVersion(bool set_to_zero = false) {
+    if (!set_to_zero) {
+      auto new_version = var_.CurrentInplaceVersion();
 
-    VLOG(6) << "The wrapper version of VariableWrapper '" << name_
-            << "' will be updated from " << inplace_version_snapshot_ << "to "
-            << new_version;
-    inplace_version_snapshot_ = new_version;
+      VLOG(6) << "The wrapper version of VariableWrapper '" << name_
+              << "' will be updated from " << inplace_version_snapshot_ << "to "
+              << new_version;
+      inplace_version_snapshot_ = new_version;
+
+    } else {
+      // Reset Snapshot & InplaceVersion to zero
+      inplace_version_snapshot_ = 0;
+      auto var = this->MutableVar();
+      if (var) {
+        var->SetInplaceVersionToZero();
+      }
+    }
   }
 
   bool hasCacheKey(const paddle::framework::OpKernelType& key) {
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
index 2202b94bee727..3fa417c2ea631 100644
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
@@ -52,11 +52,11 @@ typedef struct {
 // The traversal order also affect the lifecycles, so different sort_kind is
 // used.
 void MemoryOptimizePass::CollectLifeCycle(
-    std::unordered_map<std::string, lifecycle_t>* lifecycles,
+    Graph* graph, std::unordered_map<std::string, lifecycle_t>* lifecycles,
     int sort_kind) const {
-  max_lifecycle_ = 0;
+  int max_lifecycle = 0;
   for (auto* op_node : framework::ir::TopologyVarientSort(
-           *graph_, static_cast<framework::ir::SortKind>(sort_kind))) {
+           *graph, static_cast<framework::ir::SortKind>(sort_kind))) {
     if (!op_node->IsOp()) continue;
     auto reads = op_node->inputs;
     auto writes = op_node->outputs;
@@ -77,20 +77,20 @@ void MemoryOptimizePass::CollectLifeCycle(
         if (node->Var()->Persistable()) continue;
         std::string var = node->Name();
         if (!lifecycles->count(var)) {
-          (*lifecycles)[var] = std::make_pair(max_lifecycle_, max_lifecycle_);
+          (*lifecycles)[var] = std::make_pair(max_lifecycle, max_lifecycle);
         } else {
           (*lifecycles)[var].second =
-              std::max(max_lifecycle_, lifecycles->at(var).second);  // max()
+              std::max(max_lifecycle, lifecycles->at(var).second);  // max()
         }
       }
     }
 
-    ++max_lifecycle_;
+    ++max_lifecycle;
   }
 }
 
 void MemoryOptimizePass::CollectVarMemorySize(
-    space_table_t* space_table) const {
+    Graph* graph, space_table_t* space_table) const {
   const int fake_batch_size = 1;
 
   auto valid_var = [&](framework::ir::Node* node) -> bool {
@@ -130,7 +130,7 @@ void MemoryOptimizePass::CollectVarMemorySize(
   // although it's not always the case. so black list is the best compromise
   // between performance and underlying principle.
   std::unordered_set<std::string> black_list;
-  for (auto* node : graph_->Nodes()) {
+  for (auto* node : graph->Nodes()) {
     if (node->IsVar() &&
         node->Var()->GetType() ==
             framework::proto::VarType::Type::VarType_Type_LOD_TENSOR) {
@@ -141,7 +141,7 @@ void MemoryOptimizePass::CollectVarMemorySize(
   }
 
   // Collect tensors from graph.
-  for (auto* node : graph_->Nodes()) {
+  for (auto* node : graph->Nodes()) {
     if (node->IsVar() &&
         node->Var()->GetType() ==
             framework::proto::VarType::Type::VarType_Type_LOD_TENSOR &&
@@ -304,7 +304,10 @@ void MemoryOptimizePass::RunImpl(Argument* argument) {
   // 3. Perform reuse plan: Replace all var's name in the model according to the
   // mapping table.
   if (!argument->enable_memory_optim()) return;
-  graph_ = argument->main_graph_ptr();
+  // Because of pass is a singleton, graph can not be member
+  // variables，otherwise，errors will be caused under multithreading
+  // conditions.
+  auto graph = argument->main_graph_ptr();
 
   int sort_kind = 0;
   std::unordered_map<std::string, lifecycle_t> lifecycles;
@@ -312,10 +315,10 @@ void MemoryOptimizePass::RunImpl(Argument* argument) {
   std::unordered_map<std::string, std::string> node2cluster;
   std::unordered_map<std::string, int> cluster_size;
 
-  CollectLifeCycle(&lifecycles, sort_kind);
-  CollectVarMemorySize(&space_table);
+  CollectLifeCycle(graph, &lifecycles, sort_kind);
+  CollectVarMemorySize(graph, &space_table);
   MakeSimpleReusePlan(lifecycles, space_table, &node2cluster, &cluster_size);
-  UpdateOpDescsByReuse(graph_, node2cluster, sort_kind);
+  UpdateOpDescsByReuse(graph, node2cluster, sort_kind);
   return;
 }
 
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
index 6d20aee295b7c..57052243d2f18 100644
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
@@ -57,17 +57,15 @@ class MemoryOptimizePass : public AnalysisPass {
 
  private:
   void CollectLifeCycle(
+      framework::ir::Graph *graph,
       std::unordered_map<std::string, lifecycle_t> *lifecycles,
       int sort_kind) const;
 
-  void CollectVarMemorySize(space_table_t *space_table) const;
+  void CollectVarMemorySize(framework::ir::Graph *graph,
+                            space_table_t *space_table) const;
 
  public:
   std::string repr() const override;
-
- private:
-  mutable framework::ir::Graph *graph_{nullptr};
-  mutable int max_lifecycle_{-1};
 };
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index ceca7e8146a79..49c4b8d7372e2 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -19,8 +19,8 @@
 #include "paddle/fluid/inference/api/paddle_pass_builder.h"
 #include "paddle/fluid/inference/utils/table_printer.h"
 #include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/gpu_info.h"
 
 #ifdef PADDLE_WITH_TENSORRT
 #include "paddle/fluid/inference/tensorrt/helper.h"
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index b1408995fa157..2293b70246853 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -41,8 +41,8 @@
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/cpu_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/pten/api/ext/op_meta_info.h"
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index c0038f6c3f038..d5452f82d08b5 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -19,9 +19,8 @@ PADDLE_ROOT=$1
 TURN_ON_MKL=$2 # use MKL or Openblas
 TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
 DATA_DIR=$4 # dataset
-TENSORRT_INCLUDE_DIR=$5 # TensorRT header file dir, default to /usr/local/TensorRT/include
-TENSORRT_LIB_DIR=$6 # TensorRT lib file dir, default to /usr/local/TensorRT/lib
-MSVC_STATIC_CRT=$7
+TENSORRT_ROOT_DIR=$5 # TensorRT root dir, default to /usr
+MSVC_STATIC_CRT=$6
 inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir
 WIN_DETECT=$(echo `uname` | grep "Win") # detect current platform
 
@@ -39,7 +38,7 @@ else
 fi
 
 USE_TENSORRT=OFF
-if [ -d "$TENSORRT_INCLUDE_DIR" -a -d "$TENSORRT_LIB_DIR" ]; then
+if [ -d "$TENSORRT_ROOT_DIR" ]; then
   USE_TENSORRT=ON
 fi
 
@@ -132,6 +131,28 @@ for WITH_STATIC_LIB in ON OFF; do
         fi
       done
     done
+
+    # --------tensorrt mobilenet on windows------
+    if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then
+      rm -rf *
+      cmake .. -G "Visual Studio 15 2017" -A x64 -T host=x64 -DPADDLE_LIB=${inference_install_dir} \
+        -DWITH_MKL=$TURN_ON_MKL \
+        -DDEMO_NAME=trt_mobilenet_demo \
+        -DWITH_GPU=$TEST_GPU_CPU \
+        -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+        -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \
+        -DUSE_TENSORRT=$USE_TENSORRT \
+        -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR
+      msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
+      Release/trt_mobilenet_demo.exe \
+        --modeldir=$DATA_DIR/mobilenet/model \
+        --data=$DATA_DIR/mobilenet/data.txt \
+        --refer=$DATA_DIR/mobilenet/result.txt 
+      if [ $? -ne 0 ]; then
+        echo "trt demo trt_mobilenet_demo runs fail."
+        exit 1
+      fi
+    fi
   else
     # -----simple_on_word2vec on linux/mac-----
     rm -rf *
@@ -183,8 +204,7 @@ for WITH_STATIC_LIB in ON OFF; do
         -DWITH_GPU=$TEST_GPU_CPU \
         -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
         -DUSE_TENSORRT=$USE_TENSORRT \
-        -DTENSORRT_INCLUDE_DIR=$TENSORRT_INCLUDE_DIR \
-        -DTENSORRT_LIB_DIR=$TENSORRT_LIB_DIR
+        -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR
       make -j$(nproc)
       ./trt_mobilenet_demo \
         --modeldir=$DATA_DIR/mobilenet/model \
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc
index 654b58a2ded34..aa29b779e471b 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc
@@ -134,6 +134,16 @@ void AnalysisPredictor::MkldnnQuantizer::CalculateScalesForOpOutputs(
           scales_[var_name] = scales_[input_var_name];
         }
         compute_scale = false;
+      } else if (op->Type() == "slice") {
+        auto input_var_name = op->Input("Input")[0];
+        PADDLE_ENFORCE_NE(scales_.find(input_var_name), scales_.end(),
+                          platform::errors::PreconditionNotMet(
+                              "Input scales must be calculated before the "
+                              "output scales to infer if output is unsigned."));
+        if (scales_.find(input_var_name) != scales_.end()) {
+          scales_[var_name] = scales_[input_var_name];
+        }
+        compute_scale = false;
       } else if (op->Type() == "concat") {
         // output of ops with unsigned input must be unsigned
         is_unsigned = true;
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
index 5a07cc7e240d5..6642a2c030b26 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
@@ -42,6 +42,9 @@ MkldnnQuantizerConfig::MkldnnQuantizerConfig() {
   rules_["transpose2"]["X"] = ScaleAlgo::KL;
   rules_["transpose2"]["Out"] = ScaleAlgo::NONE;
 
+  rules_["slice"]["Input"] = ScaleAlgo::KL;
+  rules_["slice"]["Out"] = ScaleAlgo::NONE;
+
   rules_["fc"]["Input"] = ScaleAlgo::KL;
   rules_["fc"]["W"] = ScaleAlgo::MAX_CH_T;
   rules_["fc"]["Bias"] = ScaleAlgo::NONE;
diff --git a/paddle/fluid/inference/api/paddle_infer_contrib.cc b/paddle/fluid/inference/api/paddle_infer_contrib.cc
index 57b5167337e25..d27f20a93b3a4 100644
--- a/paddle/fluid/inference/api/paddle_infer_contrib.cc
+++ b/paddle/fluid/inference/api/paddle_infer_contrib.cc
@@ -27,7 +27,7 @@ using paddle::PaddleDType;
 void* TensorUtils::CudaMallocPinnedMemory(size_t size) {
 #if defined(PADDLE_WITH_CUDA)
   void* ptr = nullptr;
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMallocHost(&ptr, size));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMallocHost(&ptr, size));
   return ptr;
 #else
   return nullptr;
@@ -36,7 +36,7 @@ void* TensorUtils::CudaMallocPinnedMemory(size_t size) {
 
 void TensorUtils::CudaFreePinnedMemory(void* ptr) {
 #if defined(PADDLE_WITH_CUDA)
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaFreeHost(ptr));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(ptr));
 #endif
 }
 
diff --git a/paddle/fluid/inference/tensorrt/convert/io_converter.cc b/paddle/fluid/inference/tensorrt/convert/io_converter.cc
index d9cf9e2e86001..b468518fa5a3c 100644
--- a/paddle/fluid/inference/tensorrt/convert/io_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/io_converter.cc
@@ -45,7 +45,7 @@ class DefaultIOConverter : public EngineIOConverter {
             "the input max_size. But in's memory_size = %u, max_size = %u.",
             size, max_size));
     if (is_cpu_place(place)) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(
           out, in.data<float>(), size, cudaMemcpyHostToDevice, *stream_));
     } else if (is_gpu_place(place)) {
       PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index 35c9658108ab5..26d87e4832f5f 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -162,20 +162,6 @@ class Pool2dOpConverter : public OpConverter {
         }
         layer = pool_layer;
       } else if (!adaptive && !global_pooling && ceil_mode) {
-        nvinfer1::DimsHW pre_pad(0, 0);
-        nvinfer1::DimsHW post_pad(0, 0);
-        // If ceil mode is true, we will pad the appropriate size to the input.
-        DealCeilMode(input_shape, ksize, strides, paddings, &pre_pad, &post_pad,
-                     input_dims);
-        auto *pad_layer = TRT_ENGINE_ADD_LAYER(
-            engine_, Padding, *const_cast<nvinfer1::ITensor *>(input1), pre_pad,
-            post_pad);
-        PADDLE_ENFORCE_NOT_NULL(
-            pad_layer, platform::errors::Fatal(
-                           "Pad layer in poolOp converter could not be "
-                           "created. The pointer to pad layer is `NULL`."));
-        input1 = pad_layer->getOutput(0);
-
         auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *input1,
                                                 nv_pool_type, nv_ksize);
         pool_layer->setStride(nv_strides);
@@ -183,6 +169,8 @@ class Pool2dOpConverter : public OpConverter {
         pool_layer->setAverageCountExcludesPadding(exclusive);
         if (padding_algorithm == "SAME") {
           pool_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
+        } else {
+          pool_layer->setPaddingMode(nvinfer1::PaddingMode::kEXPLICIT_ROUND_UP);
         }
         layer = pool_layer;
       } else if (global_pooling) {
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 64116b7973e71..2addff52829c8 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -20,8 +20,8 @@ limitations under the License. */
 
 #include "cuda_runtime_api.h"  // NOLINT
 #include "paddle/fluid/inference/tensorrt/helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
index 0f32183c0fbc1..70e5a7bcc7b4f 100644
--- a/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
@@ -43,16 +43,16 @@ nvinfer1::Weights DeformableConvPlugin::copyToDevice(const void* hostData,
                                                      size_t count) {
   int num_bytes = (data_type_ == nvinfer1::DataType::kFLOAT ? 4 : 2);
   void* deviceData;
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(&deviceData, count * num_bytes));
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpy(
-      deviceData, hostData, count * num_bytes, cudaMemcpyHostToDevice));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc(&deviceData, count * num_bytes));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(deviceData, hostData, count * num_bytes,
+                                        cudaMemcpyHostToDevice));
   return nvinfer1::Weights{data_type_, deviceData, int64_t(count)};
 }
 
 void DeformableConvPlugin::serializeFromDevice(
     void** hostBuffer, const nvinfer1::Weights& deviceWeights) const {
   int num_bytes = (data_type_ == nvinfer1::DataType::kFLOAT ? 4 : 2);
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       cudaMemcpy(static_cast<char*>(*hostBuffer), deviceWeights.values,
                  deviceWeights.count * num_bytes, cudaMemcpyDeviceToHost));
   hostBuffer += deviceWeights.count * num_bytes;
diff --git a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
index a9a50543e7bb7..a4880a9997a53 100644
--- a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
@@ -17,7 +17,7 @@
 #include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.cu
index 88e075386d093..7cab12b625d23 100644
--- a/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.cu
@@ -33,31 +33,31 @@ void Ltgemm_int8_linear(
     cublasLtMatmulDesc_t matmulDesc, void* alpha_scale, void* alpha_zero,
     void* alpha_one, void* workspace, cudaStream_t stream) {
   if (transA_) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransform(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform(
         ltHandle, transformDescT, alpha_one, A, Adesc, alpha_zero, nullptr,
         nullptr, Atransform, AtransformDesc, stream));
   } else {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransform(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform(
         ltHandle, transformDescN, alpha_one, A, Adesc, alpha_zero, nullptr,
         nullptr, Atransform, AtransformDesc, stream));
   }
 
   if (transB_) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransform(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform(
         ltHandle, transformDescN, alpha_one, B, Bdesc, alpha_zero, nullptr,
         nullptr, Btransform, BtransformDesc, stream));
   } else {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransform(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform(
         ltHandle, transformDescT, alpha_one, B, Bdesc, alpha_zero, nullptr,
         nullptr, Btransform, BtransformDesc, stream));
   }
 
-  PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmul(
+  PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmul(
       ltHandle, matmulDesc, alpha_scale, Atransform, AtransformDesc, Btransform,
       BtransformDesc, nullptr, Ctransform, CtransformDesc, Ctransform,
       CtransformDesc, nullptr, workspace, 0, stream));
 
-  PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransform(
+  PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform(
       ltHandle, transformDescN, alpha_one, Ctransform, CtransformDesc,
       alpha_zero, nullptr, nullptr, C, Cdesc, stream));
 }
@@ -69,7 +69,7 @@ void Ltgemm_fp32_linear(cublasLtHandle_t ltHandle, const float* A,
                         cublasLtMatmulDesc_t matmulDesc, void* alpha_scale,
                         void* alpha_zero, void* workspace,
                         cudaStream_t stream) {
-  PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmul(
+  PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmul(
       ltHandle, matmulDesc, alpha_scale, A, Adesc, B, Bdesc, alpha_zero, C,
       Cdesc, C, Cdesc, nullptr, workspace, 0, stream));
 }
@@ -81,7 +81,7 @@ void Ltgemm_fp16_linear(cublasLtHandle_t ltHandle, const half* A,
                         cublasLtMatmulDesc_t matmulDesc, void* alpha_scale,
                         void* alpha_zero, void* workspace,
                         cudaStream_t stream) {
-  PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmul(
+  PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmul(
       ltHandle, matmulDesc, alpha_scale, A, Adesc, B, Bdesc, alpha_zero, C,
       Cdesc, C, Cdesc, nullptr, workspace, 0, stream));
 }
@@ -182,98 +182,98 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs,
     int const ldatransform = 32 * n_;
     int const ldbtransform = 32 * ((m_ + 8 - 1) / 8 * 8);
     int const ldctransform = 32 * n_;
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc(
         (void**)&Atransform_,
         sizeof(int8_t) * ((k_ + 32 - 1) / 32 * 32) / 32 * ldatransform));
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc(
         (void**)&Btransform_,
         sizeof(int8_t) * ((k_ + 32 - 1) / 32 * 32) / 32 * ldbtransform));
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc(
         (void**)&Ctransform_,
         sizeof(int8_t) * ((m_ + 32 - 1) / 32 * 32) / 32 * ldctransform));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &Adesc_, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n_ : k_,
         AopTranspose == CUBLAS_OP_N ? k_ : n_,
         AopTranspose == CUBLAS_OP_N ? n_ : k_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea),
         sizeof(stridea)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &Bdesc_, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k_ : m_,
         BopTranspose == CUBLAS_OP_N ? m_ : k_,
         BopTranspose == CUBLAS_OP_N ? k_ : m_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb),
         sizeof(strideb)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dyl::cublasLtMatrixLayoutCreate(&Cdesc_, cudadataTypeIO, n_, m_, n_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec),
         sizeof(stridec)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &AtransformDesc_, cudadataTypeIO, n_, k_, ldatransform));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         AtransformDesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         AtransformDesc_, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL32, sizeof(COL32)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &BtransformDesc_, cudadataTypeIO, m_, k_, ldbtransform));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         BtransformDesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         BtransformDesc_, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL4_4R2_8C,
         sizeof(COL4_4R2_8C)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &CtransformDesc_, cudadataTypeIO, n_, m_, ldctransform));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         CtransformDesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         CtransformDesc_, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL32, sizeof(COL32)));
 
     cublasOperation_t Transpose = CUBLAS_OP_T;
     cublasLtPointerMode_t transform_model = CUBLASLT_POINTER_MODE_DEVICE;
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescCreate(
         &transformDescT_, cudaDataTypeS));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
         transformDescT_, CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE,
         &cudaDataTypeS, sizeof(cudaDataTypeS)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
         transformDescT_, CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA, &Transpose,
         sizeof(Transpose)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
         transformDescT_, CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE,
         &transform_model, sizeof(transform_model)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescCreate(
         &transformDescN_, cudaDataTypeS));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
         transformDescN_, CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE,
         &cudaDataTypeS, sizeof(cudaDataTypeS)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
         transformDescN_, CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE,
         &transform_model, sizeof(transform_model)));
 
@@ -282,20 +282,20 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs,
         CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO;
 
 #if CUBLAS_VER_MAJOR < 11
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dyl::cublasLtMatmulDescCreate(&matmulDesc_, cudaComputeType));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescCreate(
         &matmulDesc_, cudaComputeType, cudaDataTypeS));
 #endif
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSA, &ATranspose,
         sizeof(ATranspose)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSB, &BTranspose,
         sizeof(BTranspose)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc_, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model,
         sizeof(matmul_model)));
 
@@ -303,17 +303,16 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs,
     for (int i = 0; i < n_; i++) {
       alpha_tem[i] = alpha_ * inscale_0 * inscale_1 / outscale;
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaMalloc((void**)&alpha_scale_, n_ * sizeof(float)));
     cudaMemcpyAsync(alpha_scale_, &alpha_tem[0], n_ * sizeof(float),
                     cudaMemcpyHostToDevice);
     float zero_tem = zero;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaMalloc((void**)&alpha_zero_, sizeof(float)));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_zero_, sizeof(float)));
     cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float),
                     cudaMemcpyHostToDevice);
     float one_tem = 1;
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc((void**)&alpha_one_, sizeof(float)));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_one_, sizeof(float)));
     cudaMemcpyAsync(alpha_one_, &one_tem, sizeof(float),
                     cudaMemcpyHostToDevice);
   } else if (type_ == nvinfer1::DataType::kHALF) {
@@ -324,70 +323,69 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs,
 #else
     cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_16F;
 #endif
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &Adesc_, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n_ : k_,
         AopTranspose == CUBLAS_OP_N ? k_ : n_,
         AopTranspose == CUBLAS_OP_N ? n_ : k_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea),
         sizeof(stridea)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &Bdesc_, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k_ : m_,
         BopTranspose == CUBLAS_OP_N ? m_ : k_,
         BopTranspose == CUBLAS_OP_N ? k_ : m_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb),
         sizeof(strideb)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dyl::cublasLtMatrixLayoutCreate(&Cdesc_, cudadataTypeIO, n_, m_, n_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec),
         sizeof(stridec)));
 
     cublasLtPointerMode_t matmul_model = CUBLASLT_POINTER_MODE_DEVICE;
 
 #if CUBLAS_VER_MAJOR < 11
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dyl::cublasLtMatmulDescCreate(&matmulDesc_, cudaComputeType));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescCreate(
         &matmulDesc_, cudaComputeType, cudaDataTypeS));
 #endif
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSA, &AopTranspose,
         sizeof(AopTranspose)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSB, &BopTranspose,
         sizeof(BopTranspose)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc_, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model,
         sizeof(matmul_model)));
 
     half alpha_tem = static_cast<half>(alpha_);
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaMalloc((void**)&alpha_scale_, sizeof(half)));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_scale_, sizeof(half)));
     cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(half),
                     cudaMemcpyHostToDevice);
     half zero_tem = static_cast<half>(zero);
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc((void**)&alpha_zero_, sizeof(half)));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_zero_, sizeof(half)));
     cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(half),
                     cudaMemcpyHostToDevice);
   } else {
@@ -398,71 +396,70 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs,
 #else
     cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_32F_FAST_16F;
 #endif
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &Adesc_, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n_ : k_,
         AopTranspose == CUBLAS_OP_N ? k_ : n_,
         AopTranspose == CUBLAS_OP_N ? n_ : k_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea),
         sizeof(stridea)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &Bdesc_, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k_ : m_,
         BopTranspose == CUBLAS_OP_N ? m_ : k_,
         BopTranspose == CUBLAS_OP_N ? k_ : m_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb),
         sizeof(strideb)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dyl::cublasLtMatrixLayoutCreate(&Cdesc_, cudadataTypeIO, n_, m_, n_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec),
         sizeof(stridec)));
 
     cublasLtPointerMode_t matmul_model = CUBLASLT_POINTER_MODE_DEVICE;
 
 #if CUBLAS_VER_MAJOR < 11
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dyl::cublasLtMatmulDescCreate(&matmulDesc_, cudaComputeType));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescCreate(
         &matmulDesc_, cudaComputeType, cudaDataTypeS));
 #endif
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSA, &AopTranspose,
         sizeof(AopTranspose)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSB, &BopTranspose,
         sizeof(BopTranspose)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc_, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model,
         sizeof(matmul_model)));
 
     float alpha_tem = alpha_;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaMalloc((void**)&alpha_scale_, sizeof(float)));
     cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(float),
                     cudaMemcpyHostToDevice);
     float zero_tem = zero;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaMalloc((void**)&alpha_zero_, sizeof(float)));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_zero_, sizeof(float)));
     cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float),
                     cudaMemcpyHostToDevice);
   }
@@ -613,13 +610,13 @@ void MatmulPluginDynamic::configurePlugin(
   int const ldatransform = 32 * n_max;
   int const ldbtransform = 32 * ((m_max + 8 - 1) / 8 * 8);
   int const ldctransform = 32 * n_max;
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc(
       (void**)&Atransform_,
       sizeof(int8_t) * ((k_max + 32 - 1) / 32 * 32) / 32 * ldatransform));
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc(
       (void**)&Btransform_,
       sizeof(int8_t) * ((k_max + 32 - 1) / 32 * 32) / 32 * ldbtransform));
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc(
       (void**)&Ctransform_,
       sizeof(int8_t) * ((m_max + 32 - 1) / 32 * 32) / 32 * ldctransform));
 
@@ -628,38 +625,35 @@ void MatmulPluginDynamic::configurePlugin(
     for (int i = 0; i < n_max; i++) {
       alpha_tem[i] = alpha_ * inscale_0 * inscale_1 / outscale;
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaMalloc((void**)&alpha_scale_, n_max * sizeof(float)));
     cudaMemcpyAsync(alpha_scale_, &alpha_tem[0], n_max * sizeof(float),
                     cudaMemcpyHostToDevice);
     float zero_tem = zero;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaMalloc((void**)&alpha_zero_, sizeof(float)));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_zero_, sizeof(float)));
     cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float),
                     cudaMemcpyHostToDevice);
     float one_tem = 1;
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc((void**)&alpha_one_, sizeof(float)));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_one_, sizeof(float)));
     cudaMemcpyAsync(alpha_one_, &one_tem, sizeof(float),
                     cudaMemcpyHostToDevice);
   } else if (type_ == nvinfer1::DataType::kHALF) {
     half alpha_tem = static_cast<half>(alpha_);
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaMalloc((void**)&alpha_scale_, sizeof(half)));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_scale_, sizeof(half)));
     cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(half),
                     cudaMemcpyHostToDevice);
     half zero_tem = static_cast<half>(zero);
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc((void**)&alpha_zero_, sizeof(half)));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_zero_, sizeof(half)));
     cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(half),
                     cudaMemcpyHostToDevice);
   } else {
     float alpha_tem = alpha_;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaMalloc((void**)&alpha_scale_, sizeof(float)));
     cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(float),
                     cudaMemcpyHostToDevice);
     float zero_tem = zero;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaMalloc((void**)&alpha_zero_, sizeof(float)));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_zero_, sizeof(float)));
     cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float),
                     cudaMemcpyHostToDevice);
   }
@@ -766,88 +760,88 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
     cublasLtOrder_t COL32 = CUBLASLT_ORDER_COL32;
     cublasLtOrder_t COL4_4R2_8C = CUBLASLT_ORDER_COL4_4R2_8C;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &Adesc, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n : k,
         AopTranspose == CUBLAS_OP_N ? k : n,
         AopTranspose == CUBLAS_OP_N ? n : k));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea),
         sizeof(stridea)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &Bdesc, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k : m,
         BopTranspose == CUBLAS_OP_N ? m : k,
         BopTranspose == CUBLAS_OP_N ? k : m));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb),
         sizeof(strideb)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dyl::cublasLtMatrixLayoutCreate(&Cdesc, cudadataTypeIO, n, m, n));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec),
         sizeof(stridec)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &AtransformDesc, cudadataTypeIO, n, k, ldatransform));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         AtransformDesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         AtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL32, sizeof(COL32)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &BtransformDesc, cudadataTypeIO, m, k, ldbtransform));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         BtransformDesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         BtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL4_4R2_8C,
         sizeof(COL4_4R2_8C)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &CtransformDesc, cudadataTypeIO, n, m, ldctransform));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         CtransformDesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         CtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL32, sizeof(COL32)));
 
     cublasOperation_t Transpose = CUBLAS_OP_T;
     cublasLtPointerMode_t transform_model = CUBLASLT_POINTER_MODE_DEVICE;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dyl::cublasLtMatrixTransformDescCreate(&transformDescT, cudaDataTypeS));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
         transformDescT, CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE,
         &cudaDataTypeS, sizeof(cudaDataTypeS)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
         transformDescT, CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA, &Transpose,
         sizeof(Transpose)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
         transformDescT, CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE,
         &transform_model, sizeof(transform_model)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dyl::cublasLtMatrixTransformDescCreate(&transformDescN, cudaDataTypeS));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
         transformDescN, CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE,
         &cudaDataTypeS, sizeof(cudaDataTypeS)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
         transformDescN, CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE,
         &transform_model, sizeof(transform_model)));
 
@@ -856,20 +850,20 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
         CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO;
 
 #if CUBLAS_VER_MAJOR < 11
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dyl::cublasLtMatmulDescCreate(&matmulDesc, cudaComputeType));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescCreate(
         &matmulDesc, cudaComputeType, cudaDataTypeS));
 #endif
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &ATranspose,
         sizeof(ATranspose)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &BTranspose,
         sizeof(BTranspose)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model,
         sizeof(matmul_model)));
 
@@ -889,60 +883,60 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
 #else
     cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_16F;
 #endif
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &Adesc, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n : k,
         AopTranspose == CUBLAS_OP_N ? k : n,
         AopTranspose == CUBLAS_OP_N ? n : k));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea),
         sizeof(stridea)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &Bdesc, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k : m,
         BopTranspose == CUBLAS_OP_N ? m : k,
         BopTranspose == CUBLAS_OP_N ? k : m));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb),
         sizeof(strideb)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dyl::cublasLtMatrixLayoutCreate(&Cdesc, cudadataTypeIO, n, m, n));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec),
         sizeof(stridec)));
 
     cublasLtPointerMode_t matmul_model = CUBLASLT_POINTER_MODE_DEVICE;
 
 #if CUBLAS_VER_MAJOR < 11
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dyl::cublasLtMatmulDescCreate(&matmulDesc, cudaComputeType));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescCreate(
         &matmulDesc, cudaComputeType, cudaDataTypeS));
 #endif
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &AopTranspose,
         sizeof(AopTranspose)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &BopTranspose,
         sizeof(BopTranspose)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model,
         sizeof(matmul_model)));
 
@@ -959,60 +953,60 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
 #else
     cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_32F_FAST_16F;
 #endif
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &Adesc, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n : k,
         AopTranspose == CUBLAS_OP_N ? k : n,
         AopTranspose == CUBLAS_OP_N ? n : k));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea),
         sizeof(stridea)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &Bdesc, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k : m,
         BopTranspose == CUBLAS_OP_N ? m : k,
         BopTranspose == CUBLAS_OP_N ? k : m));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb),
         sizeof(strideb)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dyl::cublasLtMatrixLayoutCreate(&Cdesc, cudadataTypeIO, n, m, n));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec),
         sizeof(stridec)));
 
     cublasLtPointerMode_t matmul_model = CUBLASLT_POINTER_MODE_DEVICE;
 
 #if CUBLAS_VER_MAJOR < 11
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dyl::cublasLtMatmulDescCreate(&matmulDesc, cudaComputeType));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescCreate(
         &matmulDesc, cudaComputeType, cudaDataTypeS));
 #endif
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &AopTranspose,
         sizeof(AopTranspose)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &BopTranspose,
         sizeof(BopTranspose)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model,
         sizeof(matmul_model)));
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
index 091680ff672d0..ec4fcca6d74d0 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -136,7 +136,7 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
   float const* input_ptr = reinterpret_cast<float const*>(inputs[0]);
   float* const* h_odatas = reinterpret_cast<float* const*>(outputs);
   float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]);
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(
       output_ptrs, h_odatas, d_output_ptrs_.size() * sizeof(float*),
       cudaMemcpyHostToDevice, stream));
 
@@ -263,7 +263,7 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
     float* const* h_odatas = reinterpret_cast<float* const*>(outputs);
     float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]);
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(
         output_ptrs, h_odatas, d_output_ptrs.size() * sizeof(float*),
         cudaMemcpyHostToDevice, stream));
 
@@ -279,7 +279,7 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
     half* const* h_odatas = reinterpret_cast<half* const*>(outputs);
     half** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]);
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(
         output_ptrs, h_odatas, d_output_ptrs.size() * sizeof(half*),
         cudaMemcpyHostToDevice, stream));
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu
index c3b4a6ff4af1c..74a6c3cdf3e4e 100644
--- a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu
@@ -107,8 +107,13 @@ bool StackPluginDynamic::supportsFormatCombination(
   const nvinfer1::PluginTensorDesc& in = in_out[pos];
   if (pos == 0) {
     if (with_fp16_) {
-      return (in.type == nvinfer1::DataType::kFLOAT ||
-              in.type == nvinfer1::DataType::kHALF) &&
+      return (
+// It's workaround for ernie fix len model.
+// Enabling float, half on the same time will cause trt hang.
+#if IS_TRT_VERSION_LT(8000)
+                 in.type == nvinfer1::DataType::kFLOAT ||
+#endif
+                 in.type == nvinfer1::DataType::kHALF) &&
              (in.format == nvinfer1::TensorFormat::kLINEAR);
     } else {
       return (in.type == nvinfer1::DataType::kFLOAT) &&
diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
index 86666950bc36e..c330867607f8e 100644
--- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
+++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
@@ -85,7 +85,7 @@ bool TRTInt8Calibrator::setBatch(
           engine_name_, it.first));
     }
     const auto& d = dataptr->second;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice));
   }
 
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 6fd3944a6c528..a28b0c172aff0 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -94,6 +94,17 @@ function(inference_analysis_api_test target install_dir filename)
         ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt --refer_result=${install_dir}/result.txt)
 endfunction()
 
+function(inference_analysis_api_int8_test target install_dir filename)
+    inference_analysis_test(${target} SRCS ${filename}
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+        ARGS --infer_model=${install_dir}/model
+             --infer_data=${install_dir}/data.txt
+             --refer_result=${install_dir}/result.txt
+             --accuracy=0.8
+             --batch_size=5
+             --enable_int8=true)
+endfunction()
+
 function(inference_multiple_models_analysis_api_test target install_dir filename)
     inference_analysis_test(${target} SRCS ${filename}
         EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
@@ -284,13 +295,14 @@ set(PYRAMID_DNN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/pyramid_dnn")
 download_model_and_data_without_verify(${PYRAMID_DNN_INSTALL_DIR} "PyramidDNN_model.tar.gz" "PyramidDNN_data.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_pyramid_dnn ${PYRAMID_DNN_INSTALL_DIR} analyzer_pyramid_dnn_tester.cc)
 
-#Ernie
+# Ernie
 set(ERNIE_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie")
 download_model_and_data(${ERNIE_INSTALL_DIR} "Ernie_model.tar.gz" aa59192dd41ed377f9f168e3a1309fa6 "Ernie_data.txt.tar.gz" 5396e63548edad7ca561e7e26a9476d1)
 download_result(${ERNIE_INSTALL_DIR} "Ernie_result.txt.tar.gz" 73beea65abda2edb61c1662cd3180c62)
 inference_analysis_api_test(test_analyzer_ernie ${ERNIE_INSTALL_DIR} analyzer_ernie_tester.cc)
+inference_analysis_api_int8_test(test_analyzer_ernie_int8 ${ERNIE_INSTALL_DIR} analyzer_ernie_int8_tester.cc)
 
-#Ernie large
+# Ernie large
 set(ERNIE_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie_Large")
 download_model_and_data(${ERNIE_INSTALL_DIR} "Ernie_large_model.tar.gz" af7715245ed32cc77374625d4c80f7ef "Ernie_large_data.txt.tar.gz" edb2113eec93783cad56ed76d47ba57f)
 download_result(${ERNIE_INSTALL_DIR} "Ernie_large_result.txt.tar.gz" 1facda98eef1085dc9d435ebf3f23a73)
@@ -426,7 +438,7 @@ if(WITH_MKLDNN)
   # TODO(grygielski) Enable after MKL-DNN 1.0 merge
   set(INT8_VGG16_MODEL_DIR "${INT8_DATA_DIR}/vgg16")
   download_int8_data_without_verify(${INT8_VGG16_MODEL_DIR} "VGG16_int8_model.tar.gz" )
-#   inference_analysis_api_int8_test_run(test_analyzer_int8_vgg16 ${INT8_IMG_CLASS_TEST_APP} ${INT8_VGG16_MODEL_DIR} ${IMAGENET_DATA_PATH})
+#  inference_analysis_api_int8_test_run(test_analyzer_int8_vgg16 ${INT8_IMG_CLASS_TEST_APP} ${INT8_VGG16_MODEL_DIR} ${IMAGENET_DATA_PATH})
  
   # vgg19 int8
   # TODO(grygielski) Enable after MKL-DNN 1.0 merge
@@ -730,6 +742,7 @@ set_tests_properties(test_analyzer_mobilenet_transpose PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_resnet50 PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_ner PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_ernie PROPERTIES TIMEOUT 120)
+set_tests_properties(test_analyzer_ernie_int8 PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_googlenet PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_small_dam PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_transformer PROPERTIES TIMEOUT 120)
diff --git a/paddle/fluid/inference/tests/api/analyzer_ernie_int8_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ernie_int8_tester.cc
new file mode 100644
index 0000000000000..b85726647b548
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_ernie_int8_tester.cc
@@ -0,0 +1,54 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tests/api/analyzer_ernie_tester.h"
+
+namespace paddle {
+namespace inference {
+
+using paddle::PaddleTensor;
+
+#ifdef PADDLE_WITH_MKLDNN
+void SetInt8Config(AnalysisConfig *cfg,
+                   std::vector<paddle::PaddleTensor> data) {
+  cfg->SetModel(FLAGS_infer_model);
+  cfg->EnableMKLDNN();
+  cfg->EnableMkldnnQuantizer();
+  auto warmup_data = std::make_shared<std::vector<PaddleTensor>>(data);
+  cfg->mkldnn_quantizer_config()->SetWarmupData(warmup_data);
+  cfg->mkldnn_quantizer_config()->SetWarmupBatchSize(FLAGS_batch_size);
+  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchIrOptim();
+  cfg->SetCpuMathLibraryNumThreads(FLAGS_cpu_num_threads);
+}
+
+// Compare result of NativeConfig and AnalysisConfig
+void compare_int8(bool use_mkldnn = false) {
+  std::vector<std::vector<PaddleTensor>> inputs;
+  LoadInputData(&inputs);
+
+  AnalysisConfig cfg;
+  SetInt8Config(&cfg, inputs[0]);
+
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), inputs);
+}
+
+TEST(Analyzer_ernie, compare_int8_mkldnn) {
+  compare_int8(true /* use_mkldnn */);
+}
+#endif
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_ernie_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ernie_tester.cc
index 0c2a140023e29..d6ff3e422368b 100644
--- a/paddle/fluid/inference/tests/api/analyzer_ernie_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ernie_tester.cc
@@ -12,142 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/tests/api/tester_helper.h"
+#include "paddle/fluid/inference/tests/api/analyzer_ernie_tester.h"
 
 namespace paddle {
 namespace inference {
 
 using paddle::PaddleTensor;
 
-template <typename T>
-void GetValueFromStream(std::stringstream *ss, T *t) {
-  (*ss) >> (*t);
-}
-
-template <>
-void GetValueFromStream<std::string>(std::stringstream *ss, std::string *t) {
-  *t = ss->str();
-}
-
-// Split string to vector
-template <typename T>
-void Split(const std::string &line, char sep, std::vector<T> *v) {
-  std::stringstream ss;
-  T t;
-  for (auto c : line) {
-    if (c != sep) {
-      ss << c;
-    } else {
-      GetValueFromStream<T>(&ss, &t);
-      v->push_back(std::move(t));
-      ss.str({});
-      ss.clear();
-    }
-  }
-
-  if (!ss.str().empty()) {
-    GetValueFromStream<T>(&ss, &t);
-    v->push_back(std::move(t));
-    ss.str({});
-    ss.clear();
-  }
-}
-
-// Parse tensor from string
-template <typename T>
-bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) {
-  std::vector<std::string> data;
-  Split(field, ':', &data);
-  if (data.size() < 2) return false;
-
-  std::string shape_str = data[0];
-
-  std::vector<int> shape;
-  Split(shape_str, ' ', &shape);
-
-  std::string mat_str = data[1];
-
-  std::vector<T> mat;
-  Split(mat_str, ' ', &mat);
-
-  tensor->shape = shape;
-  auto size =
-      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) *
-      sizeof(T);
-  tensor->data.Resize(size);
-  std::copy(mat.begin(), mat.end(), static_cast<T *>(tensor->data.data()));
-  tensor->dtype = GetPaddleDType<T>();
-
-  return true;
-}
-
-// Parse input tensors from string
-bool ParseLine(const std::string &line,
-               std::vector<paddle::PaddleTensor> *tensors) {
-  std::vector<std::string> fields;
-  Split(line, ';', &fields);
-
-  tensors->clear();
-  tensors->reserve(4);
-
-  int i = 0;
-  auto input_name = FLAGS_ernie_large ? "eval_placeholder_" : "placeholder_";
-  for (; i < 3; i++) {
-    paddle::PaddleTensor temp;
-    ParseTensor<int64_t>(fields[i], &temp);
-    temp.name = input_name + std::to_string(i);
-    tensors->push_back(temp);
-  }
-
-  // input_mask
-  paddle::PaddleTensor input_mask;
-  ParseTensor<float>(fields[i], &input_mask);
-  input_mask.name = input_name + std::to_string(i);
-  tensors->push_back(input_mask);
-
-  return true;
-}
-
-bool LoadInputData(std::vector<std::vector<paddle::PaddleTensor>> *inputs) {
-  if (FLAGS_infer_data.empty()) {
-    LOG(ERROR) << "please set input data path";
-    return false;
-  }
-
-  std::ifstream fin(FLAGS_infer_data);
-  std::string line;
-  int sample = 0;
-
-  // The unit-test dataset only have 10 samples, each sample have 5 feeds.
-  while (std::getline(fin, line)) {
-    std::vector<paddle::PaddleTensor> feed_data;
-    ParseLine(line, &feed_data);
-    inputs->push_back(std::move(feed_data));
-    sample++;
-    if (!FLAGS_test_all_data && sample == FLAGS_batch_size) break;
-  }
-  LOG(INFO) << "number of samples: " << sample;
-  return true;
-}
-
-void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false,
-               bool use_gpu = false) {
-  cfg->SetModel(FLAGS_infer_model);
-  if (use_mkldnn) {
-    cfg->EnableMKLDNN();
-  }
-  if (use_gpu) {
-    cfg->EnableUseGpu(100, 0);
-  } else {
-    cfg->DisableGpu();
-  }
-  cfg->SwitchSpecifyInputNames();
-  cfg->SwitchIrOptim();
-  cfg->SetCpuMathLibraryNumThreads(FLAGS_cpu_num_threads);
-}
-
 void profile(bool use_mkldnn = false, bool use_gpu = false) {
   AnalysisConfig config;
+
   SetConfig(&config, use_mkldnn, use_gpu);
 
   std::vector<std::vector<PaddleTensor>> outputs;
@@ -189,11 +63,12 @@ TEST(Analyzer_Ernie, fuse_statis) {
 
 // Compare result of NativeConfig and AnalysisConfig
 void compare(bool use_mkldnn = false) {
+  std::vector<std::vector<PaddleTensor>> inputs;
+  LoadInputData(&inputs);
+
   AnalysisConfig cfg;
   SetConfig(&cfg, use_mkldnn, false);
 
-  std::vector<std::vector<PaddleTensor>> inputs;
-  LoadInputData(&inputs);
   CompareNativeAndAnalysis(
       reinterpret_cast<const PaddlePredictor::Config *>(&cfg), inputs);
 }
diff --git a/paddle/fluid/inference/tests/api/analyzer_ernie_tester.h b/paddle/fluid/inference/tests/api/analyzer_ernie_tester.h
new file mode 100644
index 0000000000000..dd3faac759210
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_ernie_tester.h
@@ -0,0 +1,152 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+
+using paddle::PaddleTensor;
+
+template <typename T>
+void GetValueFromStream(std::stringstream *ss, T *t) {
+  (*ss) >> (*t);
+}
+
+template <>
+void GetValueFromStream<std::string>(std::stringstream *ss, std::string *t) {
+  *t = ss->str();
+}
+
+// Split string to vector
+template <typename T>
+void Split(const std::string &line, char sep, std::vector<T> *v) {
+  std::stringstream ss;
+  T t;
+  for (auto c : line) {
+    if (c != sep) {
+      ss << c;
+    } else {
+      GetValueFromStream<T>(&ss, &t);
+      v->push_back(std::move(t));
+      ss.str({});
+      ss.clear();
+    }
+  }
+
+  if (!ss.str().empty()) {
+    GetValueFromStream<T>(&ss, &t);
+    v->push_back(std::move(t));
+    ss.str({});
+    ss.clear();
+  }
+}
+
+// Parse tensor from string
+template <typename T>
+bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) {
+  std::vector<std::string> data;
+  Split(field, ':', &data);
+  if (data.size() < 2) return false;
+
+  std::string shape_str = data[0];
+
+  std::vector<int> shape;
+  Split(shape_str, ' ', &shape);
+
+  std::string mat_str = data[1];
+
+  std::vector<T> mat;
+  Split(mat_str, ' ', &mat);
+
+  tensor->shape = shape;
+  auto size =
+      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) *
+      sizeof(T);
+  tensor->data.Resize(size);
+  std::copy(mat.begin(), mat.end(), static_cast<T *>(tensor->data.data()));
+  tensor->dtype = GetPaddleDType<T>();
+
+  return true;
+}
+
+// Parse input tensors from string
+bool ParseLine(const std::string &line,
+               std::vector<paddle::PaddleTensor> *tensors) {
+  std::vector<std::string> fields;
+  Split(line, ';', &fields);
+
+  tensors->clear();
+  tensors->reserve(4);
+
+  int i = 0;
+  auto input_name = FLAGS_ernie_large ? "eval_placeholder_" : "placeholder_";
+  for (; i < 3; i++) {
+    paddle::PaddleTensor temp;
+    ParseTensor<int64_t>(fields[i], &temp);
+    temp.name = input_name + std::to_string(i);
+    tensors->push_back(temp);
+  }
+
+  // input_mask
+  paddle::PaddleTensor input_mask;
+  ParseTensor<float>(fields[i], &input_mask);
+  input_mask.name = input_name + std::to_string(i);
+  tensors->push_back(input_mask);
+
+  return true;
+}
+
+bool LoadInputData(std::vector<std::vector<paddle::PaddleTensor>> *inputs) {
+  if (FLAGS_infer_data.empty()) {
+    LOG(ERROR) << "please set input data path";
+    return false;
+  }
+
+  std::ifstream fin(FLAGS_infer_data);
+  std::string line;
+  int sample = 0;
+
+  // The unit-test dataset only have 10 samples, each sample have 5 feeds.
+  while (std::getline(fin, line)) {
+    std::vector<paddle::PaddleTensor> feed_data;
+    ParseLine(line, &feed_data);
+    inputs->push_back(std::move(feed_data));
+    sample++;
+    if (!FLAGS_test_all_data && sample == FLAGS_batch_size) break;
+  }
+  LOG(INFO) << "number of samples: " << sample;
+  return true;
+}
+
+void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false,
+               bool use_gpu = false) {
+  cfg->SetModel(FLAGS_infer_model);
+  if (use_mkldnn) {
+    cfg->EnableMKLDNN();
+  }
+  if (use_gpu) {
+    cfg->EnableUseGpu(100, 0);
+  } else {
+    cfg->DisableGpu();
+  }
+  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchIrOptim();
+  cfg->SetCpuMathLibraryNumThreads(FLAGS_cpu_num_threads);
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt
index 69134e1c76bb7..97952e4b71641 100644
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@@ -19,14 +19,13 @@ if (WITH_GPU)
             DEPS device_context malloc)
     nv_test(stream_safe_cuda_alloc_test
             SRCS stream_safe_cuda_alloc_test.cu
-            DEPS malloc)
+            DEPS malloc cuda_graph_with_memory_pool)
     
     if(WITH_TESTING AND TEST stream_safe_cuda_alloc_test)
         set_tests_properties(stream_safe_cuda_alloc_test PROPERTIES 
-                             ENVIRONMENT "FLAGS_use_system_allocator=false"
-                             ENVIRONMENT "FLAGS_enable_stream_safe_cuda_allocator=true"
-                             ENVIRONMENT "FLAGS_allocator_strategy=auto_growth")   
-    endif()
+                             ENVIRONMENT "FLAGS_use_stream_safe_cuda_allocator=true;
+                                          FLAGS_allocator_strategy=auto_growth")
+    endif()  
 endif()
 
 if (WITH_ROCM)
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 4d44c533b7456..b3351f44dc35a 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -48,6 +48,8 @@ if (WITH_GPU OR WITH_ROCM)
     endif()
 elseif(WITH_XPU)
     set(AllocatorFacadeDeps xpu_info)
+elseif(WITH_IPU)
+    set(AllocatorFacadeDeps ipu_info)
 elseif(WITH_ASCEND)
     set(AllocatorFacadeDeps ascend_npu_info)
 else ()
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index b7b238bd0bf53..2aed7ec001d2a 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -30,13 +30,10 @@
 #include "paddle/fluid/memory/allocation/pinned_allocator.h"
 #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
 #include "paddle/fluid/memory/allocation/thread_local_allocator.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 #ifdef PADDLE_WITH_CUDA
-#include <cuda_runtime.h>
-#include "paddle/fluid/platform/cuda_graph.h"
-#else
-#include <hip/hip_runtime.h>
+#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
 #endif
 
 #if CUDA_VERSION >= 10020
@@ -54,6 +51,10 @@
 #include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
 #endif
 
+#ifdef PADDLE_WITH_IPU
+#include "paddle/fluid/platform/device/ipu/ipu_info.h"
+#endif
+
 PADDLE_DEFINE_EXPORTED_int64(
     gpu_allocator_retry_time, 10000,
     "The retry time (milliseconds) when allocator fails "
@@ -70,7 +71,7 @@ PADDLE_DEFINE_EXPORTED_bool(use_virtual_memory_auto_growth, false,
 // NOTE(Ruibiao): This FLAGS is just to be compatibled with
 // the old single-stream CUDA allocator. It will be removed
 // after StreamSafeCudaAllocator has been fully tested.
-PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, true,
+PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, false,
                             "Enable StreamSafeCUDAAllocator");
 
 DECLARE_string(allocator_strategy);
@@ -139,14 +140,18 @@ class AllocatorFacadePrivate {
     switch (strategy_) {
       case AllocatorStrategy::kNaiveBestFit: {
         InitNaiveBestFitCPUAllocator();
+#ifdef PADDLE_WITH_IPU
+        for (int dev_id = 0; dev_id < platform::GetIPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
+        }
+#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
         if (FLAGS_use_stream_safe_cuda_allocator) {
           LOG(WARNING) << "FLAGS_use_stream_safe_cuda_allocator is invalid for "
                           "naive_best_fit strategy";
           FLAGS_use_stream_safe_cuda_allocator = false;
         }
-        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
-             ++dev_id) {
+        for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
           InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
         }
         InitNaiveBestFitCUDAPinnedAllocator();
@@ -172,13 +177,13 @@ class AllocatorFacadePrivate {
         if (FLAGS_use_stream_safe_cuda_allocator) {
           // TODO(Ruibiao): Support multi-stream allocator for other strategies
           default_stream_ = nullptr;
-          for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
+          for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount();
                ++dev_id) {
             InitStreamSafeCUDAAllocator(platform::CUDAPlace(dev_id),
                                         default_stream_);
           }
         } else {
-          for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
+          for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount();
                ++dev_id) {
             InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
                                         allow_free_idle_chunk_);
@@ -190,6 +195,11 @@ class AllocatorFacadePrivate {
         for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
           InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
         }
+#endif
+#ifdef PADDLE_WITH_IPU
+        for (int dev_id = 0; dev_id < platform::GetIPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
+        }
 #endif
         break;
       }
@@ -201,6 +211,11 @@ class AllocatorFacadePrivate {
           InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
         }
 #endif
+#ifdef PADDLE_WITH_IPU
+        for (int dev_id = 0; dev_id < platform::GetIPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
+        }
+#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
         if (FLAGS_use_stream_safe_cuda_allocator) {
           LOG(WARNING) << "FLAGS_use_stream_safe_cuda_allocator is invalid for "
@@ -208,8 +223,7 @@ class AllocatorFacadePrivate {
           FLAGS_use_stream_safe_cuda_allocator = false;
         }
 
-        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
-             ++dev_id) {
+        for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
           InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
         }
         InitNaiveBestFitCUDAPinnedAllocator();
@@ -399,10 +413,10 @@ class AllocatorFacadePrivate {
     CUdevice device;
     int val;
     try {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           paddle::platform::dynload::cuDeviceGet(&device, p.GetDeviceId()));
 
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           paddle::platform::dynload::cuDeviceGetAttribute(
               &val, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
               device));
@@ -476,10 +490,10 @@ class AllocatorFacadePrivate {
     CUdevice device;
     int val;
     try {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           paddle::platform::dynload::cuDeviceGet(&device, p.GetDeviceId()));
 
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           paddle::platform::dynload::cuDeviceGetAttribute(
               &val, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
               device));
@@ -575,6 +589,12 @@ class AllocatorFacadePrivate {
   }
 #endif
 
+#ifdef PADDLE_WITH_IPU
+  void InitNaiveBestFitIPUAllocator(platform::IPUPlace p) {
+    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+  }
+#endif
+
 #ifdef PADDLE_WITH_ASCEND_CL
   void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) {
     allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
@@ -596,10 +616,17 @@ class AllocatorFacadePrivate {
       system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
     }
 #endif
+#ifdef PADDLE_WITH_IPU
+    int device_count = platform::GetIPUDeviceCount();
+    for (int i = 0; i < device_count; ++i) {
+      platform::IPUPlace p(i);
+      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+    }
+#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     system_allocators_[platform::CUDAPinnedPlace()] =
         std::make_shared<CPUPinnedAllocator>();
-    int device_count = platform::GetCUDADeviceCount();
+    int device_count = platform::GetGPUDeviceCount();
     for (int i = 0; i < device_count; ++i) {
       platform::CUDAPlace p(i);
       system_allocators_[p] = std::make_shared<CUDAAllocator>(p);
@@ -612,7 +639,7 @@ class AllocatorFacadePrivate {
     std::vector<platform::Place> places;
     places.emplace_back(platform::CPUPlace());
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    int device_count = platform::GetCUDADeviceCount();
+    int device_count = platform::GetGPUDeviceCount();
     for (int dev_id = 0; dev_id < device_count; ++dev_id) {
       places.emplace_back(platform::CUDAPlace(dev_id));
     }
@@ -630,6 +657,12 @@ class AllocatorFacadePrivate {
       places.emplace_back(platform::NPUPlace(dev_id));
     }
 #endif
+#ifdef PADDLE_WITH_IPU
+    int device_count = platform::GetIPUDeviceCount();
+    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
+      places.emplace_back(platform::IPUPlace(dev_id));
+    }
+#endif
 
     for (auto& p : places) {
       zero_size_allocators_[p] = std::make_shared<ZeroSizeAllocator>(p);
@@ -704,10 +737,18 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
       FLAGS_use_system_allocator == false) {
+#ifdef PADDLE_WITH_CUDA
+    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+      return m_->GetAllocator(place,
+                              /* A non-zero num to choose allocator_ */ 1);
+    }
+#endif
+
     return m_->GetAllocator(BOOST_GET_CONST(platform::CUDAPlace, place),
                             m_->GetDefaultStream());
   }
 #endif
+
   return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
 }
 
@@ -721,10 +762,17 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
       size > 0 && FLAGS_use_system_allocator == false) {
+#ifdef PADDLE_WITH_CUDA
+    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+      return m_->GetAllocator(place, size)->Allocate(size);
+    }
+#endif
+
     return Alloc(BOOST_GET_CONST(platform::CUDAPlace, place), size,
                  m_->GetDefaultStream());
   }
 #endif
+
   return m_->GetAllocator(place, size)->Allocate(size);
 }
 
@@ -732,6 +780,14 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
       FLAGS_use_system_allocator == false) {
+#ifdef PADDLE_WITH_CUDA
+    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+      return m_
+          ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
+          ->Release(place);
+    }
+#endif
+
     return Release(BOOST_GET_CONST(platform::CUDAPlace, place),
                    m_->GetDefaultStream());
   }
@@ -750,6 +806,14 @@ std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
           "multi-stream 'AllocaShared' function. "
           "To enable it, you can enter 'export "
           "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal."));
+
+#ifdef PADDLE_WITH_CUDA
+  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
+  }
+#endif
+
   return std::shared_ptr<Allocation>(Alloc(place, size, stream));
 }
 
@@ -762,6 +826,14 @@ AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place,
           "multi-stream 'Alloca' function. "
           "To enable it, you can enter 'export "
           "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal."));
+
+#ifdef PADDLE_WITH_CUDA
+  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
+  }
+#endif
+
   if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
     return m_->GetAllocator(place, stream, /* creat_if_not_found = */ true)
         ->Allocate(size);
@@ -779,6 +851,14 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
           "multi-stream 'Release' function. "
           "To enable it, you can enter 'export "
           "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal."));
+
+#ifdef PADDLE_WITH_CUDA
+  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
+  }
+#endif
+
   return m_->GetAllocator(place, stream)->Release(place);
 }
 
@@ -791,6 +871,14 @@ void AllocatorFacade::RecordStream(Allocation* allocation,
           "'RecordStream' function. "
           "To enable it, you can enter 'export "
           "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal."));
+
+#ifdef PADDLE_WITH_CUDA
+  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
+  }
+#endif
+
   m_->RecordStream(allocation, stream);
 }
 
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index 4cd8b4e91e614..0d9f1043d9e86 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -19,7 +19,7 @@
 #include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
 #endif
 #ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 #include "paddle/fluid/platform/place.h"
 
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
index 9f34f5198a179..dd2a65d889d8d 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
@@ -100,11 +100,13 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t unaligned_size) {
     VLOG(2) << "Not found and reallocate " << realloc_size << "("
             << static_cast<void *>(p) << "), and remaining " << remaining_size;
   }
+  VLOG(10) << "Alloc " << block_it->size_ << " bytes, ptr = " << block_it->ptr_;
   return new BlockAllocation(block_it);
 }
 
 void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) {
-  VLOG(10) << "Free " << allocation->size() << " bytes";
+  VLOG(10) << "Free " << allocation->size()
+           << " bytes, ptr = " << allocation->ptr();
   std::lock_guard<SpinLock> guard(spinlock_);
   auto block_it = static_cast<BlockAllocation *>(allocation)->block_it_;
   auto &blocks = block_it->chunk_->blocks_;
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
index 193ef5a0cb922..4469673b305bf 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
@@ -19,7 +19,7 @@
 #include <thread>  // NOLINT
 #include "gflags/gflags.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 DECLARE_double(fraction_of_gpu_memory_to_use);
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc
index b1a45afa99d9a..4242083f2e617 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -25,8 +25,8 @@
 
 #include <string>
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
 namespace memory {
@@ -37,8 +37,8 @@ void CUDAAllocator::FreeImpl(Allocation* allocation) {
       BOOST_GET_CONST(platform::CUDAPlace, allocation->place()), place_,
       platform::errors::PermissionDenied(
           "GPU memory is freed in incorrect device. This may be a bug"));
-  platform::RecordedCudaFree(allocation->ptr(), allocation->size(),
-                             place_.device);
+  platform::RecordedGpuFree(allocation->ptr(), allocation->size(),
+                            place_.device);
   delete allocation;
 }
 
@@ -46,13 +46,13 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size) {
   std::call_once(once_flag_, [this] { platform::SetDeviceId(place_.device); });
 
   void* ptr;
-  auto result = platform::RecordedCudaMalloc(&ptr, size, place_.device);
+  auto result = platform::RecordedGpuMalloc(&ptr, size, place_.device);
   if (LIKELY(result == gpuSuccess)) {
     return new Allocation(ptr, size, platform::Place(place_));
   }
 
   size_t avail, total, actual_avail, actual_total;
-  bool is_limited = platform::RecordedCudaMemGetInfo(
+  bool is_limited = platform::RecordedGpuMemGetInfo(
       &avail, &total, &actual_avail, &actual_total, place_.device);
   size_t allocated = total - avail;
 
diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
index 3d6f1d7bcbea6..9e04fd3f0619e 100644
--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
@@ -81,10 +81,10 @@ class CUDADeviceContextAllocator : public Allocator {
       : place_(place), default_stream_(default_stream) {
     platform::CUDADeviceGuard guard(place_.device);
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         hipEventCreateWithFlags(&event_, hipEventDisableTiming));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaEventCreate(&event_, cudaEventDisableTiming));
 #endif
   }
@@ -93,9 +93,9 @@ class CUDADeviceContextAllocator : public Allocator {
     if (event_) {
       platform::CUDADeviceGuard guard(place_.device);
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(event_));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event_));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event_));
 #endif
     }
   }
@@ -111,12 +111,11 @@ class CUDADeviceContextAllocator : public Allocator {
         new CUDADeviceContextAllocation(memory::Alloc(place_, size));
 // Wait for the event on stream
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event_, default_stream_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(default_stream_, event_, 0));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, default_stream_));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(default_stream_, event_, 0));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, default_stream_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaStreamWaitEvent(default_stream_, event_, 0));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, default_stream_));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(default_stream_, event_, 0));
 #endif
     return allocation;
   }
diff --git a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
index e3780f2f11359..f4baca8288f03 100644
--- a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
@@ -23,8 +23,8 @@
 
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/dynload/cuda_driver.h"
-#include "paddle/fluid/platform/gpu_info.h"
 #endif
 #if CUDA_VERSION >= 10020
 
@@ -49,10 +49,10 @@ CUDAVirtualMemAllocator::CUDAVirtualMemAllocator(
 
   // Prepare the access descriptor array indicating where and how the backings
   // should be visible.
-  for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) {
+  for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
     if (place.device != dev_id) {
       int capable = 0;
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           cudaDeviceCanAccessPeer(&capable, place.device, dev_id));
       if (!capable) {
         VLOG(1) << "device(" << place.device
@@ -73,10 +73,10 @@ CUDAVirtualMemAllocator::CUDAVirtualMemAllocator(
   // Get the minimum granularity needed for all devices
   // (the max of the minimum granularity of each participating device)
   granularity_ = 0;
-  for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) {
+  for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
     size_t granularity;
     prop.location.id = dev_id;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         paddle::platform::dynload::cuMemGetAllocationGranularity(
             &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
     granularity_ = std::max(granularity, granularity_);
@@ -84,7 +84,7 @@ CUDAVirtualMemAllocator::CUDAVirtualMemAllocator(
 
   size_t actual_avail, actual_total;
   paddle::platform::CUDADeviceGuard guard(place.device);
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemGetInfo(&actual_avail, &actual_total));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemGetInfo(&actual_avail, &actual_total));
 
   virtual_mem_size_ = AlignedSize(actual_total, granularity_);
 
@@ -93,7 +93,7 @@ CUDAVirtualMemAllocator::CUDAVirtualMemAllocator(
   // GPU,
   // so the virtual address space size we reserve is equal to the GPU video
   // memory size
-  PADDLE_ENFORCE_CUDA_SUCCESS(paddle::platform::dynload::cuMemAddressReserve(
+  PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cuMemAddressReserve(
       &virtual_mem_base_, virtual_mem_size_, 0, 0, 0));
 
   virtual_mem_alloced_offset_ = 0;
@@ -123,11 +123,11 @@ void CUDAVirtualMemAllocator::FreeImpl(Allocation* allocation) {
   auto result =
       paddle::platform::dynload::cuMemUnmap(iter->first, iter->second.second);
   if (result != CUDA_ERROR_DEINITIALIZED) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(result);
+    PADDLE_ENFORCE_GPU_SUCCESS(result);
   }
 
   if (result != CUDA_ERROR_DEINITIALIZED) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::RecordedCuMemRelease(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::RecordedGpuMemRelease(
         iter->second.first, iter->second.second, place_.device));
   }
 
@@ -166,12 +166,12 @@ Allocation* CUDAVirtualMemAllocator::AllocateImpl(size_t size) {
 
   // Create physical memory backing allocation.
   auto result =
-      platform::RecordedCuMemCreate(&handle, size, &prop_, 0, place_.device);
+      platform::RecordedGpuMemCreate(&handle, size, &prop_, 0, place_.device);
 
   if (result != CUDA_SUCCESS) {
     if (result == CUDA_ERROR_OUT_OF_MEMORY) {
       size_t actual_avail, actual_total;
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemGetInfo(&actual_avail, &actual_total));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaMemGetInfo(&actual_avail, &actual_total));
       size_t actual_allocated = actual_total - actual_avail;
 
       PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
@@ -186,7 +186,7 @@ Allocation* CUDAVirtualMemAllocator::AllocateImpl(size_t size) {
           string::HumanReadableSize(actual_allocated),
           string::HumanReadableSize(actual_avail), place_.device));
     } else {
-      PADDLE_ENFORCE_CUDA_SUCCESS(result);
+      PADDLE_ENFORCE_GPU_SUCCESS(result);
     }
     return nullptr;
   }
@@ -197,8 +197,8 @@ Allocation* CUDAVirtualMemAllocator::AllocateImpl(size_t size) {
   result = paddle::platform::dynload::cuMemMap(ptr, size, 0, handle, 0);
 
   if (result != CUDA_SUCCESS) {
-    platform::RecordedCuMemRelease(handle, size, place_.device);
-    PADDLE_ENFORCE_CUDA_SUCCESS(result);
+    platform::RecordedGpuMemRelease(handle, size, place_.device);
+    PADDLE_ENFORCE_GPU_SUCCESS(result);
     return nullptr;
   }
 
@@ -208,8 +208,8 @@ Allocation* CUDAVirtualMemAllocator::AllocateImpl(size_t size) {
 
   if (result != CUDA_SUCCESS) {
     paddle::platform::dynload::cuMemUnmap(ptr, size);
-    platform::RecordedCuMemRelease(handle, size, place_.device);
-    PADDLE_ENFORCE_CUDA_SUCCESS(result);
+    platform::RecordedGpuMemRelease(handle, size, place_.device);
+    PADDLE_ENFORCE_GPU_SUCCESS(result);
     return nullptr;
   }
 
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 3bdd856759dc1..6de32335c62b2 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -20,8 +20,8 @@
 #include "glog/logging.h"
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "paddle/fluid/memory/detail/system_allocator.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/profiler.h"
 
 #include "paddle/fluid/string/printf.h"
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc
index 5aa0514432844..c56a7235c109c 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -20,18 +20,18 @@ namespace allocation {
 bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; }
 void CPUPinnedAllocator::FreeImpl(Allocation *allocation) {
 #ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipHostFree(allocation->ptr()));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipHostFree(allocation->ptr()));
 #else
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaFreeHost(allocation->ptr()));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(allocation->ptr()));
 #endif
   delete allocation;
 }
 Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
   void *ptr;
 #ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipHostMalloc(&ptr, size, hipHostMallocPortable));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipHostMalloc(&ptr, size, hipHostMallocPortable));
 #else
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
 #endif
   return new Allocation(ptr, size, platform::CUDAPinnedPlace());
 }
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index b2e13af6ef956..86f3135ee4d14 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -103,6 +103,8 @@ uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) {
   for (StreamSafeCUDAAllocator* allocator : allocators) {
     release_size += allocator->ProcessEventsAndFreeWithRelease();
   }
+  VLOG(8) << "Release " << release_size
+          << " bytes memory from all stream for place " << place;
   return release_size;
 }
 
@@ -112,13 +114,13 @@ void StreamSafeCUDAAllocator::CreateEventForAllRecordedStream(
   for (gpuStream_t stream : *recorded_streams) {
     gpuEvent_t event;
 #ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, stream));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         hipEventCreateWithFlags(&event, hipEventDisableTiming));
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event, stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, stream));
 #endif
     outstanding_events->emplace_back(event);
     VLOG(9) << "Record event " << event << " in stream " << stream;
@@ -162,8 +164,8 @@ void StreamSafeCUDAAllocator::ProcessEventsAndFree() {
         outstanding_events.erase(outstanding_events.begin(), deque_it);
         break;
       }
-      PADDLE_ENFORCE_CUDA_SUCCESS(err);
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(*deque_it));
+      PADDLE_ENFORCE_GPU_SUCCESS(err);
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(*deque_it));
 #else
       gpuError_t err = hipEventQuery(*deque_it);
       if (err == hipErrorNotReady) {
@@ -173,8 +175,8 @@ void StreamSafeCUDAAllocator::ProcessEventsAndFree() {
         outstanding_events.erase(outstanding_events.begin(), deque_it);
         break;
       }
-      PADDLE_ENFORCE_CUDA_SUCCESS(err);
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(*deque_it));
+      PADDLE_ENFORCE_GPU_SUCCESS(err);
+      PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(*deque_it));
 #endif
       ++deque_it;
     }
diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.h b/paddle/fluid/memory/allocation/thread_local_allocator.h
index 654fb3fe7bc04..c55f579981b00 100644
--- a/paddle/fluid/memory/allocation/thread_local_allocator.h
+++ b/paddle/fluid/memory/allocation/thread_local_allocator.h
@@ -20,7 +20,7 @@
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "paddle/fluid/memory/detail/system_allocator.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h
index 88dbec2bcfd0c..b7be895b35830 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -25,8 +25,8 @@ limitations under the License. */
 #include "paddle/fluid/memory/detail/memory_block.h"
 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/npu/npu_info.h"
-#include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/fluid/memory/detail/buddy_allocator_test.cc b/paddle/fluid/memory/detail/buddy_allocator_test.cc
index 8b3d776cef210..cd152843553a9 100644
--- a/paddle/fluid/memory/detail/buddy_allocator_test.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc
@@ -24,8 +24,8 @@ limitations under the License. */
 
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/npu/npu_info.h"
-#include "paddle/fluid/platform/gpu_info.h"
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
     defined(PADDLE_WITH_ASCEND_CL)
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 75b93088e5502..b300f936f7a68 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -27,9 +27,9 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/npu/npu_info.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/gpu_info.h"
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
@@ -115,7 +115,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
   if (size <= 0) return nullptr;
 
   void* p;
-  auto result = platform::RecordedCudaMalloc(&p, size, gpu_id_);
+  auto result = platform::RecordedGpuMalloc(&p, size, gpu_id_);
 
   if (result == gpuSuccess) {
     *index = 0;
@@ -123,7 +123,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
     return p;
   } else {
     size_t avail, total, actual_avail, actual_total;
-    bool is_limited = platform::RecordedCudaMemGetInfo(
+    bool is_limited = platform::RecordedGpuMemGetInfo(
         &avail, &total, &actual_avail, &actual_total, gpu_id_);
     size_t allocated = total - avail;
 
@@ -166,7 +166,7 @@ void GPUAllocator::Free(void* p, size_t size, size_t index) {
                         size, gpu_alloc_size_));
   gpu_alloc_size_ -= size;
 
-  platform::RecordedCudaFree(p, size, gpu_id_);
+  platform::RecordedGpuFree(p, size, gpu_id_);
 }
 
 bool GPUAllocator::UseGpu() const { return true; }
diff --git a/paddle/fluid/memory/detail/system_allocator_test.cc b/paddle/fluid/memory/detail/system_allocator_test.cc
index ead188341dac4..bb7f47f9d30ec 100644
--- a/paddle/fluid/memory/detail/system_allocator_test.cc
+++ b/paddle/fluid/memory/detail/system_allocator_test.cc
@@ -19,6 +19,9 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#endif
 
 DECLARE_bool(use_pinned_memory);
 
@@ -77,11 +80,7 @@ TEST(GPUAllocator, AllocFailure) {
     allocator.Alloc(&index, alloc_size);
     ASSERT_TRUE(false);
   } catch (paddle::memory::allocation::BadAlloc&) {
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipGetLastError());
-#else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError());
-#endif
+    PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::GpuGetLastError());
   }
 }
 #endif
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index 574b152054399..fe38200efa8e2 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -33,6 +33,32 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
   VLOG(4) << "src: " << src << ", dst: " << dst << ", num: " << num;
   std::memcpy(dst, src, num);
 }
+#ifdef PADDLE_WITH_IPU
+template <>
+void Copy<platform::IPUPlace, platform::CPUPlace>(platform::IPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::CPUPlace src_place,
+                                                  const void* src, size_t num) {
+  if (UNLIKELY(num == 0)) return;
+  std::memcpy(dst, src, num);
+}
+template <>
+void Copy<platform::CPUPlace, platform::IPUPlace>(platform::CPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::IPUPlace src_place,
+                                                  const void* src, size_t num) {
+  if (UNLIKELY(num == 0)) return;
+  std::memcpy(dst, src, num);
+}
+template <>
+void Copy<platform::IPUPlace, platform::IPUPlace>(platform::IPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::IPUPlace src_place,
+                                                  const void* src, size_t num) {
+  if (UNLIKELY(num == 0)) return;
+  std::memcpy(dst, src, num);
+}
+#endif
 
 #ifdef PADDLE_WITH_XPU
 template <>
diff --git a/paddle/fluid/memory/memcpy.h b/paddle/fluid/memory/memcpy.h
index c630437224cd0..7d2d2526ab124 100644
--- a/paddle/fluid/memory/memcpy.h
+++ b/paddle/fluid/memory/memcpy.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
diff --git a/paddle/fluid/memory/pinned_memory_test.cu b/paddle/fluid/memory/pinned_memory_test.cu
index 76a880755e21b..837c964e2ad32 100644
--- a/paddle/fluid/memory/pinned_memory_test.cu
+++ b/paddle/fluid/memory/pinned_memory_test.cu
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/memory.h"
 
 #include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
 
 // This unit test is an example comparing the performance between using pinned
diff --git a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
index 6a5818fd9603b..134c368d4340e 100644
--- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
+++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
@@ -25,8 +25,10 @@
 #include <vector>
 
 #include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 namespace paddle {
 namespace memory {
@@ -38,6 +40,14 @@ __global__ void add_kernel(int *x, int n) {
   }
 }
 
+void CheckMemLeak(const platform::CUDAPlace &place) {
+  uint64_t cuda_malloc_size =
+      platform::RecordedGpuMallocSize(place.GetDeviceId());
+  ASSERT_EQ(cuda_malloc_size, 0) << "Found " << cuda_malloc_size
+                                 << " bytes memory that not released yet,"
+                                 << " there may be a memory leak problem";
+}
+
 class StreamSafeCUDAAllocTest : public ::testing::Test {
  protected:
   void SetUp() override {
@@ -53,9 +63,9 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
     for (size_t i = 1; i < stream_num_; ++i) {
       gpuStream_t stream;
 #ifdef PADDLE_WITH_CUDA
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream));
 #endif
       streams_.emplace_back(stream);
     }
@@ -65,10 +75,10 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
       std::shared_ptr<Allocation> allocation =
           AllocShared(place_, allocation_size, streams_[i]);
 #ifdef PADDLE_WITH_CUDA
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemset(allocation->ptr(), 0, allocation->size()));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemset(allocation->ptr(), 0, allocation->size()));
 #endif
       allocations_.emplace_back(allocation);
@@ -111,13 +121,13 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
 // tricky code, the allocations are still accessible even though
 // allocations_.clear() has been called
 #ifdef PADDLE_WITH_CUDA
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemcpy(host_x.get(), allocations_[i]->ptr(),
                      data_num_ * sizeof(int), cudaMemcpyDeviceToHost));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          hipMemcpy(host_x.get(), allocations_[i]->ptr(),
-                    data_num_ * sizeof(int), hipMemcpyDeviceToHost));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(host_x.get(), allocations_[i]->ptr(),
+                                           data_num_ * sizeof(int),
+                                           hipMemcpyDeviceToHost));
 #endif
       for (int j = 0; j < data_num_; ++j) {
         EXPECT_TRUE(host_x[j] == (j % thread_num) * stream_num_);
@@ -127,9 +137,9 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
 
   void TearDown() override {
 #ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize());
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize());
+    PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
 #endif
     for (gpuStream_t stream : streams_) {
       Release(place_, stream);
@@ -137,17 +147,13 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
 
     for (size_t i = 1; i < stream_num_; ++i) {
 #ifdef PADDLE_WITH_CUDA
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(streams_[i]));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams_[i]));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamDestroy(streams_[i]));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(streams_[i]));
 #endif
     }
 
-    uint64_t cuda_malloc_size =
-        platform::RecordedCudaMallocSize(place_.GetDeviceId());
-    ASSERT_EQ(cuda_malloc_size, 0) << "Found " << cuda_malloc_size
-                                   << " bytes memory that not released yet,"
-                                   << " there may be a memory leak problem";
+    CheckMemLeak(place_);
   }
 
   size_t stream_num_;
@@ -186,17 +192,70 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
       Alloc(place, alloc_size, default_stream);
   EXPECT_GE(allocation_unique->size(), alloc_size);
   EXPECT_EQ(allocation_unique->ptr(), address);
+  allocation_unique.reset();
+
+  Release(place);
+  CheckMemLeak(place);
 }
 
+TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorInterfaceTest) {
+  platform::CUDAPlace place = platform::CUDAPlace();
+  auto &instance = allocation::AllocatorFacade::Instance();
+  const std::shared_ptr<Allocator> &allocator = instance.GetAllocator(place);
+
+  size_t alloc_size = 256;
+  std::shared_ptr<Allocation> allocation_from_allocator =
+      allocator->Allocate(alloc_size);
+  EXPECT_GE(allocation_from_allocator->size(), alloc_size);
+  void *address = allocation_from_allocator->ptr();
+  allocation_from_allocator.reset();
+
+  std::shared_ptr<Allocation> allocation_implicit_stream =
+      AllocShared(place, alloc_size);
+  EXPECT_GE(allocation_implicit_stream->size(), alloc_size);
+  EXPECT_EQ(allocation_implicit_stream->ptr(), address);
+  allocation_implicit_stream.reset();
+
+  Release(place);
+  CheckMemLeak(place);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(StreamSafeCUDAAllocInterfaceTest, CUDAGraphExceptionTest) {
+  platform::CUDAPlace place = platform::CUDAPlace();
+  size_t alloc_size = 1;
+  std::shared_ptr<Allocation> allocation = AllocShared(place, alloc_size);
+
+  platform::BeginCUDAGraphCapture(place, cudaStreamCaptureModeGlobal);
+  EXPECT_THROW(AllocShared(place, alloc_size), paddle::platform::EnforceNotMet);
+  EXPECT_THROW(Alloc(place, alloc_size), paddle::platform::EnforceNotMet);
+  EXPECT_THROW(Release(place), paddle::platform::EnforceNotMet);
+  EXPECT_THROW(allocation::AllocatorFacade::Instance().GetAllocator(place),
+               paddle::platform::EnforceNotMet);
+  EXPECT_THROW(AllocShared(place, alloc_size, nullptr),
+               paddle::platform::EnforceNotMet);
+  EXPECT_THROW(Alloc(place, alloc_size, nullptr),
+               paddle::platform::EnforceNotMet);
+  EXPECT_THROW(Release(place, nullptr), paddle::platform::EnforceNotMet);
+  EXPECT_THROW(RecordStream(allocation.get(), nullptr),
+               paddle::platform::EnforceNotMet);
+  platform::EndCUDAGraphCapture();
+
+  allocation.reset();
+  Release(place);
+  CheckMemLeak(place);
+}
+#endif
+
 TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
   platform::CUDAPlace place = platform::CUDAPlace();
   gpuStream_t stream1, stream2;
 #ifdef PADDLE_WITH_CUDA
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream1));
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream2));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream1));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream2));
 #else
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream1));
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream2));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream1));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream2));
 #endif
   size_t available_size = platform::GpuAvailableMemToAlloc();
   // alloc_size < available_size < 2 * alloc_size
@@ -216,13 +275,14 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
   allocation2.reset();
 
 #ifdef PADDLE_WITH_CUDA
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize());
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
 #else
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize());
+  PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
 #endif
 
   Release(place, stream1);
   Release(place, stream2);
+  CheckMemLeak(place);
 }
 
 }  // namespace memory
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index a8f35d61f3c4c..f0621af9bbda5 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -169,8 +169,10 @@ endif()
 
 if (WITH_CINN)
   op_library(cinn_launch_op SRCS cinn_launch_op.cc cinn_launch_op.cu.cc DEPS transform_desc cinn_compiler cinn ${OP_HEADER_DEPS})
-  cc_test(cinn_launch_op_test SRCS cinn_launch_op_test.cc DEPS cinn_compiler cinn_launch_op elementwise_add_op)
-  set_tests_properties(cinn_launch_op_test PROPERTIES ENVIRONMENT OMP_NUM_THREADS=1)
+  if (WITH_TESTING)
+    cc_test(cinn_launch_op_test SRCS cinn_launch_op_test.cc DEPS cinn_compiler cinn_launch_op elementwise_add_op)
+    set_tests_properties(cinn_launch_op_test PROPERTIES ENVIRONMENT OMP_NUM_THREADS=1)
+  endif()
 endif()
 
 # FIXME(typhoonzero): operator deps may not needed.
@@ -203,6 +205,7 @@ elseif(WITH_ROCM)
 else()
     cc_test(test_leaky_relu_grad_grad_functor SRCS test_leaky_relu_grad_grad_functor.cc DEPS tensor device_context eigen3)
 endif()
+cc_test(share_buffer_op_cpp_test SRCS share_buffer_op_test.cc DEPS lod_tensor device_context share_buffer_op) 
 
 cc_library(tensor_formatter SRCS tensor_formatter.cc DEPS ${OP_HEADER_DEPS})
 if (WITH_PYTHON)
diff --git a/paddle/fluid/operators/activation_cudnn.cu.cc b/paddle/fluid/operators/activation_cudnn.cu.cc
index 38499783eb492..2ad92e36272b3 100644
--- a/paddle/fluid/operators/activation_cudnn.cu.cc
+++ b/paddle/fluid/operators/activation_cudnn.cu.cc
@@ -14,11 +14,7 @@
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/activation_op.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_desc.h"
-#else
-#include "paddle/fluid/platform/cudnn_desc.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc
index b197d3511f96b..2776fe9c13132 100644
--- a/paddle/fluid/operators/activation_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc
@@ -14,11 +14,7 @@
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/activation_op.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_desc.h"
-#else
-#include "paddle/fluid/platform/cudnn_desc.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace platform {
@@ -64,13 +60,13 @@ struct CudnnActivationFunctor {
     x_desc.set(x);
     out_desc.set(GET_DATA_SAFELY(out, "Output", "Out", "CudnnActivation"));
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenActivationForward(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenActivationForward(
         ctx_.cudnn_handle(), act_desc.desc(),
         platform::CudnnDataType<T>::kOne(), x_desc.desc(), x.data<T>(),
         platform::CudnnDataType<T>::kZero(), out_desc.desc(),
         out->mutable_data<T>(ctx_.GetPlace())));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnActivationForward(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnActivationForward(
         ctx_.cudnn_handle(), act_desc.desc(),
         platform::CudnnDataType<T>::kOne(), x_desc.desc(), x.data<T>(),
         platform::CudnnDataType<T>::kZero(), out_desc.desc(),
@@ -108,14 +104,14 @@ struct CudnnActivationGradFunctor {
     dout_desc.set(dout);
     dx_desc.set(GET_DATA_SAFELY(dx, "Output", "X@GRAD", "CudnnActivationGrad"));
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenActivationBackward(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenActivationBackward(
         ctx_.cudnn_handle(), act_desc.desc(),
         platform::CudnnDataType<T>::kOne(), out_desc.desc(), out.data<T>(),
         dout_desc.desc(), dout.data<T>(), x_desc.desc(), x.data<T>(),
         platform::CudnnDataType<T>::kZero(), dx_desc.desc(),
         dx->mutable_data<T>(ctx_.GetPlace())));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnActivationBackward(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnActivationBackward(
         ctx_.cudnn_handle(), act_desc.desc(),
         platform::CudnnDataType<T>::kOne(), out_desc.desc(), out.data<T>(),
         dout_desc.desc(), dout.data<T>(), x_desc.desc(), x.data<T>(),
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 0294bfd5b05d5..07cf516c476e8 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/operators/math/math_cuda_utils.h"
 #include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu
index 5fa1e18553bd5..cf4041f721af2 100644
--- a/paddle/fluid/operators/affine_channel_op.cu
+++ b/paddle/fluid/operators/affine_channel_op.cu
@@ -23,7 +23,7 @@ namespace cub = hipcub;
 
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
index b8ce52387b959..31801b14564d3 100644
--- a/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 // HIP not support cudnnSpatialTfGridGeneratorForward
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
@@ -108,7 +108,7 @@ class CUDNNAffineGridGradOpKernel : public framework::OpKernel<T> {
     const T* output_grad_data = output_grad->data<T>();
     T* theta_grad_data = theta_grad->mutable_data<T>(ctx.GetPlace());
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnSpatialTfGridGeneratorBackward(
             handle, cudnn_st_desc, output_grad_data, theta_grad_data));
   }
diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc
index 3125e005174de..d1da11028c05c 100644
--- a/paddle/fluid/operators/affine_grid_op.cc
+++ b/paddle/fluid/operators/affine_grid_op.cc
@@ -18,12 +18,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/affine_grid_op.cu b/paddle/fluid/operators/affine_grid_op.cu
index 58b56bdcf5614..bcf7deefc98f0 100644
--- a/paddle/fluid/operators/affine_grid_op.cu
+++ b/paddle/fluid/operators/affine_grid_op.cu
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/affine_grid_op.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/angle_op.cc b/paddle/fluid/operators/angle_op.cc
new file mode 100644
index 0000000000000..3cb0148681496
--- /dev/null
+++ b/paddle/fluid/operators/angle_op.cc
@@ -0,0 +1,125 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/angle_op.h"
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+class AngleOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "angle");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "angle");
+
+    auto in_dims = ctx->GetInputDim("X");
+
+    ctx->SetOutputDim("Out", in_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class AngleOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of angle op.");
+    AddOutput("Out", "(Tensor), The output tensor of angle op.");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
+    AddAttr<bool>("use_cudnn",
+                  "(bool, default false) Only used in cudnn kernel, need "
+                  "install cudnn")
+        .SetDefault(false);
+    AddComment(R"DOC(
+Angle Operator.
+
+This operator is used to perform elementwise angle for input $X$.
+$$out = angle(x)$$
+
+)DOC");
+  }
+};
+
+class AngleGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out@Grad", "angle_grad");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "Out@Grad", "angle_grad");
+    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
+                   "X@Grad", "angle_grad");
+
+    auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+    ctx->SetOutputDim(framework::GradVarName("X"), dout_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(dtype, ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class AngleGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("angle_grad");
+    retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    retv->SetInput("X", this->Input("X"));
+    retv->SetAttrMap(this->Attrs());
+    retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(angle, ops::AngleOp, ops::AngleOpMaker,
+                  ops::AngleGradMaker<paddle::framework::OpDesc>,
+                  ops::AngleGradMaker<paddle::imperative::OpBase>);
+
+REGISTER_OP_CPU_KERNEL(
+    angle, ops::AngleKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AngleKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::AngleKernel<paddle::platform::CPUDeviceContext,
+                     paddle::platform::complex<float>>,
+    ops::AngleKernel<paddle::platform::CPUDeviceContext,
+                     paddle::platform::complex<double>>);
+
+REGISTER_OPERATOR(angle_grad, ops::AngleGradOp);
+
+REGISTER_OP_CPU_KERNEL(
+    angle_grad, ops::AngleGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AngleGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::AngleGradKernel<paddle::platform::CPUDeviceContext,
+                         paddle::platform::complex<float>>,
+    ops::AngleGradKernel<paddle::platform::CPUDeviceContext,
+                         paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/angle_op.cu b/paddle/fluid/operators/angle_op.cu
new file mode 100644
index 0000000000000..3264f426a77d1
--- /dev/null
+++ b/paddle/fluid/operators/angle_op.cu
@@ -0,0 +1,31 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/angle_op.h"
+#include "paddle/fluid/platform/complex.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    angle, ops::AngleKernel<plat::CUDADeviceContext, float>,
+    ops::AngleKernel<plat::CUDADeviceContext, double>,
+    ops::AngleKernel<plat::CUDADeviceContext, plat::complex<float>>,
+    ops::AngleKernel<plat::CUDADeviceContext, plat::complex<double>>);
+
+REGISTER_OP_CUDA_KERNEL(
+    angle_grad, ops::AngleGradKernel<plat::CUDADeviceContext, float>,
+    ops::AngleGradKernel<plat::CUDADeviceContext, double>,
+    ops::AngleGradKernel<plat::CUDADeviceContext, plat::complex<float>>,
+    ops::AngleGradKernel<plat::CUDADeviceContext, plat::complex<double>>);
diff --git a/paddle/fluid/operators/angle_op.h b/paddle/fluid/operators/angle_op.h
new file mode 100644
index 0000000000000..093a04f03df95
--- /dev/null
+++ b/paddle/fluid/operators/angle_op.h
@@ -0,0 +1,147 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES
+#endif
+#include <cmath>
+#include "paddle/fluid/operators/math/complex_functors.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+
+namespace math {
+template <typename T, typename Enable = void>
+struct AngleFunctor;
+
+// angel function for complex
+template <typename T>
+struct AngleFunctor<T, Complex<T, Real<T>>> {
+  AngleFunctor(const T* input, Real<T>* output, int64_t numel)
+      : input_(input), output_(output), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    output_[idx] = arg(input_[idx]);
+  }
+
+  const T* input_;
+  Real<T>* output_;
+  int64_t numel_;
+};
+
+// angel function for real
+template <typename T>
+struct AngleFunctor<T, NoComplex<T, Real<T>>> {
+  AngleFunctor(const T* input, T* output, int64_t numel)
+      : input_(input), output_(output), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    output_[idx] = input_[idx] < static_cast<T>(0) ? M_PI : 0;
+  }
+
+  const T* input_;
+  T* output_;
+  int64_t numel_;
+};
+
+template <typename T, typename Enable = void>
+struct AngleGradFunctor;
+
+// angle grad for complex
+template <typename T>
+struct AngleGradFunctor<T, Complex<T, Real<T>>> {
+  AngleGradFunctor(const math::Real<T>* dout, const T* x, T* dx, int64_t numel)
+      : dout_(dout), x_(x), dx_(dx), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    if (x_[idx] == T(0)) {
+      dx_[idx] = T(0);
+    } else {
+      const math::Real<T> r_square =
+          x_[idx].real * x_[idx].real + x_[idx].imag * x_[idx].imag;
+      dx_[idx] = T(-dout_[idx] * x_[idx].imag / r_square,
+                   dout_[idx] * x_[idx].real / r_square);
+    }
+  }
+
+  const math::Real<T>* dout_;
+  const T* x_;
+  T* dx_;
+  int64_t numel_;
+};
+
+// angle grad for real
+template <typename T>
+struct AngleGradFunctor<T, NoComplex<T, Real<T>>> {
+  AngleGradFunctor(const math::Real<T>* dout, const T* x, T* dx, int64_t numel)
+      : dout_(dout), x_(x), dx_(dx), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const { dx_[idx] = 0; }
+
+  const math::Real<T>* dout_;
+  const T* x_;
+  T* dx_;
+  int64_t numel_;
+};
+}  // namespace math
+
+using Tensor = framework::Tensor;
+template <typename DeviceContext, typename T>
+class AngleKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* x = context.Input<Tensor>("X");
+    Tensor* out = context.Output<Tensor>("Out");
+
+    auto numel = x->numel();
+    auto* x_data = x->data<T>();
+    auto* out_data = out->mutable_data<math::Real<T>>(
+        context.GetPlace(), size_t(x->numel() * sizeof(math::Real<T>)));
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
+    math::AngleFunctor<T> functor(x_data, out_data, numel);
+    for_range(functor);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class AngleGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    const framework::Tensor* d_out =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    const framework::Tensor* x = ctx.Input<framework::Tensor>("X");
+    framework::Tensor* d_x =
+        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    auto numel = d_out->numel();
+    auto* dout_data = d_out->data<math::Real<T>>();
+    auto* x_data = x->data<T>();
+    auto* dx_data = d_x->mutable_data<T>(
+        ctx.GetPlace(), static_cast<size_t>(numel * sizeof(T)));
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
+    math::AngleGradFunctor<T> functor(dout_data, x_data, dx_data, numel);
+    for_range(functor);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/argsort_op.cu b/paddle/fluid/operators/argsort_op.cu
index f50d5e619ebea..6236a07de4bc6 100644
--- a/paddle/fluid/operators/argsort_op.cu
+++ b/paddle/fluid/operators/argsort_op.cu
@@ -26,8 +26,8 @@ namespace cub = hipcub;
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/argsort_op.h"
 #include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 #ifdef __HIPCC__
 namespace rocprim {
@@ -169,7 +169,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
         num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
         cu_stream);
   }
-  PADDLE_ENFORCE_CUDA_SUCCESS(err);
+  PADDLE_ENFORCE_GPU_SUCCESS(err);
 
   Tensor temp_storage;
   temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
@@ -188,7 +188,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
         cu_stream);
   }
 
-  PADDLE_ENFORCE_CUDA_SUCCESS(err);
+  PADDLE_ENFORCE_GPU_SUCCESS(err);
 }
 
 template <typename T, typename IndType>
diff --git a/paddle/fluid/operators/average_accumulates_op.cu b/paddle/fluid/operators/average_accumulates_op.cu
index 2796a6b2239b9..3bffe0a05a8f7 100644
--- a/paddle/fluid/operators/average_accumulates_op.cu
+++ b/paddle/fluid/operators/average_accumulates_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/average_accumulates_op.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/batch_fc_op.cu b/paddle/fluid/operators/batch_fc_op.cu
index b686c766e0f8b..c326929a14680 100644
--- a/paddle/fluid/operators/batch_fc_op.cu
+++ b/paddle/fluid/operators/batch_fc_op.cu
@@ -16,8 +16,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/batch_fc_op.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu
index b4cf9c48df2a8..e3dc54e17cd7f 100644
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -197,18 +197,18 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
 // miopenTensorDescriptor_t bn_param_desc_;
 // miopenBatchNormMode_t mode_;
 
-// PADDLE_ENFORCE_CUDA_SUCCESS(
+// PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
-// PADDLE_ENFORCE_CUDA_SUCCESS(
+// PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
 #else
     cudnnTensorDescriptor_t data_desc_;
     cudnnTensorDescriptor_t bn_param_desc_;
     cudnnBatchNormMode_t mode_;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
 #endif
 
@@ -251,23 +251,22 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
 
 #ifdef PADDLE_WITH_HIP
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
 //     data_desc_, CudnnDataType<T>::type,
 //     x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
 //     const_cast<int *>(strides.data())));
 // Note: PERSISTENT not implemented for inference
-// PADDLE_ENFORCE_CUDA_SUCCESS(
+// PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenDeriveBNTensorDescriptor(
 //         bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         data_desc_, CudnnDataType<T>::type,
         x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
     // Note: PERSISTENT not implemented for inference
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDeriveBNTensorDescriptor(
-            bn_param_desc_, data_desc_,
-            test_mode ? CUDNN_BATCHNORM_SPATIAL : mode_));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
+        bn_param_desc_, data_desc_,
+        test_mode ? CUDNN_BATCHNORM_SPATIAL : mode_));
 #endif
 
     const auto *scale = ctx.Input<Tensor>("Scale");
@@ -341,7 +340,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
       }
 
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_CUDA_SUCCESS(
+// PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenBatchNormalizationForwardInference(
 //         handle, miopenBNSpatial,
 //         const_cast<void *>(
@@ -364,7 +363,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
 //             est_var->template data<BatchNormParamType<T>>())),
 //         epsilon));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnBatchNormalizationForwardInference(
               handle,
               // Note: PERSISTENT not implemented for inference
@@ -426,7 +425,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
                 "The argument ReserveSpace of batch_norm op is not found."));
 
         // --------------- cudnn batchnorm workspace ---------------
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::
                 cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
                     /*handle=*/handle,
@@ -440,7 +439,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
                     /*sizeInBytes=*/&workspace_size));
 
         // -------------- cudnn batchnorm reserve space --------------
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::
                 cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
                     /*handle=*/handle,
@@ -454,7 +453,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
             ctx.GetPlace(), transformed_x.type(), reserve_space_size);
         workspace_ptr = workspace_tensor.mutable_data(
             ctx.GetPlace(), transformed_x.type(), workspace_size);
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
                 handle, mode_, CUDNN_BATCHNORM_OPS_BN, CudnnDataType<T>::kOne(),
                 CudnnDataType<T>::kZero(), data_desc_,
@@ -508,7 +507,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
           }
 
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_CUDA_SUCCESS(
+// PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenBatchNormalizationForwardTraining(
 //         handle, mode_, const_cast<void *>(static_cast<const void *>(
 //                            CudnnDataType<T>::kOne())),
@@ -537,7 +536,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
 //         static_cast<void *>(saved_variance->template mutable_data<
 //                             BatchNormParamType<T>>(ctx.GetPlace()))));
 #else
-          PADDLE_ENFORCE_CUDA_SUCCESS(
+          PADDLE_ENFORCE_GPU_SUCCESS(
               platform::dynload::cudnnBatchNormalizationForwardTraining(
                   handle, mode_, CudnnDataType<T>::kOne(),
                   CudnnDataType<T>::kZero(), data_desc_,
@@ -568,15 +567,15 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
 #ifdef PADDLE_WITH_HIP
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
 // clean when exit.
-// PADDLE_ENFORCE_CUDA_SUCCESS(
+// PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
-// PADDLE_ENFORCE_CUDA_SUCCESS(
+// PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
 #else
     // clean when exit.
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
 #endif
   }
@@ -981,18 +980,18 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
 // miopenTensorDescriptor_t bn_param_desc_;
 // miopenBatchNormMode_t mode_;
 
-// PADDLE_ENFORCE_CUDA_SUCCESS(
+// PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
-// PADDLE_ENFORCE_CUDA_SUCCESS(
+// PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
 #else
       cudnnTensorDescriptor_t data_desc_;
       cudnnTensorDescriptor_t bn_param_desc_;
       cudnnBatchNormMode_t mode_;
 
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
 #endif
       if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
@@ -1022,18 +1021,18 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
 
 #ifdef PADDLE_WITH_HIP
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
 //     data_desc_, CudnnDataType<T>::type,
 //     x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
 //     const_cast<int *>(strides.data())));
-// PADDLE_ENFORCE_CUDA_SUCCESS(
+// PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
 //                                                       data_desc_, mode_));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
           data_desc_, CudnnDataType<T>::type,
           x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
                                                            data_desc_, mode_));
 #endif
@@ -1063,7 +1062,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
         Tensor workspace_tensor;
         auto reserve_space_size = reserve_space->memory_size();
         // --------------- cudnn batchnorm workspace ---------------
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::
                 cudnnGetBatchNormalizationBackwardExWorkspaceSize(
                     /*handle=*/dev_ctx.cudnn_handle(),
@@ -1081,7 +1080,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
         workspace_ptr = workspace_tensor.mutable_data(
             ctx.GetPlace(), transformed_x.type(), workspace_size);
 
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::cudnnBatchNormalizationBackwardEx(
                 /*handle=*/dev_ctx.cudnn_handle(),
                 /*mode=*/mode_,
@@ -1151,7 +1150,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
           }
 
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_CUDA_SUCCESS(
+// PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenBatchNormalizationBackward(
 //         dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
 //         CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
@@ -1166,7 +1165,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
 //             ctx.GetPlace()),
 //         epsilon, saved_mean_data, saved_var_data));
 #else
-          PADDLE_ENFORCE_CUDA_SUCCESS(
+          PADDLE_ENFORCE_GPU_SUCCESS(
               platform::dynload::cudnnBatchNormalizationBackward(
                   dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
                   CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
@@ -1231,15 +1230,15 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
 #ifdef PADDLE_WITH_HIP
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
 // clean when exit.
-// PADDLE_ENFORCE_CUDA_SUCCESS(
+// PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
-// PADDLE_ENFORCE_CUDA_SUCCESS(
+// PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
 #else
       // clean when exit.
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
 #endif
     } else {
diff --git a/paddle/fluid/operators/bce_loss_op.cu b/paddle/fluid/operators/bce_loss_op.cu
index 8bd2b7fe2d127..73f73a81c088e 100644
--- a/paddle/fluid/operators/bce_loss_op.cu
+++ b/paddle/fluid/operators/bce_loss_op.cu
@@ -14,8 +14,8 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/operators/bce_loss_op.h"
 #include "paddle/fluid/operators/math.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/bilateral_slice_op.cu b/paddle/fluid/operators/bilateral_slice_op.cu
index 3c64ed1acc847..3fd8995745acb 100644
--- a/paddle/fluid/operators/bilateral_slice_op.cu
+++ b/paddle/fluid/operators/bilateral_slice_op.cu
@@ -12,8 +12,8 @@
 #include <algorithm>
 #include <string>
 #include "paddle/fluid/operators/bilateral_slice_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/bincount_op.cu b/paddle/fluid/operators/bincount_op.cu
index 757f728629106..34facf1ea1fa9 100644
--- a/paddle/fluid/operators/bincount_op.cu
+++ b/paddle/fluid/operators/bincount_op.cu
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/bincount_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu
index bb4246e3e9b84..6b393b5666bb2 100644
--- a/paddle/fluid/operators/cast_op.cu
+++ b/paddle/fluid/operators/cast_op.cu
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/cast_op.h"
 #include "paddle/fluid/platform/aligned_vector.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/center_loss_op.cu b/paddle/fluid/operators/center_loss_op.cu
index f15d1fe5e02ac..549bb5ae75aff 100644
--- a/paddle/fluid/operators/center_loss_op.cu
+++ b/paddle/fluid/operators/center_loss_op.cu
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include <iostream>
 #include "paddle/fluid/operators/center_loss_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/cholesky_op.cu b/paddle/fluid/operators/cholesky_op.cu
index 4426057305249..0bfddf8b5f386 100644
--- a/paddle/fluid/operators/cholesky_op.cu
+++ b/paddle/fluid/operators/cholesky_op.cu
@@ -131,27 +131,26 @@ class CholeskyGPUKernel : public framework::OpKernel<T> {
                                    int lda, int* info) const {                 \
     auto handle = dev_ctx.cusolver_dn_handle();                                \
     int workspace_size = 0;                                                    \
-    PADDLE_ENFORCE_CUDA_SUCCESS(                                               \
+    PADDLE_ENFORCE_GPU_SUCCESS(                                                \
         platform::dynload::cusolverDn##C##potrf_bufferSize(                    \
             handle, uplo, n, A, lda, &workspace_size));                        \
     auto workspace = memory::Alloc(dev_ctx, workspace_size);                   \
     T* workspace_ptr = reinterpret_cast<T*>(workspace->ptr());                 \
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDn##C##potrf(       \
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDn##C##potrf(        \
         handle, uplo, n, A, lda, workspace_ptr, workspace_size, info));        \
   }
 
 FUNC_WITH_TYPES(POTRF_INSTANCE);
 
 #if CUDA_VERSION >= 9020 && !defined(_WIN32)
-#define POTRF_BATCH_INSTANCE(T, C)                                          \
-  template <>                                                               \
-  void CholeskyGPUKernel<T>::PotrfBatched(                                  \
-      const platform::CUDADeviceContext& dev_ctx, cublasFillMode_t uplo,    \
-      int n, T* Aarray[], int lda, int* info_array, int batch_size) const { \
-    auto handle = dev_ctx.cusolver_dn_handle();                             \
-    PADDLE_ENFORCE_CUDA_SUCCESS(                                            \
-        platform::dynload::cusolverDn##C##potrfBatched(                     \
-            handle, uplo, n, Aarray, lda, info_array, batch_size));         \
+#define POTRF_BATCH_INSTANCE(T, C)                                             \
+  template <>                                                                  \
+  void CholeskyGPUKernel<T>::PotrfBatched(                                     \
+      const platform::CUDADeviceContext& dev_ctx, cublasFillMode_t uplo,       \
+      int n, T* Aarray[], int lda, int* info_array, int batch_size) const {    \
+    auto handle = dev_ctx.cusolver_dn_handle();                                \
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDn##C##potrfBatched( \
+        handle, uplo, n, Aarray, lda, info_array, batch_size));                \
   }
 
 FUNC_WITH_TYPES(POTRF_BATCH_INSTANCE);
diff --git a/paddle/fluid/operators/cinn_launch_op.cc b/paddle/fluid/operators/cinn_launch_op.cc
index e70a51d880516..f0ad5b3c3bf99 100644
--- a/paddle/fluid/operators/cinn_launch_op.cc
+++ b/paddle/fluid/operators/cinn_launch_op.cc
@@ -13,7 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/cinn_launch_op.h"
+
+#include <functional>
 #include <vector>
+
 #include "paddle/fluid/string/string_helper.h"
 
 DECLARE_bool(cudnn_deterministic);
@@ -108,33 +111,9 @@ std::unordered_set<std::string> CinnLaunchContext::GetInternalVariableNames() {
   return all_parameters;
 }
 
-void CinnLaunchContext::MutableTensorData(const std::string& var_name,
-                                          const platform::Place& place,
-                                          LoDTensor* paddle_tensor,
-                                          bool is_internal_var) {
-  auto cinn_name = var_name;
-  if (!is_internal_var) {
-    PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true,
-                      platform::errors::InvalidArgument(
-                          "Paddle variable(%s) not used by cinn", var_name));
-    cinn_name = paddle2cinn_varmap_.at(var_name);
-  }
-
-  auto cinn_tensor = GetCinnTensor(cinn_name);
-  // TODO(CtfGo): support mutable corresponding c++ type after CINN ready
-  VLOG(4) << "Only support float in cinn_launch op now.";
-  paddle_tensor->mutable_data<float>(
-      framework::make_ddim(cinn_tensor->shape().data()), place);
-}
-
 void CinnLaunchContext::CheckTensorEquivalent(const std::string& paddle_name,
                                               const LoDTensor& paddle_tensor,
                                               const CinnTensor& cinn_tensor) {
-  PADDLE_ENFORCE_EQ(
-      paddle_tensor.IsInitialized(), true,
-      platform::errors::InvalidArgument(
-          "Tensor in variable(%s) is not initialized.", paddle_name));
-
   // check dimension
   auto cinn_dims = framework::make_ddim(cinn_tensor->shape().data());
   PADDLE_ENFORCE_EQ(paddle_tensor.dims(), cinn_dims,
@@ -147,27 +126,39 @@ void CinnLaunchContext::CheckTensorEquivalent(const std::string& paddle_name,
 }
 
 void CinnLaunchContext::AssignExternalVariable(const std::string& paddle_name,
+                                               const platform::Place& place,
                                                LoDTensor* paddle_tensor) {
   PADDLE_ENFORCE_EQ(IsVariableUsed(paddle_name), true,
                     platform::errors::InvalidArgument(
                         "Paddle variable(%s) not used by cinn", paddle_name));
 
   const auto& cinn_name = paddle2cinn_varmap_.at(paddle_name);
-  CheckTensorEquivalent(paddle_name, *paddle_tensor, GetCinnTensor(cinn_name));
-  return SetArgument(cinn_name, paddle_tensor);
+  CinnTensor cinn_tensor = GetCinnTensor(cinn_name);
+  if (!paddle_tensor->IsInitialized()) {
+    paddle_tensor->Resize(framework::make_ddim(cinn_tensor->shape().data()));
+  }
+  CheckTensorEquivalent(paddle_name, *paddle_tensor, cinn_tensor);
+  return SetArgument(cinn_name, place, /* free_mem_callback = */ false,
+                     paddle_tensor);
 }
 
 void CinnLaunchContext::AssignInternalVariable(const std::string& cinn_name,
+                                               const platform::Place& place,
                                                LoDTensor* paddle_tensor) {
   PADDLE_ENFORCE_GT(cinn_variable_names_.count(cinn_name), 0,
                     platform::errors::InvalidArgument(
                         "Variable(%s) not found in cinn socpe.", cinn_name));
-  CheckTensorEquivalent(cinn_name, *paddle_tensor, GetCinnTensor(cinn_name));
-  return SetArgument(cinn_name, paddle_tensor);
+  CinnTensor cinn_tensor = GetCinnTensor(cinn_name);
+  if (!paddle_tensor->IsInitialized()) {
+    paddle_tensor->Resize(framework::make_ddim(cinn_tensor->shape().data()));
+  }
+  CheckTensorEquivalent(cinn_name, *paddle_tensor, cinn_tensor);
+  return SetArgument(cinn_name, place, /* free_mem_callback = */ true,
+                     paddle_tensor);
 }
 
 std::unique_ptr<cinn_buffer_t> CinnLaunchContext::ShareTensorWithCinnBuffer(
-    LoDTensor* tensor) {
+    const platform::Place& place, bool free_mem_callback, LoDTensor* tensor) {
   // convert paddle dimensions array to cinn format
   std::vector<cinn_dimension_t> cinn_dims(tensor->dims().size());
   for (auto i = 0; i < tensor->dims().size(); ++i) {
@@ -177,19 +168,42 @@ std::unique_ptr<cinn_buffer_t> CinnLaunchContext::ShareTensorWithCinnBuffer(
   auto cinn_buffer = std::make_unique<cinn_buffer_t>();
   // assign size and memory
   cinn_buffer->resize(cinn_dims.data(), cinn_dims.size());
-  cinn_buffer->memory = reinterpret_cast<uint8_t*>(tensor->data<float>());
+
+  cinn_buffer->external_malloc = new std::function<int(void*, cinn_buffer_t*)>(
+      [place, tensor](void* ctx, cinn_buffer_t* buffer) {
+        buffer->memory =
+            reinterpret_cast<uint8_t*>(tensor->mutable_data<float>(place));
+        return 0;
+      });
+
+  if (free_mem_callback) {
+    cinn_buffer->external_free = new std::function<int(void*, cinn_buffer_t*)>(
+        [tensor](void* ctx, cinn_buffer_t* buffer) {
+          tensor->clear();
+          return 0;
+        });
+    return cinn_buffer;
+  }
+
+  cinn_buffer->external_free = new std::function<int(void*, cinn_buffer_t*)>(
+      [](void* ctx, cinn_buffer_t* buffer) {
+        // Do nothing
+        return 0;
+      });
   return cinn_buffer;
 }
 
 void CinnLaunchContext::SetArgument(const std::string& cinn_name,
+                                    const platform::Place& place,
+                                    bool free_mem_callback,
                                     LoDTensor* paddle_tensor) {
-  auto buffer = ShareTensorWithCinnBuffer(paddle_tensor);
+  auto buffer =
+      ShareTensorWithCinnBuffer(place, free_mem_callback, paddle_tensor);
   name2argument_.emplace(cinn_name, buffer.get());
   hold_buffers_.emplace_back(std::move(buffer));
   VLOG(4) << "SetArgument-" << name2argument_.size() << ": "
-          << "name(" << cinn_name << "), "
-          << "type(" << framework::DataTypeToString(paddle_tensor->type())
-          << "), dims(" << paddle_tensor->dims() << ").";
+          << "name(" << cinn_name << "), dims(" << paddle_tensor->dims()
+          << ").";
 }
 
 const std::map<std::string, cinn_pod_value_t>&
diff --git a/paddle/fluid/operators/cinn_launch_op.cu.cc b/paddle/fluid/operators/cinn_launch_op.cu.cc
index d557cfc7c0892..fae2d6ddb487d 100644
--- a/paddle/fluid/operators/cinn_launch_op.cu.cc
+++ b/paddle/fluid/operators/cinn_launch_op.cu.cc
@@ -18,9 +18,9 @@ limitations under the License. */
 #include "cinn/runtime/cinn_runtime.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/type_defs.h"
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
@@ -45,9 +45,9 @@ void CUDART_CB ReleaseBuffers(void* data) {
 template <>
 void ReleaseResource<platform::CUDADeviceContext>(
     const std::vector<void*>& resources, void* stream) {
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaLaunchHostFunc(
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaLaunchHostFunc(
       static_cast<gpuStream_t>(stream), ReleaseScope, resources[0]));
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaLaunchHostFunc(
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaLaunchHostFunc(
       static_cast<gpuStream_t>(stream), ReleaseBuffers, resources[1]));
 }
 
diff --git a/paddle/fluid/operators/cinn_launch_op.h b/paddle/fluid/operators/cinn_launch_op.h
index 53e6ff0d61387..2b1bf89197dff 100644
--- a/paddle/fluid/operators/cinn_launch_op.h
+++ b/paddle/fluid/operators/cinn_launch_op.h
@@ -49,16 +49,13 @@ class CinnLaunchContext {
   // Return whether a Paddle variable used on compiled kernels
   bool IsVariableUsed(const std::string& var_name);
 
-  // Allocate buffer to a Paddle tensor with assginment information from CINN
-  void MutableTensorData(const std::string& var_name,
-                         const platform::Place& place, LoDTensor* paddle_tensor,
-                         bool is_internal_var = false);
-
   // Assign tensor buffer to input or output variables
-  void AssignExternalVariable(const std::string& var_name, LoDTensor* tensor);
+  void AssignExternalVariable(const std::string& var_name,
+                              const platform::Place& place, LoDTensor* tensor);
 
   // Assign tensor buffer to internal variables
-  void AssignInternalVariable(const std::string& var_name, LoDTensor* tensor);
+  void AssignInternalVariable(const std::string& var_name,
+                              const platform::Place& place, LoDTensor* tensor);
 
   // Extract internal variable names from CinnScope
   // by excluding used input and output variables
@@ -83,10 +80,12 @@ class CinnLaunchContext {
 
   // Share the buffer of a Paddle tensor to CINN by delivering memory address
   // to a cinn_buffer_t object
-  std::unique_ptr<cinn_buffer_t> ShareTensorWithCinnBuffer(LoDTensor* tensor);
+  std::unique_ptr<cinn_buffer_t> ShareTensorWithCinnBuffer(
+      const platform::Place& place, bool free_mem_callback, LoDTensor* tensor);
 
   // Set an argument with (cinn name)->(paddle tensor) pair
-  void SetArgument(const std::string& cinn_name, LoDTensor* paddle_tensor);
+  void SetArgument(const std::string& cinn_name, const platform::Place& place,
+                   bool free_mem_callback, LoDTensor* paddle_tensor);
 
  private:
   // a variable name map from paddle to cinn
@@ -198,7 +197,7 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
       }
 
       launch_context->AssignExternalVariable(
-          var_name, scope.GetVar(var_name)->GetMutable<LoDTensor>());
+          var_name, place, scope.GetVar(var_name)->GetMutable<LoDTensor>());
     }
 
     // 3.2 Prepare output variables: all output variables should
@@ -215,11 +214,7 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
                             "Output variable(%s) not used by cinn", var_name));
 
       auto* tensor = scope.GetVar(var_name)->GetMutable<LoDTensor>();
-      if (!tensor->IsInitialized()) {
-        launch_context->MutableTensorData(var_name, place, tensor);
-      }
-      launch_context->AssignExternalVariable(
-          var_name, scope.GetVar(var_name)->GetMutable<LoDTensor>());
+      launch_context->AssignExternalVariable(var_name, place, tensor);
     }
 
     // 3.3 Prepare internal or temporary variables: Create a temporary
@@ -232,8 +227,7 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
     framework::Scope* temp_scope = scope.NewTmpScope().release();
     for (const auto& var_name : internal_variable_names) {
       auto* tensor = temp_scope->Var(var_name)->GetMutable<LoDTensor>();
-      launch_context->MutableTensorData(var_name, place, tensor, true);
-      launch_context->AssignInternalVariable(var_name, tensor);
+      launch_context->AssignInternalVariable(var_name, place, tensor);
     }
 
     // Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
diff --git a/paddle/fluid/operators/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn_launch_op_test.cc
index 5a07a49a5969a..5e0b87d06afea 100644
--- a/paddle/fluid/operators/cinn_launch_op_test.cc
+++ b/paddle/fluid/operators/cinn_launch_op_test.cc
@@ -222,30 +222,9 @@ TEST(CinnLaunchContextTest, TestGetInternalVariableNames) {
   auto launch_context =
       std::make_unique<CinnLaunchContext>(GetDefaultCompiledObj());
   auto internal_variable_names = launch_context->GetInternalVariableNames();
-  ASSERT_EQ(internal_variable_names.size(), 1);
-  EXPECT_EQ(*internal_variable_names.begin(), "cinn_var2");
-}
-
-TEST(CinnLaunchContextTest, TestMutableTensorData) {
-  platform::CPUPlace place;
-  framework::Scope scope;
-  auto* tensor1 = scope.Var("var1")->GetMutable<LoDTensor>();
-  auto* tensor2 = scope.Var("var2")->GetMutable<LoDTensor>();
-
-  auto launch_context =
-      std::make_unique<CinnLaunchContext>(GetDefaultCompiledObj());
-  // mutable_data on external variable
-  ASSERT_NO_THROW(launch_context->MutableTensorData("var1", place, tensor1));
-  ASSERT_TRUE(tensor1->IsInitialized());
-  ASSERT_EQ(tensor1->dims(), framework::make_ddim({3, 4}));
-  ASSERT_THROW(launch_context->MutableTensorData("not_exist", place, tensor1),
-               paddle::platform::EnforceNotMet);
-
-  // mutable_data on internal variable
-  ASSERT_NO_THROW(
-      launch_context->MutableTensorData("cinn_var2", place, tensor2, true));
-  ASSERT_TRUE(tensor2->IsInitialized());
-  ASSERT_EQ(tensor2->dims(), framework::make_ddim({6, 7, 8}));
+  ASSERT_EQ(internal_variable_names.size(), 3);
+  EXPECT_NE(internal_variable_names.find("cinn_var2"),
+            internal_variable_names.end());
 }
 
 TEST(CinnLaunchContextTest, TestCheckTensorEquivalent) {
@@ -255,12 +234,9 @@ TEST(CinnLaunchContextTest, TestCheckTensorEquivalent) {
   framework::Scope scope;
   auto* tensor1 = scope.Var("var1")->GetMutable<LoDTensor>();
 
-  // CheckTensorEquivalent: tensor is not initialized
-  ASSERT_THROW(launch_context->AssignExternalVariable("var1", tensor1),
-               paddle::platform::EnforceNotMet);
   // CheckTensorEquivalent: tensor dimension not equivalent
   tensor1->mutable_data<float>(framework::make_ddim({3, 5}), place);
-  ASSERT_THROW(launch_context->AssignExternalVariable("var1", tensor1),
+  ASSERT_THROW(launch_context->AssignExternalVariable("var1", place, tensor1),
                paddle::platform::EnforceNotMet);
 }
 
@@ -272,11 +248,12 @@ TEST(CinnLaunchContextTest, TestAssignVariablePreCondition) {
   auto* tensor4 = scope.Var("var4")->GetMutable<LoDTensor>();
 
   // not used
-  ASSERT_THROW(launch_context->AssignExternalVariable("var4", tensor4),
+  ASSERT_THROW(launch_context->AssignExternalVariable("var4", place, tensor4),
                paddle::platform::EnforceNotMet);
   // not found
-  ASSERT_THROW(launch_context->AssignExternalVariable("cinn_var4", tensor4),
-               paddle::platform::EnforceNotMet);
+  ASSERT_THROW(
+      launch_context->AssignExternalVariable("cinn_var4", place, tensor4),
+      paddle::platform::EnforceNotMet);
 }
 
 TEST(CinnLaunchContextTest, TestSetArgument) {
@@ -286,22 +263,25 @@ TEST(CinnLaunchContextTest, TestSetArgument) {
   platform::CPUPlace place;
   framework::Scope scope;
   auto* tensor1 = scope.Var("var1")->GetMutable<LoDTensor>();
-  tensor1->mutable_data<float>(framework::make_ddim({3, 4}), place);
-  auto* data1 = tensor1->data<float>();
+  float* data1 =
+      tensor1->mutable_data<float>(framework::make_ddim({3, 4}), place);
   data1[0] = 9.99f;
   data1[10] = 19.99f;
 
   // assign external variable
-  ASSERT_NO_THROW(launch_context->AssignExternalVariable("var1", tensor1));
+  ASSERT_NO_THROW(
+      launch_context->AssignExternalVariable("var1", place, tensor1));
   auto* tensor2 = scope.Var("var2")->GetMutable<LoDTensor>();
   tensor2->mutable_data<float>(framework::make_ddim({6, 7, 8}), place);
-  ASSERT_NO_THROW(launch_context->AssignInternalVariable("cinn_var2", tensor2));
+  ASSERT_NO_THROW(
+      launch_context->AssignInternalVariable("cinn_var2", place, tensor2));
   // FinalizeArguments not missed check
   ASSERT_THROW(launch_context->FinalizeArguments(),
                paddle::platform::EnforceNotMet);
   auto* tensor3 = scope.Var("var3")->GetMutable<LoDTensor>();
   tensor3->mutable_data<float>(framework::make_ddim({10, 16}), place);
-  ASSERT_NO_THROW(launch_context->AssignExternalVariable("var3", tensor3));
+  ASSERT_NO_THROW(
+      launch_context->AssignExternalVariable("var3", place, tensor3));
 
   auto name2argument = launch_context->FinalizeArguments();
   ASSERT_EQ(name2argument.size(), 3);
@@ -310,6 +290,8 @@ TEST(CinnLaunchContextTest, TestSetArgument) {
   auto* cinn_buffer =
       static_cast<cinn_buffer_t*>(name2argument.at("cinn_var1"));
 
+  ASSERT_EQ(cinn_buffer->memory, nullptr);
+  cinn_buffer->external_malloc->operator()(nullptr, cinn_buffer);
   ASSERT_NE(cinn_buffer->memory, nullptr);
   ASSERT_EQ(cinn_buffer->num_elements(), 12);
   auto* shadow_data = reinterpret_cast<float*>(cinn_buffer->memory);
diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu
index cfcfd04e6fc7c..29286be0dd6b2 100644
--- a/paddle/fluid/operators/class_center_sample_op.cu
+++ b/paddle/fluid/operators/class_center_sample_op.cu
@@ -30,7 +30,7 @@ namespace cub = hipcub;
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -335,7 +335,7 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
           static_cast<platform::CUDADeviceContext*>(
               platform::DeviceContextPool::Instance().Get(ctx.GetPlace()))
               ->stream();
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
           num_classes_per_device_ptr, num_classes_per_device_ptr,
           num_classes_per_device.numel(),
           platform::ToNCCLDataType(num_classes_per_device.type()), ncclSum,
@@ -346,13 +346,13 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
     // step 2: Determine temporary device storage requirements
     int num_buffer_ele = std::max(batch_size, num_classes);
     size_t cub_sort_temp_store_size = 0;
-    PADDLE_ENFORCE_CUDA_SUCCESS((cub::DeviceRadixSort::SortPairs<T, T>(
+    PADDLE_ENFORCE_GPU_SUCCESS((cub::DeviceRadixSort::SortPairs<T, T>(
         nullptr, cub_sort_temp_store_size, nullptr, nullptr, nullptr, nullptr,
         num_buffer_ele, 0, sizeof(T) * 8, ctx.cuda_device_context().stream())));
 
     size_t cub_sum_temp_store_size = 0;
     NotEqualToPreviousAdjacentIterator<T> unique_counting_iter_temp(nullptr, 0);
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         (cub::DeviceScan::InclusiveSum<NotEqualToPreviousAdjacentIterator<T>,
                                        T*>(
             nullptr, cub_sum_temp_store_size, unique_counting_iter_temp,
@@ -360,7 +360,7 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
 
     size_t cub_scan_temp_store_size = 0;
     ActualNumSampledFunctor<T> actual_num_sampled_op_temp(num_samples);
-    PADDLE_ENFORCE_CUDA_SUCCESS((cub::DeviceScan::InclusiveScan(
+    PADDLE_ENFORCE_GPU_SUCCESS((cub::DeviceScan::InclusiveScan(
         nullptr, cub_scan_temp_store_size, num_classes_per_device_ptr,
         num_classes_per_device_ptr, actual_num_sampled_op_temp, nranks + 1,
         ctx.cuda_device_context().stream())));
@@ -384,7 +384,7 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
     void* cub_temp_storage_ptr = memory_buffer.cub_temp_storage_ptr();
 
     // step 4: Calculate class interval among nranks
-    PADDLE_ENFORCE_CUDA_SUCCESS((cub::DeviceScan::InclusiveSum(
+    PADDLE_ENFORCE_GPU_SUCCESS((cub::DeviceScan::InclusiveSum(
         cub_temp_storage_ptr, cub_temp_storage_bytes,
         num_classes_per_device_ptr, class_interval_ptr, nranks + 1,
         ctx.cuda_device_context().stream())));
@@ -415,13 +415,13 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
 
     // step 7: sort class center by ascending, so that positive class center
     // always be sampled.
-    PADDLE_ENFORCE_CUDA_SUCCESS((cub::DeviceRadixSort::SortPairs<T, T>(
+    PADDLE_ENFORCE_GPU_SUCCESS((cub::DeviceRadixSort::SortPairs<T, T>(
         cub_temp_storage_ptr, cub_temp_storage_bytes, cub_sort_keys_ptr,
         cub_sort_keys_out_ptr, cub_sort_values_ptr, cub_sort_values_out_ptr,
         num_classes, 0, sizeof(T) * 8, ctx.cuda_device_context().stream())));
 
     // step 8: sort input label ascending
-    PADDLE_ENFORCE_CUDA_SUCCESS((cub::DeviceRadixSort::SortPairs<T, T>(
+    PADDLE_ENFORCE_GPU_SUCCESS((cub::DeviceRadixSort::SortPairs<T, T>(
         cub_temp_storage_ptr, cub_temp_storage_bytes, label->data<T>(),
         cub_sort_keys_out_ptr, cub_sort_values_ptr, cub_sort_keys_ptr,
         batch_size, 0, sizeof(T) * 8, ctx.cuda_device_context().stream())));
@@ -430,8 +430,8 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
     // label
     NotEqualToPreviousAdjacentIterator<T> unique_counting_iter(
         cub_sort_keys_out_ptr, 0);
-    PADDLE_ENFORCE_CUDA_SUCCESS((cub::DeviceScan::InclusiveSum<
-                                 NotEqualToPreviousAdjacentIterator<T>, T*>(
+    PADDLE_ENFORCE_GPU_SUCCESS((cub::DeviceScan::InclusiveSum<
+                                NotEqualToPreviousAdjacentIterator<T>, T*>(
         cub_temp_storage_ptr, cub_temp_storage_bytes, unique_counting_iter,
         cub_sort_values_ptr, batch_size, ctx.cuda_device_context().stream())));
 
@@ -445,13 +445,13 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
     // Since maybe num_positive_class_center > num_samples,
     // we need to ensure all positive class center per device are sampled.
     ActualNumSampledFunctor<T> actual_num_sampled_op(num_samples);
-    PADDLE_ENFORCE_CUDA_SUCCESS((cub::DeviceScan::InclusiveScan(
+    PADDLE_ENFORCE_GPU_SUCCESS((cub::DeviceScan::InclusiveScan(
         cub_temp_storage_ptr, cub_temp_storage_bytes, bound_value_ptr,
         num_classes_per_device_ptr, actual_num_sampled_op, nranks + 1,
         ctx.cuda_device_context().stream())));
 
     // step 12: Calculate actual sampled class interval among nranks
-    PADDLE_ENFORCE_CUDA_SUCCESS((cub::DeviceScan::InclusiveSum(
+    PADDLE_ENFORCE_GPU_SUCCESS((cub::DeviceScan::InclusiveSum(
         cub_temp_storage_ptr, cub_temp_storage_bytes,
         num_classes_per_device_ptr, class_interval_ptr, nranks + 1,
         ctx.cuda_device_context().stream())));
diff --git a/paddle/fluid/operators/collective/allreduce_op.h b/paddle/fluid/operators/collective/allreduce_op.h
index 157924f08546b..4e6d86d49e863 100644
--- a/paddle/fluid/operators/collective/allreduce_op.h
+++ b/paddle/fluid/operators/collective/allreduce_op.h
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -69,15 +69,11 @@ class AllReduceOpKernel : public framework::OpKernel<T> {
         red_type = ncclMin;
         break;
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
         sendbuff, recvbuff, numel, static_cast<ncclDataType_t>(dtype), red_type,
         comm, stream));
     if (ctx.Attr<bool>("sync_mode")) {
-#ifdef PADDLE_WITH_RCCL
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
+      platform::GpuStreamSync(stream);
     }
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/operators/collective/alltoall_op.cu.cc b/paddle/fluid/operators/collective/alltoall_op.cu.cc
index 1bcb47fc686cf..02b10f17da5a3 100644
--- a/paddle/fluid/operators/collective/alltoall_op.cu.cc
+++ b/paddle/fluid/operators/collective/alltoall_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -62,15 +62,15 @@ class AllToAllOpCUDAKernel : public framework::OpKernel<T> {
     auto recv_buf = out->mutable_data<T>(out_dims, place);
     size_t offset = 0;
     send_numel /= nranks;
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupStart());
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
     for (auto i = 0; i < nranks; ++i) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclSend(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
           send_buf + offset, send_numel, dtype, i, comm->comm(), stream));
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclRecv(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
           recv_buf + offset, send_numel, dtype, i, comm->comm(), stream));
       offset += send_numel;
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupEnd());
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
 #else
     PADDLE_THROW(
         platform::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
diff --git a/paddle/fluid/operators/collective/barrier_op.cu.cc b/paddle/fluid/operators/collective/barrier_op.cu.cc
index b8631b44f14ca..c9aef237699f3 100644
--- a/paddle/fluid/operators/collective/barrier_op.cu.cc
+++ b/paddle/fluid/operators/collective/barrier_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -41,13 +41,9 @@ class BarrierOpCUDAKernel : public framework::OpKernel<T> {
     auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
     auto stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
     ncclRedOp_t nccl_red_type = ncclSum;
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
         sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream));
-#ifdef PADDLE_WITH_RCCL
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
+    platform::GpuStreamSync(stream);
 #else
     PADDLE_THROW(platform::errors::Unavailable(
         "PaddlePaddle should compile with NCCL."));
diff --git a/paddle/fluid/operators/collective/broadcast_op.cu.cc b/paddle/fluid/operators/collective/broadcast_op.cu.cc
index fa4d7ee4cce5d..daaaf8b7a2e41 100644
--- a/paddle/fluid/operators/collective/broadcast_op.cu.cc
+++ b/paddle/fluid/operators/collective/broadcast_op.cu.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace ops = paddle::operators;
@@ -54,7 +54,7 @@ class NCCLBroadcastOpKernel : public framework::OpKernel<T> {
     auto comm = dev_ctx.nccl_comm();
     auto stream = dev_ctx.stream();
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
         send_recv_buffer, static_cast<size_t>(in->numel()),
         platform::ToNCCLDataType(in->type()), root_dev_id, comm, stream));
 
@@ -62,11 +62,7 @@ class NCCLBroadcastOpKernel : public framework::OpKernel<T> {
             << " From " << root_dev_id << " to " << dev_id;
 
     if (ctx.Attr<bool>("sync_mode")) {
-#ifdef PADDLE_WITH_RCCL
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
+      platform::GpuStreamSync(stream);
     }
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/operators/collective/c_allgather_op.cu.cc b/paddle/fluid/operators/collective/c_allgather_op.cu.cc
index 597e4321d66bd..f174473c049ec 100644
--- a/paddle/fluid/operators/collective/c_allgather_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -56,7 +56,7 @@ class CAllGatherOpCUDAKernel : public framework::OpKernel<T> {
       stream = comm->stream();
     }
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
         send_buff, recv_buff, send_numel, static_cast<ncclDataType_t>(dtype),
         comm->comm(), stream));
 #else
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 6d569b454e691..714dc4e19f9b1 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -29,7 +29,7 @@ limitations under the License. */
 #endif
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 #if defined(PADDLE_WITH_XPU_BKCL)
@@ -386,7 +386,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
             "Invalid reduce type: %d", red_type));
     }
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
         sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream));
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
index b37bd250c1558..6deb837069761 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -46,7 +46,7 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
 
     int root = ctx.Attr<int>("root");
     if (root == comm->rank()) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
           reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), numel, dtype,
           root, comm->comm(), stream));
       VLOG(3) << "rank " << comm->rank() << " invoke Bcast. sent "
@@ -59,7 +59,7 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
             static_cast<framework::Tensor*>(out));
       }
     } else {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::ncclBcast(out->mutable_data<T>(place), numel,
                                        dtype, root, comm->comm(), stream));
       VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved "
diff --git a/paddle/fluid/operators/collective/c_comm_init_all_op.cc b/paddle/fluid/operators/collective/c_comm_init_all_op.cc
index 60a9b1ee44fcc..db9a8428e3d03 100644
--- a/paddle/fluid/operators/collective/c_comm_init_all_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_all_op.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/threadpool.h"
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc b/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc
index aee10dcdc2732..f69fe8f1e3f1f 100644
--- a/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc
@@ -26,7 +26,7 @@ limitations under the License. */
 // #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/operators/collective/c_concat_op.cu.cc b/paddle/fluid/operators/collective/c_concat_op.cu.cc
index bfdc49c440aae..738ed16286131 100644
--- a/paddle/fluid/operators/collective/c_concat_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_concat_op.cu.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -71,7 +71,7 @@ class CConcatOpCUDAKernel : public framework::OpKernel<T> {
     auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
     stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
         send_buff, recv_buff, send_numel, static_cast<ncclDataType_t>(dtype),
         comm->comm(), stream));
 
diff --git a/paddle/fluid/operators/collective/c_embedding_op.cu b/paddle/fluid/operators/collective/c_embedding_op.cu
index 858ca79f85b0e..9b343b34a3e51 100644
--- a/paddle/fluid/operators/collective/c_embedding_op.cu
+++ b/paddle/fluid/operators/collective/c_embedding_op.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/collective/c_embedding_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
index 0a0a824b77586..d392beb3a4834 100644
--- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
@@ -30,7 +30,7 @@ namespace operators {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 static void GenNCCLID(std::vector<ncclUniqueId>* nccl_ids) {
   for (size_t i = 0; i < nccl_ids->size(); ++i) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::ncclGetUniqueId(&(*nccl_ids)[i]));
   }
 }
diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h
index 74f41bff9dc86..b950339bd22be 100644
--- a/paddle/fluid/operators/collective/c_reduce_op.h
+++ b/paddle/fluid/operators/collective/c_reduce_op.h
@@ -30,7 +30,7 @@ limitations under the License. */
 #endif
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 #if defined(PADDLE_WITH_XPU_BKCL)
@@ -316,7 +316,7 @@ class CReduceOpCUDAKernel : public framework::OpKernel<T> {
                                            "kRedMax, kRedMin, kRedProd."));
     }
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce(
         sendbuff, recvbuff, numel, dtype, nccl_red_type, root, comm->comm(),
         stream));
 #else
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
index 4d19ee42641f4..141fa760413b3 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -57,7 +57,7 @@ class CReduceScatterOpCUDAKernel : public framework::OpKernel<T> {
       stream = comm->stream();
     }
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduceScatter(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduceScatter(
         send_buff, recv_buff, recv_numel, static_cast<ncclDataType_t>(dtype),
         ncclSum, comm->comm(), stream));
 #else
diff --git a/paddle/fluid/operators/collective/c_scatter_op.cu.cc b/paddle/fluid/operators/collective/c_scatter_op.cu.cc
index 0c9dc2af14f39..4d4dc0c12af55 100644
--- a/paddle/fluid/operators/collective/c_scatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_scatter_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -66,7 +66,7 @@ class CScatterOpCUDAKernel : public framework::OpKernel<T> {
     framework::Tensor temp;
     auto out_ptr = temp.mutable_data<T>(out_dims, place);
     if (root_id == comm->rank()) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
           reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), numel, dtype,
           root_id, comm->comm(), stream));
 
@@ -74,7 +74,7 @@ class CScatterOpCUDAKernel : public framework::OpKernel<T> {
                             *platform::DeviceContextPool::Instance().Get(place),
                             static_cast<framework::Tensor*>(&temp));
     } else {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
           out_ptr, numel, dtype, root_id, comm->comm(), stream));
     }
 
diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
index 77db86e711111..6371d523cfa4a 100644
--- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/operators/math/softmax_impl.h"
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/fluid/string/string_helper.h"
 
 namespace paddle {
@@ -119,7 +119,7 @@ class CSoftmaxWithCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
     Eigen::DSizes<int, 1> along_axis(1);
     eigen_logits_max.device(*dev_ctx.eigen_device()) =
         eigen_logits.maximum(along_axis);
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
         logits_max_buff, logits_max_buff, logits_max.numel(),
         platform::ToNCCLDataType(logits_max.type()), ncclMax, comm->comm(),
         stream));
@@ -160,7 +160,7 @@ class CSoftmaxWithCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
     }
 
     void* predict_logits_buff = predicted_logits.mutable_data<T>(place);
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
         predict_logits_buff, predict_logits_buff, predicted_logits.numel(),
         platform::ToNCCLDataType(predicted_logits.type()), ncclSum,
         comm->comm(), stream));
@@ -178,7 +178,7 @@ class CSoftmaxWithCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
     eigen_sum_exp_logits.device(*dev_ctx.eigen_device()) =
         eigen_softmax.sum(along_axis);
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
         sum_exp_logits_buff, sum_exp_logits_buff, sum_exp_logits.numel(),
         platform::ToNCCLDataType(sum_exp_logits.type()), ncclSum, comm->comm(),
         stream));
diff --git a/paddle/fluid/operators/collective/c_split_op.cu b/paddle/fluid/operators/collective/c_split_op.cu
index 034accbb480c7..a8c4eafede41b 100644
--- a/paddle/fluid/operators/collective/c_split_op.cu
+++ b/paddle/fluid/operators/collective/c_split_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_split_op.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
index 72faf4298cf60..72339bbd48752 100644
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
@@ -55,11 +55,7 @@ class CSyncCalcStreamKernel : public framework::OpKernel<T> {
     auto dev_ctx = static_cast<platform::CUDADeviceContext*>(
         platform::DeviceContextPool::Instance().Get(place));
 
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(dev_ctx->stream()));
-#else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(dev_ctx->stream()));
-#endif
+    platform::GpuStreamSync(dev_ctx->stream());
 
 #elif defined(PADDLE_WITH_ASCEND_CL) && !defined(_WIN32)
     auto place = ctx.GetPlace();
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
index 03894b24a913b..21bad096c2d49 100644
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 #if defined(PADDLE_WITH_ASCEND_CL)
@@ -67,11 +67,7 @@ class CSyncCommStreamKernel : public framework::OpKernel<T> {
     auto stream =
         platform::NCCLCommContext::Instance().Get(ring_id, place)->stream();
 
-#ifdef PADDLE_WITH_RCCL
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
+    platform::GpuStreamSync(stream);
 
 #elif defined(PADDLE_WITH_ASCEND_CL)
     PADDLE_ENFORCE_EQ(is_npu_place(place), true,
diff --git a/paddle/fluid/operators/collective/c_wait_comm_op.cc b/paddle/fluid/operators/collective/c_wait_comm_op.cc
index d0dfc3bb1c2e5..dfa4dcd0fac59 100644
--- a/paddle/fluid/operators/collective/c_wait_comm_op.cc
+++ b/paddle/fluid/operators/collective/c_wait_comm_op.cc
@@ -54,11 +54,11 @@ class CWaitCommOp : public framework::OperatorBase {
 
 // comm_stream-->event-->compute_stream
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event, comm_stream));
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(compute_stream, event, 0));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, comm_stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(compute_stream, event, 0));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, comm_stream));
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(compute_stream, event, 0));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, comm_stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(compute_stream, event, 0));
 #endif
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/operators/collective/c_wait_compute_op.cc b/paddle/fluid/operators/collective/c_wait_compute_op.cc
index 12a28040ef1c5..e038617bf3d6a 100644
--- a/paddle/fluid/operators/collective/c_wait_compute_op.cc
+++ b/paddle/fluid/operators/collective/c_wait_compute_op.cc
@@ -57,11 +57,11 @@ class CWaitComputeOp : public framework::OperatorBase {
 
 // compute_stream-->event-->comm_stream
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event, compute_stream));
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(comm_stream, event, 0));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, compute_stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(comm_stream, event, 0));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, compute_stream));
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(comm_stream, event, 0));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, compute_stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(comm_stream, event, 0));
 #endif
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/operators/collective/gen_nccl_id_op.cc b/paddle/fluid/operators/collective/gen_nccl_id_op.cc
index 99a92469e8502..7a5b6b5f429b2 100644
--- a/paddle/fluid/operators/collective/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/gen_nccl_id_op.cc
@@ -20,9 +20,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
@@ -37,7 +37,7 @@ namespace operators {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 static void GenNCCLID(std::vector<ncclUniqueId>* nccl_ids) {
   for (size_t i = 0; i < nccl_ids->size(); ++i) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::ncclGetUniqueId(&(*nccl_ids)[i]));
   }
 }
diff --git a/paddle/fluid/operators/collective/global_gather_op.cu.cc b/paddle/fluid/operators/collective/global_gather_op.cu.cc
index 70b5d0244d385..e2ff823420aef 100644
--- a/paddle/fluid/operators/collective/global_gather_op.cu.cc
+++ b/paddle/fluid/operators/collective/global_gather_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -103,24 +103,24 @@ class GlobalGatherOpCUDAKernel : public framework::OpKernel<T> {
     auto recv_buf = out->mutable_data<T>(out_dims, place);
 
     for (auto i = 0; i < n_expert; ++i) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupStart());
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
       for (auto j = 0; j < nranks; ++j) {
         int idx = i + j * n_expert;
         if (cpu_global_count_data[idx]) {
-          PADDLE_ENFORCE_CUDA_SUCCESS(
+          PADDLE_ENFORCE_GPU_SUCCESS(
               platform::dynload::ncclSend(send_buf + send_ptr * in_feat,
                                           cpu_global_count_data[idx] * in_feat,
                                           dtype, j, comm->comm(), stream));
           send_ptr += cpu_global_count_data[idx];
         }
         if (cpu_local_count_data[idx]) {
-          PADDLE_ENFORCE_CUDA_SUCCESS(
+          PADDLE_ENFORCE_GPU_SUCCESS(
               platform::dynload::ncclRecv(recv_buf + expert_ptr[idx] * in_feat,
                                           cpu_local_count_data[idx] * in_feat,
                                           dtype, j, comm->comm(), stream));
         }
       }
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupEnd());
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
     }
 #else
     PADDLE_THROW(
diff --git a/paddle/fluid/operators/collective/global_scatter_op.cu.cc b/paddle/fluid/operators/collective/global_scatter_op.cu.cc
index bec984c6b57e1..c47d27366c5f2 100644
--- a/paddle/fluid/operators/collective/global_scatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/global_scatter_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -102,24 +102,24 @@ class GlobalScatterOpCUDAKernel : public framework::OpKernel<T> {
     auto recv_buf = out->mutable_data<T>(out_dims, place);
 
     for (auto i = 0; i < n_expert; ++i) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupStart());
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
       for (auto j = 0; j < nranks; ++j) {
         int idx = i + j * n_expert;
         if (cpu_local_count_data[idx]) {
-          PADDLE_ENFORCE_CUDA_SUCCESS(
+          PADDLE_ENFORCE_GPU_SUCCESS(
               platform::dynload::ncclSend(send_buf + expert_ptr[idx] * in_feat,
                                           cpu_local_count_data[idx] * in_feat,
                                           dtype, j, comm->comm(), stream));
         }
         if (cpu_global_count_data[idx]) {
-          PADDLE_ENFORCE_CUDA_SUCCESS(
+          PADDLE_ENFORCE_GPU_SUCCESS(
               platform::dynload::ncclRecv(recv_buf + recv_ptr * in_feat,
                                           cpu_global_count_data[idx] * in_feat,
                                           dtype, j, comm->comm(), stream));
           recv_ptr += cpu_global_count_data[idx];
         }
       }
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupEnd());
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
     }
 
 #else
diff --git a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
index 8c32f8c41bbf2..094847beca214 100644
--- a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -67,7 +67,7 @@ class PartialAllGatherOpCUDAKernel : public framework::OpKernel<T> {
       stream = comm->stream();
     }
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
         send_buff, recv_buff, send_numel, static_cast<ncclDataType_t>(dtype),
         comm->comm(), stream));
 #else
diff --git a/paddle/fluid/operators/collective/partial_recv_op.cu.cc b/paddle/fluid/operators/collective/partial_recv_op.cu.cc
index 49eafa5c7c4f5..d59c062a31b8c 100644
--- a/paddle/fluid/operators/collective/partial_recv_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_recv_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -80,7 +80,7 @@ class PartialRecvOpCUDAKernel : public framework::OpKernel<T> {
     int recv_numel = numel / num;
     int offset = recv_numel * id;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::ncclRecv(out->data<T>() + offset, recv_numel, dtype,
                                     peer, comm->comm(), stream));
     VLOG(3) << "rank " << comm->rank() << " recv " << recv_numel
diff --git a/paddle/fluid/operators/collective/partial_send_op.cu.cc b/paddle/fluid/operators/collective/partial_send_op.cu.cc
index 2463f208746ed..8a4f7f750a15b 100644
--- a/paddle/fluid/operators/collective/partial_send_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_send_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -74,7 +74,7 @@ class PartialSendCUDAKernel : public framework::OpKernel<T> {
     int send_numel = numel / num;
     int offset = send_numel * id;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclSend(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
         x->data<T>() + offset, send_numel, dtype, peer, comm->comm(), stream));
     VLOG(3) << "rank " << comm->rank() << " send " << send_numel
             << " from offset[" << offset << "] to " << peer;
diff --git a/paddle/fluid/operators/collective/recv_v2_op.cu.cc b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
index df94fee5223c6..18d6af4c2aaa1 100644
--- a/paddle/fluid/operators/collective/recv_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -69,7 +69,7 @@ class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
         auto out_dims = out->dims();
         out->mutable_data<T>(out_dims, place, 0);
         auto numel = out->numel();
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclRecv(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
             out->data<T>(), numel, dtype, peer, comm->comm(), stream));
         VLOG(3) << "rank " << comm->rank() << " recv "
                 << framework::product(out_dims) << " from " << peer;
@@ -83,7 +83,7 @@ class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
     auto numel = out->numel();
 
     out->mutable_data<T>(out_dims, place);
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclRecv(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
         out->data<T>(), numel, dtype, peer, comm->comm(), stream));
     VLOG(3) << "rank " << comm->rank() << " recv "
             << framework::product(out->dims()) << " from " << peer;
diff --git a/paddle/fluid/operators/collective/send_v2_op.cu.cc b/paddle/fluid/operators/collective/send_v2_op.cu.cc
index dc28910e9ec9c..952fcf2065d59 100644
--- a/paddle/fluid/operators/collective/send_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -62,7 +62,7 @@ class SendOpV2CUDAKernel : public framework::OpKernel<T> {
         auto& x = x_array.at(idx);
         int numel = x.numel();
         ncclDataType_t dtype = platform::ToNCCLDataType(x.type());
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclSend(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
             x.data<T>(), numel, dtype, peer, comm->comm(), stream));
         VLOG(3) << "rank " << comm->rank() << " send "
                 << framework::product(x.dims()) << " to " << peer;
@@ -73,7 +73,7 @@ class SendOpV2CUDAKernel : public framework::OpKernel<T> {
     int numel = x->numel();
 
     ncclDataType_t dtype = platform::ToNCCLDataType(x->type());
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclSend(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
         x->data<T>(), numel, dtype, peer, comm->comm(), stream));
     VLOG(3) << "rank " << comm->rank() << " send "
             << framework::product(x->dims()) << " to " << peer;
diff --git a/paddle/fluid/operators/controlflow/get_places_op.cc b/paddle/fluid/operators/controlflow/get_places_op.cc
index dec0e789776a4..55bd4879ab794 100644
--- a/paddle/fluid/operators/controlflow/get_places_op.cc
+++ b/paddle/fluid/operators/controlflow/get_places_op.cc
@@ -27,7 +27,7 @@ class OpBase;
 }  // namespace imperative
 }  // namespace paddle
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 
 namespace paddle {
@@ -35,7 +35,7 @@ namespace operators {
 
 static size_t CUDADevCount() {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  return platform::GetCUDADeviceCount();
+  return platform::GetGPUDeviceCount();
 #else
   return 0UL;
 #endif
diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h
index f4183bf570926..a783a619473ef 100644
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -25,7 +25,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
-#include "paddle/fluid/platform/cudnn_desc.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+
 namespace paddle {
 namespace operators {
 
@@ -98,7 +99,7 @@ std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
 inline int MaxBwdFilterAlgos(cudnnHandle_t cudnn_handle) {
   int max_algos = 0;
 #if CUDNN_VERSION_MIN(7, 0, 1)
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
           cudnn_handle, &max_algos));
 #endif
@@ -176,22 +177,22 @@ static void SetConvMathType(const framework::ExecutionContext& ctx,
 #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
   auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
   if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
         cdesc.desc(), CUDNN_TENSOR_OP_MATH));
     VLOG(5) << "use cudnn_tensor_op_math";
 #if CUDA_VERSION >= 11000
 #if CUDNN_VERSION_MIN(8, 1, 0)
   } else if (dev_ctx.GetComputeCapability() >= 80 &&
              dtype == CUDNN_DATA_BFLOAT16) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
         cdesc.desc(), CUDNN_TENSOR_OP_MATH));
 #endif  // CUDNN_VERSION_MIN(8, 1, 0)
   } else if (dtype == CUDNN_DATA_FLOAT && !cdesc.allow_tf32_) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
         cdesc.desc(), CUDNN_FMA_MATH));
 #endif  // CUDA_VERSION >= 11000
   } else {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
         cdesc.desc(), CUDNN_DEFAULT_MATH));
     VLOG(5) << "NOT use cudnn_tensor_op_math";
   }
@@ -245,7 +246,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
       int perf_count;
       int best_algo_idx = 0;
       std::unique_ptr<perf_t[]> perf_results(new perf_t[kNUM_CUDNN_FWD_ALGS]);
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
               args.handle, args.idesc.desc(), args.wdesc.desc(),
               args.cdesc.desc(), args.odesc.desc(), kNUM_CUDNN_FWD_ALGS,
@@ -264,7 +265,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
                    "the workspace size request("
                 << workspace_size << ") exceeds the limit("
                 << workspace_size_limit << ")";
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::cudnnGetConvolutionForwardAlgorithm(
                 args.handle, args.idesc.desc(), args.wdesc.desc(),
                 args.cdesc.desc(), args.odesc.desc(),
@@ -273,7 +274,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
 #endif
       }
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnGetConvolutionForwardAlgorithm(
               args.handle, args.idesc.desc(), args.wdesc.desc(),
               args.cdesc.desc(), args.odesc.desc(),
@@ -306,7 +307,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
             std::array<perf_t, kNUM_CUDNN_FWD_ALGS> perf_stat;
 
             auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+              PADDLE_ENFORCE_GPU_SUCCESS(
                   platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
                       args.handle, args.idesc.desc(), args.x->data<T>(),
                       args.wdesc.desc(), args.w->data<T>(), args.cdesc.desc(),
@@ -332,7 +333,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
 
   static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) {
     size_t workspace_size = 0;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
             args.handle, args.idesc.desc(), args.wdesc.desc(),
             args.cdesc.desc(), args.odesc.desc(), algo, &workspace_size));
@@ -362,7 +363,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
       int best_algo_idx = 0;
       std::unique_ptr<perf_t[]> perf_results(
           new perf_t[kNUM_CUDNN_BWD_DATA_ALGS]);
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7(
               args.handle, args.wdesc.desc(), args.odesc.desc(),
               args.cdesc.desc(), args.idesc.desc(), kNUM_CUDNN_BWD_DATA_ALGS,
@@ -395,7 +396,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
                    "the workspace size request("
                 << workspace_size << ") exceeds the limit("
                 << workspace_size_limit << ")";
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
                 args.handle, args.wdesc.desc(), args.odesc.desc(),
                 args.cdesc.desc(), args.idesc.desc(),
@@ -404,7 +405,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
 #endif
       }
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
               args.handle, args.wdesc.desc(), args.odesc.desc(),
               args.cdesc.desc(), args.idesc.desc(),
@@ -435,7 +436,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
             std::array<perf_t, kNUM_CUDNN_BWD_DATA_ALGS> perf_stat;
 
             auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+              PADDLE_ENFORCE_GPU_SUCCESS(
                   platform::dynload::
                       cudnnFindConvolutionBackwardDataAlgorithmEx(
                           args.handle, args.wdesc.desc(), args.w->data<T>(),
@@ -464,7 +465,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
 
   static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) {
     size_t workspace_size = 0;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
             args.handle, args.wdesc.desc(), args.odesc.desc(),
             args.cdesc.desc(), args.idesc.desc(), algo, &workspace_size));
@@ -496,7 +497,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
       int best_algo_idx = 0;
       std::unique_ptr<perf_t[]> perf_results(
           new perf_t[kNUM_CUDNN_BWD_FILTER_ALGS]);
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7(
               args.handle, args.idesc.desc(), args.odesc.desc(),
               args.cdesc.desc(), args.wdesc.desc(), kNUM_CUDNN_BWD_FILTER_ALGS,
@@ -515,7 +516,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
                    "the workspace size request("
                 << workspace_size << ") exceeds the limit("
                 << workspace_size_limit << ")";
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
                 args.handle, args.idesc.desc(), args.odesc.desc(),
                 args.cdesc.desc(), args.wdesc.desc(),
@@ -524,7 +525,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
 #endif
       }
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
               args.handle, args.idesc.desc(), args.odesc.desc(),
               args.cdesc.desc(), args.wdesc.desc(),
@@ -553,7 +554,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
               int returned_algo_count;
               std::array<perf_t, kNUM_CUDNN_BWD_FILTER_ALGS> perf_stat;
               auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
-                PADDLE_ENFORCE_CUDA_SUCCESS(
+                PADDLE_ENFORCE_GPU_SUCCESS(
                     platform::dynload::
                         cudnnFindConvolutionBackwardFilterAlgorithmEx(
                             args.handle, args.idesc.desc(), args.x->data<T>(),
@@ -584,7 +585,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
               algo_t chosen_algo;
               std::vector<perf_t> perf_results(max_algos);
               int actual_algos = 0;
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+              PADDLE_ENFORCE_GPU_SUCCESS(
                   platform::dynload::
                       cudnnFindConvolutionBackwardFilterAlgorithm(
                           args.handle, args.idesc.desc(), args.odesc.desc(),
@@ -605,7 +606,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
   static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) {
     platform::CUDAGraphCaptureModeGuard guard;
     size_t workspace_size = 0;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
             args.handle, args.idesc.desc(), args.odesc.desc(),
             args.cdesc.desc(), args.wdesc.desc(), algo, &workspace_size));
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu
index 275e81fc7f33a..566e99c357fbe 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_cudnn_op.cu
@@ -261,9 +261,8 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     // cudnn 7 can support groups, no need to do it manually
     // FIXME(typhoonzero): find a better way to disable groups
     // rather than setting it to 1.
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnSetConvolutionGroupCount(args.cdesc.desc(),
-                                                         groups));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionGroupCount(
+        args.cdesc.desc(), groups));
     groups = 1;
 #endif
 #ifdef PADDLE_WITH_HIP
@@ -328,7 +327,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
     workspace_handle.RunFunc(
         [&](void* workspace_ptr) {
-          PADDLE_ENFORCE_CUDA_SUCCESS(
+          PADDLE_ENFORCE_GPU_SUCCESS(
               platform::dynload::miopenConvolutionForward(
                   handle, &alpha, args.idesc.desc(), input_data,
                   args.wdesc.desc(), filter_data, args.cdesc.desc(), algo,
@@ -340,7 +339,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     for (int i = 0; i < groups; i++) {
       workspace_handle.RunFunc(
           [&](void* workspace_ptr) {
-            PADDLE_ENFORCE_CUDA_SUCCESS(
+            PADDLE_ENFORCE_GPU_SUCCESS(
                 platform::dynload::cudnnConvolutionForward(
                     handle, &alpha, args.idesc.desc(),
                     input_data + i * group_offset_in, args.wdesc.desc(),
@@ -718,7 +717,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
         T* temp_tensor_data = temp_tensor.mutable_data<T>(ctx.GetPlace());
         workspace_handle.RunFunc(
             [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+              PADDLE_ENFORCE_GPU_SUCCESS(
                   platform::dynload::miopenConvolutionBackwardData(
                       handle, &alpha, args1.odesc.desc(), output_grad_data,
                       args1.wdesc.desc(), filter_data, args1.cdesc.desc(),
@@ -726,7 +725,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
                       cudnn_workspace_ptr, workspace_size));
             },
             workspace_size);
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenOpTensor(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenOpTensor(
             handle, miopenTensorOpAdd, &alpha, args1.idesc.desc(),
             transformed_input_grad_data, &alpha, args1.idesc.desc(),
             temp_tensor_data, &beta, args1.idesc.desc(),
@@ -734,7 +733,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       } else {
         workspace_handle.RunFunc(
             [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+              PADDLE_ENFORCE_GPU_SUCCESS(
                   platform::dynload::miopenConvolutionBackwardData(
                       handle, &alpha, args1.odesc.desc(), output_grad_data,
                       args1.wdesc.desc(), filter_data, args1.cdesc.desc(),
@@ -749,7 +748,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       for (int i = 0; i < groups; i++) {
         workspace_handle.RunFunc(
             [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+              PADDLE_ENFORCE_GPU_SUCCESS(
                   platform::dynload::cudnnConvolutionBackwardData(
                       handle, &alpha, args1.wdesc.desc(),
                       filter_data + i * group_offset_filter, args1.odesc.desc(),
@@ -796,7 +795,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
       workspace_handle.RunFunc(
           [&](void* cudnn_workspace_ptr) {
-            PADDLE_ENFORCE_CUDA_SUCCESS(
+            PADDLE_ENFORCE_GPU_SUCCESS(
                 platform::dynload::miopenConvolutionBackwardWeights(
                     handle, &alpha, args2.odesc.desc(), output_grad_data,
                     args2.idesc.desc(), input_data, args2.cdesc.desc(),
@@ -808,7 +807,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       for (int i = 0; i < groups; i++) {
         workspace_handle.RunFunc(
             [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+              PADDLE_ENFORCE_GPU_SUCCESS(
                   platform::dynload::cudnnConvolutionBackwardFilter(
                       handle, &alpha, args2.idesc.desc(),
                       input_data + i * group_offset_in, args2.odesc.desc(),
@@ -1228,7 +1227,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
         wkspace_handle.RunFunc(
             [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+              PADDLE_ENFORCE_GPU_SUCCESS(
                   platform::dynload::miopenConvolutionForward(
                       handle, &alpha, args1.idesc.desc(), ddx,
                       args1.wdesc.desc(), w, args1.cdesc.desc(), fwd_algo1,
@@ -1240,7 +1239,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
         for (int i = 0; i < groups; i++) {
           wkspace_handle.RunFunc(
               [&](void* workspace_ptr) {
-                PADDLE_ENFORCE_CUDA_SUCCESS(
+                PADDLE_ENFORCE_GPU_SUCCESS(
                     platform::dynload::cudnnConvolutionForward(
                         handle, &alpha, args1.idesc.desc(),
                         ddx + i * group_offset_in, args1.wdesc.desc(),
@@ -1258,7 +1257,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
         // MIOPEN ONLY support beta to be 0.0f
         wkspace_handle.RunFunc(
             [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+              PADDLE_ENFORCE_GPU_SUCCESS(
                   platform::dynload::miopenConvolutionForward(
                       handle, &alpha, args2.idesc.desc(), x, args2.wdesc.desc(),
                       ddw, args2.cdesc.desc(), fwd_algo2, &beta,
@@ -1270,7 +1269,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
         for (int i = 0; i < groups; i++) {
           wkspace_handle.RunFunc(
               [&](void* workspace_ptr) {
-                PADDLE_ENFORCE_CUDA_SUCCESS(
+                PADDLE_ENFORCE_GPU_SUCCESS(
                     platform::dynload::cudnnConvolutionForward(
                         handle, &alpha, args2.idesc.desc(),
                         x + i * group_offset_in, args2.wdesc.desc(),
@@ -1294,7 +1293,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
       wkspace_handle.RunFunc(
           [&](void* workspace_ptr) {
-            PADDLE_ENFORCE_CUDA_SUCCESS(
+            PADDLE_ENFORCE_GPU_SUCCESS(
                 platform::dynload::miopenConvolutionBackwardWeights(
                     handle, &alpha, args3.odesc.desc(), transformed_dy_channel,
                     args3.idesc.desc(), ddx, args3.cdesc.desc(), filter_algo,
@@ -1306,7 +1305,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
       for (int i = 0; i < groups; i++) {
         wkspace_handle.RunFunc(
             [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+              PADDLE_ENFORCE_GPU_SUCCESS(
                   platform::dynload::cudnnConvolutionBackwardFilter(
                       handle, &alpha, args3.idesc.desc(),
                       ddx + i * group_offset_in, args3.odesc.desc(),
@@ -1325,7 +1324,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
       wkspace_handle.RunFunc(
           [&](void* workspace_ptr) {
-            PADDLE_ENFORCE_CUDA_SUCCESS(
+            PADDLE_ENFORCE_GPU_SUCCESS(
                 platform::dynload::miopenConvolutionBackwardData(
                     handle, &alpha, args4.odesc.desc(), transformed_dy_channel,
                     args4.wdesc.desc(), ddw, args4.cdesc.desc(), data_algo,
@@ -1337,7 +1336,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
       for (int i = 0; i < groups; i++) {
         wkspace_handle.RunFunc(
             [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+              PADDLE_ENFORCE_GPU_SUCCESS(
                   platform::dynload::cudnnConvolutionBackwardData(
                       handle, &alpha, args4.wdesc.desc(),
                       ddw + i * group_offset_filter, args4.odesc.desc(),
diff --git a/paddle/fluid/operators/conv_cudnn_op_cache.h b/paddle/fluid/operators/conv_cudnn_op_cache.h
index 23a471cfa0067..291e5f92f322c 100644
--- a/paddle/fluid/operators/conv_cudnn_op_cache.h
+++ b/paddle/fluid/operators/conv_cudnn_op_cache.h
@@ -18,11 +18,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/operator.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#else
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 DECLARE_uint64(conv_workspace_size_limit);
 DECLARE_bool(cudnn_exhaustive_search);
diff --git a/paddle/fluid/operators/conv_miopen_helper.h b/paddle/fluid/operators/conv_miopen_helper.h
index befe09c8e6beb..9c9795143eb78 100644
--- a/paddle/fluid/operators/conv_miopen_helper.h
+++ b/paddle/fluid/operators/conv_miopen_helper.h
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/conv_search_cache.h"
 #include "paddle/fluid/framework/operator_kernel_configs.h"
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
-#include "paddle/fluid/platform/miopen_desc.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
@@ -137,7 +137,7 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
     int find_count;
     miopenConvAlgoPerf_t find_result;
     auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::miopenFindConvolutionForwardAlgorithm(
               args.handle, args.idesc.desc(), args.x->data<T>(),
               args.wdesc.desc(), args.w->data<T>(), args.cdesc.desc(),
@@ -154,7 +154,7 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
 
   static size_t GetWorkspaceSize(const ConvArgs& args) {
     size_t workspace_size = 0;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenConvolutionForwardGetWorkSpaceSize(
             args.handle, args.wdesc.desc(), args.idesc.desc(),
             args.cdesc.desc(), args.odesc.desc(), &workspace_size));
@@ -179,7 +179,7 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
     int find_count;
     miopenConvAlgoPerf_t find_result;
     auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::miopenFindConvolutionBackwardDataAlgorithm(
               args.handle, args.odesc.desc(), args.o->data<T>(),
               args.wdesc.desc(), args.w->data<T>(), args.cdesc.desc(),
@@ -196,7 +196,7 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
 
   static size_t GetWorkspaceSize(const ConvArgs& args) {
     size_t workspace_size = 0;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenConvolutionBackwardDataGetWorkSpaceSize(
             args.handle, args.odesc.desc(), args.wdesc.desc(),
             args.cdesc.desc(), args.idesc.desc(), &workspace_size));
@@ -221,7 +221,7 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
     int find_count;
     miopenConvAlgoPerf_t find_result;
     auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::miopenFindConvolutionBackwardWeightsAlgorithm(
               args.handle, args.odesc.desc(), args.o->data<T>(),
               args.idesc.desc(), args.x->data<T>(), args.cdesc.desc(),
@@ -238,7 +238,7 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
 
   static size_t GetWorkspaceSize(const ConvArgs& args) {
     size_t workspace_size = 0;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenConvolutionBackwardWeightsGetWorkSpaceSize(
             args.handle, args.odesc.desc(), args.idesc.desc(),
             args.cdesc.desc(), args.wdesc.desc(), &workspace_size));
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 1610705c4694c..41f6f75200697 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -20,13 +20,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_version_registry.h"
 
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
-
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -222,7 +216,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
   if (input_data_type == framework::proto::VarType::BF16 &&
       library == framework::LibraryType::kCUDNN) {
     PADDLE_ENFORCE_GE(
-        platform::CudnnVersion(), 8100,
+        platform::DnnVersion(), 8100,
         platform::errors::InvalidArgument(
             "bfloat16 can only be used when CUDNN_VERSION >= 8100"));
   }
diff --git a/paddle/fluid/operators/conv_shift_op.cu b/paddle/fluid/operators/conv_shift_op.cu
index 314d33310588e..2289104d2dbfb 100644
--- a/paddle/fluid/operators/conv_shift_op.cu
+++ b/paddle/fluid/operators/conv_shift_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/conv_shift_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
index c4cd5854c0f78..19c0be44a1d0b 100644
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
@@ -265,7 +265,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
     for (int g = 0; g < groups; g++) {
 #ifdef PADDLE_WITH_HIP
       auto cudnn_func = [&](void* cudnn_workspace) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::miopenConvolutionBackwardData(
                 handle, &alpha, args.odesc.desc(),
                 input_data + input_offset * g, args.wdesc.desc(),
@@ -275,7 +275,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
       };
 #else   // PADDLE_WITH_HIP
       auto cudnn_func = [&](void* cudnn_workspace) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::cudnnConvolutionBackwardData(
                 handle, &alpha, args.wdesc.desc(),
                 filter_data + filter_offset * g, args.odesc.desc(),
@@ -549,7 +549,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
       for (int g = 0; g < groups; g++) {
 #ifdef PADDLE_WITH_HIP
         auto cudnn_func = [&](void* cudnn_workspace) {
-          PADDLE_ENFORCE_CUDA_SUCCESS(
+          PADDLE_ENFORCE_GPU_SUCCESS(
               platform::dynload::miopenConvolutionForward(
                   handle, &alpha, args1.idesc.desc(),
                   output_grad_data + output_grad_offset * g, args1.wdesc.desc(),
@@ -560,13 +560,12 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
         };
 #else   // PADDLE_WITH_HIP
         auto cudnn_func = [&](void* cudnn_workspace) {
-          PADDLE_ENFORCE_CUDA_SUCCESS(
-              platform::dynload::cudnnConvolutionForward(
-                  handle, &alpha, args1.idesc.desc(),
-                  output_grad_data + output_grad_offset * g, args1.wdesc.desc(),
-                  filter_data + filter_offset * g, args1.cdesc.desc(),
-                  data_algo, cudnn_workspace, workspace_size, &beta,
-                  args1.odesc.desc(), input_grad_data + input_offset * g));
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnConvolutionForward(
+              handle, &alpha, args1.idesc.desc(),
+              output_grad_data + output_grad_offset * g, args1.wdesc.desc(),
+              filter_data + filter_offset * g, args1.cdesc.desc(), data_algo,
+              cudnn_workspace, workspace_size, &beta, args1.odesc.desc(),
+              input_grad_data + input_offset * g));
         };
 #endif  // PADDLE_WITH_HIP
         workspace_handle.RunFunc(cudnn_func, workspace_size);
@@ -598,7 +597,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
       for (int g = 0; g < groups; g++) {
 #ifdef PADDLE_WITH_HIP
         auto cudnn_func = [&](void* cudnn_workspace) {
-          PADDLE_ENFORCE_CUDA_SUCCESS(
+          PADDLE_ENFORCE_GPU_SUCCESS(
               platform::dynload::miopenConvolutionBackwardWeights(
                   handle, &alpha, args2.odesc.desc(),
                   input_data + input_offset * g, args2.idesc.desc(),
@@ -609,7 +608,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
         };
 #else   // PADDLE_WITH_HIP
         auto cudnn_func = [&](void* cudnn_workspace) {
-          PADDLE_ENFORCE_CUDA_SUCCESS(
+          PADDLE_ENFORCE_GPU_SUCCESS(
               platform::dynload::cudnnConvolutionBackwardFilter(
                   handle, &alpha, args2.idesc.desc(),
                   output_grad_data + output_grad_offset * g, args2.odesc.desc(),
@@ -1054,7 +1053,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
           wkspace_handle.RunFunc(
               [&](void* workspace_ptr) {
-                PADDLE_ENFORCE_CUDA_SUCCESS(
+                PADDLE_ENFORCE_GPU_SUCCESS(
                     platform::dynload::miopenConvolutionBackwardData(
                         handle, &alpha, args1.odesc.desc(),
                         ddx + i * group_offset_in, args1.wdesc.desc(),
@@ -1067,7 +1066,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
 #else   // PADDLE_WITH_HIP
           wkspace_handle.RunFunc(
               [&](void* workspace_ptr) {
-                PADDLE_ENFORCE_CUDA_SUCCESS(
+                PADDLE_ENFORCE_GPU_SUCCESS(
                     platform::dynload::cudnnConvolutionBackwardData(
                         handle, &alpha, args1.wdesc.desc(),
                         w + i * group_offset_filter, args1.odesc.desc(),
@@ -1089,7 +1088,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
           T* conv_x_ddw_data = conv_x_ddw.mutable_data<T>(ctx.GetPlace());
           wkspace_handle.RunFunc(
               [&](void* workspace_ptr) {
-                PADDLE_ENFORCE_CUDA_SUCCESS(
+                PADDLE_ENFORCE_GPU_SUCCESS(
                     platform::dynload::miopenConvolutionBackwardData(
                         handle, &alpha, args2.odesc.desc(),
                         x + i * group_offset_in, args2.wdesc.desc(),
@@ -1099,7 +1098,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
                         workspace_size));
               },
               workspace_size);
-          PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenOpTensor(
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenOpTensor(
               handle, miopenTensorOpAdd, &alpha, args2.idesc.desc(),
               transformed_ddy_channel + i * group_offset_out, &alpha,
               args2.idesc.desc(), conv_x_ddw_data + i * group_offset_out, &beta,
@@ -1108,7 +1107,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
 #else   // PADDLE_WITH_HIP
           wkspace_handle.RunFunc(
               [&](void* workspace_ptr) {
-                PADDLE_ENFORCE_CUDA_SUCCESS(
+                PADDLE_ENFORCE_GPU_SUCCESS(
                     platform::dynload::cudnnConvolutionBackwardData(
                         handle, &alpha, args2.wdesc.desc(),
                         ddw + i * group_offset_filter, args2.odesc.desc(),
@@ -1152,7 +1151,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
         wkspace_handle.RunFunc(
             [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+              PADDLE_ENFORCE_GPU_SUCCESS(
                   platform::dynload::miopenConvolutionBackwardWeights(
                       handle, &alpha, args3.odesc.desc(),
                       ddx + i * group_offset_in, args3.idesc.desc(),
@@ -1165,7 +1164,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
 #else   // PADDLE_WITH_HIP
         wkspace_handle.RunFunc(
             [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+              PADDLE_ENFORCE_GPU_SUCCESS(
                   platform::dynload::cudnnConvolutionBackwardFilter(
                       handle, &alpha, args3.idesc.desc(),
                       transformed_dy_channel + i * group_offset_out,
@@ -1185,7 +1184,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
         wkspace_handle.RunFunc(
             [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+              PADDLE_ENFORCE_GPU_SUCCESS(
                   platform::dynload::miopenConvolutionForward(
                       handle, &alpha, args4.idesc.desc(),
                       transformed_dy_channel + i * group_offset_out,
@@ -1198,7 +1197,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
 #else   // PADDLE_WITH_HIP
         wkspace_handle.RunFunc(
             [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+              PADDLE_ENFORCE_GPU_SUCCESS(
                   platform::dynload::cudnnConvolutionForward(
                       handle, &alpha, args4.idesc.desc(),
                       transformed_dy_channel + i * group_offset_out,
diff --git a/paddle/fluid/operators/cudnn_lstm_cache.h b/paddle/fluid/operators/cudnn_lstm_cache.h
index b7859237e737a..5451cf815cae3 100644
--- a/paddle/fluid/operators/cudnn_lstm_cache.h
+++ b/paddle/fluid/operators/cudnn_lstm_cache.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <vector>
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 
 namespace paddle {
@@ -77,7 +77,7 @@ class ScopedRNNBase {
     // ------------------- cudnn dropout descriptors ---------------------
     size_t state_size;
     if (!initialized_) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size));
       dropout_state->mutable_data<uint8_t>({static_cast<int64_t>(state_size)},
                                            place);
@@ -86,7 +86,7 @@ class ScopedRNNBase {
                              dropout_state, seed_, state_size);
 
     // ------------------- cudnn rnn descriptors ---------------------
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6(
         handle, rnn_desc_.desc(), hidden_size_, num_layers_,
         dropout_desc_.desc(), CUDNN_LINEAR_INPUT,
         is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
@@ -94,14 +94,14 @@ class ScopedRNNBase {
 
 #if CUDNN_VERSION >= 7201
     if (!sequence_length.empty()) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNPaddingMode(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetRNNPaddingMode(
           rnn_desc_.desc(), CUDNN_RNN_PADDED_IO_ENABLED));
     }
 #endif
 
     // ------------------- cudnn weights_size ---------------------
     size_t weights_size_;
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNParamsSize(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnGetRNNParamsSize(
         handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type));
     PADDLE_ENFORCE_EQ(
         weights_size_, sizeof(T) * weight_numel_,
@@ -113,10 +113,10 @@ class ScopedRNNBase {
     std::vector<int> dim_w = {dim_tmp, 1, 1};
     weight_desc_.descriptor<T>(layout, dim_w);
     // ------------------- cudnn workspace, reserve size ---------------------
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNWorkspaceSize(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnGetRNNWorkspaceSize(
         handle, rnn_desc_.desc(), seq_length_, x_descs_.data(),
         workspace_size));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnGetRNNTrainingReserveSize(
             handle, rnn_desc_.desc(), seq_length_, x_descs_.data(),
             reserve_size));
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index 27f64b41948be..6f696afa23886 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -111,14 +111,14 @@ void LSTMInferece(const bool &has_seq_length, const cudnnHandle_t &handle,
 // for inference
 // This interface is used when the input/output is unpadded.
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenRNNForwardInference(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenRNNForwardInference(
         handle, rnn->rnn_desc(), seq_length, rnn->x_descs(), x_data,
         rnn->init_h_desc(), init_h_data, rnn->init_c_desc(), init_c_data,
         rnn->weight_desc(), w_data, rnn->y_descs(), out_data,
         rnn->last_h_desc(), last_h_data, rnn->last_c_desc(), last_c_data,
         workspace_data->data<uint8_t>(), workspace_size));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInference(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNForwardInference(
         handle, rnn->rnn_desc(), seq_length, rnn->x_descs(), x_data,
         rnn->init_h_desc(), init_h_data, rnn->init_c_desc(), init_c_data,
         rnn->weight_desc(), w_data, rnn->y_descs(), out_data,
@@ -129,7 +129,7 @@ void LSTMInferece(const bool &has_seq_length, const cudnnHandle_t &handle,
 #if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201
     // for inference
     // This interface is used when the input/output is padded.
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInferenceEx(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNForwardInferenceEx(
         handle, rnn->rnn_desc(), rnn->x_seq_desc(), x_data, rnn->init_h_desc(),
         init_h_data, rnn->init_c_desc(), init_c_data, rnn->weight_desc(),
         w_data, rnn->y_seq_desc(), out_data, rnn->last_h_desc(), last_h_data,
@@ -277,7 +277,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
 // for train
 // This interface is used when the input/output is unpadded.
 #ifdef PADDLE_WITH_HIP
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenRNNForwardTraining(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenRNNForwardTraining(
             handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), x_data,
             rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
             rnn.weight_desc(), w_data, rnn.y_descs(), out_data,
@@ -285,7 +285,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
             workspace_data_.data<uint8_t>(), workspace_size, reserve_data,
             reserve_size));
 #else
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardTraining(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNForwardTraining(
             handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), x_data,
             rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
             rnn.weight_desc(), w_data, rnn.y_descs(), out_data,
@@ -297,15 +297,13 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
 #if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201
         // for train
         // This interface is used when the input/output is padded.
-        PADDLE_ENFORCE_CUDA_SUCCESS(
-            platform::dynload::cudnnRNNForwardTrainingEx(
-                handle, rnn.rnn_desc(), rnn.x_seq_desc(), x_data,
-                rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
-                rnn.weight_desc(), w_data, rnn.y_seq_desc(), out_data,
-                rnn.last_h_desc(), last_h_data, rnn.last_c_desc(), last_c_data,
-                nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
-                nullptr, workspace_data_.data<uint8_t>(), workspace_size,
-                reserve_data, reserve_size));
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNForwardTrainingEx(
+            handle, rnn.rnn_desc(), rnn.x_seq_desc(), x_data, rnn.init_h_desc(),
+            init_h_data, rnn.init_c_desc(), init_c_data, rnn.weight_desc(),
+            w_data, rnn.y_seq_desc(), out_data, rnn.last_h_desc(), last_h_data,
+            rnn.last_c_desc(), last_c_data, nullptr, nullptr, nullptr, nullptr,
+            nullptr, nullptr, nullptr, nullptr, workspace_data_.data<uint8_t>(),
+            workspace_size, reserve_data, reserve_size));
 #else
         PADDLE_THROW(platform::errors::Unavailable(
             "The padded input is supported by "
@@ -433,7 +431,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
     if (!has_seq_length) {
 // This interface is used when the input/output is unpadded.
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenRNNBackwardData(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenRNNBackwardData(
           handle, rnn.rnn_desc(), seq_length, rnn.y_descs(), out_data,
           rnn.y_descs(), out_grad_data, rnn.last_h_desc(), last_h_grad_data,
           rnn.last_c_desc(), last_c_grad_data, rnn.weight_desc(), weight_data,
@@ -442,13 +440,13 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
           rnn.init_c_desc(), init_c_grad_data, workspace_data_.data<uint8_t>(),
           workspace_size, const_cast<uint8_t *>(reserve_data), reserve_size));
 
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenRNNBackwardWeights(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenRNNBackwardWeights(
           handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), input->data<T>(),
           rnn.init_h_desc(), init_h->data<T>(), rnn.y_descs(), out->data<T>(),
           rnn.weight_desc(), weight_grad_data, workspace_data_.data<uint8_t>(),
           workspace_size, const_cast<uint8_t *>(reserve_data), reserve_size));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardData(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNBackwardData(
           handle, rnn.rnn_desc(), seq_length, rnn.y_descs(), out_data,
           rnn.y_descs(), out_grad_data, rnn.last_h_desc(), last_h_grad_data,
           rnn.last_c_desc(), last_c_grad_data, rnn.weight_desc(), weight_data,
@@ -457,7 +455,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
           rnn.init_c_desc(), init_c_grad_data, workspace_data_.data<uint8_t>(),
           workspace_size, const_cast<uint8_t *>(reserve_data), reserve_size));
 
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeights(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNBackwardWeights(
           handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), input->data<T>(),
           rnn.init_h_desc(), init_h->data<T>(), rnn.y_descs(), out->data<T>(),
           workspace_data_.data<uint8_t>(), workspace_size, rnn.weight_desc(),
@@ -467,7 +465,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
 #if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201
       // for train
       // This interface is used when the input/output is padded.
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardDataEx(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNBackwardDataEx(
           handle, rnn.rnn_desc(), rnn.y_seq_desc(), out_data, rnn.y_seq_desc(),
           out_grad_data, nullptr, nullptr, rnn.last_h_desc(), last_h_grad_data,
           rnn.last_c_desc(), last_c_grad_data, rnn.weight_desc(), weight_data,
@@ -477,7 +475,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
           workspace_data_.data<uint8_t>(), workspace_size,
           const_cast<uint8_t *>(reserve_data), reserve_size));
 
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeightsEx(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNBackwardWeightsEx(
           handle, rnn.rnn_desc(), rnn.x_seq_desc(), input->data<T>(),
           rnn.init_h_desc(), init_h->data<T>(), rnn.y_seq_desc(),
           out->data<T>(), workspace_data_.data<uint8_t>(), workspace_size,
diff --git a/paddle/fluid/operators/cudnn_rnn_cache.h b/paddle/fluid/operators/cudnn_rnn_cache.h
index a6a23a91c76c0..6c059257b94e8 100644
--- a/paddle/fluid/operators/cudnn_rnn_cache.h
+++ b/paddle/fluid/operators/cudnn_rnn_cache.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <vector>
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
@@ -92,15 +92,15 @@ struct CudnnRNNCache {
     std::vector<int> strides_y = {hidden_size_ * numDirections, 1, 1};
 
     for (size_t i = 0; i < seq_length_; ++i) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i]));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i]));
 
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
           x_desc_[i], cudnn_type, 3, dims.data(), strides.data()));
 
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
           y_desc_[i], cudnn_type, 3, dims_y.data(), strides_y.data()));
     }
 
@@ -108,78 +108,78 @@ struct CudnnRNNCache {
                                 hidden_size_};
     std::vector<int> strides_hx = {hidden_size_ * batch_size_, hidden_size_, 1};
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&cx_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&hy_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&cy_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&dhx_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&dcx_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&dhy_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         hx_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         cx_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         hy_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         cy_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         dhx_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         dcx_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         dhy_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         dcy_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data()));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_));
 
     size_t state_size;
     if (!initialized) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size));
       dropout_state_->Resize({static_cast<int64_t>(state_size)});
       uint8_t *dropout_state_data =
           dropout_state_->mutable_data<uint8_t>(place);
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetDropoutDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetDropoutDescriptor(
           dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size,
           seed_));
     } else {
       uint8_t *dropout_state_data = dropout_state_->data<uint8_t>();
       auto dropout_state_dims = dropout_state_->dims();
       state_size = dropout_state_dims[0];
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnRestoreDropoutDescriptor(
               dropout_desc_, handle, dropout_prob_, dropout_state_data,
               state_size, 0));
     }
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6(
         handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_,
         CUDNN_LINEAR_INPUT,
         is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
         CUDNN_RNN_ALGO_STANDARD, cudnn_type));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateFilterDescriptor(&w_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNParamsSize(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnGetRNNParamsSize(
         handle, rnn_desc_, x_desc_[0], &weights_size_, cudnn_type));
 
     PADDLE_ENFORCE_EQ(
@@ -191,14 +191,14 @@ struct CudnnRNNCache {
     dim_w[0] = weights_size_ / cudnn_size;
     dim_w[1] = 1;
     dim_w[2] = 1;
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetFilterNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetFilterNdDescriptor(
         w_desc_, cudnn_type, CUDNN_TENSOR_NCHW, 3, dim_w));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetFilterNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetFilterNdDescriptor(
         dw_desc_, cudnn_type, CUDNN_TENSOR_NCHW, 3, dim_w));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNWorkspaceSize(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnGetRNNWorkspaceSize(
         handle, rnn_desc_, seq_length_, x_desc_, &workspace_size_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnGetRNNTrainingReserveSize(
             handle, rnn_desc_, seq_length_, x_desc_, reserve_size_));
 
@@ -208,40 +208,40 @@ struct CudnnRNNCache {
 
   void release() {
     for (size_t i = 0; i < seq_length_; ++i) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i]));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i]));
     }
 
     delete[] x_desc_;
     delete[] y_desc_;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(cx_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(hy_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(cy_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(dhx_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(dcx_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(dhy_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(dcy_desc_));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyDropoutDescriptor(dropout_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyRNNDescriptor(rnn_desc_));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyFilterDescriptor(w_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyFilterDescriptor(dw_desc_));
   }
 };
diff --git a/paddle/fluid/operators/cumsum_op.cu b/paddle/fluid/operators/cumsum_op.cu
index d9e19eb7f61a6..977e301f13663 100644
--- a/paddle/fluid/operators/cumsum_op.cu
+++ b/paddle/fluid/operators/cumsum_op.cu
@@ -24,7 +24,7 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 #include "paddle/fluid/operators/cum_op.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 
 using Tensor = paddle::framework::Tensor;
 using LoDTensor = paddle::framework::LoDTensor;
diff --git a/paddle/fluid/operators/cvm_op.cu b/paddle/fluid/operators/cvm_op.cu
index 75976c968c9e8..ad96dc24b9206 100644
--- a/paddle/fluid/operators/cvm_op.cu
+++ b/paddle/fluid/operators/cvm_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/cvm_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/data_norm_op.cu b/paddle/fluid/operators/data_norm_op.cu
index 1043faa56f01b..5d157a77b3dd1 100644
--- a/paddle/fluid/operators/data_norm_op.cu
+++ b/paddle/fluid/operators/data_norm_op.cu
@@ -16,10 +16,10 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/data_norm_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -176,23 +176,19 @@ class DataNormGradKernel<platform::CUDADeviceContext, T>
     if (need_sync_stats) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
       auto comm = platform::NCCLCommContext::Instance().Get(0, ctx.GetPlace());
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
           reinterpret_cast<const void *>(d_batch_size),
           reinterpret_cast<void *>(d_batch_size), C,
           platform::ToNCCLDataType(x->type()), ncclSum, comm->comm(), stream));
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
           reinterpret_cast<const void *>(d_batch_sum),
           reinterpret_cast<void *>(d_batch_sum), C,
           platform::ToNCCLDataType(x->type()), ncclSum, comm->comm(), stream));
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
           reinterpret_cast<const void *>(d_batch_square_sum),
           reinterpret_cast<void *>(d_batch_square_sum), C,
           platform::ToNCCLDataType(x->type()), ncclSum, comm->comm(), stream));
-#ifdef PADDLE_WITH_RCCL
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
+      platform::GpuStreamSync(stream);
 #else
       PADDLE_THROW(platform::errors::PreconditionNotMet(
           "PaddlePaddle should compile with GPU, and need_sync_stats connot be "
diff --git a/paddle/fluid/operators/deformable_conv_op.cu b/paddle/fluid/operators/deformable_conv_op.cu
index 67f5ee332eeb2..924adafa4b8d8 100644
--- a/paddle/fluid/operators/deformable_conv_op.cu
+++ b/paddle/fluid/operators/deformable_conv_op.cu
@@ -27,7 +27,7 @@
 #include "paddle/fluid/operators/deformable_conv_op.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/deformable_conv_v1_op.cu b/paddle/fluid/operators/deformable_conv_v1_op.cu
index e399a1fafdb71..c252700528c49 100644
--- a/paddle/fluid/operators/deformable_conv_v1_op.cu
+++ b/paddle/fluid/operators/deformable_conv_v1_op.cu
@@ -30,7 +30,7 @@
 #include "paddle/fluid/operators/deformable_conv_v1_op.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cu b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
index c1d4cc9d17ab4..6489c1f9784cf 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
@@ -32,7 +32,7 @@
 #include "paddle/fluid/operators/deformable_psroi_pooling_op.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/dequantize_log_op.cu b/paddle/fluid/operators/dequantize_log_op.cu
index 9f63f8ed6f520..39f4fdb71b69d 100644
--- a/paddle/fluid/operators/dequantize_log_op.cu
+++ b/paddle/fluid/operators/dequantize_log_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/dequantize_log_op.h"
 #include "paddle/fluid/operators/math.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index 08c44a2d39ecf..a85bca3646499 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -18,11 +18,20 @@ endfunction()
 if (WITH_ASCEND_CL)
     detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu box_coder_op_npu.cc)
     detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu density_prior_box_op_npu.cc)
-    detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu prior_box_op_npu.cc)
 else()
     detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu)
     detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu)
-    detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
+endif()
+
+if(WITH_XPU)
+  detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op_xpu.cc)
+  detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_xpu.cc)
+elseif(WITH_ASCEND_CL)
+  detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op_npu.cc)
+  detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu prior_box_op_npu.cc)
+else()
+  detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op.cu)
+  detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
 endif()
 
 detection_library(bipartite_match_op SRCS bipartite_match_op.cc)
@@ -63,14 +72,6 @@ else()
   detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc)
 endif()
 
-if(WITH_XPU)
-  detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op_xpu.cc)
-elseif(WITH_ASCEND_CL)
-  detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op_npu.cc)
-else()
-  detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op.cu)
-endif()
-
 detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu)
 #Export local libraries to parent
 # set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h
index 725983f8153e4..6f5137be62011 100644
--- a/paddle/fluid/operators/detection/bbox_util.cu.h
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
@@ -18,15 +18,14 @@ limitations under the License. */
 #include <vector>
 #ifdef __NVCC__
 #include "cub/cub.cuh"
-#include "paddle/fluid/platform/cudnn_helper.h"
 #endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
-#include "paddle/fluid/platform/miopen_helper.h"
 namespace cub = hipcub;
 #endif
 #include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/for_range.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/box_clip_op.cu b/paddle/fluid/operators/detection/box_clip_op.cu
index e02f99a613c01..17013efcc98b7 100644
--- a/paddle/fluid/operators/detection/box_clip_op.cu
+++ b/paddle/fluid/operators/detection/box_clip_op.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/box_clip_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/box_coder_op.cu b/paddle/fluid/operators/detection/box_coder_op.cu
index 0693029eaea9c..6e5fa1e293353 100644
--- a/paddle/fluid/operators/detection/box_coder_op.cu
+++ b/paddle/fluid/operators/detection/box_coder_op.cu
@@ -13,7 +13,7 @@ limitations under the License. */
 #include <thrust/host_vector.h>
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/detection/box_coder_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
index 70767f1d7b115..ed97559aa8bb5 100644
--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
+++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
@@ -11,7 +11,7 @@ limitations under the License. */
 
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/detection/box_decoder_and_assign_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
index ffd9ac6b2af80..bd5703022db90 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -26,7 +26,7 @@ namespace cub = hipcub;
 #include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/for_range.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
index 7ccb354e1773a..1df7dcbe670c0 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
@@ -26,7 +26,7 @@ namespace cub = hipcub;
 #include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
 #include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/for_range.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cu b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
index 5977a434a6023..5ff479eac8df0 100644
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cu
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/prior_box_op_xpu.cc b/paddle/fluid/operators/detection/prior_box_op_xpu.cc
new file mode 100644
index 0000000000000..bab394689546e
--- /dev/null
+++ b/paddle/fluid/operators/detection/prior_box_op_xpu.cc
@@ -0,0 +1,108 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/detection/prior_box_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, typename K>
+class PriorBoxOpXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
+    auto* image = ctx.Input<paddle::framework::Tensor>("Image");
+    auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
+    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
+
+    auto min_sizes = ctx.Attr<std::vector<float>>("min_sizes");
+    auto max_sizes = ctx.Attr<std::vector<float>>("max_sizes");
+    auto input_aspect_ratio = ctx.Attr<std::vector<float>>("aspect_ratios");
+    auto variances = ctx.Attr<std::vector<float>>("variances");
+    auto flip = ctx.Attr<bool>("flip");
+    auto clip = ctx.Attr<bool>("clip");
+    auto min_max_aspect_ratios_order =
+        ctx.Attr<bool>("min_max_aspect_ratios_order");
+
+    std::vector<float> aspect_ratios;
+    ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
+
+    K step_w = static_cast<K>(ctx.Attr<float>("step_w"));
+    K step_h = static_cast<K>(ctx.Attr<float>("step_h"));
+    K offset = static_cast<K>(ctx.Attr<float>("offset"));
+
+    auto img_width = image->dims()[3];
+    auto img_height = image->dims()[2];
+
+    auto feature_width = input->dims()[3];
+    auto feature_height = input->dims()[2];
+
+    K step_width, step_height;
+    if (step_w == 0 || step_h == 0) {
+      step_width = static_cast<K>(img_width) / feature_width;
+      step_height = static_cast<K>(img_height) / feature_height;
+    } else {
+      step_width = step_w;
+      step_height = step_h;
+    }
+
+    int num_priors = aspect_ratios.size() * min_sizes.size();
+    if (max_sizes.size() > 0) {
+      num_priors += max_sizes.size();
+    }
+
+    boxes->mutable_data<K>(ctx.GetPlace());
+    vars->mutable_data<K>(ctx.GetPlace());
+
+    const auto& dev_ctx =
+        ctx.template device_context<paddle::platform::XPUDeviceContext>();
+    auto boxes_data = boxes->data<K>();
+    auto vars_data = vars->data<K>();
+    xpu::VectorParam<float> aspect_ratios_param{
+        aspect_ratios.data(), static_cast<int>(aspect_ratios.size()), nullptr};
+    xpu::VectorParam<float> min_sizes_param{
+        min_sizes.data(), static_cast<int>(min_sizes.size()), nullptr};
+    xpu::VectorParam<float> max_sizes_param{
+        max_sizes.data(), static_cast<int>(max_sizes.size()), nullptr};
+
+    int ret = xpu::gen_prior_box(
+        dev_ctx.x_context(), boxes_data, aspect_ratios_param, min_sizes_param,
+        max_sizes_param, feature_height, feature_width, img_height, img_width,
+        offset, step_height, step_width, clip, min_max_aspect_ratios_order);
+    PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                      platform::errors::External(
+                          "XPU gen_prior_box kernel return wrong value[%d %s]",
+                          ret, XPUAPIErrorMsg[ret]));
+
+    int box_num = feature_height * feature_width * num_priors;
+    int vlen = variances.size();
+    for (int i = 0; i < box_num; ++i) {
+      ret = xpu_memcpy(vars_data + i * vlen, variances.data(), vlen * sizeof(K),
+                       XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+      PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, platform::errors::External(
+                                              "XPU xpu_memcpy return wrong "
+                                              "value[%d %s] in prior_box.",
+                                              ret, XPUAPIErrorMsg[ret]));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(prior_box, ops::PriorBoxOpXPUKernel<float, float>);
+
+#endif
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
index 7b34e197ffe21..2ddcc7a06f679 100644
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
 
 using paddle::platform::PADDLE_CUDA_NUM_THREADS;
diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
index ed1676200dc47..10c402e5a4078 100644
--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
+++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h"
 #include "paddle/fluid/operators/math.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu
index 83a0eb87d02dd..23bd6af6bd2e8 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cu
+++ b/paddle/fluid/operators/detection/yolo_box_op.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/detection/yolo_box_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/diagonal_op.cu b/paddle/fluid/operators/diagonal_op.cu
index e2b5f24d6619e..b1268e903df19 100644
--- a/paddle/fluid/operators/diagonal_op.cu
+++ b/paddle/fluid/operators/diagonal_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/diagonal_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h
index bd4d690577a6f..c97a523caa767 100644
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -33,7 +33,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/dropout_impl_util.h"
 #include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/platform/aligned_vector.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 
 namespace paddle {
 namespace operators {
@@ -167,14 +167,14 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     auto* y_data = y->data<T>();
     if (dropout_prob == 1.0f) {
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(y_data, 0, x_numel * sizeof(T), stream));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(mask_data, 0, x_numel * sizeof(*mask_data), stream));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemsetAsync(mask_data, 0, x_numel * sizeof(*mask_data), stream));
 #endif
       return;
diff --git a/paddle/fluid/operators/edit_distance_op.cu b/paddle/fluid/operators/edit_distance_op.cu
index 80490af33a1f9..f28fa4d6338d7 100644
--- a/paddle/fluid/operators/edit_distance_op.cu
+++ b/paddle/fluid/operators/edit_distance_op.cu
@@ -17,8 +17,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/edit_distance_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index d3ab8ad9d6985..ad5a55aede751 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -26,7 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/elementwise/elementwise_functor.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/transform.h"
 
 // only can include the headers in paddle/pten/include dirs
@@ -43,8 +43,8 @@ limitations under the License. */
 #include <thrust/iterator/iterator_adaptor.h>
 
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 #ifdef __HIPCC__
 constexpr int ELEMWISE_MAX_BLOCK_DIM = 256;
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
index 00562767c97a5..2b44c81a4550d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -30,12 +32,69 @@ static __global__ void SimpleElemwiseSubGradCUDAKernel(const T* dout,
   int col = blockIdx.x * blockDim.x + threadIdx.x;
 
   while (col < size) {
-    dx[col] = dout[col];
+    if (dx != nullptr) {
+      dx[col] = dout[col];
+    }
     dy[col] = -dout[col];
     col += blockDim.x * gridDim.x;
   }
 }
 
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
+default_elementwise_sub_grad(const framework::ExecutionContext& ctx,
+                             const framework::Tensor* x,
+                             const framework::Tensor* y,
+                             const framework::Tensor* out,
+                             const framework::Tensor* dout,
+                             framework::Tensor* dx, framework::Tensor* dy) {
+  int axis = ctx.Attr<int>("axis");
+  auto* dout_data = dout->data<T>();
+  // dx
+  if (dx != nullptr) {
+    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    if (dx->dims() == dout->dims()) {
+      if (dx_data != dout_data) {
+        framework::TensorCopy(
+            *dout, ctx.GetPlace(),
+            ctx.template device_context<platform::DeviceContext>(), dx);
+      }
+    } else {
+      // For inplace strategy, dx will be stored in addr of dout, which makes
+      // the result of dy wrong.
+      if (dx->IsSharedBufferWith(*dout)) {
+        dx->clear();
+        dx->mutable_data<T>(x->dims(), ctx.GetPlace());
+      }
+      std::vector<int> reduce_dims = GetReduceDim(x->dims(), out->dims(), axis);
+      gpuStream_t stream = ctx.cuda_device_context().stream();
+      TensorReduceFunctorImpl<T, T, CustomSum>(*dout, dx, reduce_dims, stream);
+    }
+  }
+  // dy
+  if (dy != nullptr) {
+    auto* dy_data = dy->mutable_data<T>(ctx.GetPlace());
+    if (dy->dims() == dout->dims()) {
+      if (dy_data != dout_data) {
+        dim3 block_size = dim3(ELEMENTWISE_BLOCK_SIZE, 1);
+        auto size = dy->numel();
+        dim3 grid_size = dim3(
+            (size + ELEMENTWISE_BLOCK_SIZE - 1) / ELEMENTWISE_BLOCK_SIZE, 1);
+        SimpleElemwiseSubGradCUDAKernel<T><<<
+            grid_size, block_size, 0,
+            ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
+            dout->data<T>(), size, nullptr,
+            dy->mutable_data<T>(ctx.GetPlace()));
+      }
+    } else {
+      std::vector<int> reduce_dims = GetReduceDim(y->dims(), out->dims(), axis);
+      gpuStream_t stream = ctx.cuda_device_context().stream();
+      TensorReduceFunctorImpl<T, T, CustomSub>(*dout, dy, reduce_dims, stream);
+    }
+  }
+}
+
 template <typename DeviceContext, typename T>
 typename std::enable_if<
     std::is_same<DeviceContext, plat::CUDADeviceContext>::value>::type
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
index 94c8edf24a127..08a4e709a37ad 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
@@ -71,6 +71,21 @@ struct SubGradDY {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return -dout; }
 };
 
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+default_elementwise_sub_grad(const framework::ExecutionContext& ctx,
+                             const framework::Tensor* x,
+                             const framework::Tensor* y,
+                             const framework::Tensor* out,
+                             const framework::Tensor* dout,
+                             framework::Tensor* dx, framework::Tensor* dy) {
+  int axis = ctx.Attr<int>("axis");
+
+  ElemwiseExplicitGradCompute<DeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
+      ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX<T>(), SubGradDY<T>());
+}
+
 template <typename DeviceContext, typename T>
 typename std::enable_if<
     std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
@@ -79,13 +94,21 @@ elementwise_sub_grad(const framework::ExecutionContext& ctx,
                      const framework::Tensor* out,
                      const framework::Tensor* dout, framework::Tensor* dx,
                      framework::Tensor* dy) {
-  int axis = ctx.Attr<int>("axis");
-  ElemwiseExplicitGradCompute<DeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
-      ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX<T>(), SubGradDY<T>());
+  default_elementwise_sub_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
 }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 // cuda definition
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
+default_elementwise_sub_grad(const framework::ExecutionContext& ctx,
+                             const framework::Tensor* x,
+                             const framework::Tensor* y,
+                             const framework::Tensor* out,
+                             const framework::Tensor* dout,
+                             framework::Tensor* dx, framework::Tensor* dy);
+
 template <typename DeviceContext, typename T>
 typename std::enable_if<
     std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
@@ -108,15 +131,13 @@ class ElementwiseSubGradKernel : public ElemwiseGradKernel<T> {
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
     // skip out
     auto* out = dout;
     if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
       elementwise_sub_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
     } else {
-      ElemwiseExplicitGradCompute<DeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
-          ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX<T>(),
-          SubGradDY<T>());
+      default_elementwise_sub_grad<DeviceContext, T>(ctx, x, y, out, dout, dx,
+                                                     dy);
     }
   }
 };
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
index ab45b6f4de276..706475bc82fad 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
@@ -30,10 +30,9 @@ namespace operators {
 static void Memcpy(void *dst, const void *src, size_t n, bool copy_to_gpu) {
   if (copy_to_gpu) {
 #ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaMemcpy(dst, src, n, cudaMemcpyHostToDevice));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(dst, src, n, cudaMemcpyHostToDevice));
 #elif defined(PADDLE_WITH_HIP)
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipMemcpy(dst, src, n, hipMemcpyHostToDevice));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(dst, src, n, hipMemcpyHostToDevice));
 #else
     PADDLE_THROW(
         platform::errors::InvalidArgument("Check your paddle version, current "
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
index 8f2235c7e3d21..b95bbc775a0d7 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/fake_quantize_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/filter_by_instag_op.h b/paddle/fluid/operators/filter_by_instag_op.h
index 77bc9e466e808..fd0f42df11875 100644
--- a/paddle/fluid/operators/filter_by_instag_op.h
+++ b/paddle/fluid/operators/filter_by_instag_op.h
@@ -65,19 +65,26 @@ class FilterByInstagKernel : public framework::OpKernel<T> {
     // expected auto = const int64_t
     auto* x2_data = x2->data<int64_t>();
     // e.g get [0, 1, 2, 3, ...]
-    auto x2_lods = x2->lod()[0];
+    size_t x2_lods_size = x2->dims()[0];
     Vector<size_t> x1_lods(1, 0);
     if (!is_x1_lod) {
       for (int i = 0; i < x1->dims()[0]; i++) {
         x1_lods.push_back(i + 1);
       }
     } else {
-      x1_lods = context.Input<LoDTensor>("Ins")->lod()[0];
+      // new: lod_level=0 => lod() return {}
+      if (x1->lod().size() != 0) {
+        x1_lods = x1->lod()[0];
+      } else {
+        for (int i = 0; i < x1->dims()[0]; i++) {
+          x1_lods.push_back(i + 1);
+        }
+      }
     }
     std::unordered_map<int64_t, int64_t> mmap_aux;
     Vector<size_t> out_lods(1, 0);
-    for (size_t i = 0; i < x2_lods.size() - 1; i++) {
-      for (size_t j = x2_lods[i]; j < x2_lods[i + 1]; j++) {
+    for (size_t i = 0; i < x2_lods_size; i++) {
+      for (size_t j = i; j < i + 1; j++) {
         if (filter_tag.find(x2_data[j]) != filter_tag.end()) {
           size_t batch_len = x1_lods[i + 1] - x1_lods[i];
           mmap_aux[out_lods.back()] = x1_lods[i];
diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h
index f7478364cdfc5..990ac8dbc8121 100644
--- a/paddle/fluid/operators/fused/attn_bias_add.cu.h
+++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h
@@ -22,11 +22,7 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#else
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 #ifdef __HIPCC__
 #define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim)
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cc b/paddle/fluid/operators/fused/conv_fusion_op.cc
index 6b94f4ea5bdd2..f2ce0bccd2fb5 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cc
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cc
@@ -15,9 +15,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/operators/conv_op.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu
index f5ee7f5599184..38326e7560c0d 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
@@ -18,11 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/operators/conv_op.h"
 #include "paddle/fluid/operators/math/padding.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#else
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 DECLARE_int64(cudnn_exhaustive_search_times);
 
@@ -169,7 +165,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
     miopenConvolutionDescriptor_t cudnn_conv_desc =
         conv_desc.descriptor<T>(padding_common, strides, dilations);
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenSetConvolutionGroupCount(cudnn_conv_desc,
                                                           groups));
     // Now only support NCHW
@@ -194,14 +190,14 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
     auto f_dims = framework::vectorize(filter->dims());
 
     size_t workspace_size = 0;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenConvolutionForwardGetWorkSpaceSize(
             handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
             cudnn_output_desc, &workspace_size));
     int find_count;
     miopenConvAlgoPerf_t find_result;
     auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::miopenFindConvolutionForwardAlgorithm(
               handle, cudnn_input_desc, input_data, cudnn_filter_desc,
               filter_data, cudnn_conv_desc, cudnn_output_desc, output_data,
@@ -215,23 +211,23 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
     {
       ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
       auto cudnn_func = [&](void* cudnn_workspace) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenConvolutionForward(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenConvolutionForward(
             handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
             filter_data, cudnn_conv_desc, algo, &beta, cudnn_output_desc,
             output_data, cudnn_workspace, workspace_size));
       };
       workspace_handle.RunFunc(cudnn_func, workspace_size);
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::miopenConvolutionForwardBias(
               handle, &alpha, cudnn_bias_desc, bias_data, &beta,
               cudnn_output_desc, output_data));
       if (activation != "identity") {
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenActivationForward(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenActivationForward(
             handle, cudnn_act_desc, &alpha, cudnn_output_desc, output_data,
             &beta, cudnn_output_desc, output_data));
       }
       if (residual) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenOpTensor(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenOpTensor(
             handle, miopenTensorOpAdd, &alpha, cudnn_output_desc, output_data,
             &alpha, cudnn_output_desc, residual_data, &beta, cudnn_output_desc,
             output_data));
@@ -240,9 +236,8 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
 #else  // PADDLE_WITH_HIP
     cudnnConvolutionDescriptor_t cudnn_conv_desc =
         conv_desc.descriptor<T>(padding_common, strides, dilations);
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnSetConvolutionGroupCount(cudnn_conv_desc,
-                                                         groups));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionGroupCount(
+        cudnn_conv_desc, groups));
 
     cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
         layout, framework::vectorize<int>(transformed_input.dims()));
@@ -273,13 +268,12 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
     auto handle = dev_ctx.cudnn_handle();
     auto workspace_handle = dev_ctx.cudnn_workspace_handle();
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
         cudnn_conv_desc, CUDNN_DEFAULT_MATH));
 #if CUDA_VERSION >= 11000 && CUDNN_VERSION >= 8000
     if (!platform::allow_tf32_cudnn) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnSetConvolutionMathType(cudnn_conv_desc,
-                                                         CUDNN_FMA_MATH));
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+          cudnn_conv_desc, CUDNN_FMA_MATH));
     }
 #endif  // CUDA_VERSION >= 11000 && CUDNN_VERSION >= 8000
 
@@ -292,20 +286,20 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
       size_t tmp_size = 0;
       std::unique_ptr<cudnnConvolutionFwdAlgoPerf_t[]> perf_results(
           new cudnnConvolutionFwdAlgoPerf_t[kNUM_CUDNN_FWD_ALGS]);
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
               handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
               cudnn_output_desc, kNUM_CUDNN_FWD_ALGS, &perf_count,
               perf_results.get()));
       algo = (perf_results.get())[best_algo_idx].algo;
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
               handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
               cudnn_output_desc, algo, &workspace_size_in_bytes));
       if (workspace_size_in_bytes > workspace_size_limit)
         workspace_size_limit = workspace_size_in_bytes;
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnGetConvolutionForwardAlgorithm(
               handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
               cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
@@ -319,7 +313,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
         std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
             fwd_perf_stat;
         auto cudnn_find_func = [&](void* cudnn_workspace) {
-          PADDLE_ENFORCE_CUDA_SUCCESS(
+          PADDLE_ENFORCE_GPU_SUCCESS(
               platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
                   handle, cudnn_input_desc, input_data, cudnn_filter_desc,
                   filter_data, cudnn_conv_desc, cudnn_output_desc, output_data,
@@ -355,7 +349,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
       VLOG(3) << "choose algo " << algo;
     }
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
             handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
             cudnn_output_desc, algo, &workspace_size_in_bytes));
@@ -375,13 +369,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
       // ------------- cudnn conv forward and bias add ---------------------
       ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
       auto cudnn_func = [&](void* cudnn_workspace) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnConvolutionForward(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnConvolutionForward(
             handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
             filter_data, cudnn_conv_desc, algo, cudnn_workspace,
             workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
       };
       workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnAddTensor(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnAddTensor(
           handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc,
           output_data));
     } else {
@@ -392,7 +386,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
       ScalingParamType<T> alpha1 = 1.0f;
       ScalingParamType<T> alpha2 = residual ? 1.0f : 0.0f;
       auto cudnn_func = [&](void* cudnn_workspace) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::cudnnConvolutionBiasActivationForward(
                 handle, &alpha1, cudnn_input_desc, input_data,
                 cudnn_filter_desc, filter_data, cudnn_conv_desc, algo,
diff --git a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
index dc703f9a822b5..913772fb65050 100644
--- a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
@@ -15,8 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/operators/fused/cudnn_fusion_helper.h"
-#include "paddle/fluid/platform/cudnn_desc.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/fused/cudnn_fusion_helper.h b/paddle/fluid/operators/fused/cudnn_fusion_helper.h
index 1de64cf5ad947..13fad0b7cbb3d 100644
--- a/paddle/fluid/operators/fused/cudnn_fusion_helper.h
+++ b/paddle/fluid/operators/fused/cudnn_fusion_helper.h
@@ -31,19 +31,19 @@ class CudnnFusionOp {
  public:
   explicit CudnnFusionOp(cudnnFusedOps_t op_id) : plan_created_(false) {
     // New 'fused op' descriptor creation
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateFusedOpsPlan(&op_, op_id));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreateFusedOpsPlan(&op_, op_id));
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::cudnnCreateFusedOpsConstParamPack(&op_const_params_, op_id));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateFusedOpsVariantParamPack(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreateFusedOpsVariantParamPack(
         &op_variant_params_, op_id));
   }
 
   ~CudnnFusionOp() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::cudnnDestroyFusedOpsVariantParamPack(op_variant_params_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::cudnnDestroyFusedOpsConstParamPack(op_const_params_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyFusedOpsPlan(op_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroyFusedOpsPlan(op_));
   }
 
   // Execute fused op
@@ -53,7 +53,7 @@ class CudnnFusionOp {
         platform::errors::Fatal(
             "CudnnFusionOp exec requested without a valid 'plan', need: "
             "<set const params>, GetWorkspaceSizeBytes(), Execute()."));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::cudnnFusedOpsExecute(cudnn_handle, op_, op_variant_params_));
   }
 
@@ -61,9 +61,8 @@ class CudnnFusionOp {
   template <typename T>
   void SetOpConstParamDesc(cudnnFusedOpsConstParamLabel_t param_label,
                            T *param_ptr) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        dynload::cudnnSetFusedOpsConstParamPackAttribute(
-            op_const_params_, param_label, param_ptr));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetFusedOpsConstParamPackAttribute(
+        op_const_params_, param_label, param_ptr));
     plan_created_ = false;
   }
 
@@ -81,9 +80,8 @@ class CudnnFusionOp {
   template <typename T>
   void SetOpConstParamAttr(cudnnFusedOpsConstParamLabel_t param_label,
                            T param) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        dynload::cudnnSetFusedOpsConstParamPackAttribute(op_const_params_,
-                                                         param_label, &param));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetFusedOpsConstParamPackAttribute(
+        op_const_params_, param_label, &param));
     plan_created_ = false;
   }
 
@@ -101,7 +99,7 @@ class CudnnFusionOp {
   template <typename T>
   void SetOpVariantParamAttrPtr(cudnnFusedOpsVariantParamLabel_t param_label,
                                 T *param_ptr) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::cudnnSetFusedOpsVariantParamPackAttribute(
             op_variant_params_, param_label, param_ptr));
   }
@@ -120,7 +118,7 @@ class CudnnFusionOp {
   size_t GetWorkspaceSizeInBytes(cudnnHandle_t cudnn_handle) {
     if (!plan_created_) {
       workspace_bytes_ = 0U;
-      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnMakeFusedOpsPlan(
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnMakeFusedOpsPlan(
           cudnn_handle, op_, op_const_params_, &workspace_bytes_));
       plan_created_ = true;
     }
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
index 9b9328a5ca620..c8871388dd450 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
@@ -15,8 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/operators/fused/cudnn_fusion_helper.h"
-#include "paddle/fluid/platform/cudnn_desc.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
@@ -320,7 +319,7 @@ class CudnnNormConvolutionGrad {
     ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f;
     ctx.cudnn_workspace_handle().RunFunc(
         [&](void *cudnn_workspace_ptr) {
-          PADDLE_ENFORCE_CUDA_SUCCESS(
+          PADDLE_ENFORCE_GPU_SUCCESS(
               platform::dynload::cudnnConvolutionBackwardData(
                   cudnn_handle, &alpha, args_.filter_desc.desc(), filter_ptr,
                   args_.out_desc.desc(), output_grad_ptr,
@@ -370,7 +369,7 @@ class CudnnNormConvolutionGrad {
   size_t GetWorkspaceSizeBwdData(const platform::CUDADeviceContext &ctx) {
     size_t workspace_size = 0U;
     auto handle = ctx.cudnn_handle();
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
             handle, args_.filter_desc.desc(), args_.out_desc.desc(),
             args_.conv_desc.desc(), args_.in_desc.desc(), dgrad_algo_,
diff --git a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
index 5166ff27234f2..d0205208acc47 100644
--- a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
@@ -15,8 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/operators/fused/cudnn_fusion_helper.h"
-#include "paddle/fluid/platform/cudnn_desc.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu
index 9f6d6e2270673..173ef48b83dc2 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <cub/cub.cuh>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
index 9339ae8e470de..83328caf3844f 100644
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
@@ -22,7 +22,7 @@
 #include "paddle/fluid/operators/fused/fused_bn_activation_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/norm_utils.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/float16.h"
 
 DECLARE_bool(cudnn_batchnorm_spatial_persistent);
@@ -107,22 +107,21 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
     cudnnTensorDescriptor_t bn_param_desc_;
     cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
 
     VLOG(3) << "Setting descriptors.";
     std::vector<int> dims = {N, C, H, W, D};
     std::vector<int> strides = {H * W * D * C, 1, W * D * C, D * C, C};
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         data_desc_, CudnnDataType<T>::type,
         x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
-                                                         data_desc_, mode_));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
+        bn_param_desc_, data_desc_, mode_));
 
     double this_factor = 1. - momentum;
     cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ACTIVATION;
@@ -144,7 +143,7 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
             "The argument ReserveSpace of batch_norm op is not found."));
 
     // --------------- cudnn batchnorm workspace ---------------
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::
             cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
                 /*handle=*/handle,
@@ -158,7 +157,7 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
                 /*sizeInBytes=*/&workspace_size));
 
     // -------------- cudnn batchnorm reserve space --------------
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
             /*handle=*/handle,
             /*mode=*/mode_,
@@ -171,7 +170,7 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
                                                     reserve_space_size);
     workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(),
                                                   workspace_size);
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
             handle, mode_, bnOps_, CudnnDataType<T>::kOne(),
             CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
@@ -190,9 +189,9 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
             reserve_space_size));
 
     // clean when exit.
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
   }
 };
@@ -271,9 +270,9 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
     cudnnTensorDescriptor_t bn_param_desc_;
     cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
     if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
       LOG(ERROR) << "Provided epsilon is smaller than "
@@ -282,12 +281,11 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
     }
     epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         data_desc_, CudnnDataType<T>::type,
         x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
-                                                         data_desc_, mode_));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
+        bn_param_desc_, data_desc_, mode_));
 
     const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
     const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
@@ -305,7 +303,7 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
     cudnnActivationDescriptor_t activation_desc_ =
         scope_act_desc.descriptor<T>(act_type);
     // --------------- cudnn batchnorm workspace ---------------
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnGetBatchNormalizationBackwardExWorkspaceSize(
             /*handle=*/dev_ctx.cudnn_handle(),
             /*mode=*/mode_,
@@ -322,7 +320,7 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
     workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(),
                                                   workspace_size);
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnBatchNormalizationBackwardEx(
             /*handle=*/dev_ctx.cudnn_handle(),
             /*mode=*/mode_,
@@ -358,9 +356,9 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
             /*reserveSpaceSizeInBytes=*/reserve_space_size));
 
     // clean when exit.
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
   }
 };
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
index c92b13b5f5847..7c124a0d6b661 100644
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
@@ -21,7 +21,7 @@
 #include "paddle/fluid/operators/fused/fused_bn_add_activation_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/norm_utils.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/float16.h"
 
 DECLARE_bool(cudnn_batchnorm_spatial_persistent);
@@ -87,20 +87,19 @@ class FusedBatchNormAddActKernel<platform::CUDADeviceContext, T>
     cudnnTensorDescriptor_t bn_param_desc_;
     cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
 
     std::vector<int> dims = {N, C, H, W, D};
     std::vector<int> strides = {H * W * D * C, 1, W * D * C, D * C, C};
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         data_desc_, CudnnDataType<T>::type,
         in_dims.size() > 3 ? in_dims.size() : 4, dims.data(), strides.data()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
-                                                         data_desc_, mode_));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
+        bn_param_desc_, data_desc_, mode_));
 
     double this_factor = 1. - momentum;
     cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION;
@@ -122,7 +121,7 @@ class FusedBatchNormAddActKernel<platform::CUDADeviceContext, T>
             "The argument ReserveSpace of batch_norm op is not found."));
 
     // --------------- cudnn batchnorm workspace ---------------
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::
             cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
                 /*handle=*/handle,
@@ -136,7 +135,7 @@ class FusedBatchNormAddActKernel<platform::CUDADeviceContext, T>
                 /*sizeInBytes=*/&workspace_size));
 
     // -------------- cudnn batchnorm reserve space --------------
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
             /*handle=*/handle,
             /*mode=*/mode_,
@@ -149,7 +148,7 @@ class FusedBatchNormAddActKernel<platform::CUDADeviceContext, T>
                                                     reserve_space_size);
     workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(),
                                                   workspace_size);
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
             handle, mode_, bnOps_, CudnnDataType<T>::kOne(),
             CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
@@ -169,9 +168,9 @@ class FusedBatchNormAddActKernel<platform::CUDADeviceContext, T>
             reserve_space_size));
 
     // clean when exit.
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
   }
 };
@@ -231,9 +230,9 @@ class FusedBatchNormAddActGradKernel<platform::CUDADeviceContext, T>
     cudnnTensorDescriptor_t bn_param_desc_;
     cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
     if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
       LOG(ERROR) << "Provided epsilon is smaller than "
@@ -242,12 +241,11 @@ class FusedBatchNormAddActGradKernel<platform::CUDADeviceContext, T>
     }
     epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         data_desc_, CudnnDataType<T>::type,
         in_dims.size() > 3 ? in_dims.size() : 4, dims.data(), strides.data()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
-                                                         data_desc_, mode_));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
+        bn_param_desc_, data_desc_, mode_));
 
     const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
     const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
@@ -265,7 +263,7 @@ class FusedBatchNormAddActGradKernel<platform::CUDADeviceContext, T>
     cudnnActivationDescriptor_t activation_desc_ =
         scope_act_desc.descriptor<T>(act_type);
     // --------------- cudnn batchnorm workspace ---------------
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnGetBatchNormalizationBackwardExWorkspaceSize(
             /*handle=*/dev_ctx.cudnn_handle(),
             /*mode=*/mode_,
@@ -281,7 +279,7 @@ class FusedBatchNormAddActGradKernel<platform::CUDADeviceContext, T>
 
     workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(),
                                                   workspace_size);
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnBatchNormalizationBackwardEx(
             /*handle=*/dev_ctx.cudnn_handle(),
             /*mode=*/mode_,
@@ -315,9 +313,9 @@ class FusedBatchNormAddActGradKernel<platform::CUDADeviceContext, T>
             /*reserveSpaceSizeInBytes=*/reserve_space_size));
 
     // clean when exit.
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
   }
 };
diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h
index 049c37f1ea0c4..eb651e4ea7b4f 100644
--- a/paddle/fluid/operators/fused/fused_dropout_common.h
+++ b/paddle/fluid/operators/fused/fused_dropout_common.h
@@ -23,10 +23,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
 #include "paddle/fluid/operators/math/functors.h"
 #include "paddle/fluid/platform/aligned_vector.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
 
 namespace paddle {
 namespace operators {
@@ -93,7 +93,7 @@ __forceinline__ __device__ void RandVec<8>(curandStatePhilox4_32_10_t *state,
 template <typename T>
 inline void SetZero(const platform::CUDADeviceContext &ctx, T *ptr,
                     const size_t size) {
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       cudaMemsetAsync(ptr, 0, size * sizeof(T), ctx.stream()));
 }
 
diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
index dc068e02be4ec..c5b1fd9392950 100644
--- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
@@ -22,7 +22,7 @@ namespace cub = hipcub;
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
index f257d3efa433e..1827e137c15f1 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
@@ -169,7 +169,7 @@ void LaunchLayernormResidualDropoutBias(
     auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
     memory::Copy(cuda_place, dst, cuda_place, residual, rows * cols * sizeof(T),
                  ctx.stream());
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync(
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(
         mask_data, 0, rows * cols * sizeof(MaskType), ctx.stream()));
 
     // call layernorm forward
diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cc b/paddle/fluid/operators/fused/fusion_conv_inception_op.cc
index ea1e9512ca519..eeeb004003c9c 100644
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cc
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #endif
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"
 
diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
index b3796f1df5fdf..44312be797398 100644
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 DECLARE_uint64(conv_workspace_size_limit);
 
@@ -95,15 +95,15 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
     cudnnConvolutionDescriptor_t* conv_desc =
         new cudnnConvolutionDescriptor_t[4];
     for (int i = 0; i < 4; ++i) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnCreateFilterDescriptor(&filter_desc[i]));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnCreateTensorDescriptor(&bias_desc[i]));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnCreateTensorDescriptor(&in_desc[i]));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnCreateTensorDescriptor(&out_desc[i]));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnCreateConvolutionDescriptor(&conv_desc[i]));
     }
 
@@ -127,11 +127,11 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
 
     for (int i = 0; i < 4; ++i) {
       filter_dims.push_back(framework::vectorize<int>(filters[i]->dims()));
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetFilterNdDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetFilterNdDescriptor(
           filter_desc[i], cudnn_dtype, format, 4, filter_dims[i].data()));
       bias_dims.push_back({1, filter_dims[i][0], 1, 1});
       bias_strides.push_back({filter_dims[i][0], 1, 1, 1});
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
           bias_desc[i], cudnn_dtype, 4, bias_dims[i].data(),
           bias_strides[i].data()));
       in_dims.push_back({n, filter_dims[i][1], h, w});
@@ -140,22 +140,21 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
       out_strides.push_back({oc * h * w, h * w, w, 1});
 
       if (i < 2) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::cudnnSetConvolutionNdDescriptor(
                 conv_desc[i], 2, k0x0.data(), k1x1.data(), k1x1.data(),
                 CUDNN_CROSS_CORRELATION, compute_type));
       } else {
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::cudnnSetConvolutionNdDescriptor(
                 conv_desc[i], 2, k1x1.data(), k1x1.data(), k1x1.data(),
                 CUDNN_CROSS_CORRELATION, compute_type));
       }
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnSetConvolutionMathType(conv_desc[i],
-                                                         CUDNN_DEFAULT_MATH));
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+          conv_desc[i], CUDNN_DEFAULT_MATH));
 #if CUDA_VERSION >= 11000 && CUDNN_VERSION >= 8000
       if (!platform::allow_tf32_cudnn) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::cudnnSetConvolutionMathType(conv_desc[i],
                                                            CUDNN_FMA_MATH));
       }
@@ -165,7 +164,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
     in_strides[2][0] = oc * h * w;
     out_strides[2][0] = filter_dims[2][0] * h * w;  // this out is continuous.
     in_strides[3][0] = filter_dims[2][0] * h * w;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnSetConvolutionGroupCount(conv_desc[2], 2));
 
     cudnnConvolutionFwdAlgo_t algo[4];
@@ -181,9 +180,9 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
     }
 
     for (int i = 0; i < 4; ++i) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
           in_desc[i], cudnn_dtype, 4, in_dims[i].data(), in_strides[i].data()));
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
           out_desc[i], cudnn_dtype, 4, out_dims[i].data(),
           out_strides[i].data()));
 
@@ -192,13 +191,13 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
       size_t tmp_size = 0;
       std::unique_ptr<cudnnConvolutionFwdAlgoPerf_t[]> perf_results(
           new cudnnConvolutionFwdAlgoPerf_t[kNUM_CUDNN_FWD_ALGS]);
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
               handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i],
               kNUM_CUDNN_FWD_ALGS, &perf_count, perf_results.get()));
       algo[i] = (perf_results.get())[best_algo_idx].algo;
 
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
               handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i],
               algo[i], &tmp_size));
@@ -215,7 +214,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
 
     // branch1: pool + 1x1 conv
     ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnPoolingForward(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnPoolingForward(
         handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta,
         pool_out_desc, temp_data));
 
@@ -237,7 +236,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
 
     for (int i = 0; i < 4; ++i) {
       auto func = [&](void* cudnn_workspace) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::cudnnConvolutionBiasActivationForward(
                 handle, &alpha, in_desc[i], in_datas[i], filter_desc[i],
                 static_cast<const void*>(filters[i]->data<T>()), conv_desc[i],
@@ -252,34 +251,34 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
 
     cudnnTensorDescriptor_t x_desc;
     cudnnTensorDescriptor_t y_desc;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&x_desc));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&y_desc));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         x_desc, cudnn_dtype, 4, out_dims[3].data(), out_strides[2].data()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         y_desc, cudnn_dtype, 4, out_dims[3].data(), out_strides[3].data()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnTransformTensor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnTransformTensor(
         handle, CudnnDataType<T>::kOne(), x_desc,
         static_cast<const void*>(out_datas[2]), CudnnDataType<T>::kZero(),
         y_desc, static_cast<void*>(output_data + (oc0 + oc1) * h * w)));
 
     for (int i = 0; i < 4; ++i) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnDestroyTensorDescriptor(in_desc[i]));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnDestroyTensorDescriptor(out_desc[i]));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnDestroyFilterDescriptor(filter_desc[i]));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnDestroyTensorDescriptor(bias_desc[i]));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnDestroyConvolutionDescriptor(conv_desc[i]));
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(x_desc));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(y_desc));
   }
 };
diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
index 37a442a781571..1fa4225934d39 100644
--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
+++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace platform {
@@ -50,9 +50,9 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
 
     cudnnTensorDescriptor_t in_desc;
     cudnnTensorDescriptor_t out_desc;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&in_desc));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&out_desc));
     cudnnDataType_t cudnn_dtype = CudnnDataType<T>::type;
 
@@ -92,12 +92,12 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
         dims_y[i] = 1;
       }
 
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
           in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data()));
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
           out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data()));
 
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnTransformTensor(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnTransformTensor(
           handle, CudnnDataType<T>::kOne(), in_desc,
           static_cast<const void*>(ins[k]->data<T>()),
           CudnnDataType<T>::kZero(), out_desc, static_cast<void*>(odata)));
@@ -108,9 +108,9 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
         odata += flat_shape[1];
       }
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(in_desc));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(out_desc));
   }
 };
diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/fluid/operators/gather.cu.h
index 05af4ff150f39..700de8074ff8a 100644
--- a/paddle/fluid/operators/gather.cu.h
+++ b/paddle/fluid/operators/gather.cu.h
@@ -19,8 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/place.h"
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/graph_send_recv_op.cu b/paddle/fluid/operators/graph_send_recv_op.cu
index d9f56ec4dc038..6e5e203e2d943 100644
--- a/paddle/fluid/operators/graph_send_recv_op.cu
+++ b/paddle/fluid/operators/graph_send_recv_op.cu
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/graph_send_recv_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
index d2002b487ca33..080dadeacaae7 100644
--- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 // HIP not support cudnnSpatialTfGridGeneratorForward
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace framework {
@@ -70,7 +70,7 @@ class CUDNNGridSampleOpKernel : public framework::OpKernel<T> {
     cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
         DataLayout::kNCHW, framework::vectorize<int>(output->dims()));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSpatialTfSamplerForward(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSpatialTfSamplerForward(
         handle, cudnn_st_desc, CudnnDataType<T>::kOne(), cudnn_input_desc,
         input_data, grid_data, CudnnDataType<T>::kZero(), cudnn_output_desc,
         output_data));
@@ -123,13 +123,12 @@ class CUDNNGridSampleGradOpKernel : public framework::OpKernel<T> {
         output_grad_desc.descriptor<T>(
             DataLayout::kNCHW, framework::vectorize<int>(output_grad->dims()));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnSpatialTfSamplerBackward(
-            handle, cudnn_st_dest, CudnnDataType<T>::kOne(), cudnn_input_desc,
-            input_data, CudnnDataType<T>::kZero(), cudnn_input_grad_desc,
-            input_grad_data, CudnnDataType<T>::kOne(), cudnn_output_grad_desc,
-            output_grad_data, grid_data, CudnnDataType<T>::kZero(),
-            grid_grad_data));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSpatialTfSamplerBackward(
+        handle, cudnn_st_dest, CudnnDataType<T>::kOne(), cudnn_input_desc,
+        input_data, CudnnDataType<T>::kZero(), cudnn_input_grad_desc,
+        input_grad_data, CudnnDataType<T>::kOne(), cudnn_output_grad_desc,
+        output_grad_data, grid_data, CudnnDataType<T>::kZero(),
+        grid_grad_data));
   }
 };
 
diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc
index 0b410f07fcb57..04aa6a3e10f6e 100644
--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -17,12 +17,7 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/grid_sampler_op.cu b/paddle/fluid/operators/grid_sampler_op.cu
index 762d14096a5ab..8e9f445f3b116 100644
--- a/paddle/fluid/operators/grid_sampler_op.cu
+++ b/paddle/fluid/operators/grid_sampler_op.cu
@@ -15,9 +15,9 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/grid_sampler_op.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu
index e029c84090af1..055fd791af5a3 100644
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -21,8 +21,8 @@ namespace cub = hipcub;
 #endif
 
 #include "paddle/fluid/operators/group_norm_op.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/histogram_op.cu b/paddle/fluid/operators/histogram_op.cu
index 6a9183a8b465b..b9419cbcc57b5 100644
--- a/paddle/fluid/operators/histogram_op.cu
+++ b/paddle/fluid/operators/histogram_op.cu
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/histogram_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/index_sample_op.cu b/paddle/fluid/operators/index_sample_op.cu
index 46dd91fed6cbc..40a968b8a397d 100644
--- a/paddle/fluid/operators/index_sample_op.cu
+++ b/paddle/fluid/operators/index_sample_op.cu
@@ -15,8 +15,8 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/index_sample_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/index_select_op.cu b/paddle/fluid/operators/index_select_op.cu
index 2353781daaa39..acf959896f949 100644
--- a/paddle/fluid/operators/index_select_op.cu
+++ b/paddle/fluid/operators/index_select_op.cu
@@ -15,7 +15,7 @@
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/index_select_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
@@ -110,22 +110,14 @@ class IndexSelectCUDAKernel : public framework::OpKernel<T> {
           (numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
           PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, index_data,
                                                 numel, stride, size, delta);
-#ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
+      platform::GpuStreamSync(stream);
     } else {
       const int* index_data = index->data<int>();
       index_select_cuda_kernel<T, int><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
                                              PADDLE_CUDA_NUM_THREADS,
                                          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
           in_data, out_data, index_data, numel, stride, size, delta);
-#ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
+      platform::GpuStreamSync(stream);
     }
   }
 };
@@ -181,11 +173,7 @@ class IndexSelectGradCUDAKernel : public framework::OpKernel<T> {
           PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data,
                                                 index_data, index_nums,
                                                 out_nums, stride, size, delta);
-#ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
+      platform::GpuStreamSync(stream);
     } else {
       const int* index_data = index->data<int>();
       index_select_grad_cuda_kernel<T, int><<<
@@ -193,11 +181,7 @@ class IndexSelectGradCUDAKernel : public framework::OpKernel<T> {
           PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data,
                                                 index_data, index_nums,
                                                 out_nums, stride, size, delta);
-#ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
+      platform::GpuStreamSync(stream);
     }
   }
 };
diff --git a/paddle/fluid/operators/instance_norm_op.cu b/paddle/fluid/operators/instance_norm_op.cu
index affd0b7e1edd7..e0401366693b1 100644
--- a/paddle/fluid/operators/instance_norm_op.cu
+++ b/paddle/fluid/operators/instance_norm_op.cu
@@ -26,12 +26,7 @@ namespace cub = hipcub;
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/instance_norm_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
@@ -114,17 +109,17 @@ class InstanceNormKernel<platform::CUDADeviceContext, T>
     miopenTensorDescriptor_t data_desc_;
     miopenTensorDescriptor_t in_param_desc_;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
 #else
     cudnnTensorDescriptor_t data_desc_;
     cudnnTensorDescriptor_t in_param_desc_;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
 #endif
     if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
@@ -143,20 +138,19 @@ class InstanceNormKernel<platform::CUDADeviceContext, T>
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
 
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
         data_desc_, CudnnDataType<T>::type,
         x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
         const_cast<int *>(strides.data())));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDeriveBNTensorDescriptor(
             in_param_desc_, data_desc_, miopenBNSpatial));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         data_desc_, CudnnDataType<T>::type,
         x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDeriveBNTensorDescriptor(
-            in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
+        in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
 #endif
 
     const auto *scale = ctx.Input<Tensor>("Scale");
@@ -202,7 +196,7 @@ class InstanceNormKernel<platform::CUDADeviceContext, T>
     functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
 
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenBatchNormalizationForwardTraining(
             handle, miopenBNSpatial,
             const_cast<void *>(
@@ -225,12 +219,12 @@ class InstanceNormKernel<platform::CUDADeviceContext, T>
                 saved_variance->template mutable_data<BatchNormParamType<T>>(
                     ctx.GetPlace()))));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnBatchNormalizationForwardTraining(
             handle, CUDNN_BATCHNORM_SPATIAL, CudnnDataType<T>::kOne(),
             CudnnDataType<T>::kZero(), data_desc_, x_tmp.template data<T>(),
@@ -243,9 +237,9 @@ class InstanceNormKernel<platform::CUDADeviceContext, T>
             saved_variance->template mutable_data<BatchNormParamType<T>>(
                 ctx.GetPlace())));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
 #endif
   }
@@ -396,17 +390,17 @@ class InstanceNormGradKernel<platform::CUDADeviceContext, T>
     miopenTensorDescriptor_t data_desc_;
     miopenTensorDescriptor_t in_param_desc_;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
 #else
     cudnnTensorDescriptor_t data_desc_;
     cudnnTensorDescriptor_t in_param_desc_;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
 #endif
 
@@ -418,20 +412,19 @@ class InstanceNormGradKernel<platform::CUDADeviceContext, T>
     epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
 
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
         data_desc_, CudnnDataType<T>::type,
         x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
         const_cast<int *>(strides.data())));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDeriveBNTensorDescriptor(
             in_param_desc_, data_desc_, miopenBNSpatial));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         data_desc_, CudnnDataType<T>::type,
         x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDeriveBNTensorDescriptor(
-            in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
+        in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
 #endif
 
     const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
@@ -442,7 +435,7 @@ class InstanceNormGradKernel<platform::CUDADeviceContext, T>
         saved_var->template data<BatchNormParamType<T>>();
     if (d_scale && d_bias) {
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::miopenBatchNormalizationBackward(
               dev_ctx.cudnn_handle(), miopenBNSpatial, CudnnDataType<T>::kOne(),
               CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
@@ -456,7 +449,7 @@ class InstanceNormGradKernel<platform::CUDADeviceContext, T>
                   ctx.GetPlace()),
               epsilon, saved_mean_data, saved_var_data));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnBatchNormalizationBackward(
               dev_ctx.cudnn_handle(), CUDNN_BATCHNORM_SPATIAL,
               CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
@@ -487,14 +480,14 @@ class InstanceNormGradKernel<platform::CUDADeviceContext, T>
     }
 
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
 #endif
   }
diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu
index 6be7dbdc110d5..3c857eb326ace 100644
--- a/paddle/fluid/operators/interpolate_op.cu
+++ b/paddle/fluid/operators/interpolate_op.cu
@@ -12,8 +12,8 @@
 #include <algorithm>
 #include <string>
 #include "paddle/fluid/operators/interpolate_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu
index fe9228135606d..bc1ab704aafe3 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cu
+++ b/paddle/fluid/operators/interpolate_v2_op.cu
@@ -13,9 +13,9 @@
 #include <string>
 #include "paddle/fluid/operators/interpolate_v2_op.h"
 #include "paddle/fluid/operators/math/math_cuda_utils.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/kernel_primitives/compute_primitives.h b/paddle/fluid/operators/kernel_primitives/compute_primitives.h
index 73316d66b6cf2..2320b9e0b2fbf 100644
--- a/paddle/fluid/operators/kernel_primitives/compute_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/compute_primitives.h
@@ -21,7 +21,7 @@
 #include <hip/hip_fp16.h>
 #endif
 
-#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/kernel_primitives/functor_primitives.h b/paddle/fluid/operators/kernel_primitives/functor_primitives.h
index 3fce3b1c0920a..d7aed8595ba05 100644
--- a/paddle/fluid/operators/kernel_primitives/functor_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/functor_primitives.h
@@ -86,6 +86,20 @@ struct DivideFunctor {
   Tx n_inv;
 };
 
+/**
+ * @brief Default inverse functor
+ */
+template <typename Tx, typename Ty = Tx>
+struct InverseFunctor {
+  HOSTDEVICE inline InverseFunctor() {}
+
+  HOSTDEVICE explicit inline InverseFunctor(int n) {}
+
+  HOSTDEVICE inline Ty operator()(const Tx& x) const {
+    return static_cast<Ty>(-x);
+  }
+};
+
 /**
  * @brief Default unary square functor
  */
diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h
index 4280c86ca99ab..3656bd1a18167 100644
--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
@@ -23,13 +23,8 @@ namespace cub = hipcub;
 #endif
 
 #include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/linspace_op.cu b/paddle/fluid/operators/linspace_op.cu
index a4f0693323297..4bf2a7cb372cb 100644
--- a/paddle/fluid/operators/linspace_op.cu
+++ b/paddle/fluid/operators/linspace_op.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/linspace_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/lite/lite_engine_op.h b/paddle/fluid/operators/lite/lite_engine_op.h
index ec9f5dd95d4d0..5d2a1683d381b 100644
--- a/paddle/fluid/operators/lite/lite_engine_op.h
+++ b/paddle/fluid/operators/lite/lite_engine_op.h
@@ -26,7 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/inference/analysis/helper.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 #include "paddle/fluid/inference/lite/engine.h"
 #include "paddle/fluid/inference/lite/tensor_utils.h"
diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu
index 7c47ad90502eb..6676cde1cafca 100644
--- a/paddle/fluid/operators/log_softmax_op.cu
+++ b/paddle/fluid/operators/log_softmax_op.cu
@@ -16,7 +16,7 @@
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/log_softmax_op.h"
 #include "paddle/fluid/operators/math/functors.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
index 3edea025b2a04..5aa546cbcc21a 100644
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/lookup_table_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu
index 493966ecda7bd..317f9eeb94f39 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ b/paddle/fluid/operators/lookup_table_v2_op.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/lookup_table_v2_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu
index 7c5e64d2afa46..1deaa3ef1ee7c 100644
--- a/paddle/fluid/operators/margin_cross_entropy_op.cu
+++ b/paddle/fluid/operators/margin_cross_entropy_op.cu
@@ -30,7 +30,7 @@ namespace cub = hipcub;
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -69,7 +69,7 @@ void GetClassInterval(const gpuStream_t& stream, const platform::Place& place,
           platform::DeviceContextPool::Instance().Get(place))
           ->stream();
 
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
       num_classes_per_device_ptr, num_classes_per_device_ptr,
       num_classes_per_device.numel(),
       platform::ToNCCLDataType(num_classes_per_device.type()), ncclSum,
@@ -314,7 +314,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     if (nranks > 1) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
           logits_max_buff, logits_max_buff, logits_max.numel(),
           platform::ToNCCLDataType(logits_max.type()), ncclMax, comm->comm(),
           stream));
@@ -335,7 +335,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     if (nranks > 1) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
           sum_exp_logits_buff, sum_exp_logits_buff, sum_exp_logits.numel(),
           platform::ToNCCLDataType(sum_exp_logits.type()), ncclSum,
           comm->comm(), stream));
@@ -368,7 +368,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     if (nranks > 1) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
           loss_ptr, loss_ptr, loss->numel(),
           platform::ToNCCLDataType(loss->type()), ncclSum, comm->comm(),
           stream));
diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu
index ed3ead47d171e..0cc552d34c587 100644
--- a/paddle/fluid/operators/math/beam_search.cu
+++ b/paddle/fluid/operators/math/beam_search.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/beam_search.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index 70c6cf9dcab03..92162e639ff86 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -17,7 +17,7 @@
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/dynload/cublas.h"
 
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 DECLARE_bool(enable_cublas_tensor_op_math);
 
@@ -32,33 +32,33 @@ template <>
 struct CUBlas<float> {
   template <typename... ARGS>
   static void GEMM(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSgemm(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasSgemm(args...));
   }
 
   template <typename... ARGS>
   static void AXPY(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSaxpy(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasSaxpy(args...));
   }
 
   template <typename... ARGS>
   static void SCAL(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSscal(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasSscal(args...));
   }
 
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasScopy(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasScopy(args...));
   }
 
   template <typename... ARGS>
   static void GEMV(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSgemv(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasSgemv(args...));
   }
 
   template <typename... ARGS>
   static void GEMM_STRIDED_BATCH(ARGS... args) {
 #if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cublasSgemmStridedBatched(args...));
 #else
     PADDLE_THROW(platform::errors::Unimplemented(
@@ -82,7 +82,7 @@ struct CUBlas<float> {
     VLOG(5) << "use_tensor_op_math: "
             << (dev_ctx->tensor_core_available() ? "True" : "False");
     dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSgemmEx(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasSgemmEx(
           handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
           beta, C, Ctype, ldc));
     });
@@ -94,36 +94,33 @@ struct CUBlas<float> {
 
   template <typename... ARGS>
   static void TRSM(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasStrsm(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasStrsm(args...));
   }
 
   template <typename... ARGS>
   static void GETRF_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cublasSgetrfBatched(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasSgetrfBatched(args...));
   }
 
   template <typename... ARGS>
   static void GETRI_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cublasSgetriBatched(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasSgetriBatched(args...));
   }
 
   template <typename... ARGS>
   static void MATINV_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cublasSmatinvBatched(args...));
   }
 
   template <typename... ARGS>
   static void GETRS_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cublasSgetrsBatched(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasSgetrsBatched(args...));
   }
 
   template <typename... ARGS>
   static void TRSM_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasStrsmBatched(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasStrsmBatched(args...));
   }
 };
 
@@ -131,33 +128,33 @@ template <>
 struct CUBlas<double> {
   template <typename... ARGS>
   static void GEMM(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDgemm(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasDgemm(args...));
   }
 
   template <typename... ARGS>
   static void AXPY(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDaxpy(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasDaxpy(args...));
   }
 
   template <typename... ARGS>
   static void SCAL(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDscal(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasDscal(args...));
   }
 
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDcopy(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasDcopy(args...));
   }
 
   template <typename... ARGS>
   static void GEMV(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDgemv(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasDgemv(args...));
   }
 
   template <typename... ARGS>
   static void GEMM_STRIDED_BATCH(ARGS... args) {
 #if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cublasDgemmStridedBatched(args...));
 #else
     PADDLE_THROW(platform::errors::Unimplemented(
@@ -173,36 +170,33 @@ struct CUBlas<double> {
 
   template <typename... ARGS>
   static void TRSM(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDtrsm(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasDtrsm(args...));
   }
 
   template <typename... ARGS>
   static void GETRF_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cublasDgetrfBatched(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasDgetrfBatched(args...));
   }
 
   template <typename... ARGS>
   static void GETRI_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cublasDgetriBatched(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasDgetriBatched(args...));
   }
 
   template <typename... ARGS>
   static void MATINV_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cublasDmatinvBatched(args...));
   }
 
   template <typename... ARGS>
   static void GETRS_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cublasDgetrsBatched(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasDgetrsBatched(args...));
   }
 
   template <typename... ARGS>
   static void TRSM_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDtrsmBatched(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasDtrsmBatched(args...));
   }
 };
 
@@ -215,7 +209,7 @@ struct CUBlas<platform::float16> {
                    const float16 *alpha, const float16 *A, int lda,
                    const float16 *B, int ldb, const float16 *beta, float16 *C,
                    int ldc) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cublasHgemm(handle, transa, transb, m, n, k,
                                        reinterpret_cast<const __half *>(alpha),
                                        reinterpret_cast<const __half *>(A), lda,
@@ -235,7 +229,7 @@ struct CUBlas<platform::float16> {
                                  long long int strideC,  // NOLINT
                                  int batchCount) {
 #if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasHgemmStridedBatched(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasHgemmStridedBatched(
         handle, transa, transb, m, n, k,
         reinterpret_cast<const __half *>(alpha),
         reinterpret_cast<const __half *>(A), lda, strideA,
@@ -270,7 +264,7 @@ struct CUBlas<platform::float16> {
 #endif  // CUDA_VERSION >= 9000
 
     dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasGemmEx(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx(
           handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
           beta, C, Ctype, ldc, computeType, algo));
     });
@@ -289,7 +283,7 @@ struct CUBlas<platform::complex<float>> {
                    const platform::complex<float> *B, int ldb,
                    const platform::complex<float> *beta,
                    platform::complex<float> *C, int ldc) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasCgemv(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasCgemv(
         handle, transa, m, n, reinterpret_cast<const cuFloatComplex *>(alpha),
         reinterpret_cast<const cuFloatComplex *>(A), lda,
         reinterpret_cast<const cuFloatComplex *>(B), ldb,
@@ -301,7 +295,7 @@ struct CUBlas<platform::complex<float>> {
                    const platform::complex<float> *alpha,
                    const platform::complex<float> *X, const int incX,
                    platform::complex<float> *Y, const int incY) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasCaxpy(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasCaxpy(
         handle, n, reinterpret_cast<const cuFloatComplex *>(alpha),
         reinterpret_cast<const cuFloatComplex *>(X), incX,
         reinterpret_cast<cuFloatComplex *>(Y), incY));
@@ -320,7 +314,7 @@ struct CUBlas<platform::complex<float>> {
                                  long long int strideC,  // NOLINT
                                  int batchCount) {
 #if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasCgemmStridedBatched(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasCgemmStridedBatched(
         handle, transa, transb, m, n, k,
         reinterpret_cast<const cuFloatComplex *>(alpha),
         reinterpret_cast<const cuFloatComplex *>(A), lda, strideA,
@@ -340,7 +334,7 @@ struct CUBlas<platform::complex<float>> {
                    const platform::complex<float> *B, int ldb,
                    const platform::complex<float> *beta,
                    platform::complex<float> *C, int ldc) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasCgemm(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasCgemm(
         handle, transa, transb, m, n, k,
         reinterpret_cast<const cuFloatComplex *>(alpha),
         reinterpret_cast<const cuFloatComplex *>(A), lda,
@@ -355,7 +349,7 @@ struct CUBlas<platform::complex<float>> {
                    const paddle::platform::complex<float> *alpha,
                    const paddle::platform::complex<float> *A, int lda,
                    paddle::platform::complex<float> *B, int ldb) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasCtrsm(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasCtrsm(
         handle, side, uplo, transa, diag, m, n,
         reinterpret_cast<const cuFloatComplex *>(alpha),
         reinterpret_cast<const cuFloatComplex *>(A), lda,
@@ -384,7 +378,7 @@ struct CUBlas<platform::complex<float>> {
 #endif  // CUDA_VERSION >= 9000
 
     dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasGemmEx(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx(
           handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
           beta, C, Ctype, ldc, computeType, algo));
     });
@@ -401,7 +395,7 @@ struct CUBlas<platform::complex<float>> {
                          const paddle::platform::complex<float> **A, int lda,
                          paddle::platform::complex<float> **B, int ldb,
                          int batch_size) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasCtrsmBatched(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasCtrsmBatched(
         handle, side, uplo, transa, diag, m, n,
         reinterpret_cast<const cuFloatComplex *>(alpha),
         reinterpret_cast<const cuFloatComplex **>(A), lda,
@@ -417,7 +411,7 @@ struct CUBlas<platform::complex<double>> {
                    const platform::complex<double> *B, int ldb,
                    const platform::complex<double> *beta,
                    platform::complex<double> *C, int ldc) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasZgemv(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasZgemv(
         handle, transa, m, n, reinterpret_cast<const cuDoubleComplex *>(alpha),
         reinterpret_cast<const cuDoubleComplex *>(A), lda,
         reinterpret_cast<const cuDoubleComplex *>(B), ldb,
@@ -429,7 +423,7 @@ struct CUBlas<platform::complex<double>> {
                    const platform::complex<double> *alpha,
                    const platform::complex<double> *X, const int incX,
                    platform::complex<double> *Y, const int incY) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasZaxpy(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasZaxpy(
         handle, n, reinterpret_cast<const cuDoubleComplex *>(alpha),
         reinterpret_cast<const cuDoubleComplex *>(X), incX,
         reinterpret_cast<cuDoubleComplex *>(Y), incY));
@@ -448,7 +442,7 @@ struct CUBlas<platform::complex<double>> {
                                  long long int strideC,  // NOLINT
                                  int batchCount) {
 #if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasZgemmStridedBatched(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasZgemmStridedBatched(
         handle, transa, transb, m, n, k,
         reinterpret_cast<const cuDoubleComplex *>(alpha),
         reinterpret_cast<const cuDoubleComplex *>(A), lda, strideA,
@@ -468,7 +462,7 @@ struct CUBlas<platform::complex<double>> {
                    const platform::complex<double> *B, int ldb,
                    const platform::complex<double> *beta,
                    platform::complex<double> *C, int ldc) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasZgemm(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasZgemm(
         handle, transa, transb, m, n, k,
         reinterpret_cast<const cuDoubleComplex *>(alpha),
         reinterpret_cast<const cuDoubleComplex *>(A), lda,
@@ -483,7 +477,7 @@ struct CUBlas<platform::complex<double>> {
                    const paddle::platform::complex<double> *alpha,
                    const paddle::platform::complex<double> *A, int lda,
                    paddle::platform::complex<double> *B, int ldb) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasZtrsm(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasZtrsm(
         handle, side, uplo, transa, diag, m, n,
         reinterpret_cast<const cuDoubleComplex *>(alpha),
         reinterpret_cast<const cuDoubleComplex *>(A), lda,
@@ -497,7 +491,7 @@ struct CUBlas<platform::complex<double>> {
                          const paddle::platform::complex<double> **A, int lda,
                          paddle::platform::complex<double> **B, int ldb,
                          int batch_size) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasZtrsmBatched(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasZtrsmBatched(
         handle, side, uplo, transa, diag, m, n,
         reinterpret_cast<const cuDoubleComplex *>(alpha),
         reinterpret_cast<const cuDoubleComplex **>(A), lda,
@@ -526,7 +520,7 @@ struct CUBlas<platform::complex<double>> {
 #endif  // CUDA_VERSION >= 9000
 
     dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasGemmEx(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx(
           handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
           beta, C, Ctype, ldc, computeType, algo));
     });
@@ -842,7 +836,7 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
 
     auto fp = std::is_same<T, float>::value ? CUDA_R_32F : CUDA_R_16F;
     context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasGemmStridedBatchedEx(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmStridedBatchedEx(
           handle, cuTransB, cuTransA, N, M, K, &alpha, B, fp, ldb, strideB, A,
           fp, lda, strideA, &beta, C, fp, ldc, strideC, batchCount, fp, algo));
     });
diff --git a/paddle/fluid/operators/math/blas_impl.hip.h b/paddle/fluid/operators/math/blas_impl.hip.h
index f972d38adda5f..32479189eea58 100644
--- a/paddle/fluid/operators/math/blas_impl.hip.h
+++ b/paddle/fluid/operators/math/blas_impl.hip.h
@@ -15,8 +15,8 @@
 #pragma once
 
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/dynload/rocblas.h"
-#include "paddle/fluid/platform/gpu_info.h"
 
 DECLARE_bool(enable_cublas_tensor_op_math);
 
@@ -31,32 +31,32 @@ template <>
 struct CUBlas<float> {
   template <typename... ARGS>
   static void GEMM(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_sgemm(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_sgemm(args...));
   }
 
   template <typename... ARGS>
   static void AXPY(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_saxpy(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_saxpy(args...));
   }
 
   template <typename... ARGS>
   static void SCAL(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_sscal(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_sscal(args...));
   }
 
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_scopy(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_scopy(args...));
   }
 
   template <typename... ARGS>
   static void GEMV(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_sgemv(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_sgemv(args...));
   }
 
   template <typename... ARGS>
   static void GEMM_STRIDED_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::rocblas_sgemm_strided_batched(args...));
   }
 
@@ -70,7 +70,7 @@ struct CUBlas<float> {
 
   template <typename... ARGS>
   static void TRSM(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_strsm(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_strsm(args...));
   }
 
   template <typename... ARGS>
@@ -102,32 +102,32 @@ template <>
 struct CUBlas<double> {
   template <typename... ARGS>
   static void GEMM(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_dgemm(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_dgemm(args...));
   }
 
   template <typename... ARGS>
   static void AXPY(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_daxpy(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_daxpy(args...));
   }
 
   template <typename... ARGS>
   static void SCAL(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_dscal(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_dscal(args...));
   }
 
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_dcopy(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_dcopy(args...));
   }
 
   template <typename... ARGS>
   static void GEMV(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_dgemv(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_dgemv(args...));
   }
 
   template <typename... ARGS>
   static void GEMM_STRIDED_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::rocblas_dgemm_strided_batched(args...));
   }
 
@@ -139,7 +139,7 @@ struct CUBlas<double> {
 
   template <typename... ARGS>
   static void TRSM(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_dtrsm(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_dtrsm(args...));
   }
 
   template <typename... ARGS>
@@ -176,7 +176,7 @@ struct CUBlas<platform::float16> {
                    const float16 *alpha, const float16 *A, int lda,
                    const float16 *B, int ldb, const float16 *beta, float16 *C,
                    int ldc) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_hgemm(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_hgemm(
         handle, transa, transb, m, n, k,
         reinterpret_cast<const rocblas_half *>(alpha),
         reinterpret_cast<const rocblas_half *>(A), lda,
@@ -195,14 +195,13 @@ struct CUBlas<platform::float16> {
                                  const float16 *beta, float16 *C, int ldc,
                                  long long int strideC,  // NOLINT
                                  int batchCount) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::rocblas_hgemm_strided_batched(
-            handle, transa, transb, m, n, k,
-            reinterpret_cast<const rocblas_half *>(alpha),
-            reinterpret_cast<const rocblas_half *>(A), lda, strideA,
-            reinterpret_cast<const rocblas_half *>(B), ldb, strideB,
-            reinterpret_cast<const rocblas_half *>(beta),
-            reinterpret_cast<rocblas_half *>(C), ldc, strideC, batchCount));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_hgemm_strided_batched(
+        handle, transa, transb, m, n, k,
+        reinterpret_cast<const rocblas_half *>(alpha),
+        reinterpret_cast<const rocblas_half *>(A), lda, strideA,
+        reinterpret_cast<const rocblas_half *>(B), ldb, strideB,
+        reinterpret_cast<const rocblas_half *>(beta),
+        reinterpret_cast<rocblas_half *>(C), ldc, strideC, batchCount));
   }
 
   // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
@@ -217,7 +216,7 @@ struct CUBlas<platform::float16> {
                       rocblas_datatype computeType) {
     rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
     dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_gemm_ex(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex(
           handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
           beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
     });
@@ -232,7 +231,7 @@ struct CUBlas<platform::complex<float>> {
                    const platform::complex<float> *B, int ldb,
                    const platform::complex<float> *beta,
                    platform::complex<float> *C, int ldc) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_cgemv(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_cgemv(
         handle, transa, m, n,
         reinterpret_cast<const rocblas_float_complex *>(alpha),
         reinterpret_cast<const rocblas_float_complex *>(A), lda,
@@ -245,7 +244,7 @@ struct CUBlas<platform::complex<float>> {
                    const platform::complex<float> *alpha,
                    const platform::complex<float> *X, const int incX,
                    platform::complex<float> *Y, const int incY) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_caxpy(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_caxpy(
         handle, n, reinterpret_cast<const rocblas_float_complex *>(alpha),
         reinterpret_cast<const rocblas_float_complex *>(X), incX,
         reinterpret_cast<rocblas_float_complex *>(Y), incY));
@@ -263,15 +262,14 @@ struct CUBlas<platform::complex<float>> {
                                  platform::complex<float> *C, int ldc,
                                  long long int strideC,  // NOLINT
                                  int batchCount) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::rocblas_cgemm_strided_batched(
-            handle, transa, transb, m, n, k,
-            reinterpret_cast<const rocblas_float_complex *>(alpha),
-            reinterpret_cast<const rocblas_float_complex *>(A), lda, strideA,
-            reinterpret_cast<const rocblas_float_complex *>(B), ldb, strideB,
-            reinterpret_cast<const rocblas_float_complex *>(beta),
-            reinterpret_cast<rocblas_float_complex *>(C), ldc, strideC,
-            batchCount));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_cgemm_strided_batched(
+        handle, transa, transb, m, n, k,
+        reinterpret_cast<const rocblas_float_complex *>(alpha),
+        reinterpret_cast<const rocblas_float_complex *>(A), lda, strideA,
+        reinterpret_cast<const rocblas_float_complex *>(B), ldb, strideB,
+        reinterpret_cast<const rocblas_float_complex *>(beta),
+        reinterpret_cast<rocblas_float_complex *>(C), ldc, strideC,
+        batchCount));
   }
 
   static void GEMM(rocblas_handle handle, rocblas_operation transa,
@@ -281,7 +279,7 @@ struct CUBlas<platform::complex<float>> {
                    const platform::complex<float> *B, int ldb,
                    const platform::complex<float> *beta,
                    platform::complex<float> *C, int ldc) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_cgemm(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_cgemm(
         handle, transa, transb, m, n, k,
         reinterpret_cast<const rocblas_float_complex *>(alpha),
         reinterpret_cast<const rocblas_float_complex *>(A), lda,
@@ -302,7 +300,7 @@ struct CUBlas<platform::complex<float>> {
                       rocblas_datatype computeType) {
     rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
     dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_gemm_ex(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex(
           handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
           beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
     });
@@ -317,7 +315,7 @@ struct CUBlas<platform::complex<double>> {
                    const platform::complex<double> *B, int ldb,
                    const platform::complex<double> *beta,
                    platform::complex<double> *C, int ldc) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_zgemv(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_zgemv(
         handle, transa, m, n,
         reinterpret_cast<const rocblas_double_complex *>(alpha),
         reinterpret_cast<const rocblas_double_complex *>(A), lda,
@@ -330,7 +328,7 @@ struct CUBlas<platform::complex<double>> {
                    const platform::complex<double> *alpha,
                    const platform::complex<double> *X, const int incX,
                    platform::complex<double> *Y, const int incY) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_zaxpy(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_zaxpy(
         handle, n, reinterpret_cast<const rocblas_double_complex *>(alpha),
         reinterpret_cast<const rocblas_double_complex *>(X), incX,
         reinterpret_cast<rocblas_double_complex *>(Y), incY));
@@ -348,15 +346,14 @@ struct CUBlas<platform::complex<double>> {
                                  platform::complex<double> *C, int ldc,
                                  long long int strideC,  // NOLINT
                                  int batchCount) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::rocblas_zgemm_strided_batched(
-            handle, transa, transb, m, n, k,
-            reinterpret_cast<const rocblas_double_complex *>(alpha),
-            reinterpret_cast<const rocblas_double_complex *>(A), lda, strideA,
-            reinterpret_cast<const rocblas_double_complex *>(B), ldb, strideB,
-            reinterpret_cast<const rocblas_double_complex *>(beta),
-            reinterpret_cast<rocblas_double_complex *>(C), ldc, strideC,
-            batchCount));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_zgemm_strided_batched(
+        handle, transa, transb, m, n, k,
+        reinterpret_cast<const rocblas_double_complex *>(alpha),
+        reinterpret_cast<const rocblas_double_complex *>(A), lda, strideA,
+        reinterpret_cast<const rocblas_double_complex *>(B), ldb, strideB,
+        reinterpret_cast<const rocblas_double_complex *>(beta),
+        reinterpret_cast<rocblas_double_complex *>(C), ldc, strideC,
+        batchCount));
   }
 
   static void GEMM(rocblas_handle handle, rocblas_operation transa,
@@ -366,7 +363,7 @@ struct CUBlas<platform::complex<double>> {
                    const platform::complex<double> *B, int ldb,
                    const platform::complex<double> *beta,
                    platform::complex<double> *C, int ldc) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_zgemm(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_zgemm(
         handle, transa, transb, m, n, k,
         reinterpret_cast<const rocblas_double_complex *>(alpha),
         reinterpret_cast<const rocblas_double_complex *>(A), lda,
@@ -387,7 +384,7 @@ struct CUBlas<platform::complex<double>> {
                       rocblas_datatype computeType) {
     rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
     dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_gemm_ex(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex(
           handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
           beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
     });
diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu
index 614ae93d9fa82..bc2d496a3e76a 100644
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
@@ -287,13 +287,11 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
     const T** dev_ins_data = nullptr;
     if (!has_same_shape || in_num < 2 || in_num > 4) {
       tmp_dev_ins_data = memory::Alloc(context, in_num * sizeof(T*));
-      {
-        platform::SkipCUDAGraphCaptureGuard guard;
-        memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
-                     tmp_dev_ins_data->ptr(), platform::CPUPlace(),
-                     static_cast<void*>(inputs_data), in_num * sizeof(T*),
-                     context.stream());
-      }
+      auto* restored =
+          platform::RestoreHostMemIfCapturingCUDAGraph(inputs_data, in_num);
+      memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
+                   tmp_dev_ins_data->ptr(), platform::CPUPlace(), restored,
+                   in_num * sizeof(T*), context.stream());
       dev_ins_data = reinterpret_cast<const T**>(tmp_dev_ins_data->ptr());
     }
 
@@ -317,13 +315,12 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
     } else {
       auto tmp_dev_ins_col_data =
           memory::Alloc(context, inputs_col_num * sizeof(int64_t));
-      {
-        platform::SkipCUDAGraphCaptureGuard guard;
-        memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
-                     tmp_dev_ins_col_data->ptr(), platform::CPUPlace(),
-                     static_cast<void*>(inputs_col),
-                     inputs_col_num * sizeof(int64_t), context.stream());
-      }
+
+      auto* restored = platform::RestoreHostMemIfCapturingCUDAGraph(
+          inputs_col, inputs_col_num);
+      memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
+                   tmp_dev_ins_col_data->ptr(), platform::CPUPlace(), restored,
+                   inputs_col_num * sizeof(int64_t), context.stream());
       int64_t* dev_ins_col_data =
           static_cast<int64_t*>(tmp_dev_ins_col_data->ptr());
 
@@ -422,13 +419,11 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
     T** dev_out_gpu_data = nullptr;
     if (!has_same_shape || o_num < 2 || o_num > 4) {
       tmp_dev_outs_data = memory::Alloc(context, o_num * sizeof(T*));
-      {
-        platform::SkipCUDAGraphCaptureGuard guard;
-        memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
-                     tmp_dev_outs_data->ptr(), platform::CPUPlace(),
-                     reinterpret_cast<void*>(outputs_data), o_num * sizeof(T*),
-                     context.stream());
-      }
+      auto* restored =
+          platform::RestoreHostMemIfCapturingCUDAGraph(outputs_data, o_num);
+      memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
+                   tmp_dev_outs_data->ptr(), platform::CPUPlace(), restored,
+                   o_num * sizeof(T*), context.stream());
       dev_out_gpu_data = reinterpret_cast<T**>(tmp_dev_outs_data->ptr());
     }
 
@@ -452,13 +447,11 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
     } else {
       auto tmp_dev_ins_col_data =
           memory::Alloc(context, outputs_cols_num * sizeof(int64_t));
-      {
-        platform::SkipCUDAGraphCaptureGuard guard;
-        memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
-                     tmp_dev_ins_col_data->ptr(), platform::CPUPlace(),
-                     reinterpret_cast<void*>(outputs_cols),
-                     outputs_cols_num * sizeof(int64_t), context.stream());
-      }
+      auto* restored = platform::RestoreHostMemIfCapturingCUDAGraph(
+          outputs_cols, outputs_cols_num);
+      memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
+                   tmp_dev_ins_col_data->ptr(), platform::CPUPlace(), restored,
+                   outputs_cols_num * sizeof(int64_t), context.stream());
       int64_t* dev_outs_col_data =
           reinterpret_cast<int64_t*>(tmp_dev_ins_col_data->ptr());
 
diff --git a/paddle/fluid/operators/math/cos_sim_functor.cu b/paddle/fluid/operators/math/cos_sim_functor.cu
index 537c7e47155fe..56ba145da1cad 100644
--- a/paddle/fluid/operators/math/cos_sim_functor.cu
+++ b/paddle/fluid/operators/math/cos_sim_functor.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/cos_sim_functor.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu
index 55662e1d0aad7..3e80e40f3577c 100644
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/operators/math/cross_entropy.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu
index 6da1bfb964f24..6ff2ddaa338df 100644
--- a/paddle/fluid/operators/math/depthwise_conv.cu
+++ b/paddle/fluid/operators/math/depthwise_conv.cu
@@ -23,8 +23,8 @@ namespace cub = hipcub;
 #endif
 #include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h b/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
index 62c45f4dc098b..75d4809a462cb 100644
--- a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <type_traits>
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/gru_compute.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
index 24885d37020dc..851a62dbe9a48 100644
--- a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
@@ -17,7 +17,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/math/eigen_values_vectors.h b/paddle/fluid/operators/math/eigen_values_vectors.h
index 01f05530e34e6..b24f5d40e8dca 100644
--- a/paddle/fluid/operators/math/eigen_values_vectors.h
+++ b/paddle/fluid/operators/math/eigen_values_vectors.h
@@ -184,14 +184,12 @@ struct MatrixEighFunctor<platform::CUDADeviceContext, T> {
                       values_stride >= 32 && values_stride <= 512);
     syevjInfo_t syevj_params;
     if (use_syevj) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cusolverDnCreateSyevjInfo(&syevj_params));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cusolverDnSsyevj_bufferSize(
-              dev_ctx.cusolver_dn_handle(), jobz, uplo, n,
-              reinterpret_cast<const float *>(input_vector), lda,
-              reinterpret_cast<const float *>(out_value), &lwork,
-              syevj_params));
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSsyevj_bufferSize(
+          dev_ctx.cusolver_dn_handle(), jobz, uplo, n,
+          reinterpret_cast<const float *>(input_vector), lda,
+          reinterpret_cast<const float *>(out_value), &lwork, syevj_params));
     } else {
       EvdBuffer(dev_ctx.cusolver_dn_handle(), jobz, uplo, n, input_vector, lda,
                 out_value, &lwork);
@@ -203,7 +201,7 @@ struct MatrixEighFunctor<platform::CUDADeviceContext, T> {
       auto *value_data = out_value + i * values_stride;
       auto handle = dev_ctx.cusolver_dn_handle();
       if (use_syevj) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSsyevj(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSsyevj(
             handle, jobz, uplo, n, reinterpret_cast<float *>(input_data), lda,
             reinterpret_cast<float *>(value_data),
             reinterpret_cast<float *>(work_ptr), lwork, info_ptr,
@@ -220,7 +218,7 @@ struct MatrixEighFunctor<platform::CUDADeviceContext, T> {
     }
 
     if (use_syevj) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cusolverDnDestroySyevjInfo(syevj_params));
     }
     if (has_vectors) {
@@ -255,7 +253,7 @@ struct MatrixEighFunctor<platform::CUDADeviceContext, T> {
       cusolverDnHandle_t handle, cusolverEigMode_t jobz,                       \
       cublasFillMode_t uplo, int n, const T *A, int lda, const ValueType *W,   \
       int *lwork) const {                                                      \
-    PADDLE_ENFORCE_CUDA_SUCCESS(                                               \
+    PADDLE_ENFORCE_GPU_SUCCESS(                                                \
         platform::dynload::cusolverDn##C##evd_bufferSize(                      \
             handle, jobz, uplo, n, reinterpret_cast<const CastType *>(A), lda, \
             W, lwork));                                                        \
@@ -269,7 +267,7 @@ FUNC_WITH_TYPES(EVDBUFFER_INSTANCE);
       cusolverDnHandle_t handle, cusolverEigMode_t jobz,                  \
       cublasFillMode_t uplo, int n, T *A, int lda, ValueType *W, T *work, \
       int lwork, int *devInfo) const {                                    \
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDn##C##evd(    \
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDn##C##evd(     \
         handle, jobz, uplo, n, reinterpret_cast<CastType *>(A), lda, W,   \
         reinterpret_cast<CastType *>(work), lwork, devInfo));             \
   }
diff --git a/paddle/fluid/operators/math/im2col.cu b/paddle/fluid/operators/math/im2col.cu
index 3eadaa2677ab4..f616e116d0aee 100644
--- a/paddle/fluid/operators/math/im2col.cu
+++ b/paddle/fluid/operators/math/im2col.cu
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 #include "paddle/fluid/operators/math/im2col.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h
index 71080bf424a01..54a37db1df71a 100644
--- a/paddle/fluid/operators/math/inclusive_scan.h
+++ b/paddle/fluid/operators/math/inclusive_scan.h
@@ -42,7 +42,7 @@ static void CubInclusiveScan(InputIterator x_iter, OutputIterator y_iter,
   void *temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
   for (size_t i = 0; i < 2; ++i) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(cub::DeviceScan::InclusiveScan(
+    PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::InclusiveScan(
         temp_storage, temp_storage_bytes, x_iter, y_iter, op,
         static_cast<int>(n),  // Maybe overflow?
         dev_ctx.stream()));
diff --git a/paddle/fluid/operators/math/maxouting.cu b/paddle/fluid/operators/math/maxouting.cu
index 8b134a29d81cf..1856fb4eb48c7 100644
--- a/paddle/fluid/operators/math/maxouting.cu
+++ b/paddle/fluid/operators/math/maxouting.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/maxouting.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu
index 84a970a9a2606..076d3aa3361f0 100644
--- a/paddle/fluid/operators/math/pooling.cu
+++ b/paddle/fluid/operators/math/pooling.cu
@@ -16,10 +16,10 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/operators/math/pooling.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/fast_divmod.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
 
 #ifdef __HIPCC__
 #define POOLING_BLOCK_SIZE 256
diff --git a/paddle/fluid/operators/math/prelu.h b/paddle/fluid/operators/math/prelu.h
index dc1e3c1c3ded1..70aae2ba59e2c 100644
--- a/paddle/fluid/operators/math/prelu.h
+++ b/paddle/fluid/operators/math/prelu.h
@@ -16,11 +16,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/operators/math/math_function.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#else
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu
index 446acc033eb7f..f596c1bc3dcf3 100644
--- a/paddle/fluid/operators/math/sample_prob.cu
+++ b/paddle/fluid/operators/math/sample_prob.cu
@@ -144,13 +144,13 @@ void GPUSampleWithProb<T>::operator()(
   VLOG(1) << "num_tries: " << num_tries;
 
 #ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipMemcpy(samples_data + num_true, s_data,
-                                        sizeof(int64_t) * num_samples,
-                                        hipMemcpyHostToDevice));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(samples_data + num_true, s_data,
+                                       sizeof(int64_t) * num_samples,
+                                       hipMemcpyHostToDevice));
 #else
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpy(samples_data + num_true, s_data,
-                                         sizeof(int64_t) * num_samples,
-                                         cudaMemcpyHostToDevice));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(samples_data + num_true, s_data,
+                                        sizeof(int64_t) * num_samples,
+                                        cudaMemcpyHostToDevice));
 #endif
 
   int threads = 512;
diff --git a/paddle/fluid/operators/math/segment_pooling.cu b/paddle/fluid/operators/math/segment_pooling.cu
index b49b5036ac42e..67cf316246007 100644
--- a/paddle/fluid/operators/math/segment_pooling.cu
+++ b/paddle/fluid/operators/math/segment_pooling.cu
@@ -16,8 +16,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/segment_pooling.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index f3ef537a31b44..0e04c37ed2b12 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -17,7 +17,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu
index cba8dd935ef1b..b3e1922e10657 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence_pooling.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/math/sequence_scale.cu b/paddle/fluid/operators/math/sequence_scale.cu
index 5578f1f0138c4..1807c77e37ca1 100644
--- a/paddle/fluid/operators/math/sequence_scale.cu
+++ b/paddle/fluid/operators/math/sequence_scale.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/sequence_scale.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
index 9e9fe5b9c1020..bc32e068f566d 100644
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -16,11 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/fluid/operators/math/softmax_impl.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#else
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
@@ -54,7 +50,7 @@ void SoftmaxCUDNNFunctor<T>::operator()(
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
   miopenTensorDescriptor_t cudnn_y_desc =
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
       context.cudnn_handle(), CudnnDataType<T>::kOne(), cudnn_x_desc,
       X->data<T>(), CudnnDataType<T>::kZero(), cudnn_y_desc,
       Y->mutable_data<T>(context.GetPlace()), MIOPEN_SOFTMAX_ACCURATE,
@@ -64,7 +60,7 @@ void SoftmaxCUDNNFunctor<T>::operator()(
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
   cudnnTensorDescriptor_t cudnn_y_desc =
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxForward(
       context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE,
       CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType<T>::kOne(), cudnn_x_desc,
       X->data<T>(), CudnnDataType<T>::kZero(), cudnn_y_desc,
@@ -97,7 +93,7 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
       dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
   miopenTensorDescriptor_t cudnn_ygrad_desc =
       dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2(
       context.cudnn_handle(), CudnnDataType<T>::kOne(), cudnn_y_desc,
       Y->data<T>(), cudnn_ygrad_desc, YGrad->data<T>(),
       CudnnDataType<T>::kZero(), cudnn_xgrad_desc,
@@ -110,7 +106,7 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
       dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
   cudnnTensorDescriptor_t cudnn_ygrad_desc =
       dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
       context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE,
       CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType<T>::kOne(), cudnn_y_desc,
       Y->data<T>(), cudnn_ygrad_desc, YGrad->data<T>(),
diff --git a/paddle/fluid/operators/math/unpooling.cu b/paddle/fluid/operators/math/unpooling.cu
index ad23892f37903..dbb3d64350cae 100644
--- a/paddle/fluid/operators/math/unpooling.cu
+++ b/paddle/fluid/operators/math/unpooling.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/unpooling.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/vol2col.cu b/paddle/fluid/operators/math/vol2col.cu
index d83b5b0fe3afb..d9c757544a9c6 100644
--- a/paddle/fluid/operators/math/vol2col.cu
+++ b/paddle/fluid/operators/math/vol2col.cu
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 #include "paddle/fluid/operators/math/vol2col.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/matrix_rank_op.cu b/paddle/fluid/operators/matrix_rank_op.cu
index d85a262b5e910..757c780b4ea53 100644
--- a/paddle/fluid/operators/matrix_rank_op.cu
+++ b/paddle/fluid/operators/matrix_rank_op.cu
@@ -162,9 +162,9 @@ void MatrixRankGPUKernel<float>::GesvdjBatched(
   int ldt = n;
   int lwork = 0;
   auto handle = dev_ctx.cusolver_dn_handle();
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSgesvdj_bufferSize(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgesvdj_bufferSize(
       handle, jobz, thin_UV, m, n, A, lda, S, U, ldu, V, ldt, &lwork,
       gesvdj_params));
   auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float));
@@ -173,7 +173,7 @@ void MatrixRankGPUKernel<float>::GesvdjBatched(
   int stride_U = ldu * (thin_UV ? k : m);
   int stride_V = ldt * (thin_UV ? k : n);
   for (int i = 0; i < batchSize; i++) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSgesvdj(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgesvdj(
         handle, jobz, thin_UV, m, n, A + stride_A * i, lda, S + k * i,
         U + stride_U * i, ldu, V + stride_V * i, ldt, workspace_ptr, lwork,
         info, gesvdj_params));
@@ -186,7 +186,7 @@ void MatrixRankGPUKernel<float>::GesvdjBatched(
         platform::errors::PreconditionNotMet(
             "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info));
   }
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
 }
 
@@ -203,9 +203,9 @@ void MatrixRankGPUKernel<double>::GesvdjBatched(
   int ldt = n;
   int lwork = 0;
   auto handle = dev_ctx.cusolver_dn_handle();
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDgesvdj_bufferSize(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgesvdj_bufferSize(
       handle, jobz, thin_UV, m, n, A, lda, S, U, ldu, V, ldt, &lwork,
       gesvdj_params));
   auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double));
@@ -214,7 +214,7 @@ void MatrixRankGPUKernel<double>::GesvdjBatched(
   int stride_U = ldu * (thin_UV ? k : m);
   int stride_V = ldt * (thin_UV ? k : n);
   for (int i = 0; i < batchSize; ++i) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDgesvdj(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgesvdj(
         handle, jobz, thin_UV, m, n, A + stride_A * i, lda, S + k * i,
         U + stride_U * i, ldu, V + stride_V * i, ldt, workspace_ptr, lwork,
         info, gesvdj_params));
@@ -228,7 +228,7 @@ void MatrixRankGPUKernel<double>::GesvdjBatched(
         platform::errors::PreconditionNotMet(
             "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info));
   }
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
 }
 
@@ -247,14 +247,14 @@ void MatrixRankGPUKernel<float>::SyevjBatched(
   int stride_A = lda * n;
   int lwork = 0;
   syevjInfo_t params = NULL;
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::cusolverDnCreateSyevjInfo(&params));
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSsyevj_bufferSize(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSsyevj_bufferSize(
       handle, jobz, uplo, n, A, lda, W, &lwork, params));
   auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float));
   float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
   for (int i = 0; i < batchSize; i++) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSsyevj(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSsyevj(
         handle, jobz, uplo, n, A + stride_A * i, lda, W + n * i, workspace_ptr,
         lwork, info, params));
 
@@ -268,7 +268,7 @@ void MatrixRankGPUKernel<float>::SyevjBatched(
             "For batch [%d]: CUSolver eigenvalues is not zero. [%d]", i,
             error_info));
   }
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::cusolverDnDestroySyevjInfo(params));
 }
 
@@ -285,15 +285,15 @@ void MatrixRankGPUKernel<double>::SyevjBatched(
   int stride_A = lda * n;
   int lwork = 0;
   syevjInfo_t params = NULL;
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::cusolverDnCreateSyevjInfo(&params));
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDsyevj_bufferSize(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDsyevj_bufferSize(
       handle, jobz, uplo, n, A, lda, W, &lwork, params));
   auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double));
   double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
 
   for (int i = 0; i < batchSize; i++) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDsyevj(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDsyevj(
         handle, jobz, uplo, n, A + stride_A * i, lda, W + n * i, workspace_ptr,
         lwork, info, params));
     int error_info;
@@ -306,7 +306,7 @@ void MatrixRankGPUKernel<double>::SyevjBatched(
             "For batch [%d]: CUSolver eigenvalues is not zero. [%d]", i,
             error_info));
   }
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::cusolverDnDestroySyevjInfo(params));
 }
 
diff --git a/paddle/fluid/operators/mean_iou_op.cu b/paddle/fluid/operators/mean_iou_op.cu
index 7098a720cc3a0..79aff52a16fa9 100644
--- a/paddle/fluid/operators/mean_iou_op.cu
+++ b/paddle/fluid/operators/mean_iou_op.cu
@@ -15,8 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/mean_iou_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu
index 430036bc67de7..1a10b7033f69e 100644
--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 #include "paddle/fluid/operators/mean_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
@@ -65,14 +65,14 @@ class MeanCUDAKernel : public framework::OpKernel<T> {
 
     auto err = cub::DeviceReduce::Sum(nullptr, temp_storage_bytes, trans_x,
                                       out_data, size_prob, stream);
-    PADDLE_ENFORCE_CUDA_SUCCESS(err);
+    PADDLE_ENFORCE_GPU_SUCCESS(err);
     framework::Tensor tmp;
     auto* temp_storage = tmp.mutable_data<uint8_t>(
         framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
         context.GetPlace());
     err = cub::DeviceReduce::Sum(temp_storage, temp_storage_bytes, trans_x,
                                  out_data, size_prob, stream);
-    PADDLE_ENFORCE_CUDA_SUCCESS(err);
+    PADDLE_ENFORCE_GPU_SUCCESS(err);
   }
 };
 
diff --git a/paddle/fluid/operators/memcpy_h2d_op.h b/paddle/fluid/operators/memcpy_h2d_op.h
index 3998db6731b3d..43ac5984bc8c8 100644
--- a/paddle/fluid/operators/memcpy_h2d_op.h
+++ b/paddle/fluid/operators/memcpy_h2d_op.h
@@ -41,7 +41,12 @@ class MemcpyH2DFunctor {
 
   void operator()(const framework::LoDTensor &lod_tensor) const {
     auto &out_tensor = *out_->GetMutable<framework::LoDTensor>();
-
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    out_tensor.mutable_data(
+        BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_.GetPlace()),
+        lod_tensor.type(),
+        static_cast<const platform::CUDADeviceContext *>(&dev_ctx_)->stream());
+#endif
     if (dst_place_type_ == 0 || dst_place_type_ == 1) {
       framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_,
                             &out_tensor);
diff --git a/paddle/fluid/operators/metrics/accuracy_op.cu b/paddle/fluid/operators/metrics/accuracy_op.cu
index 3d22fc60993c7..6f19100fa9d37 100644
--- a/paddle/fluid/operators/metrics/accuracy_op.cu
+++ b/paddle/fluid/operators/metrics/accuracy_op.cu
@@ -15,9 +15,9 @@ limitations under the License. */
 #include <thrust/execution_policy.h>
 #include <thrust/reduce.h>
 #include "paddle/fluid/operators/metrics/accuracy_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/metrics/auc_op.cu b/paddle/fluid/operators/metrics/auc_op.cu
index 40609381c17ae..1cb7eba8775e8 100644
--- a/paddle/fluid/operators/metrics/auc_op.cu
+++ b/paddle/fluid/operators/metrics/auc_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/metrics/auc_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/miopen_lstm_cache.h b/paddle/fluid/operators/miopen_lstm_cache.h
index a357e6e5af6af..c307218baa406 100644
--- a/paddle/fluid/operators/miopen_lstm_cache.h
+++ b/paddle/fluid/operators/miopen_lstm_cache.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <vector>
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/miopen_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
@@ -66,7 +66,7 @@ class ScopedRNNBase {
     // ------------------- miopen dropout descriptors ---------------------
     size_t state_size;
     if (!initialized_) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::miopenDropoutGetStatesSize(handle, &state_size));
       dropout_state->mutable_data<uint8_t>({static_cast<int64_t>(state_size)},
                                            place);
@@ -75,7 +75,7 @@ class ScopedRNNBase {
                              dropout_state, seed_, state_size);
 
     // ------------------- miopen rnn descriptors ---------------------
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetRNNDescriptor_V2(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetRNNDescriptor_V2(
         rnn_desc_.desc(), hidden_size_, num_layers_, dropout_desc_.desc(),
         miopenRNNlinear,
         is_bidirec_ ? miopenRNNbidirection : miopenRNNunidirection, miopenLSTM,
@@ -83,7 +83,7 @@ class ScopedRNNBase {
 
     // ------------------- miopen weights_size ---------------------
     size_t weights_size_;
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenGetRNNParamsSize(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenGetRNNParamsSize(
         handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, miopen_type));
     PADDLE_ENFORCE_EQ(
         weights_size_, sizeof(T) * weight_numel_,
@@ -95,10 +95,10 @@ class ScopedRNNBase {
     std::vector<int> dim_w = {dim_tmp, 1, 1};
     weight_desc_.descriptor<T>(layout, dim_w);
     // ------------------- miopen workspace, reserve size ---------------------
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenGetRNNWorkspaceSize(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenGetRNNWorkspaceSize(
         handle, rnn_desc_.desc(), seq_length_, x_descs_.data(),
         workspace_size));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenGetRNNTrainingReserveSize(
             handle, rnn_desc_.desc(), seq_length_, x_descs_.data(),
             reserve_size));
diff --git a/paddle/fluid/operators/miopen_rnn_cache.h b/paddle/fluid/operators/miopen_rnn_cache.h
index 97d608331ccb5..38cea39abd5de 100644
--- a/paddle/fluid/operators/miopen_rnn_cache.h
+++ b/paddle/fluid/operators/miopen_rnn_cache.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <vector>
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/miopen_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
@@ -95,16 +95,16 @@ struct CudnnRNNCache {
     std::vector<int> strides_y = {hidden_size_ * numDirections, 1, 1};
 
     for (size_t i = 0; i < seq_length_; ++i) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::miopenCreateTensorDescriptor(&x_desc_[i]));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::miopenCreateTensorDescriptor(&y_desc_[i]));
 
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
           x_desc_[i], miopen_type, 3, const_cast<int *>(dims.data()),
           const_cast<int *>(strides.data())));
 
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
           y_desc_[i], miopen_type, 3, const_cast<int *>(dims_y.data()),
           const_cast<int *>(strides_y.data())));
     }
@@ -113,85 +113,85 @@ struct CudnnRNNCache {
                                 hidden_size_};
     std::vector<int> strides_hx = {hidden_size_ * batch_size_, hidden_size_, 1};
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenCreateTensorDescriptor(&hx_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenCreateTensorDescriptor(&cx_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenCreateTensorDescriptor(&hy_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenCreateTensorDescriptor(&cy_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenCreateTensorDescriptor(&dhx_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenCreateTensorDescriptor(&dcx_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenCreateTensorDescriptor(&dhy_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenCreateTensorDescriptor(&dcy_desc_));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
         hx_desc_, miopen_type, 3, const_cast<int *>(dims_hx.data()),
         const_cast<int *>(strides_hx.data())));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
         cx_desc_, miopen_type, 3, const_cast<int *>(dims_hx.data()),
         const_cast<int *>(strides_hx.data())));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
         hy_desc_, miopen_type, 3, const_cast<int *>(dims_hx.data()),
         const_cast<int *>(strides_hx.data())));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
         cy_desc_, miopen_type, 3, const_cast<int *>(dims_hx.data()),
         const_cast<int *>(strides_hx.data())));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
         dhx_desc_, miopen_type, 3, const_cast<int *>(dims_hx.data()),
         const_cast<int *>(strides_hx.data())));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
         dcx_desc_, miopen_type, 3, const_cast<int *>(dims_hx.data()),
         const_cast<int *>(strides_hx.data())));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
         dhy_desc_, miopen_type, 3, const_cast<int *>(dims_hx.data()),
         const_cast<int *>(strides_hx.data())));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
         dcy_desc_, miopen_type, 3, const_cast<int *>(dims_hx.data()),
         const_cast<int *>(strides_hx.data())));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenCreateDropoutDescriptor(&dropout_desc_));
 
     size_t state_size;
     if (!initialized) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::miopenDropoutGetStatesSize(handle, &state_size));
       dropout_state_->Resize({static_cast<int64_t>(state_size)});
       uint8_t *dropout_state_data =
           dropout_state_->mutable_data<uint8_t>(place);
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetDropoutDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetDropoutDescriptor(
           dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size,
           seed_, false, false, MIOPEN_RNG_PSEUDO_XORWOW));
     } else {
       uint8_t *dropout_state_data = dropout_state_->data<uint8_t>();
       auto dropout_state_dims = dropout_state_->dims();
       state_size = dropout_state_dims[0];
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::miopenRestoreDropoutDescriptor(
               dropout_desc_, handle, dropout_prob_, dropout_state_data,
               state_size, 0, false, false, MIOPEN_RNG_PSEUDO_XORWOW));
     }
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenCreateRNNDescriptor(&rnn_desc_));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetRNNDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetRNNDescriptor(
         rnn_desc_, hidden_size_, num_layers_, miopenRNNlinear,
         is_bidirec_ ? miopenRNNbidirection : miopenRNNunidirection, miopenLSTM,
         miopenRNNNoBias, miopenRNNdefault, miopen_type));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenCreateTensorDescriptor(&w_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenCreateTensorDescriptor(&dw_desc_));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenGetRNNParamsSize(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenGetRNNParamsSize(
         handle, rnn_desc_, x_desc_[0], &weights_size_, miopen_type));
 
     PADDLE_ENFORCE_EQ(
@@ -208,14 +208,14 @@ struct CudnnRNNCache {
     dim_s[1] = 1;
     dim_s[0] = dim_w[1];
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
         w_desc_, miopen_type, 3, dim_w, dim_s));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
         dw_desc_, miopen_type, 3, dim_w, dim_s));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenGetRNNWorkspaceSize(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenGetRNNWorkspaceSize(
         handle, rnn_desc_, seq_length_, x_desc_, &workspace_size_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenGetRNNTrainingReserveSize(
             handle, rnn_desc_, seq_length_, x_desc_, reserve_size_));
 
@@ -225,40 +225,40 @@ struct CudnnRNNCache {
 
   void release() {
     for (size_t i = 0; i < seq_length_; ++i) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::miopenDestroyTensorDescriptor(x_desc_[i]));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::miopenDestroyTensorDescriptor(y_desc_[i]));
     }
 
     delete[] x_desc_;
     delete[] y_desc_;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDestroyTensorDescriptor(hx_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDestroyTensorDescriptor(cx_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDestroyTensorDescriptor(hy_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDestroyTensorDescriptor(cy_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDestroyTensorDescriptor(dhx_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDestroyTensorDescriptor(dcx_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDestroyTensorDescriptor(dhy_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDestroyTensorDescriptor(dcy_desc_));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDestroyDropoutDescriptor(dropout_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDestroyRNNDescriptor(rnn_desc_));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDestroyTensorDescriptor(w_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDestroyTensorDescriptor(dw_desc_));
   }
 };
diff --git a/paddle/fluid/operators/mish_op.cu b/paddle/fluid/operators/mish_op.cu
index 6513e5d95e4ac..4ca07b650c80a 100644
--- a/paddle/fluid/operators/mish_op.cu
+++ b/paddle/fluid/operators/mish_op.cu
@@ -11,8 +11,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/mish_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
index 35e35eb4bcb55..4a3d1f455bd26 100644
--- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
@@ -26,14 +26,12 @@ using Tensor = framework::Tensor;
 using framework::DataLayout;
 
 inline dnnl::memory::dims GetWeightsTz(const Tensor* filter, const int groups) {
-  auto iohw_weights_tz = framework::vectorize(filter->dims());
-  auto weights_tz = iohw_weights_tz;
-
-  // IOHW -> OIHW
-  weights_tz[0] = iohw_weights_tz[1];
-  weights_tz[1] = iohw_weights_tz[0];
+  auto weights_tz = framework::vectorize(filter->dims());
   int g = std::max(groups, 1);
+  int g_dim = (g > 1) ? 1 : 0;
   platform::GetGroupConvWeightsTz(weights_tz, g);
+  // gIOHW -> gOIHW || IOHW -> OIHW
+  std::swap(weights_tz[g_dim + 0], weights_tz[g_dim + 1]);
   return weights_tz;
 }
 
diff --git a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
index 0457aeed616fa..0266edac75d1e 100644
--- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
@@ -25,9 +25,9 @@ using paddle::platform::MKLDNNDeviceContext;
 using paddle::platform::MKLDNNGetDataType;
 using paddle::platform::to_void_cast;
 using Tensor = paddle::framework::Tensor;
-using paddle::framework::vectorize;
-using paddle::framework::make_ddim;
 using paddle::framework::GradVarName;
+using paddle::framework::make_ddim;
+using paddle::framework::vectorize;
 
 template <typename T>
 class MatMulV2MKLDNNHandler
@@ -123,45 +123,58 @@ class MatMulV2MKLDNNHandler
   }
 };
 
-template <typename T>
-class MatMulV2MKLDNNKernel
-    : public paddle::operators::MatMulGradMKLDNNKernel<T> {
- public:
-  void Compute(const ExecutionContext& ctx) const override { RunKernel(ctx); }
+bool IsOutputFused(const ExecutionContext& ctx) {
+  auto& fused_reshape_Out = ctx.Attr<std::vector<int>>("fused_reshape_Out");
+  auto& fused_transpose_Out = ctx.Attr<std::vector<int>>("fused_transpose_Out");
+  return !fused_reshape_Out.empty() && !fused_transpose_Out.empty();
+}
+
+float ComputeOutputScale(const ExecutionContext& ctx) {
+  float scale_x = ctx.Attr<float>("Scale_x");
+  float scale_y = ctx.Attr<float>("Scale_y");
+  bool force_fp32_out = ctx.Attr<bool>("force_fp32_output");
+  float scale_out = force_fp32_out ? 1.f : ctx.Attr<float>("Scale_out");
+  return scale_out / (scale_x * scale_y);
+}
 
- protected:
-  void ExecuteMatMul(const ExecutionContext& ctx,
+template <typename T>
+void ExecuteMatMulV2(const ExecutionContext& ctx,
                      const MKLDNNDeviceContext& dev_ctx,
                      const dnnl::engine onednn_engine,
                      paddle::platform::Place cpu_place, const Tensor* x,
                      std::vector<int64_t>& x_dims, bool trans_x,
                      const Tensor* y, std::vector<int64_t>& y_dims,
                      bool trans_y, Tensor* out, std::vector<int64_t>& out_dims,
-                     int execution_number = 0) const {
-    MatMulV2MKLDNNHandler<T> handler(onednn_engine, ctx.GetPlace(), x_dims,
-                                     trans_x, y_dims, trans_y,
-                                     IsOutputFused(ctx));
+                     int execution_number = 0) {
+  MatMulV2MKLDNNHandler<T> handler(onednn_engine, ctx.GetPlace(), x_dims,
+                                   trans_x, y_dims, trans_y,
+                                   IsOutputFused(ctx));
 
-    const auto src_memory_p = handler.AcquireSrcMemory(x);
-    const auto weights_memory_p = handler.AcquireWeightsMemory(y);
-    const auto dst_memory_p = handler.AcquireDstMemory(out);
+  const auto src_memory_p = handler.AcquireSrcMemory(x);
+  const auto weights_memory_p = handler.AcquireWeightsMemory(y);
+  const auto dst_memory_p = handler.AcquireDstMemory(out);
 
-    auto matmul_p = handler.AcquireForwardPrimitive();
+  auto matmul_p = handler.AcquireForwardPrimitive();
 
-    std::unordered_map<int, memory> matmul_args = {
-        {DNNL_ARG_SRC, *src_memory_p},
-        {DNNL_ARG_WEIGHTS, *weights_memory_p},
-        {DNNL_ARG_DST, *dst_memory_p}};
+  std::unordered_map<int, memory> matmul_args = {
+      {DNNL_ARG_SRC, *src_memory_p},
+      {DNNL_ARG_WEIGHTS, *weights_memory_p},
+      {DNNL_ARG_DST, *dst_memory_p}};
 
-    auto& astream = MKLDNNDeviceContext::tls().get_stream();
-    matmul_p->execute(astream, matmul_args);
-    astream.wait();
+  auto& astream = MKLDNNDeviceContext::tls().get_stream();
+  matmul_p->execute(astream, matmul_args);
+  astream.wait();
 
-    auto format = paddle::platform::MKLDNNFormatForSize(
-        out->dims().size(), dnnl::memory::format_tag::nchw);
-    out->set_layout(paddle::framework::DataLayout::kMKLDNN);
-    out->set_format(format);
-  }
+  auto format = paddle::platform::MKLDNNFormatForSize(
+      out->dims().size(), dnnl::memory::format_tag::nchw);
+  out->set_layout(paddle::framework::DataLayout::kMKLDNN);
+  out->set_format(format);
+}
+
+template <typename T>
+class MatMulV2MKLDNNKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const ExecutionContext& ctx) const override { RunKernel(ctx); }
 
  private:
   void CalculateMatrixDims(const ExecutionContext& ctx,
@@ -207,13 +220,6 @@ class MatMulV2MKLDNNKernel
     }
   }
 
-  bool IsOutputFused(const ExecutionContext& ctx) const {
-    auto& fused_reshape_Out = ctx.Attr<std::vector<int>>("fused_reshape_Out");
-    auto& fused_transpose_Out =
-        ctx.Attr<std::vector<int>>("fused_transpose_Out");
-    return !fused_reshape_Out.empty() && !fused_transpose_Out.empty();
-  }
-
   void RunKernel(const ExecutionContext& ctx) const {
     const auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
     const auto& onednn_engine = dev_ctx.GetEngine();
@@ -237,13 +243,14 @@ class MatMulV2MKLDNNKernel
     CalculateMatrixDims(ctx, x_dims, y_dims, x_bd_dims, y_bd_dims, out_dims,
                         out);
 
-    ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), x, x_bd_dims,
-                  trans_x, y, y_bd_dims, trans_y, out, out_dims);
+    ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), x,
+                       x_bd_dims, trans_x, y, y_bd_dims, trans_y, out,
+                       out_dims);
   }
 };
 
 template <typename T>
-class MatMulV2GradMKLDNNKernel : public MatMulV2MKLDNNKernel<T> {
+class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel<T> {
  public:
   void Compute(const ExecutionContext& ctx) const override { RunKernel(ctx); }
 
@@ -316,7 +323,7 @@ class MatMulV2GradMKLDNNKernel : public MatMulV2MKLDNNKernel<T> {
     // if no broadcasting is needed, we can simply use matmul's grad and avoid
     // using reduce_sum
     if (!is_broadcast) {
-      paddle::operators::MatMulGradMKLDNNKernel<T>::Compute(ctx);
+      matmul_v1_grad_mkldnn_kernel.Compute(ctx);
       return;
     }
 
@@ -342,33 +349,29 @@ class MatMulV2GradMKLDNNKernel : public MatMulV2MKLDNNKernel<T> {
                             dy_bd_dims);
 
     if (trans_x && trans_y) {
-      this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), y,
-                          y_dims, true, dout, dout_dims, true, &dx_tmp,
-                          dx_bd_dims, 1);
-      this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout,
-                          dout_dims, true, x, x_dims, true, &dy_tmp, dy_bd_dims,
-                          2);
+      ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), y, y_dims,
+                         true, dout, dout_dims, true, &dx_tmp, dx_bd_dims, 1);
+      ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout,
+                         dout_dims, true, x, x_dims, true, &dy_tmp, dy_bd_dims,
+                         2);
     } else if (trans_x) {
-      this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), y,
-                          y_dims, false, dout, dout_dims, true, &dx_tmp,
-                          dx_bd_dims, 1);
-      this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), x,
-                          x_dims, false, dout, dout_dims, false, &dy_tmp,
-                          dy_bd_dims, 2);
+      ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), y, y_dims,
+                         false, dout, dout_dims, true, &dx_tmp, dx_bd_dims, 1);
+      ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), x, x_dims,
+                         false, dout, dout_dims, false, &dy_tmp, dy_bd_dims, 2);
     } else if (trans_y) {
-      this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout,
-                          dout_dims, false, y, y_dims, false, &dx_tmp,
-                          dx_bd_dims, 1);
-      this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout,
-                          dout_dims, true, x, x_dims, false, &dy_tmp,
-                          dy_bd_dims, 2);
+      ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout,
+                         dout_dims, false, y, y_dims, false, &dx_tmp,
+                         dx_bd_dims, 1);
+      ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout,
+                         dout_dims, true, x, x_dims, false, &dy_tmp, dy_bd_dims,
+                         2);
     } else {
-      this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout,
-                          dout_dims, false, y, y_dims, true, &dx_tmp,
-                          dx_bd_dims, 1);
-      this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), x,
-                          x_dims, true, dout, dout_dims, false, &dy_tmp,
-                          dy_bd_dims, 2);
+      ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout,
+                         dout_dims, false, y, y_dims, true, &dx_tmp, dx_bd_dims,
+                         1);
+      ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), x, x_dims,
+                         true, dout, dout_dims, false, &dy_tmp, dy_bd_dims, 2);
     }
 
     if (x_dims != dx_bd_dims) {
@@ -389,8 +392,12 @@ class MatMulV2GradMKLDNNKernel : public MatMulV2MKLDNNKernel<T> {
     dy->set_layout(paddle::framework::DataLayout::kMKLDNN);
     dy->set_format(y->format());
   }
+
+ private:
+  paddle::operators::MatMulGradMKLDNNKernel<T> matmul_v1_grad_mkldnn_kernel;
 };
 }  // anonymous namespace
+
 namespace ops = paddle::operators;
 
 REGISTER_OP_KERNEL(matmul_v2, MKLDNN, ::paddle::platform::CPUPlace,
diff --git a/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc
index d9bd843a9d0cf..e5f70fa10e375 100644
--- a/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc
@@ -227,6 +227,8 @@ class SliceGradMKLDNNKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_KERNEL(slice, MKLDNN, paddle::platform::CPUPlace,
                    ops::SliceMKLDNNKernel<float>,
+                   ops::SliceMKLDNNKernel<int8_t>,
+                   ops::SliceMKLDNNKernel<uint8_t>,
                    ops::SliceMKLDNNKernel<paddle::platform::bfloat16>);
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/mv_op.cu b/paddle/fluid/operators/mv_op.cu
index ee638ede22b64..cec17f1324313 100644
--- a/paddle/fluid/operators/mv_op.cu
+++ b/paddle/fluid/operators/mv_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/mv_op.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.cc b/paddle/fluid/operators/nccl/nccl_gpu_common.cc
index 169af47e95acd..bcbc96ea1b6d1 100644
--- a/paddle/fluid/operators/nccl/nccl_gpu_common.cc
+++ b/paddle/fluid/operators/nccl/nccl_gpu_common.cc
@@ -50,7 +50,7 @@ void Communicator::InitAll(const std::vector<int>& gpus) {
   for (size_t i = 0; i < gpus.size(); ++i) {
     (*comm_id_map)[gpus[i]] = i;
   }
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       dynload::ncclCommInitAll(global_comms->data(), gpus.size(), gpus.data()));
   inited = true;
 }
diff --git a/paddle/fluid/operators/nccl/nccl_op.cu.cc b/paddle/fluid/operators/nccl/nccl_op.cu.cc
index 9a4a036077f58..f319ce159f6dd 100644
--- a/paddle/fluid/operators/nccl/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op.cu.cc
@@ -74,7 +74,7 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
     VLOG(3) << "gpu : "
             << " invoke allreduce. send " << x->numel() << " recv "
             << out->numel();
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
         x->data<T>(), out->mutable_data<T>(ctx.GetPlace()), out->numel(),
         NCCLTypeWrapper<T>::type, reduction_op_, comm->comms().at(idx),
         ctx.cuda_device_context().stream()));
@@ -111,7 +111,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
     }
     VLOG(3) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel()
             << " recv " << out->numel();
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce(
         x->data<T>(), recvbuffer, x->numel(), NCCLTypeWrapper<T>::type,
         reduction_op_, root, comm->comms().at(idx),
         ctx.cuda_device_context().stream()));
@@ -136,7 +136,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
     if (idx == root) {
       auto* x = ctx.Input<LoDTensor>("X");
       VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel();
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
           reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), x->numel(),
           NCCLTypeWrapper<T>::type, root, comm->comms().at(idx),
           ctx.cuda_device_context().stream()));
@@ -145,7 +145,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
       auto* out = ctx.Output<LoDTensor>("Out");
       VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. recv buffer "
               << framework::product(out->dims());
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
           out->mutable_data<T>(ctx.GetPlace()), out->numel(),
           NCCLTypeWrapper<T>::type, root, comm->comms().at(idx),
           ctx.cuda_device_context().stream()));
diff --git a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
index 6c7fba8d4ac78..41c1b4d7a8f81 100644
--- a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
@@ -23,9 +23,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
 
@@ -44,7 +44,7 @@ const f::DDim kDims = {20, 20};
 class NCCLTester : public ::testing::Test {
  public:
   void SetUp() override {
-    int count = p::GetCUDADeviceCount();
+    int count = p::GetGPUDeviceCount();
     if (count <= 0) {
       LOG(WARNING) << "Cannot test gpu nccl, because the CUDA device count is "
                    << count;
diff --git a/paddle/fluid/operators/nll_loss_op.cu b/paddle/fluid/operators/nll_loss_op.cu
index b6e7cd256e18d..03af45634149d 100644
--- a/paddle/fluid/operators/nll_loss_op.cu
+++ b/paddle/fluid/operators/nll_loss_op.cu
@@ -13,7 +13,7 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/operators/nll_loss_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/fluid/operators/norm_utils.cu.h
index 843736833f815..241c634e3fc98 100644
--- a/paddle/fluid/operators/norm_utils.cu.h
+++ b/paddle/fluid/operators/norm_utils.cu.h
@@ -26,11 +26,7 @@ namespace cub = hipcub;
 #endif
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#else
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 #ifdef __HIPCC__
 #define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim)
diff --git a/paddle/fluid/operators/one_hot_op.cu b/paddle/fluid/operators/one_hot_op.cu
index bffd1d5305127..3da7a3afcc93d 100644
--- a/paddle/fluid/operators/one_hot_op.cu
+++ b/paddle/fluid/operators/one_hot_op.cu
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/one_hot_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/one_hot_v2_op.cu b/paddle/fluid/operators/one_hot_v2_op.cu
index 2366f1422244e..22eb6c81845d1 100644
--- a/paddle/fluid/operators/one_hot_v2_op.cu
+++ b/paddle/fluid/operators/one_hot_v2_op.cu
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/one_hot_v2_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cu b/paddle/fluid/operators/optimizers/adagrad_op.cu
index 5043468d4c5f7..8b939b7c6b3ba 100644
--- a/paddle/fluid/operators/optimizers/adagrad_op.cu
+++ b/paddle/fluid/operators/optimizers/adagrad_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/operators/optimizers/adagrad_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op.cc b/paddle/fluid/operators/optimizers/merged_momentum_op.cc
index 6c63376b5eb42..1733150f27128 100644
--- a/paddle/fluid/operators/optimizers/merged_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/merged_momentum_op.cc
@@ -50,7 +50,8 @@ class MergedMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsDuplicable();
     AddInput("LearningRate",
              "(Tensor, default Tensor<float>) "
-             "Input learning rate");
+             "Input learning rate")
+        .AsDuplicable();
     AddInput("MasterParam", "FP32 master weight for AMP.")
         .AsDispensable()
         .AsDuplicable();
@@ -68,6 +69,18 @@ class MergedMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsDispensable()
         .AsDuplicable();
     AddAttr<float>("mu", "(float) Momentum coefficient");
+    AddAttr<bool>("use_nesterov",
+                  "(bool, default false) "
+                  "Use Nesterov Momentum or not.")
+        .SetDefault(false);
+    AddAttr<std::vector<std::string>>(
+        "regularization_method",
+        "(string) regularization_method, right now only "
+        "support l2decay or none")
+        .SetDefault({});
+    AddAttr<std::vector<float>>("regularization_coeff",
+                                "(float) regularization_coeff")
+        .SetDefault({});
     AddAttr<bool>("multi_precision",
                   "(bool, default false) "
                   "Whether to use multi-precision during weight updating.")
diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op.h b/paddle/fluid/operators/optimizers/merged_momentum_op.h
index 4dfaa4de3ad44..7560b4fd8e5f9 100644
--- a/paddle/fluid/operators/optimizers/merged_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/merged_momentum_op.h
@@ -18,6 +18,7 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/operators/optimizers/momentum_op.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/fluid/platform/macros.h"
 
@@ -85,33 +86,43 @@ class MergedMomentumOpKernel : public framework::OpKernel<T> {
     auto params = ctx.MultiInput<framework::Tensor>("Param");
     auto params_out = ctx.MultiOutput<framework::Tensor>("ParamOut");
     size_t n = params.size();
-    PADDLE_ENFORCE_EQ(
-        n, params_out.size(),
-        platform::errors::InvalidArgument(
-            "Output(ParamOut) number must be equal to Input(Param) number."));
+    PADDLE_ENFORCE_EQ(n, params_out.size(),
+                      platform::errors::InvalidArgument(
+                          "The size of Output(ParamOut) must be equal to "
+                          "Input(Param), but got the size of Output(ParamOut) "
+                          "is %d, the size of Input(Param) is %d.",
+                          params_out.size(), n));
     for (size_t i = 0; i < n; ++i) {
-      PADDLE_ENFORCE_EQ(
-          params[i], params_out[i],
-          platform::errors::InvalidArgument(
-              "Input(Param) and Output(ParamOut) must be the same Tensors."));
+      PADDLE_ENFORCE_EQ(params[i], params_out[i],
+                        platform::errors::InvalidArgument(
+                            "The size of Input(Param) and Output(ParamOut) "
+                            "must be the same Tensors."));
     }
 
     auto grads = ctx.MultiInput<framework::Tensor>("Grad");
     PADDLE_ENFORCE_EQ(
         n, grads.size(),
         platform::errors::InvalidArgument(
-            "Input(Grad) number must be equal to Input(Param) number."));
+            "The size of Input(Grad) must be equal to Input(Param), but got "
+            "the size of Input(Grad) is %d, the size of Input(Param) is %d.",
+            grads.size(), n));
 
     auto velocitys = ctx.MultiInput<framework::Tensor>("Velocity");
     PADDLE_ENFORCE_EQ(n, velocitys.size(),
                       platform::errors::InvalidArgument(
-                          "Input(Velocity) number and Input(Param) number."));
+                          "The size of Input(Velocity) must be equal to "
+                          "Input(Param), but got the size of Input(Velocity) "
+                          "is %d, the size of Input(Param) is %d.",
+                          velocitys.size(), n));
 
     auto velocitys_out = ctx.MultiOutput<framework::Tensor>("VelocityOut");
     PADDLE_ENFORCE_EQ(
         n, velocitys_out.size(),
-        platform::errors::InvalidArgument("Output(VelocityOut) number must be "
-                                          "equal to Input(Param) number."));
+        platform::errors::InvalidArgument(
+            "The size of Output(VelocityOut) must be "
+            "equal to Input(Param), but got the size of Output(VelocityOut) is "
+            "%d, the size of Input(Param) is %d.",
+            velocitys_out.size(), n));
     for (size_t i = 0; i < n; ++i) {
       PADDLE_ENFORCE_EQ(velocitys[i], velocitys_out[i],
                         platform::errors::InvalidArgument(
@@ -126,12 +137,18 @@ class MergedMomentumOpKernel : public framework::OpKernel<T> {
     if (multi_precision) {
       PADDLE_ENFORCE_EQ(
           n, master_params.size(),
-          platform::errors::InvalidArgument("Input(MasterParam) number must be "
-                                            "equal to Input(Param) number."));
-      PADDLE_ENFORCE_EQ(n, master_params_out.size(),
-                        platform::errors::InvalidArgument(
-                            "Output(MasterParamOut) number must be equal to "
-                            "Input(MasterParam) number."));
+          platform::errors::InvalidArgument(
+              "The size of Input(MasterParam) must be "
+              "equal to Input(Param), but got the size of Input(MasterParam) "
+              "is %d, the size of Input(Param) is %d.",
+              master_params.size(), n));
+      PADDLE_ENFORCE_EQ(
+          n, master_params_out.size(),
+          platform::errors::InvalidArgument(
+              "The size of Output(MasterParamOut) must be equal to "
+              "Input(MasterParam), but got the size of Output(MasterParamOut) "
+              "is %d, the size of Input(Param) is %d.",
+              master_params_out.size(), n));
       for (size_t i = 0; i < n; ++i) {
         PADDLE_ENFORCE_EQ(master_params[i], master_params_out[i],
                           platform::errors::InvalidArgument(
@@ -147,20 +164,61 @@ class MergedMomentumOpKernel : public framework::OpKernel<T> {
       master_params_out.clear();
     }
 
-    auto lr = ctx.Input<framework::Tensor>("LearningRate");
     auto mu = ctx.Attr<float>("mu");
     auto rescale_grad = ctx.Attr<float>("rescale_grad");
+    auto lrs = ctx.MultiInput<framework::Tensor>("LearningRate");
+    if (lrs.size() != 1) {
+      PADDLE_ENFORCE_EQ(
+          n, lrs.size(),
+          platform::errors::InvalidArgument(
+              "If the size of Input(LearningRate) is not 1, the size of "
+              "Input(LearningRate) must be "
+              "equal to Input(Param), but got the size of Input(LearningRate) "
+              "is %d, the size of Input(Param) is %d.",
+              lrs.size(), n));
+    }
+    auto use_nesterov = ctx.Attr<bool>("use_nesterov");
+    auto regularization_methods =
+        ctx.Attr<std::vector<std::string>>("regularization_method");
+    auto regularization_coeffs =
+        ctx.Attr<std::vector<float>>("regularization_coeff");
+    if (regularization_methods.size() != 0) {
+      PADDLE_ENFORCE_EQ(
+          n, regularization_methods.size(),
+          platform::errors::InvalidArgument(
+              "The size of Attr(regularization_method) must be equal "
+              "to Input(Param), but got the size of "
+              "Attr(regularization_method) is %d, the size of Input(Param) is "
+              "%d.",
+              regularization_methods.size(), n));
+      PADDLE_ENFORCE_EQ(
+          n, regularization_coeffs.size(),
+          platform::errors::InvalidArgument(
+              "The size of Attr(regularization_coeff) must be equal "
+              "to Input(Param), but got the size of Attr(regularization_coeff) "
+              "is %d, the size of Input(Param) is %d.",
+              regularization_coeffs.size(), n));
+    }
+
+    VLOG(5) << "use_nesterov: " << use_nesterov
+            << ",  regularization_methods.size(): "
+            << regularization_methods.size()
+            << ",  regularization_coeffs.size(): "
+            << regularization_coeffs.size();
+
     using MPType = typename operators::details::MPTypeTrait<T>::Type;
 
     auto &dev_ctx = ctx.template device_context<DeviceContext>();
 
+    if (lrs.size() == 1 && use_nesterov == false &&
+        regularization_methods.size() == 0) {
 #define PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(kMultiPrecision)                \
   MergedMomentumKernelParam<T, MPType, kMultiPrecision> kernel_params;       \
   constexpr auto kMaxMergedNum = decltype(kernel_params)::N;                 \
   size_t kernel_num = (n + kMaxMergedNum - 1) / kMaxMergedNum;               \
   kernel_params.mu = static_cast<MPType>(mu);                                \
   kernel_params.rescale_grad = static_cast<MPType>(rescale_grad);            \
-  kernel_params.lr = lr->data<MPType>();                                     \
+  kernel_params.lr = lrs[0]->data<MPType>();                                 \
   for (size_t i = 0; i < kernel_num; ++i) {                                  \
     size_t start = i * kMaxMergedNum;                                        \
     size_t end = std::min((i + 1) * kMaxMergedNum, n);                       \
@@ -182,14 +240,78 @@ class MergedMomentumOpKernel : public framework::OpKernel<T> {
     VLOG(10) << "Launch MergedMomentum kernel " << i << " "                  \
              << kernel_params.param_num;                                     \
   }
-
-    if (multi_precision) {
-      PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(true);
+      if (multi_precision) {
+        PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(true);
+      } else {
+        PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(false);
+      }
+#undef PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL
     } else {
-      PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(false);
-    }
+      for (size_t idx = 0; idx < n; idx++) {
+        RegularizationType regularization_flag =
+            regularization_methods.size() > 0 &&
+                    regularization_methods[idx] == "l2_decay"
+                ? RegularizationType::kL2DECAY
+                : RegularizationType::kNONE;
 
-#undef PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL
+        MPType regularization_coeff = static_cast<MPType>(0.0);
+        if (regularization_coeffs.size() != 0) {
+          regularization_coeff =
+              static_cast<MPType>(regularization_coeffs[idx]);
+        }
+        auto lr_temp = lrs.size() > 1 ? lrs[idx] : lrs[0];
+
+        const MPType *master_in_data =
+            multi_precision ? master_params[idx]->data<MPType>() : nullptr;
+        MPType *master_out_data =
+            multi_precision ? master_params_out[idx]->data<MPType>() : nullptr;
+        if (platform::is_cpu_place(ctx.GetPlace())) {
+          CPUDenseMomentumFunctor<MPType> functor;
+          functor(params[idx], grads[idx], velocitys[idx], lr_temp, mu,
+                  use_nesterov, regularization_flag, regularization_coeff,
+                  params_out[idx], velocitys_out[idx]);
+          VLOG(10) << "Launch MergedMomentum cpu kernel.";
+        } else if (platform::is_gpu_place(ctx.GetPlace())) {
+          platform::ForRange<DeviceContext> for_range(
+              static_cast<const DeviceContext &>(ctx.device_context()),
+              params[idx]->numel());
+#define PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(__nesterov, __reg_type)          \
+  DenseMomentumFunctor<T, MPType, __reg_type, __nesterov> functor(             \
+      params[idx]->data<T>(), grads[idx]->data<T>(),                           \
+      velocitys[idx]->data<MPType>(), lr_temp->data<MPType>(), master_in_data, \
+      mu, rescale_grad, params[idx]->numel(), regularization_coeff,            \
+      params_out[idx]->data<T>(), velocitys_out[idx]->data<MPType>(),          \
+      master_out_data);                                                        \
+  for_range(functor);
+          if (use_nesterov) {
+            if (regularization_flag == RegularizationType::kL2DECAY) {
+              PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(
+                  UseNesterov, RegularizationType::kL2DECAY);
+              VLOG(10)
+                  << "Launch MergedMomentum gpu kernel use_nesterov kL2DECAY.";
+            } else {
+              PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(UseNesterov,
+                                                    RegularizationType::kNONE);
+              VLOG(10)
+                  << "Launch MergedMomentum gpu kernel use_nesterov kNONE.";
+            }
+          } else {
+            if (regularization_flag == RegularizationType::kL2DECAY) {
+              PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(
+                  NoNesterov, RegularizationType::kL2DECAY);
+              VLOG(10)
+                  << "Launch MergedMomentum gpu kernel no_nesterov kL2DECAY.";
+            } else {
+              PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(NoNesterov,
+                                                    RegularizationType::kNONE);
+              VLOG(10) << "Launch MergedMomentum gpu kernel no_nesterov kNONE.";
+            }
+          }
+        }
+      }
+      VLOG(10)
+          << "Launch MergedMomentum kernel with multi_lr and regularization.";
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu
index a5d9ad271f23a..3582e939f30ac 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cu
+++ b/paddle/fluid/operators/optimizers/sgd_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include <algorithm>
 #include "paddle/fluid/operators/optimizers/sgd_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/optimizers/sparse_momentum_op.h b/paddle/fluid/operators/optimizers/sparse_momentum_op.h
index f1516320ec573..23e37ea27b54f 100644
--- a/paddle/fluid/operators/optimizers/sparse_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.h
@@ -445,12 +445,12 @@ class SparseMomentumOpKernel : public framework::OpKernel<T> {
       for_range_index(range_functor);
 
       size_t temp_storage_bytes = 0;
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           (cub::DeviceRadixSort::SortPairs<IndexT, IndexT>(
               nullptr, temp_storage_bytes, nullptr, nullptr, nullptr, nullptr,
               static_cast<int>(num_index))));
       auto d_temp_storage = memory::Alloc(ctx.GetPlace(), temp_storage_bytes);
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           (cub::DeviceRadixSort::SortPairs<IndexT, IndexT>(
               d_temp_storage->ptr(), temp_storage_bytes, index->data<IndexT>(),
               sorted_index_ptr, sort_value_ptr, grad_index_ptr,
diff --git a/paddle/fluid/operators/pad2d_op.cu b/paddle/fluid/operators/pad2d_op.cu
index a77d0a5650ef3..a854fa6091ab4 100644
--- a/paddle/fluid/operators/pad2d_op.cu
+++ b/paddle/fluid/operators/pad2d_op.cu
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/pad3d_op.cu b/paddle/fluid/operators/pad3d_op.cu
index f243a78e5578b..1567251236550 100644
--- a/paddle/fluid/operators/pad3d_op.cu
+++ b/paddle/fluid/operators/pad3d_op.cu
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc
index 8fcd40a9a2df4..bbe3174012947 100644
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -16,14 +16,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/pool_op.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/miopen_helper.h"
 #endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
@@ -204,17 +201,17 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
     char *pool_workspace;
     size_t pool_worksize = 0;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenPoolingGetWorkSpaceSizeV2(
             cudnn_pool_desc, cudnn_output_desc, &pool_worksize));
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipMalloc(&pool_workspace, pool_worksize));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenPoolingForward(
+    PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_worksize));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenPoolingForward(
         handle, cudnn_pool_desc, &alpha, cudnn_input_desc,
         tranformed_input_data, &beta, cudnn_output_desc, tranformed_output_data,
         false, pool_workspace, pool_worksize));
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipFree(pool_workspace));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnPoolingForward(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnPoolingForward(
         handle, cudnn_pool_desc, &alpha, cudnn_input_desc,
         tranformed_input_data, &beta, cudnn_output_desc,
         tranformed_output_data));
@@ -468,17 +465,17 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
       char *pool_workspace;
       size_t pool_worksize = 0;
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::miopenPoolingGetWorkSpaceSizeV2(
               cudnn_pool_desc, cudnn_output_desc, &pool_worksize));
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipMalloc(&pool_workspace, pool_worksize));
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenPoolingBackward(
+      PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_worksize));
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenPoolingBackward(
           handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data,
           cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data,
           &beta, cudnn_input_desc, input_grad_data, pool_workspace));
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipFree(pool_workspace));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnPoolingBackward(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnPoolingBackward(
           handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data,
           cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data,
           &beta, cudnn_input_desc, input_grad_data));
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index 9d8f086ce0f18..fa98e76e39338 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -15,12 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/pool_op.h"
 
 #include <unordered_map>
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
diff --git a/paddle/fluid/operators/prelu_op.cu b/paddle/fluid/operators/prelu_op.cu
index ce3f5969cef49..06cc9ed7a96e5 100644
--- a/paddle/fluid/operators/prelu_op.cu
+++ b/paddle/fluid/operators/prelu_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/prelu.h"
 #include "paddle/fluid/operators/prelu_op.h"
 #include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/prroi_pool_op.h b/paddle/fluid/operators/prroi_pool_op.h
index f9e2b78d5d31a..38f8d6542ac32 100644
--- a/paddle/fluid/operators/prroi_pool_op.h
+++ b/paddle/fluid/operators/prroi_pool_op.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #if defined(__NVCC__) || defined(__HIPCC__)
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/operators/psroi_pool_op.cu b/paddle/fluid/operators/psroi_pool_op.cu
index f69edfc1fcfec..5a0d1a700417c 100644
--- a/paddle/fluid/operators/psroi_pool_op.cu
+++ b/paddle/fluid/operators/psroi_pool_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/psroi_pool_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.cu b/paddle/fluid/operators/pull_box_extended_sparse_op.cu
index 5bde6bc2e5cbb..26a02ea622479 100644
--- a/paddle/fluid/operators/pull_box_extended_sparse_op.cu
+++ b/paddle/fluid/operators/pull_box_extended_sparse_op.cu
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/pull_box_extended_sparse_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/pull_box_sparse_op.cu b/paddle/fluid/operators/pull_box_sparse_op.cu
index 8bba9db5426b7..96a1b1c08b79c 100644
--- a/paddle/fluid/operators/pull_box_sparse_op.cu
+++ b/paddle/fluid/operators/pull_box_sparse_op.cu
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/pull_box_sparse_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/qr_op.cu b/paddle/fluid/operators/qr_op.cu
index 992df172ace0c..3eb5f72b5b117 100644
--- a/paddle/fluid/operators/qr_op.cu
+++ b/paddle/fluid/operators/qr_op.cu
@@ -167,7 +167,7 @@ void QrGPUKernel<float>::BatchedGeqrf(
   int lwork = 0;
 
   auto handle = dev_ctx.cusolver_dn_handle();
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSgeqrf_bufferSize(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgeqrf_bufferSize(
       handle, m, n, a, lda, &lwork));
   auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float));
   float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
@@ -178,7 +178,7 @@ void QrGPUKernel<float>::BatchedGeqrf(
     float* a_working_ptr = &a[i * a_stride];
     float* tau_working_ptr = &tau[i * tau_stride];
     // compute geqrf
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSgeqrf(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgeqrf(
         handle, m, n, a_working_ptr, lda, tau_working_ptr, workspace_ptr, lwork,
         info_d));
     // Do we need synchronized here?
@@ -201,7 +201,7 @@ void QrGPUKernel<double>::BatchedGeqrf(
   int lwork = 0;
 
   auto handle = dev_ctx.cusolver_dn_handle();
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDgeqrf_bufferSize(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgeqrf_bufferSize(
       handle, m, n, a, lda, &lwork));
   auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double));
   double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
@@ -212,7 +212,7 @@ void QrGPUKernel<double>::BatchedGeqrf(
     double* a_working_ptr = &a[i * a_stride];
     double* tau_working_ptr = &tau[i * tau_stride];
     // compute geqrf
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDgeqrf(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgeqrf(
         handle, m, n, a_working_ptr, lda, tau_working_ptr, workspace_ptr, lwork,
         info_d));
     // Do we need synchronized here?
@@ -235,7 +235,7 @@ void QrGPUKernel<float>::BatchedOrgqr(
   int lwork = 0;
 
   auto handle = dev_ctx.cusolver_dn_handle();
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSorgqr_bufferSize(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSorgqr_bufferSize(
       handle, m, n, k, a, lda, tau, &lwork));
   auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float));
   float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
@@ -246,7 +246,7 @@ void QrGPUKernel<float>::BatchedOrgqr(
     float* a_working_ptr = &a[i * a_stride];
     float* tau_working_ptr = &tau[i * tau_stride];
     // compute orggr
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSorgqr(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSorgqr(
         handle, m, n, k, a_working_ptr, lda, tau_working_ptr, workspace_ptr,
         lwork, info_d));
     // Do we need synchronized here?
@@ -270,7 +270,7 @@ void QrGPUKernel<double>::BatchedOrgqr(
   int lwork = 0;
 
   auto handle = dev_ctx.cusolver_dn_handle();
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDorgqr_bufferSize(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDorgqr_bufferSize(
       handle, m, n, k, a, lda, tau, &lwork));
   auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double));
   double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
@@ -281,7 +281,7 @@ void QrGPUKernel<double>::BatchedOrgqr(
     double* a_working_ptr = &a[i * a_stride];
     double* tau_working_ptr = &tau[i * tau_stride];
     // compute orggr
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDorgqr(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDorgqr(
         handle, m, n, k, a_working_ptr, lda, tau_working_ptr, workspace_ptr,
         lwork, info_d));
     // Do we need synchronized here?
diff --git a/paddle/fluid/operators/range_op.cu b/paddle/fluid/operators/range_op.cu
index 6250d68730e13..23a0f2d0a24e3 100644
--- a/paddle/fluid/operators/range_op.cu
+++ b/paddle/fluid/operators/range_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/range_op.h"
 #include "paddle/fluid/operators/utils.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/rank_attention_op.cu b/paddle/fluid/operators/rank_attention_op.cu
index aaa4eec7c1bf3..23b4475e1f7c1 100644
--- a/paddle/fluid/operators/rank_attention_op.cu
+++ b/paddle/fluid/operators/rank_attention_op.cu
@@ -17,8 +17,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/rank_attention.cu.h"
 #include "paddle/fluid/operators/rank_attention_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index 58af6309e3d28..6c28daa7eac72 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -161,14 +161,14 @@ void BufferedReader::ReadAsync(size_t i) {
         platform::SetDeviceId(
             BOOST_GET_CONST(platform::CUDAPlace, place_).device);
 #ifdef PADDLE_WITH_HIP
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             hipEventRecord(events_[i].get(), compute_stream_));
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             hipStreamWaitEvent(stream_.get(), events_[i].get(), 0));
 #else
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             cudaEventRecord(events_[i].get(), compute_stream_));
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             cudaStreamWaitEvent(stream_.get(), events_[i].get(), 0));
 #endif
 
@@ -199,19 +199,12 @@ void BufferedReader::ReadAsync(size_t i) {
             memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place_), gpu_ptr,
                          cuda_pinned_place, cuda_pinned_ptr, size,
                          stream_.get());
-#ifdef PADDLE_WITH_HIP
-            PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream_.get()));
-#else
-            PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_.get()));
-#endif
+
+            platform::GpuStreamSync(stream_.get());
           }
           cuda[i].set_lod(cpu[i].lod());
         }
-#ifdef PADDLE_WITH_HIP
-        PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream_.get()));
-#else
-        PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_.get()));
-#endif
+        platform::GpuStreamSync(stream_.get());
       }
     }
 #endif
diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h
index c433cac56a431..3d42486c6df88 100644
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -22,8 +22,8 @@
 #include "ThreadPool.h"
 #include "paddle/fluid/framework/reader.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#include "paddle/fluid/platform/cuda_resource_pool.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/device/npu/npu_info.h"
diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
index 90adea60927c0..dc79666b72fa6 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
@@ -64,6 +64,17 @@ struct CustomSum {
   }
 };
 
+template <typename Tx, typename Ty = Tx>
+struct CustomSub {
+  using Transformer = kps::InverseFunctor<Tx>;
+
+  inline Ty initial() { return static_cast<Ty>(0.0f); }
+
+  __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
+    return b + a;
+  }
+};
+
 template <typename Tx, typename Ty = Tx>
 struct CustomMean {
   using Transformer = kps::DivideFunctor<Tx>;
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
index 6b3b484320018..9c348477963b4 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -36,7 +36,8 @@ namespace cub = hipcub;
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/cast_op.h"
 #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/fast_divmod.h"
 
 // Reduce split or not, Whether to use ReduceHigherDim
@@ -464,9 +465,9 @@ struct ReduceConfig {
       reduce_num_per_thread = details::AlignUp(reduce_num, block_dim->y);
     }
     int device_id = platform::GetCurrentDeviceId();
-    int max_mp = platform::GetCUDAMultiProcessors(device_id);
+    int max_mp = platform::GetGPUMultiProcessors(device_id);
     int max_threads_per_mp =
-        platform::GetCUDAMaxThreadsPerMultiProcessor(device_id);
+        platform::GetGPUMaxThreadsPerMultiProcessor(device_id);
     int max_threads = max_threads_per_mp * max_mp;
     int num_threads = block_dim->x * block_dim->y;
     int max_num_blocks = max_threads / num_threads;
@@ -506,9 +507,9 @@ struct ReduceConfig {
     left_num = last_dim_num;
     grid_dim->z = grid_z;
     int device_id = platform::GetCurrentDeviceId();
-    int max_mp = platform::GetCUDAMultiProcessors(device_id);
+    int max_mp = platform::GetGPUMultiProcessors(device_id);
     int max_threads_per_mp =
-        platform::GetCUDAMaxThreadsPerMultiProcessor(device_id);
+        platform::GetGPUMaxThreadsPerMultiProcessor(device_id);
     int max_threads = max_threads_per_mp * max_mp;
     // init
     int num_block = (max_threads / left_num);
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 5148e3b0940c9..c12db1293856b 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -383,13 +383,13 @@ class ReshapeKernel {
     // 3. out tensor is view of input
     // We can't MakePtenDenseTensor for case 2, so we solve this case by
     // creating a temporary tensor here:
-    const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
-        ctx.GetPlace());
     pten::DenseTensorMeta meta{pten::TransToPtenDataType(in->type()),
                                in->dims(),
                                pten::TransToPtenDataLayout(in->layout())};
-    auto pt_out_tmp =
-        std::make_shared<pten::DenseTensor>(alloc, std::move(meta));
+    auto pt_out_tmp = std::make_shared<pten::DenseTensor>(
+        pten::make_intrusive<paddle::experimental::SharedStorage>(
+            ctx.GetPlace()),
+        std::move(meta));
     pten::DenseTensor *pt_out = nullptr;
     if (in == out) {
       pt_out = pt_x.get();
@@ -484,7 +484,8 @@ class ReshapeKernel {
     // non-inplace need move all result from pt_out to out, inplace need set
     // result dims.
     if (in != out) {
-      paddle::experimental::MovesStorage(pt_out, static_cast<Tensor *>(out));
+      paddle::experimental::MovesSharedStorage(pt_out,
+                                               static_cast<Tensor *>(out));
     } else {
       out->Resize(pt_out->dims());
     }
diff --git a/paddle/fluid/operators/rnn_op.cu.cc b/paddle/fluid/operators/rnn_op.cu.cc
index 07329a9175e52..de4847ddc4590 100644
--- a/paddle/fluid/operators/rnn_op.cu.cc
+++ b/paddle/fluid/operators/rnn_op.cu.cc
@@ -16,12 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/utils.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
@@ -97,12 +92,12 @@ class RNNDescriptors {
     bool is_initialized = dropout_state->IsInitialized();
     if (!is_test_ && !is_initialized) {
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::miopenDropoutGetStatesSize(handle, &state_size));
       dropout_state->mutable_data<uint8_t>({static_cast<int64_t>(state_size)},
                                            place);
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size));
       dropout_state->mutable_data<uint8_t>({static_cast<int64_t>(state_size)},
                                            place);
@@ -114,19 +109,19 @@ class RNNDescriptors {
 
 // ------------------- cudnn rnn descriptors ---------------------
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetRNNDescriptor_V2(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetRNNDescriptor_V2(
         rnn_desc_.desc(), hidden_size_, num_layers_, dropout_desc_.desc(),
         miopenRNNlinear,
         is_bidirec_ ? miopenRNNbidirection : miopenRNNunidirection, mode_,
         miopenRNNwithBias, miopenRNNdefault, cudnn_type));
 #elif CUDNN_VERSION >= 6000
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6(
         handle, rnn_desc_.desc(), hidden_size_, num_layers_,
         dropout_desc_.desc(), CUDNN_LINEAR_INPUT,
         is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, mode_,
         CUDNN_RNN_ALGO_STANDARD, cudnn_type));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetRNNDescriptor(
         rnn_desc_.desc(), hidden_size_, num_layers_, dropout_desc_.desc(),
         CUDNN_LINEAR_INPUT,
         is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, mode_,
@@ -135,7 +130,7 @@ class RNNDescriptors {
 
 #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 7201
     if (!sequence_length.empty()) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNPaddingMode(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetRNNPaddingMode(
           rnn_desc_.desc(), CUDNN_RNN_PADDED_IO_ENABLED));
     }
 #endif
@@ -143,10 +138,10 @@ class RNNDescriptors {
     // ------------------- cudnn weights_size ---------------------
     size_t weights_size_;
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenGetRNNParamsSize(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenGetRNNParamsSize(
         handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNParamsSize(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnGetRNNParamsSize(
         handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type));
 #endif
     PADDLE_ENFORCE_EQ(
@@ -160,18 +155,18 @@ class RNNDescriptors {
     weight_desc_.descriptor<T>(layout, dim_w);
 // ------------------- cudnn workspace, reserve size ---------------------
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenGetRNNWorkspaceSize(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenGetRNNWorkspaceSize(
         handle, rnn_desc_.desc(), seq_length_, x_descs_.data(),
         workspace_size));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenGetRNNTrainingReserveSize(
             handle, rnn_desc_.desc(), seq_length_, x_descs_.data(),
             reserve_size));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNWorkspaceSize(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnGetRNNWorkspaceSize(
         handle, rnn_desc_.desc(), seq_length_, x_descs_.data(),
         workspace_size));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnGetRNNTrainingReserveSize(
             handle, rnn_desc_.desc(), seq_length_, x_descs_.data(),
             reserve_size));
@@ -557,7 +552,7 @@ class RNNCudnnKernel : public framework::OpKernel<T> {
 // for train
 // This interface is used when the input/output is unpadded.
 #ifdef PADDLE_WITH_HIP
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenRNNForwardTraining(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenRNNForwardTraining(
             handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), x_data,
             rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
             rnn.weight_desc(), w_data, rnn.y_descs(), out_data,
@@ -565,7 +560,7 @@ class RNNCudnnKernel : public framework::OpKernel<T> {
             workspace_data_.data<uint8_t>(), workspace_size, reserve_data,
             reserve_size));
 #else
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardTraining(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNForwardTraining(
             handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), x_data,
             rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
             rnn.weight_desc(), w_data, rnn.y_descs(), out_data,
@@ -577,15 +572,13 @@ class RNNCudnnKernel : public framework::OpKernel<T> {
 #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 7201
         // for train
         // This interface is used when the input/output is padded.
-        PADDLE_ENFORCE_CUDA_SUCCESS(
-            platform::dynload::cudnnRNNForwardTrainingEx(
-                handle, rnn.rnn_desc(), rnn.x_seq_desc(), x_data,
-                rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
-                rnn.weight_desc(), w_data, rnn.y_seq_desc(), out_data,
-                rnn.last_h_desc(), last_h_data, rnn.last_c_desc(), last_c_data,
-                nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
-                nullptr, workspace_data_.data<uint8_t>(), workspace_size,
-                reserve_data, reserve_size));
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNForwardTrainingEx(
+            handle, rnn.rnn_desc(), rnn.x_seq_desc(), x_data, rnn.init_h_desc(),
+            init_h_data, rnn.init_c_desc(), init_c_data, rnn.weight_desc(),
+            w_data, rnn.y_seq_desc(), out_data, rnn.last_h_desc(), last_h_data,
+            rnn.last_c_desc(), last_c_data, nullptr, nullptr, nullptr, nullptr,
+            nullptr, nullptr, nullptr, nullptr, workspace_data_.data<uint8_t>(),
+            workspace_size, reserve_data, reserve_size));
 #else
         PADDLE_THROW(platform::errors::Unavailable(
             "The padded input is supported by "
@@ -606,14 +599,14 @@ class RNNCudnnKernel : public framework::OpKernel<T> {
 // for inference
 // This interface is used when the input/output is unpadded.
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenRNNForwardInference(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenRNNForwardInference(
           handle, rnn->rnn_desc(), seq_length, rnn->x_descs(), x_data,
           rnn->init_h_desc(), init_h_data, rnn->init_c_desc(), init_c_data,
           rnn->weight_desc(), w_data, rnn->y_descs(), out_data,
           rnn->last_h_desc(), last_h_data, rnn->last_c_desc(), last_c_data,
           workspace_data->data<uint8_t>(), workspace_size));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInference(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNForwardInference(
           handle, rnn->rnn_desc(), seq_length, rnn->x_descs(), x_data,
           rnn->init_h_desc(), init_h_data, rnn->init_c_desc(), init_c_data,
           rnn->weight_desc(), w_data, rnn->y_descs(), out_data,
@@ -624,7 +617,7 @@ class RNNCudnnKernel : public framework::OpKernel<T> {
 #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 7201
       // for inference
       // This interface is used when the input/output is padded.
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInferenceEx(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNForwardInferenceEx(
           handle, rnn->rnn_desc(), rnn->x_seq_desc(), x_data,
           rnn->init_h_desc(), init_h_data, rnn->init_c_desc(), init_c_data,
           rnn->weight_desc(), w_data, rnn->y_seq_desc(), out_data,
@@ -831,7 +824,7 @@ class RNNGradCudnnKernel : public framework::OpKernel<T> {
     if (!has_seq_length) {
       if (in_grad) {
 #ifdef PADDLE_WITH_HIP
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenRNNBackwardData(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenRNNBackwardData(
             handle, rnn.rnn_desc(), seq_length, rnn.y_descs(), out_data,
             rnn.y_descs(), out_grad_data, rnn.last_h_desc(), last_h_grad_data,
             rnn.last_c_desc(), last_c_grad_data, rnn.weight_desc(), weight_data,
@@ -842,7 +835,7 @@ class RNNGradCudnnKernel : public framework::OpKernel<T> {
             const_cast<uint8_t *>(reserve_data), reserve_size));
 #else
         // This interface is used when the input/output is unpadded.
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardData(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNBackwardData(
             handle, rnn.rnn_desc(), seq_length, rnn.y_descs(), out_data,
             rnn.y_descs(), out_grad_data, rnn.last_h_desc(), last_h_grad_data,
             rnn.last_c_desc(), last_c_grad_data, rnn.weight_desc(), weight_data,
@@ -855,7 +848,7 @@ class RNNGradCudnnKernel : public framework::OpKernel<T> {
       }
       if (!weight_grad_list.empty()) {
 #ifdef PADDLE_WITH_HIP
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenRNNBackwardWeights(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenRNNBackwardWeights(
             handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), input->data<T>(),
             rnn.init_h_desc(), init_h_data, rnn.y_descs(), out->data<T>(),
             rnn.weight_desc(), weight_grad_data,
@@ -865,7 +858,7 @@ class RNNGradCudnnKernel : public framework::OpKernel<T> {
         tensor_to_permuted_weight<T>(place, stream, weight_grad,
                                      &weight_grad_list, rnn_mode, is_bidirec);
 #else
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeights(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNBackwardWeights(
             handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), input->data<T>(),
             rnn.init_h_desc(), init_h_data, rnn.y_descs(), out->data<T>(),
             workspace_data_.data<uint8_t>(), workspace_size, rnn.weight_desc(),
@@ -878,7 +871,7 @@ class RNNGradCudnnKernel : public framework::OpKernel<T> {
       // for train
       // This interface is used when the input/output is padded.
       if (in_grad) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardDataEx(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNBackwardDataEx(
             handle, rnn.rnn_desc(), rnn.y_seq_desc(), out_data,
             rnn.y_seq_desc(), out_grad_data, nullptr, nullptr,
             rnn.last_h_desc(), last_h_grad_data, rnn.last_c_desc(),
@@ -891,13 +884,12 @@ class RNNGradCudnnKernel : public framework::OpKernel<T> {
       }
 
       if (!weight_grad_list.empty()) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(
-            platform::dynload::cudnnRNNBackwardWeightsEx(
-                handle, rnn.rnn_desc(), rnn.x_seq_desc(), input->data<T>(),
-                rnn.init_h_desc(), init_h_data, rnn.y_seq_desc(),
-                out->data<T>(), workspace_data_.data<uint8_t>(), workspace_size,
-                rnn.weight_desc(), weight_grad_data,
-                const_cast<uint8_t *>(reserve_data), reserve_size));
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNBackwardWeightsEx(
+            handle, rnn.rnn_desc(), rnn.x_seq_desc(), input->data<T>(),
+            rnn.init_h_desc(), init_h_data, rnn.y_seq_desc(), out->data<T>(),
+            workspace_data_.data<uint8_t>(), workspace_size, rnn.weight_desc(),
+            weight_grad_data, const_cast<uint8_t *>(reserve_data),
+            reserve_size));
       }
 #else
       PADDLE_THROW(platform::errors::Unavailable(
diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu
index 111828005222b..a08339d776ff1 100644
--- a/paddle/fluid/operators/roi_align_op.cu
+++ b/paddle/fluid/operators/roi_align_op.cu
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/roi_align_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu
index 562ff8d576b7d..0a4a076c6caae 100644
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/roi_pool_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/roll_op.cu b/paddle/fluid/operators/roll_op.cu
index d70bd58887f84..57986d262820d 100644
--- a/paddle/fluid/operators/roll_op.cu
+++ b/paddle/fluid/operators/roll_op.cu
@@ -17,7 +17,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/roll_op.h"
 #include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu
index a712878854298..586cf3239b575 100644
--- a/paddle/fluid/operators/row_conv_op.cu
+++ b/paddle/fluid/operators/row_conv_op.cu
@@ -13,7 +13,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/row_conv_op.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index c24f924313fb9..4e9c84ef4c950 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -75,14 +75,14 @@ class ScaleOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     if (ctx.InputVar("X")->IsType<framework::LoDTensor>() ||
         ctx.InputVar("X")->IsType<framework::Tensor>()) {
+      std::string scale_attr;
       if (ctx.HasInput("ScaleTensor")) {
-        return framework::KernelSignature("scale.host", {"X", "ScaleTensor"},
-                                          {"bias", "bias_after_scale"},
-                                          {"Out"});
+        scale_attr = "ScaleTensor";
       } else {
-        return framework::KernelSignature(
-            "scale", {"X"}, {"scale", "bias", "bias_after_scale"}, {"Out"});
+        scale_attr = "scale";
       }
+      return framework::KernelSignature(
+          "scale", {"X"}, {scale_attr, "bias", "bias_after_scale"}, {"Out"});
     }
     // TODO(chenweihang): support other cases after selected rows added
     return framework::KernelSignature("scale.unregistered", {}, {}, {});
diff --git a/paddle/fluid/operators/scatter.cu.h b/paddle/fluid/operators/scatter.cu.h
index e3791351cefb3..6c7a0a8886ef0 100644
--- a/paddle/fluid/operators/scatter.cu.h
+++ b/paddle/fluid/operators/scatter.cu.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "math/math_function.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/scatter_op_npu.cc b/paddle/fluid/operators/scatter_op_npu.cc
index de368e6e80219..8d92ea4166513 100644
--- a/paddle/fluid/operators/scatter_op_npu.cc
+++ b/paddle/fluid/operators/scatter_op_npu.cc
@@ -48,18 +48,49 @@ class ScatterNPUKernel : public framework::OpKernel<T> {
       index = &tmp_tensor;
     }
 
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
+    const auto& dev_ctx =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>();
+    auto op_func_update = [](const std::vector<Tensor>& inputs,
+                             const std::vector<Tensor>& outputs,
+                             const NPUAttributeMap& attrs,
+                             const platform::NPUDeviceContext& dev_ctx) {
+      const auto& runner =
+          NpuOpRunner("TensorScatterUpdate", inputs, outputs, attrs);
+      runner.Run(dev_ctx.stream());
+    };
+    auto op_func_add = [](const std::vector<Tensor>& inputs,
+                          const std::vector<Tensor>& outputs,
+                          const NPUAttributeMap& attrs,
+                          const platform::NPUDeviceContext& dev_ctx) {
+      const auto& runner =
+          NpuOpRunner("TensorScatterAdd", inputs, outputs, attrs);
+      runner.Run(dev_ctx.stream());
+    };
 
     if (overwrite) {
-      const auto& runner_update = NpuOpRunner(
-          "TensorScatterUpdate", {*x, *index, *updates}, {*out}, {});
-      runner_update.Run(stream);
+      if (x->type() == framework::proto::VarType::INT64) {
+        NpuOpRunner::TypeAdapter(
+            {*x, *index, *updates}, {*out}, {}, dev_ctx, op_func_update,
+            {framework::proto::VarType::INT32, framework::proto::VarType::INT32,
+             framework::proto::VarType::INT32},
+            {framework::proto::VarType::INT32});
+      } else {
+        const auto& runner_update = NpuOpRunner(
+            "TensorScatterUpdate", {*x, *index, *updates}, {*out}, {});
+        runner_update.Run(dev_ctx.stream());
+      }
     } else {
-      const auto& runner_add =
-          NpuOpRunner("TensorScatterAdd", {*x, *index, *updates}, {*out}, {});
-      runner_add.Run(stream);
+      if (x->type() == framework::proto::VarType::INT64) {
+        NpuOpRunner::TypeAdapter(
+            {*x, *index, *updates}, {*out}, {}, dev_ctx, op_func_add,
+            {framework::proto::VarType::INT32, framework::proto::VarType::INT32,
+             framework::proto::VarType::INT32},
+            {framework::proto::VarType::INT32});
+      } else {
+        const auto& runner_add =
+            NpuOpRunner("TensorScatterAdd", {*x, *index, *updates}, {*out}, {});
+        runner_add.Run(dev_ctx.stream());
+      }
     }
   }
 };
@@ -70,6 +101,10 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_NPU_KERNEL(
     scatter, ops::ScatterNPUKernel<paddle::platform::NPUDeviceContext, float>,
+#ifdef PADDLE_WITH_ASCEND_INT64
+    ops::ScatterNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
+#endif
+    ops::ScatterNPUKernel<paddle::platform::NPUDeviceContext, int>,
     ops::ScatterNPUKernel<paddle::platform::NPUDeviceContext,
                           paddle::platform::float16>);
 #endif
diff --git a/paddle/fluid/operators/segment_pool_op.cu b/paddle/fluid/operators/segment_pool_op.cu
index 379a07a26dd5c..4e20844dc3275 100644
--- a/paddle/fluid/operators/segment_pool_op.cu
+++ b/paddle/fluid/operators/segment_pool_op.cu
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/segment_pool_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/fluid/operators/segment_pool_op.h b/paddle/fluid/operators/segment_pool_op.h
index 5f9635c8ae111..307bf4010f7ff 100644
--- a/paddle/fluid/operators/segment_pool_op.h
+++ b/paddle/fluid/operators/segment_pool_op.h
@@ -72,11 +72,11 @@ void SegmentKernelLaunchHelper(const framework::ExecutionContext& context) {
     const IndexT* segment_ids = segment->data<IndexT>();
 
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         hipMemcpy(length_data, segment_ids + num_indices - 1, sizeof(IndexT),
                   hipMemcpyDeviceToHost));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaMemcpy(length_data, segment_ids + num_indices - 1, sizeof(IndexT),
                    cudaMemcpyDeviceToHost));
 #endif
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
index 6d8f60ce932ab..8092a40d19b19 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
@@ -15,7 +15,7 @@
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #include "paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
index bacaaeadbf576..bb928cf401c33 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #include "paddle/fluid/operators/sequence_ops/sequence_erase_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
index c8b6156881c96..1c4265a71d4ea 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include <algorithm>
 #include "paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
index d4f4051c3a460..f63fa5be7f496 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/shard_index_op.cu b/paddle/fluid/operators/shard_index_op.cu
index f2800c60c3304..115b3f47d664b 100644
--- a/paddle/fluid/operators/shard_index_op.cu
+++ b/paddle/fluid/operators/shard_index_op.cu
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/shard_index_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/share_buffer_op.cc b/paddle/fluid/operators/share_buffer_op.cc
index a161b9272b7b2..f6a6c9695b2ad 100644
--- a/paddle/fluid/operators/share_buffer_op.cc
+++ b/paddle/fluid/operators/share_buffer_op.cc
@@ -49,7 +49,8 @@ class ShareBufferOpMaker : public framework::OpProtoAndCheckerMaker {
               "(Tensor), The output tensors which are the same as X. It is "
               "used to build the graph dependency")
         .AsDuplicable();
-    AddAttr<std::vector<bool>>("share_dims", "Whether to share dims")
+    AddAttr<std::vector<bool>>("share_dims_and_dtype",
+                               "Whether to share dims and data type")
         .SetDefault(std::vector<bool>());
     AddComment(
         R"DOC(Operator used to perform inplace memory reuse. It should be not exposed to Python APIs.)DOC");
diff --git a/paddle/fluid/operators/share_buffer_op.h b/paddle/fluid/operators/share_buffer_op.h
index 5138ad9d54b79..1d0abf14f577e 100644
--- a/paddle/fluid/operators/share_buffer_op.h
+++ b/paddle/fluid/operators/share_buffer_op.h
@@ -29,12 +29,13 @@ class ShareBufferOpKernel : public framework::OpKernel<T> {
     size_t n = inputs.size();
     PADDLE_ENFORCE_EQ(n, outputs.size(), platform::errors::PermissionDenied(
                                              "Variable number not match."));
-    const auto &share_dims = ctx.Attr<std::vector<bool>>("share_dims");
-    if (!share_dims.empty()) {
-      PADDLE_ENFORCE_EQ(
-          n, share_dims.size(),
-          platform::errors::PermissionDenied(
-              "Attribute share_dims number not match input variable number."));
+    const auto &share_dims_and_dtype =
+        ctx.Attr<std::vector<bool>>("share_dims_and_dtype");
+    if (!share_dims_and_dtype.empty()) {
+      PADDLE_ENFORCE_EQ(n, share_dims_and_dtype.size(),
+                        platform::errors::PermissionDenied(
+                            "Attribute share_dims_and_dtype number not match "
+                            "input variable number."));
     }
 
     const std::vector<std::string> *input_args = nullptr,
@@ -50,8 +51,9 @@ class ShareBufferOpKernel : public framework::OpKernel<T> {
       outputs[i]->ShareBufferWith(*inputs[i]);
       VLOG(10) << "Share tensor buffer " << (*input_args)[i] << " -> "
                << (*output_args)[i];
-      if (!share_dims.empty() && share_dims[i]) {
+      if (!share_dims_and_dtype.empty() && share_dims_and_dtype[i]) {
         outputs[i]->Resize(inputs[i]->dims());
+        outputs[i]->ShareDataTypeWith(*inputs[i]);
       }
     }
   }
diff --git a/paddle/fluid/operators/share_buffer_op_test.cc b/paddle/fluid/operators/share_buffer_op_test.cc
new file mode 100644
index 0000000000000..60220981cab1d
--- /dev/null
+++ b/paddle/fluid/operators/share_buffer_op_test.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/place.h"
+
+USE_OP(share_buffer);
+
+namespace paddle {
+namespace framework {
+
+TEST(test_share_buffer_op, test_share_buffer_op) {
+  std::vector<std::string> inputs = {"X1", "X2"};
+  std::vector<std::string> outputs = {"Y1", "Y2"};
+  std::vector<DDim> dims = {{2, 3, 4}, {5, 6}};
+  std::vector<bool> share_dims_and_dtype = {false, true};
+
+  size_t n = inputs.size();
+  EXPECT_EQ(n, outputs.size());
+  EXPECT_EQ(n, dims.size());
+  EXPECT_EQ(n, share_dims_and_dtype.size());
+
+  OpDesc desc;
+  desc.SetType("share_buffer");
+  desc.SetInput("X", inputs);
+  desc.SetOutput("Out", outputs);
+  desc.SetOutput("XOut", inputs);
+  desc.SetAttr("share_dims_and_dtype", share_dims_and_dtype);
+
+  auto op = OpRegistry::CreateOp(desc);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  platform::Place place = platform::CUDAPlace(0);
+#else
+  platform::Place place = platform::CPUPlace();
+#endif
+
+  Scope scope;
+  for (size_t i = 0; i < n; ++i) {
+    auto *in_tensor = scope.Var(inputs[i])->GetMutable<LoDTensor>();
+    in_tensor->Resize(dims[i]);
+    in_tensor->mutable_data<float>(place);
+    scope.Var(outputs[i])->GetMutable<LoDTensor>();
+  }
+  op->Run(scope, place);
+  platform::DeviceContextPool::Instance().Get(place)->Wait();
+
+  for (size_t i = 0; i < n; ++i) {
+    const auto &in_tensor = scope.Var(inputs[i])->Get<LoDTensor>();
+    const auto &out_tensor = scope.Var(outputs[i])->Get<LoDTensor>();
+    EXPECT_TRUE(out_tensor.IsSharedBufferWith(in_tensor));
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/operators/shuffle_channel_op.cu b/paddle/fluid/operators/shuffle_channel_op.cu
index dbc3e1a7ebe26..582d1ea0f26af 100644
--- a/paddle/fluid/operators/shuffle_channel_op.cu
+++ b/paddle/fluid/operators/shuffle_channel_op.cu
@@ -10,8 +10,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/shuffle_channel_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
index 8611249a29f63..cc012230c1062 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
@@ -21,7 +21,7 @@ namespace cub = hipcub;
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
index a5513ba648776..4965e5e156c34 100644
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -244,7 +244,7 @@ class SliceOpMaker : public framework::OpProtoAndCheckerMaker {
         "mkldnn_data_type",
         "(string, default \"float32\"). Data type of mkldnn kernel")
         .SetDefault("float32")
-        .InEnum({"float32", "bfloat16"})
+        .InEnum({"float32", "int8", "bfloat16"})
         .AsExtra();
     AddComment(R"DOC(
 Slice Operator.
diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.h b/paddle/fluid/operators/softmax_cudnn_op.cu.h
index 68b694a59f47d..533488896dfcd 100644
--- a/paddle/fluid/operators/softmax_cudnn_op.cu.h
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.h
@@ -18,12 +18,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
 #include "paddle/fluid/operators/math/math_cuda_utils.h"
 #include "paddle/fluid/operators/softmax_op.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#else
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
@@ -453,7 +449,7 @@ void SoftmaxForwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx,
   const int N = SizeToAxis(axis, dims);
   const int D = SizeOutAxis(axis, dims);
 
-  constexpr int max_dim = 320;
+  constexpr int max_dim = 512;
   constexpr int warps_per_block = 4;
 
   if (D == 1 && dim <= max_dim && sizeof(T) <= 4) {
@@ -503,12 +499,12 @@ void SoftmaxForwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
                                  : MIOPEN_SOFTMAX_MODE_CHANNEL;
     if (LogMode) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
           handle, platform::CudnnDataType<T>::kOne(), desc_, x.data<T>(),
           platform::CudnnDataType<T>::kZero(), desc_, out_data,
           MIOPEN_SOFTMAX_LOG, mode));
     } else {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
           handle, platform::CudnnDataType<T>::kOne(), desc_, x.data<T>(),
           platform::CudnnDataType<T>::kZero(), desc_, out_data,
           MIOPEN_SOFTMAX_ACCURATE, mode));
@@ -517,12 +513,12 @@ void SoftmaxForwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
                                  : CUDNN_SOFTMAX_MODE_CHANNEL;
     if (LogMode) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxForward(
           handle, CUDNN_SOFTMAX_LOG, mode, platform::CudnnDataType<T>::kOne(),
           desc_, x.data<T>(), platform::CudnnDataType<T>::kZero(), desc_,
           out_data));
     } else {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxForward(
           handle, CUDNN_SOFTMAX_ACCURATE, mode,
           platform::CudnnDataType<T>::kOne(), desc_, x.data<T>(),
           platform::CudnnDataType<T>::kZero(), desc_, out_data));
@@ -544,7 +540,7 @@ void SoftmaxBackwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx,
   const int N = SizeToAxis(axis, dims);
   const int D = SizeOutAxis(axis, dims);
 
-  constexpr int max_dim = 320;
+  constexpr int max_dim = 512;
   constexpr int warps_per_block = 4;
 
   if (D == 1 && dim <= max_dim && sizeof(T) <= 4) {
@@ -591,12 +587,12 @@ void SoftmaxBackwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
                                  : MIOPEN_SOFTMAX_MODE_CHANNEL;
     if (LogMode) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2(
           handle, platform::CudnnDataType<T>::kOne(), desc_, out.data<T>(),
           desc_, dout.data<T>(), platform::CudnnDataType<T>::kZero(), desc_,
           dx_data, MIOPEN_SOFTMAX_LOG, mode));
     } else {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2(
           handle, platform::CudnnDataType<T>::kOne(), desc_, out.data<T>(),
           desc_, dout.data<T>(), platform::CudnnDataType<T>::kZero(), desc_,
           dx_data, MIOPEN_SOFTMAX_ACCURATE, mode));
@@ -605,12 +601,12 @@ void SoftmaxBackwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
                                  : CUDNN_SOFTMAX_MODE_CHANNEL;
     if (LogMode) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
           handle, CUDNN_SOFTMAX_LOG, mode, platform::CudnnDataType<T>::kOne(),
           desc_, out.data<T>(), desc_, dout.data<T>(),
           platform::CudnnDataType<T>::kZero(), desc_, dx_data));
     } else {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
           handle, CUDNN_SOFTMAX_ACCURATE, mode,
           platform::CudnnDataType<T>::kOne(), desc_, out.data<T>(), desc_,
           dout.data<T>(), platform::CudnnDataType<T>::kZero(), desc_, dx_data));
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index 3b1753b49b11d..cb97a0bb27cb5 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -18,13 +18,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
-
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 6a9dca9fe2a6a..520c95b6f3484 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -20,12 +20,8 @@ namespace cub = hipcub;
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/softmax_cudnn_op.cu.h"
 #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/for_range.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#else
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
 
 namespace paddle {
 namespace operators {
@@ -453,14 +449,14 @@ static void SoftmaxWithCrossEntropyHardLabel(
 #ifdef PADDLE_WITH_HIP
     auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
                                  : MIOPEN_SOFTMAX_MODE_CHANNEL;
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
         handle, platform::CudnnDataType<T>::kOne(), descp, logits_data,
         platform::CudnnDataType<T>::kZero(), descp, softmax_data,
         MIOPEN_SOFTMAX_LOG, mode));
 #else
     auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
                                  : CUDNN_SOFTMAX_MODE_CHANNEL;
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxForward(
         handle, CUDNN_SOFTMAX_LOG, mode, platform::CudnnDataType<T>::kOne(),
         descp, logits_data, platform::CudnnDataType<T>::kZero(), descp,
         softmax_data));
diff --git a/paddle/fluid/operators/spectral_helper.h b/paddle/fluid/operators/spectral_helper.h
index 924ec7cd52d50..39639768241d4 100644
--- a/paddle/fluid/operators/spectral_helper.h
+++ b/paddle/fluid/operators/spectral_helper.h
@@ -66,7 +66,7 @@ class CuFFTHandle {
 
  public:
   CuFFTHandle() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftCreate(&handle_));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftCreate(&handle_));
   }
 
   CuFFTHandle(const CuFFTHandle& other) = delete;
@@ -79,7 +79,7 @@ class CuFFTHandle {
   const ::cufftHandle& get() const { return handle_; }
 
   ~CuFFTHandle() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftDestroy(handle_));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftDestroy(handle_));
   }
 };
 
@@ -136,12 +136,12 @@ class FFTConfig {
     }
 
     // disable auto allocation of workspace to use allocator from the framework
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftSetAutoAllocation(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftSetAutoAllocation(
         plan(), /* autoAllocate */ 0));
 
     size_t ws_size_t;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftXtMakePlanMany(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftXtMakePlanMany(
         plan(), signal_ndim, signal_sizes.data(),
         /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype,
         /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype,
@@ -176,7 +176,7 @@ class HIPFFTHandle {
 
  public:
   HIPFFTHandle() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftCreate(&handle_));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftCreate(&handle_));
   }
 
   HIPFFTHandle(const HIPFFTHandle& other) = delete;
@@ -189,7 +189,7 @@ class HIPFFTHandle {
   const ::hipfftHandle& get() const { return handle_; }
 
   ~HIPFFTHandle() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftDestroy(handle_));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftDestroy(handle_));
   }
 };
 using plan_size_type = int;
@@ -248,12 +248,12 @@ class FFTConfig {
     }();
 
     // disable auto allocation of workspace to use allocator from the framework
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftSetAutoAllocation(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftSetAutoAllocation(
         plan(), /* autoAllocate */ 0));
 
     size_t ws_size_t;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftMakePlanMany(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftMakePlanMany(
         plan(), signal_ndim, signal_sizes.data(),
         /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1,
         /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, exec_type,
diff --git a/paddle/fluid/operators/spectral_op.cu b/paddle/fluid/operators/spectral_op.cu
index e97af7cea7e08..4ad99724fd622 100644
--- a/paddle/fluid/operators/spectral_op.cu
+++ b/paddle/fluid/operators/spectral_op.cu
@@ -96,7 +96,7 @@ static void exec_cufft_plan_raw(const FFTConfig& config, void* in_data,
                                 void* out_data, bool forward) {
   auto& plan = config.plan();
 
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftXtExec(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftXtExec(
       plan, in_data, out_data, forward ? CUFFT_FORWARD : CUFFT_INVERSE));
 }
 
@@ -167,20 +167,20 @@ static void exec_hipfft_plan_raw(const FFTConfig& config, void* in_data,
   if (value_type == framework::proto::VarType::FP32) {
     switch (config.transform_type()) {
       case FFTTransformType::C2C: {
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecC2C(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecC2C(
             plan, static_cast<hipfftComplex*>(in_data),
             static_cast<hipfftComplex*>(out_data),
             forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
         return;
       }
       case FFTTransformType::R2C: {
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecR2C(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecR2C(
             plan, static_cast<hipfftReal*>(in_data),
             static_cast<hipfftComplex*>(out_data)));
         return;
       }
       case FFTTransformType::C2R: {
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecC2R(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecC2R(
             plan, static_cast<hipfftComplex*>(in_data),
             static_cast<hipfftReal*>(out_data)));
         return;
@@ -189,20 +189,20 @@ static void exec_hipfft_plan_raw(const FFTConfig& config, void* in_data,
   } else if (value_type == framework::proto::VarType::FP64) {
     switch (config.transform_type()) {
       case FFTTransformType::C2C: {
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecZ2Z(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecZ2Z(
             plan, static_cast<hipfftDoubleComplex*>(in_data),
             static_cast<hipfftDoubleComplex*>(out_data),
             forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
         return;
       }
       case FFTTransformType::R2C: {
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecD2Z(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecD2Z(
             plan, static_cast<hipfftDoubleReal*>(in_data),
             static_cast<hipfftDoubleComplex*>(out_data)));
         return;
       }
       case FFTTransformType::C2R: {
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecZ2D(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecZ2D(
             plan, static_cast<hipfftDoubleComplex*>(in_data),
             static_cast<hipfftDoubleReal*>(out_data)));
         return;
@@ -332,11 +332,11 @@ void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out,
   }
 
   // prepare cufft for execution
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::cufftSetStream(config->plan(), ctx.stream()));
   framework::Tensor workspace_tensor;
   workspace_tensor.mutable_data<To>(tensor_place, config->workspace_size());
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftSetWorkArea(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftSetWorkArea(
       config->plan(), workspace_tensor.data<To>()));
   // execute transform plan
   exec_cufft_plan<DeviceContext, Ti, To>(ctx, *config, &collapsed_input,
@@ -355,11 +355,11 @@ void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out,
   config = &(plan_cache.lookup(key));
 
   // prepare cufft for execution
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::hipfftSetStream(config->plan(), ctx.stream()));
   framework::Tensor workspace_tensor;
   workspace_tensor.mutable_data<To>(tensor_place, config->workspace_size());
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftSetWorkArea(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftSetWorkArea(
       config->plan(), workspace_tensor.data<To>()));
   // execute transform plan
   exec_hipfft_plan<DeviceContext, Ti, To>(ctx, *config, &collapsed_input,
diff --git a/paddle/fluid/operators/stack_op.cu b/paddle/fluid/operators/stack_op.cu
index 9e5e45f4d22d9..5b3f03445d352 100644
--- a/paddle/fluid/operators/stack_op.cu
+++ b/paddle/fluid/operators/stack_op.cu
@@ -16,7 +16,7 @@
 #include <limits>
 #include <vector>
 #include "paddle/fluid/operators/stack_op.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 
 namespace plat = paddle::platform;
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/svd_op.cu b/paddle/fluid/operators/svd_op.cu
index ade7496d64622..0a7ed093ad0b8 100644
--- a/paddle/fluid/operators/svd_op.cu
+++ b/paddle/fluid/operators/svd_op.cu
@@ -91,9 +91,9 @@ void SvdGPUKernel<float>::GesvdjBatched(
   int ldt = n;
   int lwork = 0;
   auto handle = dev_ctx.cusolver_dn_handle();
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSgesvdj_bufferSize(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgesvdj_bufferSize(
       handle, jobz, thin_UV, m, n, A, lda, S, U, ldu, V, ldt, &lwork,
       gesvdj_params));
   auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float));
@@ -102,7 +102,7 @@ void SvdGPUKernel<float>::GesvdjBatched(
   int stride_U = ldu * (thin_UV ? k : m);
   int stride_V = ldt * (thin_UV ? k : n);
   for (int i = 0; i < batchSize; ++i) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSgesvdj(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgesvdj(
         handle, jobz, thin_UV, m, n, A + stride_A * i, lda, S + k * i,
         U + stride_U * i, ldu, V + stride_V * i, ldt, workspace_ptr, lwork,
         info, gesvdj_params));
@@ -116,7 +116,7 @@ void SvdGPUKernel<float>::GesvdjBatched(
         platform::errors::PreconditionNotMet(
             "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info));
   }
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
 }
 
@@ -134,9 +134,9 @@ void SvdGPUKernel<double>::GesvdjBatched(
   int ldt = n;
   int lwork = 0;
   auto handle = dev_ctx.cusolver_dn_handle();
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDgesvdj_bufferSize(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgesvdj_bufferSize(
       handle, jobz, thin_UV, m, n, A, lda, S, U, ldu, V, ldt, &lwork,
       gesvdj_params));
   auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double));
@@ -145,7 +145,7 @@ void SvdGPUKernel<double>::GesvdjBatched(
   int stride_U = ldu * (thin_UV ? k : m);
   int stride_V = ldt * (thin_UV ? k : n);
   for (int i = 0; i < batchSize; ++i) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDgesvdj(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgesvdj(
         handle, jobz, thin_UV, m, n, A + stride_A * i, lda, S + k * i,
         U + stride_U * i, ldu, V + stride_V * i, ldt, workspace_ptr, lwork,
         info, gesvdj_params));
@@ -159,7 +159,7 @@ void SvdGPUKernel<double>::GesvdjBatched(
         platform::errors::PreconditionNotMet(
             "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info));
   }
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
 }
 
diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu.h b/paddle/fluid/operators/sync_batch_norm_op.cu.h
index 69617b7e208a8..201de5ac1a428 100644
--- a/paddle/fluid/operators/sync_batch_norm_op.cu.h
+++ b/paddle/fluid/operators/sync_batch_norm_op.cu.h
@@ -21,19 +21,18 @@ limitations under the License. */
 #include <vector>
 #ifdef __NVCC__
 #include "cub/cub.cuh"
-#include "paddle/fluid/platform/cudnn_helper.h"
 #endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
-#include "paddle/fluid/platform/miopen_helper.h"
 #endif
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/batch_norm_op.h"
 #include "paddle/fluid/operators/norm_utils.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/nccl_helper.h"
 
 namespace paddle {
 namespace operators {
@@ -192,7 +191,7 @@ void SyncBatchNormFunctor(const framework::ExecutionContext &ctx,
     if (comm) {
       int dtype = platform::ToNCCLDataType(mean_out->type());
       // In-place operation
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
           stats, stats, 2 * C + 1, static_cast<ncclDataType_t>(dtype), ncclSum,
           comm, stream));
     }
@@ -466,7 +465,7 @@ void SyncBatchNormGradFunctor(
   if (comm) {
     int dtype = platform::ToNCCLDataType(scale->type());
     // In-place operation
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
         stats, stats, 2 * C + 1, static_cast<ncclDataType_t>(dtype), ncclSum,
         comm, stream));
   }
diff --git a/paddle/fluid/operators/temporal_shift_op.cu b/paddle/fluid/operators/temporal_shift_op.cu
index cb1ff5335cdf0..eb5a78f9dc0ec 100644
--- a/paddle/fluid/operators/temporal_shift_op.cu
+++ b/paddle/fluid/operators/temporal_shift_op.cu
@@ -10,8 +10,8 @@
    limitations under the License. */
 
 #include "paddle/fluid/operators/temporal_shift_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h
index 07749f90ebaa2..05ae5c9188ceb 100644
--- a/paddle/fluid/operators/top_k_function_cuda.h
+++ b/paddle/fluid/operators/top_k_function_cuda.h
@@ -24,7 +24,7 @@ limitations under the License. */
 #endif
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/top_k_op.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/float16.h"
 
 #ifdef __HIPCC__
diff --git a/paddle/fluid/operators/transpose_op.cu.h b/paddle/fluid/operators/transpose_op.cu.h
index 784d97b543fbd..6c637effee2cb 100644
--- a/paddle/fluid/operators/transpose_op.cu.h
+++ b/paddle/fluid/operators/transpose_op.cu.h
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/gpu_utils.h"
 #include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/trunc_op.cu b/paddle/fluid/operators/trunc_op.cu
index a284e0ea6e393..68d8c608f6338 100644
--- a/paddle/fluid/operators/trunc_op.cu
+++ b/paddle/fluid/operators/trunc_op.cu
@@ -12,8 +12,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/trunc_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc
index f38f5d9f72357..1426c799007a0 100644
--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
@@ -16,12 +16,7 @@ limitations under the License. */
 
 #include <memory>
 
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#endif
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/where_index_op.cu b/paddle/fluid/operators/where_index_op.cu
index b1cd172923ee6..feb8e83864e84 100644
--- a/paddle/fluid/operators/where_index_op.cu
+++ b/paddle/fluid/operators/where_index_op.cu
@@ -24,7 +24,7 @@ namespace cub = hipcub;
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/where_index_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/for_range.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/where_op.cu b/paddle/fluid/operators/where_op.cu
index 721c6e5390e85..54b0d5b69086c 100644
--- a/paddle/fluid/operators/where_op.cu
+++ b/paddle/fluid/operators/where_op.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/where_op.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 
 namespace platform = paddle::platform;
 
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 280674f9ab147..d8d41e9d9185a 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -47,18 +47,11 @@ cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS})
 cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
 
 IF(WITH_GPU)
-    nv_library(cuda_graph SRCS cuda_graph.cc DEPS enforce allocator_facade)
-    nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
-    nv_library(cuda_profiler SRCS cuda_profiler.cc DEPS enforce)
     nv_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade cuda_graph)
 ELSE()
     cc_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade)
 ENDIF()
 
-IF(WITH_ROCM)
-    hip_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
-ENDIF()
-
 cc_library(place SRCS place.cc DEPS enforce boost)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
@@ -78,6 +71,12 @@ IF(WITH_GPU OR WITH_ROCM)
     set(GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream)
 ENDIF()
 
+IF(WITH_IPU)
+    set(IPU_CTX_DEPS ipu_backend)
+ELSE()
+    set(IPU_CTX_DEPS)
+ENDIF(WITH_IPU)
+
 IF(WITH_ASCEND_CL)
     set(NPU_CTX_DEPS npu_stream npu_info)
 ENDIF()
@@ -116,7 +115,7 @@ cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost)
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS}
-    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
+    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
     ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS})
 
 cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce)
@@ -125,8 +124,7 @@ if(WITH_ASCEND_CL)
 endif()
 
 if(WITH_GPU OR WITH_ROCM)
-    cc_library(cuda_resource_pool SRCS cuda_resource_pool.cc DEPS gpu_info)
-    target_link_libraries(device_context cuda_resource_pool)
+    target_link_libraries(device_context gpu_resource_pool)
 endif()
 
 if(WITH_ASCEND_CL)
@@ -147,8 +145,6 @@ if(WITH_GPU)
   nv_test(device_event_test SRCS device_event_test.cc DEPS device_event_gpu)
 
   nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
-  nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda pten)
-  nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
   nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
 endif()
 
@@ -158,8 +154,6 @@ if(WITH_ROCM)
   hip_test(device_event_test SRCS device_event_test.cc DEPS device_event_gpu)
 
   hip_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
-  hip_test(miopen_helper_test SRCS miopen_helper_test.cc DEPS dynload_cuda)
-  hip_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda tensor)
   hip_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
 endif()
 
@@ -172,11 +166,9 @@ cc_test(lodtensor_printer_test SRCS lodtensor_printer_test.cc DEPS lodtensor_pri
 cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
 if(WITH_GPU)
   nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce dynload_cuda)
-  nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
   nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
 elseif(WITH_ROCM)
   hip_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce)
-  hip_test(cuda_helper_test SRCS cuda_helper_test.cu)
   hip_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
 else()
   cc_library(profiler SRCS profiler.cc DEPS device_tracer enforce)
diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc
index 03359d932b5ab..25f8f3ed9f3d8 100644
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
@@ -15,7 +15,7 @@
 #include "paddle/fluid/platform/collective_helper.h"
 #include <utility>
 
-#include "paddle/fluid/platform/cuda_resource_pool.h"
+#include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
 
 namespace paddle {
 namespace platform {
@@ -96,7 +96,7 @@ NCCLComm* NCCLCommContext::CreateComm(ncclUniqueId* nccl_id, int nranks,
 
   ncclComm_t comm = nullptr;
   SetDeviceId(dev_id);
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::ncclCommInitRank(&comm, nranks, *nccl_id, rank));
 
   auto* comm_wrapper = AssignNCCLComm(comm, nranks, rank, dev_id, ring_id);
@@ -121,7 +121,7 @@ void NCCLCommContext::CreateAllNCCLComms(const std::vector<int>& dev_ids,
 
   const int kDevices = dev_ids.size();
   ncclComm_t comms[kDevices];
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitAll(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclCommInitAll(
       comms, dev_ids.size(), dev_ids.data()));
 
   PADDLE_ENFORCE_EQ(comm_map_.count(ring_id), 0,
@@ -153,18 +153,18 @@ void NCCLCommContext::CreateNCCLCommMultiTrainer(
           << ", rind_id: " << ring_id;
   ncclComm_t comms[kDevices];
   {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupStart());
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupStart());
     for (int i = 0; i < kDevices; i++) {
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipSetDevice(i));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipSetDevice(i));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(i));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(i));
 #endif
       platform::dynload::ncclCommInitRank(comms + i, kDevices * ntrainers,
                                           *nccl_id, train_id * kDevices + i);
       VLOG(1) << "ncclCommInitRank: " << i;
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupEnd());
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupEnd());
     VLOG(1) << "nccl group end seccessss";
   }
   PADDLE_ENFORCE_EQ(comm_map_.count(ring_id), 0,
diff --git a/paddle/fluid/platform/complex.h b/paddle/fluid/platform/complex.h
index 35de34086c57d..e50b74133847c 100644
--- a/paddle/fluid/platform/complex.h
+++ b/paddle/fluid/platform/complex.h
@@ -401,6 +401,16 @@ HOSTDEVICE inline T abs(const complex<T>& a) {
 #endif
 }
 
+template <typename T>
+HOSTDEVICE inline T arg(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return thrust::arg(thrust::complex<T>(a));
+#else
+  return std::arg(std::complex<T>(a));
+#endif
+}
+
 template <typename T>
 HOSTDEVICE inline complex<T> pow(const complex<T>& a, const complex<T>& b) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
diff --git a/paddle/fluid/platform/cuda_device_guard.h b/paddle/fluid/platform/cuda_device_guard.h
index a85ebf4b81366..40204c0ed83f9 100644
--- a/paddle/fluid/platform/cuda_device_guard.h
+++ b/paddle/fluid/platform/cuda_device_guard.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.h b/paddle/fluid/platform/cuda_graph_with_memory_pool.h
index 6586146c5aefb..7a9e1a3a1419c 100644
--- a/paddle/fluid/platform/cuda_graph_with_memory_pool.h
+++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.h
@@ -17,7 +17,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cuda_graph.h"
+#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
 #endif
 
 namespace paddle {
@@ -60,6 +60,23 @@ inline void AddResetCallbackIfCapturingCUDAGraph(Callback &&callback) {
   callback();
 }
 
+template <typename T>
+inline T *RestoreHostMemIfCapturingCUDAGraph(T *host_mem, size_t size) {
+  static_assert(std::is_trivial<T>::value, "T must be trivial type");
+  static_assert(!std::is_same<T, void>::value, "T cannot be void");
+#ifdef PADDLE_WITH_CUDA
+  if (UNLIKELY(IsCUDAGraphCapturing())) {
+    size_t nbytes = size * sizeof(T);
+    void *new_host_mem = new uint8_t[nbytes];
+    std::memcpy(new_host_mem, host_mem, nbytes);
+    AddResetCallbackIfCapturingCUDAGraph(
+        [new_host_mem] { delete[] reinterpret_cast<uint8_t *>(new_host_mem); });
+    return reinterpret_cast<T *>(new_host_mem);
+  }
+#endif
+  return host_mem;
+}
+
 class SkipCUDAGraphCaptureGuard {
   DISABLE_COPY_AND_ASSIGN(SkipCUDAGraphCaptureGuard);
 
diff --git a/paddle/fluid/platform/device/CMakeLists.txt b/paddle/fluid/platform/device/CMakeLists.txt
index 515453afb63be..0cd07dec20e3e 100644
--- a/paddle/fluid/platform/device/CMakeLists.txt
+++ b/paddle/fluid/platform/device/CMakeLists.txt
@@ -2,8 +2,16 @@
 IF(WITH_XPU)
   add_subdirectory(xpu)
 ENDIF()
+IF(WITH_GPU OR WITH_ROCM)
+  add_subdirectory(gpu)
+ENDIF()
 
 # NPU
 IF(WITH_ASCEND OR WITH_ASCEND_CL)
   add_subdirectory(npu)
 ENDIF()
+
+# IPU
+IF(WITH_IPU)
+  add_subdirectory(ipu)
+ENDIF()
diff --git a/paddle/fluid/platform/device/gpu/CMakeLists.txt b/paddle/fluid/platform/device/gpu/CMakeLists.txt
new file mode 100644
index 0000000000000..5cf2258204fda
--- /dev/null
+++ b/paddle/fluid/platform/device/gpu/CMakeLists.txt
@@ -0,0 +1,15 @@
+IF(WITH_GPU)
+    add_subdirectory(cuda)
+    nv_library(gpu_info SRCS gpu_info.cc DEPS cuda_info gflags glog enforce monitor dynload_cuda)
+
+    nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
+    nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
+ELSEIF(WITH_ROCM)
+    add_subdirectory(rocm)
+    hip_library(gpu_info SRCS gpu_info.cc DEPS rocm_info gflags glog enforce monitor dynload_cuda)
+
+    hip_test(cuda_helper_test SRCS cuda_helper_test.cu)
+    hip_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
+ENDIF()
+
+cc_library(gpu_resource_pool SRCS gpu_resource_pool.cc DEPS gpu_info)
diff --git a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
new file mode 100644
index 0000000000000..5df1de1b00fac
--- /dev/null
+++ b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
@@ -0,0 +1,5 @@
+nv_library(cuda_info SRCS cuda_info.cc DEPS gflags glog enforce monitor dynload_cuda)
+nv_library(cuda_graph SRCS cuda_graph.cc DEPS enforce allocator_facade)
+nv_library(cuda_profiler SRCS cuda_profiler.cc DEPS enforce)
+
+nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda pten)
diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
similarity index 67%
rename from paddle/fluid/platform/cuda_device_function.h
rename to paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
index 352143302388a..e7d807573957f 100644
--- a/paddle/fluid/platform/cuda_device_function.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
@@ -22,16 +22,11 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-#ifdef PADDLE_WITH_HIP
-#define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate))
-#else
 #define FULL_WARP_MASK 0xFFFFFFFF
 #define CREATE_SHFL_MASK(mask, predicate) \
   mask = __ballot_sync(FULL_WARP_MASK, (predicate))
-#endif
 
 inline static int RoundToPowerOfTwo(int dim) {
-#ifdef PADDLE_WITH_CUDA
   if (dim > 512) {
     return 1024;
   } else if (dim > 256) {
@@ -45,17 +40,6 @@ inline static int RoundToPowerOfTwo(int dim) {
   } else {
     return 32;
   }
-#else  // HIP results in error or nan if > 256
-  if (dim > 128) {
-    return 256;
-  } else if (dim > 64) {
-    return 128;
-  } else if (dim > 32) {
-    return 64;
-  } else {
-    return 32;
-  }
-#endif
 }
 
 #define CUDA_LAUNCH_KERNEL_BASE(dim, ...)  \
@@ -76,71 +60,15 @@ template <typename T>
 __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
                                                  int delta,
                                                  int width = warpSize) {
-#if defined(PADDLE_WITH_HIP)
-  return __shfl_down(val, delta, width);
-#else
   return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
-#endif
 }
 
 template <typename T>
 __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val,
                                                 int width = warpSize) {
-#if defined(PADDLE_WITH_HIP)
-  return __shfl_xor(val, width);
-#else
   return __shfl_xor_sync(mask, val, width);
-#endif
-}
-
-#if defined(PADDLE_WITH_HIP)
-template <>
-__forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
-                                                       float16 val, int delta,
-                                                       int width) {
-  return float16(__shfl_down(static_cast<float>(val),
-                             static_cast<unsigned>(delta), width));
 }
 
-template <>
-__forceinline__ __device__ paddle::platform::complex<float> CudaShuffleDownSync(
-    unsigned mask, paddle::platform::complex<float> val, int delta, int width) {
-  float real = __shfl_down(val.real, delta, width);
-  float imag = __shfl_down(val.imag, delta, width);
-  return paddle::platform::complex<float>(real, imag);
-}
-
-template <>
-__forceinline__ __device__ paddle::platform::complex<double>
-CudaShuffleDownSync(unsigned mask, paddle::platform::complex<double> val,
-                    int delta, int width) {
-  double real = __shfl_down(val.real, delta, width);
-  double imag = __shfl_down(val.imag, delta, width);
-  return paddle::platform::complex<double>(real, imag);
-}
-
-template <>
-__forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask,
-                                                      float16 val, int width) {
-  return float16(__shfl_xor(static_cast<float>(val), width));
-}
-
-template <>
-__forceinline__ __device__ paddle::platform::complex<float> CudaShuffleXorSync(
-    unsigned mask, paddle::platform::complex<float> val, int width) {
-  float real = __shfl_xor(val.real, width);
-  float imag = __shfl_xor(val.imag, width);
-  return paddle::platform::complex<float>(real, imag);
-}
-
-template <>
-__forceinline__ __device__ paddle::platform::complex<double> CudaShuffleXorSync(
-    unsigned mask, paddle::platform::complex<double> val, int width) {
-  double real = __shfl_xor(val.real, width);
-  double imag = __shfl_xor(val.imag, width);
-  return paddle::platform::complex<double>(real, imag);
-}
-#else
 template <>
 __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
                                                        float16 val, int delta,
@@ -197,16 +125,11 @@ __forceinline__ __device__ paddle::platform::complex<double> CudaShuffleXorSync(
       __shfl_xor_sync(mask, static_cast<double>(val.imag), width));
   return paddle::platform::complex<double>(real, imag);
 }
-#endif
 
 template <typename T>
 __forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line,
                                              int width = 32) {
-#if defined(PADDLE_WITH_HIP)
-  return __shfl(val, src_line, width);
-#else
   return __shfl_sync(mask, val, src_line, width);
-#endif
 }
 
 template <typename T>
@@ -216,17 +139,13 @@ HOSTDEVICE T Infinity() {
 
 template <typename T>
 __device__ T reduceSum(T val, int tid, int len) {
-// NOTE(zcd): The warp size should be taken from the
-// parameters of the GPU but not specified as 32 simply.
-// To make the reduceSum more efficiently,
-// I use Warp-Level Parallelism and assume the Warp size
-// is 32 which may be different for different GPU,
-// but most card's warp size is 32.
-#ifdef PADDLE_WITH_HIP
-  const int warpSize = 64;
-#else
+  // NOTE(zcd): The warp size should be taken from the
+  // parameters of the GPU but not specified as 32 simply.
+  // To make the reduceSum more efficiently,
+  // I use Warp-Level Parallelism and assume the Warp size
+  // is 32 which may be different for different GPU,
+  // but most card's warp size is 32.
   const int warpSize = 32;
-#endif
   __shared__ T shm[warpSize];
   unsigned mask = 0u;
   CREATE_SHFL_MASK(mask, tid < len);
diff --git a/paddle/fluid/platform/cuda_graph.cc b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
similarity index 90%
rename from paddle/fluid/platform/cuda_graph.cc
rename to paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
index 6f3d452ef5c50..3970acf82d3ea 100644
--- a/paddle/fluid/platform/cuda_graph.cc
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/platform/cuda_graph.h"
+#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
 
 namespace paddle {
 namespace platform {
@@ -23,11 +23,11 @@ void CUDAGraph::Reset() {
   if (is_reset_) return;
 #if CUDA_VERSION >= 10010
   for (auto graph : graphs_) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaGraphDestroy(graph));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaGraphDestroy(graph));
   }
   graphs_.clear();
   for (auto exec_graph : exec_graphs_) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaGraphExecDestroy(exec_graph));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaGraphExecDestroy(exec_graph));
   }
   exec_graphs_.clear();
 #endif
@@ -46,7 +46,7 @@ void CUDAGraph::Replay() {
                     errors::PermissionDenied(
                         "Cannot replay the CUDA Graph after reset is called."));
   for (auto exec_graph : exec_graphs_) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaGraphLaunch(exec_graph, stream_));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaGraphLaunch(exec_graph, stream_));
   }
 #endif
 }
@@ -58,7 +58,7 @@ void CUDAGraph::BeginSegmentCapture() {
       IsCapturing(), true,
       errors::PermissionDenied("BeginSegmentCapture should be called when CUDA "
                                "Graph is capturing."));
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamBeginCapture(
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamBeginCapture(
       capturing_graph_->stream_, capturing_graph_->capture_mode_));
   PADDLE_ENFORCE_EQ(IsValidCapturing(), true,
                     platform::errors::PermissionDenied(
@@ -92,19 +92,19 @@ void CUDAGraph::EndSegmentCapture() {
   PADDLE_ENFORCE_EQ(IsCapturing(), true,
                     errors::PermissionDenied("No CUDA Graph is capturing."));
   cudaGraph_t graph;
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       cudaStreamEndCapture(capturing_graph_->stream_, &graph));
   auto num_nodes = static_cast<size_t>(-1);
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaGraphGetNodes(graph, nullptr, &num_nodes));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaGraphGetNodes(graph, nullptr, &num_nodes));
   if (num_nodes == 0) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaGraphDestroy(graph));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaGraphDestroy(graph));
     VLOG(10) << "Skip empty CUDA Graph with ID " << capturing_graph_->id_
              << ", segment id " << capturing_graph_->graphs_.size();
     return;
   }
 
   cudaGraphExec_t exec_graph;
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       cudaGraphInstantiate(&exec_graph, graph, nullptr, nullptr, 0));
   VLOG(10) << "End to capture CUDA Graph with ID " << capturing_graph_->id_
            << ", segment id " << capturing_graph_->graphs_.size();
@@ -123,7 +123,7 @@ bool CUDAGraph::IsValidCapturing() {
   if (!IsCapturing()) return false;
   cudaStreamCaptureStatus status;
   CUDAGraphID id;
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       cudaStreamGetCaptureInfo(capturing_graph_->stream_, &status, &id));
   return status == cudaStreamCaptureStatusActive;
 #else
@@ -154,7 +154,7 @@ void CUDAGraph::PrintToDotFiles(const std::string &dirname,
         ConcatPath(dirname, "segment_" + std::to_string(i) + ".dot");
     VLOG(10) << "Save the " << i << "-th segment of graph " << id_ << " to "
              << filename;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaGraphDebugDotPrint(graphs_[i], filename.c_str(), flags));
   }
 #else
diff --git a/paddle/fluid/platform/cuda_graph.h b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
similarity index 96%
rename from paddle/fluid/platform/cuda_graph.h
rename to paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
index f70a66f76242f..0856e0fad1900 100644
--- a/paddle/fluid/platform/cuda_graph.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
@@ -21,7 +21,7 @@
 #include <vector>
 #include "cuda.h"          // NOLINT
 #include "cuda_runtime.h"  // NOLINT
-#include "paddle/fluid/platform/type_defs.h"
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
 
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
@@ -129,7 +129,7 @@ class CUDAGraphCaptureModeGuard {
   explicit CUDAGraphCaptureModeGuard(
       cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) {
     if (UNLIKELY(CUDAGraph::IsCapturing())) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaThreadExchangeStreamCaptureMode(&mode));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaThreadExchangeStreamCaptureMode(&mode));
       // After cudaThreadExchangeStreamCaptureMode is called,
       // the variable "mode" would be set to the old capturing mode.
       old_mode_ = mode;
@@ -138,7 +138,7 @@ class CUDAGraphCaptureModeGuard {
 
   ~CUDAGraphCaptureModeGuard() PADDLE_MAY_THROW {
     if (UNLIKELY(CUDAGraph::IsCapturing())) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           cudaThreadExchangeStreamCaptureMode(&old_mode_));
     }
   }
diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
similarity index 78%
rename from paddle/fluid/platform/cuda_helper.h
rename to paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
index 202be920c5595..3199af9c97520 100644
--- a/paddle/fluid/platform/cuda_helper.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
@@ -16,12 +16,7 @@
 
 #include <mutex>  // NOLINT
 
-#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/cublas.h"
-#endif
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/dynload/rocblas.h"
-#endif
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
 
@@ -72,28 +67,13 @@ namespace platform {
  *
 */
 
-#ifdef __HIPCC__
-#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                     \
-  int64_t __index__ = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; \
-  for (index_type i = __index__; __index__ < (num);                   \
-       __index__ += hipBlockDim_x * hipGridDim_x, i = __index__)
-#else
 #define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)            \
   int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \
   for (index_type i = __index__; __index__ < (num);          \
        __index__ += blockDim.x * gridDim.x, i = __index__)
-#endif
-
-#define CUDA_KERNEL_LOOP(i, num) CUDA_KERNEL_LOOP_TYPE(i, num, int)
 
 class CublasHandleHolder {
  public:
-#ifdef PADDLE_WITH_HIP
-  explicit CublasHandleHolder(hipStream_t stream) {
-    PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_create_handle(&handle_));
-    PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_set_stream(handle_, stream));
-  }
-#else
   CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) {
     PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasCreate(&handle_));
     PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasSetStream(handle_, stream));
@@ -109,20 +89,11 @@ class CublasHandleHolder {
     }
 #endif  // CUDA_VERSION >= 9000
   }
-#endif
 
-#ifdef PADDLE_WITH_HIP
-  const rocblas_handle& GetCublasHandle() const { return handle_; }
-#else
   const cublasHandle_t& GetCublasHandle() const { return handle_; }
-#endif
 
   ~CublasHandleHolder() PADDLE_MAY_THROW {
-#ifdef PADDLE_WITH_HIP
-    PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_destroy_handle(handle_));
-#else
     PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasDestroy(handle_));
-#endif
   }
 
   template <typename Callback>
@@ -134,11 +105,7 @@ class CublasHandleHolder {
  private:
   DISABLE_COPY_AND_ASSIGN(CublasHandleHolder);
 
-#ifdef PADDLE_WITH_HIP
-  rocblas_handle handle_;
-#else
   cublasHandle_t handle_;
-#endif
   mutable std::mutex mtx_;
 };
 
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_info.cc b/paddle/fluid/platform/device/gpu/cuda/cuda_info.cc
new file mode 100644
index 0000000000000..6109ed6554318
--- /dev/null
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_info.cc
@@ -0,0 +1,268 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/lock_guard_ptr.h"
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/monitor.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/split.h"
+
+static std::once_flag g_device_props_size_init_flag;
+static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags;
+static std::vector<paddle::gpuDeviceProp> g_device_props;
+
+namespace paddle {
+namespace platform {
+int DnnVersion() {
+  if (!dynload::HasCUDNN()) return -1;
+  return dynload::cudnnGetVersion();
+}
+
+static int GetGPUDeviceCountImpl() {
+  int driverVersion = 0;
+  cudaError_t status = cudaDriverGetVersion(&driverVersion);
+
+  if (!(status == gpuSuccess && driverVersion != 0)) {
+    // No GPU driver
+    VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!";
+    return 0;
+  }
+
+  const auto *cuda_visible_devices = std::getenv("CUDA_VISIBLE_DEVICES");
+
+  if (cuda_visible_devices != nullptr) {
+    std::string cuda_visible_devices_str(cuda_visible_devices);
+    if (!cuda_visible_devices_str.empty()) {
+      cuda_visible_devices_str.erase(
+          0, cuda_visible_devices_str.find_first_not_of('\''));
+      cuda_visible_devices_str.erase(
+          cuda_visible_devices_str.find_last_not_of('\'') + 1);
+      cuda_visible_devices_str.erase(
+          0, cuda_visible_devices_str.find_first_not_of('\"'));
+      cuda_visible_devices_str.erase(
+          cuda_visible_devices_str.find_last_not_of('\"') + 1);
+    }
+    if (std::all_of(cuda_visible_devices_str.begin(),
+                    cuda_visible_devices_str.end(),
+                    [](char ch) { return ch == ' '; })) {
+      VLOG(2) << "CUDA_VISIBLE_DEVICES is set to be "
+                 "empty. No GPU detected.";
+      return 0;
+    }
+  }
+  int count;
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaGetDeviceCount(&count));
+  return count;
+}
+
+int GetGPUDeviceCount() {
+  // cache the count
+  static auto dev_cnt = GetGPUDeviceCountImpl();
+  return dev_cnt;
+}
+
+int GetGPUComputeCapability(int id) {
+  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetGPUDeviceCount()));
+  int major, minor;
+  auto major_error_code =
+      cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id);
+  auto minor_error_code =
+      cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, id);
+
+  PADDLE_ENFORCE_GPU_SUCCESS(major_error_code);
+  PADDLE_ENFORCE_GPU_SUCCESS(minor_error_code);
+  return major * 10 + minor;
+}
+
+int GetGPURuntimeVersion(int id) {
+  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetGPUDeviceCount()));
+  int runtime_version = 0;
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaRuntimeGetVersion(&runtime_version));
+  return runtime_version;
+}
+
+int GetGPUDriverVersion(int id) {
+  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetGPUDeviceCount()));
+  int driver_version = 0;
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaDriverGetVersion(&driver_version));
+  return driver_version;
+}
+
+bool TensorCoreAvailable() {
+  int device = GetCurrentDeviceId();
+  int driver_version = GetGPUComputeCapability(device);
+  return driver_version >= 70;
+}
+
+int GetGPUMultiProcessors(int id) {
+  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetGPUDeviceCount()));
+  int count;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id));
+  return count;
+}
+
+int GetGPUMaxThreadsPerMultiProcessor(int id) {
+  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetGPUDeviceCount()));
+  int count;
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceGetAttribute(
+      &count, cudaDevAttrMaxThreadsPerMultiProcessor, id));
+
+  return count;
+}
+
+int GetGPUMaxThreadsPerBlock(int id) {
+  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetGPUDeviceCount()));
+  int count;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id));
+  return count;
+}
+
+int GetCurrentDeviceId() {
+  int device_id;
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaGetDevice(&device_id));
+  return device_id;
+}
+
+dim3 GetGpuMaxGridDimSize(int id) {
+  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetGPUDeviceCount()));
+  dim3 ret;
+  int size;
+  auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id);
+  PADDLE_ENFORCE_GPU_SUCCESS(error_code_x);
+  ret.x = size;
+
+  auto error_code_y = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id);
+  PADDLE_ENFORCE_GPU_SUCCESS(error_code_y);
+  ret.y = size;
+
+  auto error_code_z = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id);
+  PADDLE_ENFORCE_GPU_SUCCESS(error_code_z);
+  ret.z = size;
+  return ret;
+}
+
+const gpuDeviceProp &GetDeviceProperties(int id) {
+  std::call_once(g_device_props_size_init_flag, [&] {
+    int gpu_num = 0;
+    gpu_num = platform::GetGPUDeviceCount();
+    g_device_props_init_flags.resize(gpu_num);
+    g_device_props.resize(gpu_num);
+    for (int i = 0; i < gpu_num; ++i) {
+      g_device_props_init_flags[i] = std::make_unique<std::once_flag>();
+    }
+  });
+
+  if (id == -1) {
+    id = platform::GetCurrentDeviceId();
+  }
+
+  if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
+    PADDLE_THROW(platform::errors::OutOfRange(
+        "The device id %d is out of range [0, %d), where %d is the number of "
+        "devices on this machine. Because the device id should be greater than "
+        "or equal to zero and smaller than the number of gpus. Please input "
+        "appropriate device again!",
+        id, static_cast<int>(g_device_props.size()),
+        static_cast<int>(g_device_props.size())));
+  }
+
+  std::call_once(*(g_device_props_init_flags[id]), [&] {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        cudaGetDeviceProperties(&g_device_props[id], id));
+  });
+
+  return g_device_props[id];
+}
+
+void SetDeviceId(int id) {
+  // TODO(qijun): find a better way to cache the cuda device count
+  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetGPUDeviceCount()));
+  PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(id));
+}
+
+void GpuMemcpyAsync(void *dst, const void *src, size_t count,
+                    gpuMemcpyKind kind, gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(dst, src, count, kind, stream));
+}
+
+void GpuMemcpySync(void *dst, const void *src, size_t count,
+                   gpuMemcpyKind kind) {
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(dst, src, count, kind));
+}
+
+void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
+                        int src_device, size_t count, gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
+}
+
+void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
+                       int src_device, size_t count) {
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cudaMemcpyPeer(dst, dst_device, src, src_device, count));
+}
+
+void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(dst, value, count, stream));
+}
+
+void GpuStreamSync(gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+}
+
+void GpuDestroyStream(gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream));
+}
+
+void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); }
+
+gpuError_t GpuGetLastError() { return cudaGetLastError(); }
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/cuda_profiler.cc b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc
similarity index 85%
rename from paddle/fluid/platform/cuda_profiler.cc
rename to paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc
index 998dd80dc5e7d..42351fe097a9d 100644
--- a/paddle/fluid/platform/cuda_profiler.cc
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/platform/cuda_profiler.h"
+#include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
 
 namespace paddle {
 namespace platform {
@@ -25,13 +25,13 @@ void CudaProfilerInit(std::string output_file, std::string output_mode,
                      "`csv`, but received `%s`.",
                      output_mode));
   cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode));
 }
 
-void CudaProfilerStart() { PADDLE_ENFORCE_CUDA_SUCCESS(cudaProfilerStart()); }
+void CudaProfilerStart() { PADDLE_ENFORCE_GPU_SUCCESS(cudaProfilerStart()); }
 
-void CudaProfilerStop() { PADDLE_ENFORCE_CUDA_SUCCESS(cudaProfilerStop()); }
+void CudaProfilerStop() { PADDLE_ENFORCE_GPU_SUCCESS(cudaProfilerStop()); }
 
 #ifndef _WIN32
 void CudaNvtxRangePush(std::string name) {
diff --git a/paddle/fluid/platform/cuda_profiler.h b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h
similarity index 100%
rename from paddle/fluid/platform/cuda_profiler.h
rename to paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h
diff --git a/paddle/fluid/platform/cudnn_desc.h b/paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h
similarity index 84%
rename from paddle/fluid/platform/cudnn_desc.h
rename to paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h
index 318c85ee484be..7bff2c69381e6 100644
--- a/paddle/fluid/platform/cudnn_desc.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h
@@ -23,7 +23,7 @@
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h"
 #include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
@@ -99,7 +99,7 @@ class ActivationDescriptor {
   struct Deleter {
     void operator()(T* t) {
       if (t != nullptr) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             dynload::cudnnDestroyActivationDescriptor(t));
         t = nullptr;
       }
@@ -107,13 +107,13 @@ class ActivationDescriptor {
   };
   ActivationDescriptor() {
     T* raw_ptr;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::cudnnCreateActivationDescriptor(&raw_ptr));
     desc_.reset(raw_ptr);
   }
   template <typename T>
   void set(cudnnActivationMode_t mode, const T& coef) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetActivationDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetActivationDescriptor(
         desc_.get(), mode, CUDNN_NOT_PROPAGATE_NAN, static_cast<double>(coef)));
   }
 
@@ -130,14 +130,14 @@ class TensorDescriptor {
   struct Deleter {
     void operator()(T* t) {
       if (t != nullptr) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyTensorDescriptor(t));
+        PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroyTensorDescriptor(t));
         t = nullptr;
       }
     }
   };
   TensorDescriptor() {
     T* raw_ptr;
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateTensorDescriptor(&raw_ptr));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreateTensorDescriptor(&raw_ptr));
     desc_.reset(raw_ptr);
   }
   T* desc() { return desc_.get(); }
@@ -153,7 +153,7 @@ class TensorDescriptor {
     if (groups > 1) {
       dims_with_group[1] = dims_with_group[1] / groups;
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetTensorNdDescriptor(
         desc_.get(), ToCudnnDataType(tensor.type()), dims_with_group.size(),
         dims_with_group.data(), strides.data()));
   }
@@ -166,7 +166,7 @@ class TensorDescriptor {
     } else {
       transformed_dims = dims;
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptorEx(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetTensorNdDescriptorEx(
         desc_.get(), format, dtype, transformed_dims.size(),
         transformed_dims.data()));
   }
@@ -187,14 +187,14 @@ class FilterDescriptor {
   struct Deleter {
     void operator()(T* t) {
       if (t != nullptr) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyFilterDescriptor(t));
+        PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroyFilterDescriptor(t));
         t = nullptr;
       }
     }
   };
   FilterDescriptor() {
     T* raw_ptr;
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateFilterDescriptor(&raw_ptr));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreateFilterDescriptor(&raw_ptr));
     desc_.reset(raw_ptr);
   }
   T* desc() { return desc_.get(); }
@@ -211,7 +211,7 @@ class FilterDescriptor {
     if (groups > 1) {
       transformed_dims[1] = transformed_dims[1] / groups;
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetFilterNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetFilterNdDescriptor(
         desc_.get(), dtype, format, transformed_dims.size(),
         transformed_dims.data()));
   }
@@ -233,7 +233,7 @@ class ConvolutionDescriptor {
   struct Deleter {
     void operator()(T* t) {
       if (t != nullptr) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             dynload::cudnnDestroyConvolutionDescriptor(t));
         t = nullptr;
       }
@@ -241,7 +241,7 @@ class ConvolutionDescriptor {
   };
   ConvolutionDescriptor() {
     T* raw_ptr;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::cudnnCreateConvolutionDescriptor(&raw_ptr));
     desc_.reset(raw_ptr);
   }
@@ -255,28 +255,26 @@ class ConvolutionDescriptor {
     cudnnDataType_t compute_type =
         (dtype == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT;
     T* desc = desc_.get();
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetConvolutionNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetConvolutionNdDescriptor(
         desc, pads.size(), pads.data(), strides.data(), dilations.data(),
         CUDNN_CROSS_CORRELATION, compute_type));
 #if CUDNN_VERSION_MIN(7, 0, 1)
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnSetConvolutionGroupCount(desc, groups));
 #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
         desc, CUDNN_DEFAULT_MATH));
     if (dtype == CUDNN_DATA_HALF) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnSetConvolutionMathType(desc,
-                                                         CUDNN_TENSOR_OP_MATH));
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+          desc, CUDNN_TENSOR_OP_MATH));
 #if CUDA_VERSION >= 11000
 #if CUDNN_VERSION_MIN(8, 1, 0)
     } else if (dtype == CUDNN_DATA_BFLOAT16) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnSetConvolutionMathType(desc,
-                                                         CUDNN_TENSOR_OP_MATH));
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+          desc, CUDNN_TENSOR_OP_MATH));
 #endif  // CUDNN_VERSION_MIN(8,1,0)
     } else if (dtype == CUDNN_DATA_FLOAT && !allow_tf32) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnSetConvolutionMathType(desc, CUDNN_FMA_MATH));
 #endif  // CUDA_VERSION >= 11000
     }
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h
similarity index 88%
rename from paddle/fluid/platform/cudnn_helper.h
rename to paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h
index 65dd69a37d37f..2bcdbaa201889 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h
@@ -191,10 +191,10 @@ inline cudnnTensorFormat_t GetCudnnTensorFormat(
 class ScopedTensorDescriptor {
  public:
   ScopedTensorDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateTensorDescriptor(&desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreateTensorDescriptor(&desc_));
   }
   ~ScopedTensorDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyTensorDescriptor(desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroyTensorDescriptor(desc_));
   }
 
   inline cudnnTensorDescriptor_t descriptor(const cudnnTensorFormat_t format,
@@ -216,20 +216,20 @@ class ScopedTensorDescriptor {
 
     if (dims.size() == 4) {
       if (format == CUDNN_TENSOR_NCHW) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptor(
+        PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetTensorNdDescriptor(
             desc_, type, dims_with_group.size(), dims_with_group.data(),
             strides.data()));
       } else {  // CUDNN_TENSOR_NHWC
-        PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensor4dDescriptor(
+        PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetTensor4dDescriptor(
             desc_, format, type, dims[0], dims[3], dims[1], dims[2]));
       }
     } else if (dims.size() == 5) {
       if (format == CUDNN_TENSOR_NCHW) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptor(
+        PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetTensorNdDescriptor(
             desc_, type, dims_with_group.size(), dims_with_group.data(),
             strides.data()));
       } else {  // CUDNN_TENSOR_NHWC
-        PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptorEx(
+        PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetTensorNdDescriptorEx(
             desc_, format, type, dims.size(), dims.data()));
       }
     }
@@ -247,7 +247,7 @@ class ScopedTensorDescriptor {
   inline cudnnTensorDescriptor_t descriptor(const cudnnDataType_t cudnn_type,
                                             const std::vector<int>& dim,
                                             const std::vector<int>& stride) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetTensorNdDescriptor(
         desc_, cudnn_type, dim.size(), dim.data(), stride.data()));
     return desc_;
   }
@@ -269,11 +269,11 @@ class ScopedTensorDescriptor {
 class ScopedRNNTensorDescriptor {
  public:
   ScopedRNNTensorDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateRNNDataDescriptor(&desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreateRNNDataDescriptor(&desc_));
   }
 
   ~ScopedRNNTensorDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyRNNDataDescriptor(desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroyRNNDataDescriptor(desc_));
   }
 
   inline cudnnRNNDataDescriptor_t descriptor(
@@ -288,7 +288,7 @@ class ScopedRNNTensorDescriptor {
       layout = CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED;
     }
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetRNNDataDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetRNNDataDescriptor(
         desc_, cudnn_type, layout, max_seq_length, batch_size, input_size,
         seq_length.data(), static_cast<void*>(&padding_fill)));
 
@@ -314,10 +314,10 @@ class ScopedRNNTensorDescriptor {
 class ScopedDropoutDescriptor {
  public:
   ScopedDropoutDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateDropoutDescriptor(&desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreateDropoutDescriptor(&desc_));
   }
   ~ScopedDropoutDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyDropoutDescriptor(desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroyDropoutDescriptor(desc_));
   }
 
   inline cudnnDropoutDescriptor_t descriptor(const cudnnHandle_t& handle,
@@ -327,19 +327,19 @@ class ScopedDropoutDescriptor {
                                              framework::Tensor* dropout_state_,
                                              int seed, size_t state_size) {
     if (dropout_state_ == nullptr) {  // for no dropout or test
-      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetDropoutDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetDropoutDescriptor(
           desc_, handle, 0 /* dropout */, nullptr, 0 /* state_size */,
           0 /* seed */));
       return desc_;
     }
     auto* dropout_state_data = dropout_state_->data<uint8_t>();
     if (!initialized) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetDropoutDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetDropoutDescriptor(
           desc_, handle, dropout_prob_, dropout_state_data, state_size, seed));
     } else {
       auto dropout_state_dims = dropout_state_->dims();
       state_size = dropout_state_dims[0];
-      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnRestoreDropoutDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnRestoreDropoutDescriptor(
           desc_, handle, dropout_prob_, dropout_state_data, state_size, 0));
     }
     return desc_;
@@ -354,10 +354,10 @@ class ScopedDropoutDescriptor {
 class ScopedRNNDescriptor {
  public:
   ScopedRNNDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateRNNDescriptor(&desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreateRNNDescriptor(&desc_));
   }
   ~ScopedRNNDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyRNNDescriptor(desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroyRNNDescriptor(desc_));
   }
 
   inline cudnnRNNDescriptor_t desc() { return desc_; }
@@ -370,10 +370,10 @@ class ScopedRNNDescriptor {
 class ScopedFilterDescriptor {
  public:
   ScopedFilterDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateFilterDescriptor(&desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreateFilterDescriptor(&desc_));
   }
   ~ScopedFilterDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyFilterDescriptor(desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroyFilterDescriptor(desc_));
   }
 
   inline cudnnFilterDescriptor_t descriptor(const cudnnTensorFormat_t format,
@@ -389,7 +389,7 @@ class ScopedFilterDescriptor {
       kernel_with_group[0] /= groups;
       // NOTE: input filter(C) of the filter is already asserted to be C/groups.
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetFilterNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetFilterNdDescriptor(
         desc_, type, format, kernel_with_group.size(),
         kernel_with_group.data()));
     return desc_;
@@ -413,11 +413,11 @@ class ScopedFilterDescriptor {
 class ScopedConvolutionDescriptor {
  public:
   ScopedConvolutionDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::cudnnCreateConvolutionDescriptor(&desc_));
   }
   ~ScopedConvolutionDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::cudnnDestroyConvolutionDescriptor(desc_));
   }
 
@@ -438,7 +438,7 @@ class ScopedConvolutionDescriptor {
 
     cudnnDataType_t compute_type =
         (type == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT;
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetConvolutionNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetConvolutionNdDescriptor(
         desc_, pads.size(), pads.data(), strides.data(), dilations.data(),
         CUDNN_CROSS_CORRELATION, compute_type));
     return desc_;
@@ -459,10 +459,10 @@ class ScopedConvolutionDescriptor {
 class ScopedPoolingDescriptor {
  public:
   ScopedPoolingDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreatePoolingDescriptor(&desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreatePoolingDescriptor(&desc_));
   }
   ~ScopedPoolingDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyPoolingDescriptor(desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroyPoolingDescriptor(desc_));
   }
 
   inline cudnnPoolingDescriptor_t descriptor(const PoolingMode& mode,
@@ -480,7 +480,7 @@ class ScopedPoolingDescriptor {
             "The size of kernel and strides should be equal. But "
             "received size of kernel is %d, size of strides is %d.",
             kernel.size(), strides.size()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetPoolingNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetPoolingNdDescriptor(
         desc_, (GetPoolingMode(mode)),
         CUDNN_PROPAGATE_NAN,  // Always propagate nans.
         kernel.size(), kernel.data(), pads.data(), strides.data()));
@@ -495,18 +495,18 @@ class ScopedPoolingDescriptor {
 class ScopedSpatialTransformerDescriptor {
  public:
   ScopedSpatialTransformerDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::cudnnCreateSpatialTransformerDescriptor(&desc_));
   }
   ~ScopedSpatialTransformerDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::cudnnDestroySpatialTransformerDescriptor(desc_));
   }
 
   template <typename T>
   inline cudnnSpatialTransformerDescriptor_t descriptor(const int nbDims,
                                                         const int dimA[]) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetSpatialTransformerNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetSpatialTransformerNdDescriptor(
         desc_, CUDNN_SAMPLER_BILINEAR, CudnnDataType<T>::type, nbDims, dimA));
     return desc_;
   }
@@ -519,11 +519,11 @@ class ScopedSpatialTransformerDescriptor {
 class ScopedActivationDescriptor {
  public:
   ScopedActivationDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::cudnnCreateActivationDescriptor(&desc_));
   }
   ~ScopedActivationDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::cudnnDestroyActivationDescriptor(desc_));
   }
 
@@ -561,7 +561,7 @@ class ScopedActivationDescriptor {
             "Unrecognized CUDNN activation mode: %d.",
             static_cast<int>(activation_mode)));
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetActivationDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetActivationDescriptor(
         desc_, mode, CUDNN_NOT_PROPAGATE_NAN, relu_ceiling));
     return desc_;
   }
@@ -587,15 +587,15 @@ inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
 class ScopedCTCLossDescriptor {
  public:
   ScopedCTCLossDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateCTCLossDescriptor(&desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreateCTCLossDescriptor(&desc_));
   }
   ~ScopedCTCLossDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyCTCLossDescriptor(desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroyCTCLossDescriptor(desc_));
   }
 
   template <typename T>
   inline cudnnCTCLossDescriptor_t descriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::cudnnSetCTCLossDescriptor(desc_, CudnnDataType<T>::type));
     return desc_;
   }
diff --git a/paddle/fluid/platform/cudnn_helper_test.cc b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper_test.cc
similarity index 98%
rename from paddle/fluid/platform/cudnn_helper_test.cc
rename to paddle/fluid/platform/device/gpu/cuda/cudnn_helper_test.cc
index 98ec2be87755c..851d0d18c604c 100644
--- a/paddle/fluid/platform/cudnn_helper_test.cc
+++ b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper_test.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #define GLOG_NO_ABBREVIATED_SEVERITIES
 #define GOOGLE_GLOG_DLL_DECL
 
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 #include <gtest/gtest.h>
 
diff --git a/paddle/fluid/platform/cuda_helper_test.cu b/paddle/fluid/platform/device/gpu/cuda_helper_test.cu
similarity index 98%
rename from paddle/fluid/platform/cuda_helper_test.cu
rename to paddle/fluid/platform/device/gpu/cuda_helper_test.cu
index fd46aa2393403..ab8bb2cad8c51 100644
--- a/paddle/fluid/platform/cuda_helper_test.cu
+++ b/paddle/fluid/platform/device/gpu/cuda_helper_test.cu
@@ -21,11 +21,11 @@
 #include <random>
 
 #define PADDLE_CUDA_FP16
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
 
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_helper.h"
 
 using paddle::platform::PADDLE_CUDA_NUM_THREADS;
 using paddle::platform::float16;
diff --git a/paddle/fluid/platform/cudnn_desc_test.cc b/paddle/fluid/platform/device/gpu/cudnn_desc_test.cc
similarity index 90%
rename from paddle/fluid/platform/cudnn_desc_test.cc
rename to paddle/fluid/platform/device/gpu/cudnn_desc_test.cc
index db5362f5cb1f5..8ea30027e8ade 100644
--- a/paddle/fluid/platform/cudnn_desc_test.cc
+++ b/paddle/fluid/platform/device/gpu/cudnn_desc_test.cc
@@ -12,11 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_desc.h"
-#else
-#include "paddle/fluid/platform/cudnn_desc.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 #include <gtest/gtest.h>
 
diff --git a/paddle/fluid/platform/device/gpu/gpu_device_function.h b/paddle/fluid/platform/device/gpu/gpu_device_function.h
new file mode 100644
index 0000000000000..a8daa5e87fdc3
--- /dev/null
+++ b/paddle/fluid/platform/device/gpu/gpu_device_function.h
@@ -0,0 +1,24 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h"
+#else
+#include "paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h"
+#endif
+
+#endif
diff --git a/paddle/fluid/platform/device/gpu/gpu_dnn.h b/paddle/fluid/platform/device/gpu/gpu_dnn.h
new file mode 100644
index 0000000000000..3f9bc5e6de80b
--- /dev/null
+++ b/paddle/fluid/platform/device/gpu/gpu_dnn.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/device/gpu/rocm/miopen_desc.h"
+#include "paddle/fluid/platform/device/gpu/rocm/miopen_helper.h"
+#else  // CUDA
+#include "paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h"
+#include "paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h"
+#endif
+
+#endif
diff --git a/paddle/fluid/platform/device/gpu/gpu_helper.h b/paddle/fluid/platform/device/gpu/gpu_helper.h
new file mode 100644
index 0000000000000..6077a7b625d25
--- /dev/null
+++ b/paddle/fluid/platform/device/gpu/gpu_helper.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/device/gpu/rocm/rocm_helper.h"
+#else
+#include "paddle/fluid/platform/device/gpu/cuda/cuda_helper.h"
+#endif
+
+#define CUDA_KERNEL_LOOP(i, num) CUDA_KERNEL_LOOP_TYPE(i, num, int)
+
+#endif
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
new file mode 100644
index 0000000000000..e68277cc37b38
--- /dev/null
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -0,0 +1,356 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include <cstdlib>
+#include <mutex>
+#include <vector>
+
+#include "gflags/gflags.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/dynload/miopen.h"
+#else
+#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
+#include "paddle/fluid/platform/dynload/cudnn.h"
+#endif
+#include "paddle/fluid/memory/malloc.h"
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 10020
+#include "paddle/fluid/platform/dynload/cuda_driver.h"
+#endif
+#endif
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/lock_guard_ptr.h"
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/monitor.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/split.h"
+
+DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_uint64(initial_gpu_memory_in_mb);
+DECLARE_uint64(reallocate_gpu_memory_in_mb);
+DECLARE_bool(enable_cublas_tensor_op_math);
+DECLARE_string(selected_gpus);
+DECLARE_uint64(gpu_memory_limit_mb);
+
+constexpr static float fraction_reserve_gpu_memory = 0.05f;
+
+USE_GPU_MEM_STAT;
+namespace paddle {
+namespace platform {
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetSelectedDevices() {
+  // use user specified GPUs in single-node multi-process mode.
+  std::vector<int> devices;
+  if (!FLAGS_selected_gpus.empty()) {
+    auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ',');
+    for (auto id : devices_str) {
+      devices.push_back(atoi(id.c_str()));
+    }
+  } else {
+    int count = GetGPUDeviceCount();
+    for (int i = 0; i < count; ++i) {
+      devices.push_back(i);
+    }
+  }
+  return devices;
+}
+
+void GpuMemoryUsage(size_t *available, size_t *total) {
+  size_t actual_available, actual_total;
+  RecordedGpuMemGetInfo(available, total, &actual_available, &actual_total,
+                        platform::GetCurrentDeviceId());
+}
+
+size_t GpuAvailableMemToAlloc() {
+  size_t total = 0;
+  size_t available = 0;
+  GpuMemoryUsage(&available, &total);
+  size_t reserving =
+      static_cast<size_t>(fraction_reserve_gpu_memory * available);
+  // If available size is less than minimum chunk size, no usable memory exists
+  size_t available_to_alloc = available - reserving;
+  size_t min_chunk_size = GpuMinChunkSize();
+  if (available_to_alloc < min_chunk_size) {
+    available_to_alloc = 0;
+  }
+  VLOG(10) << "GPU usage " << (available >> 20) << "M/" << (total >> 20)
+           << "M, " << (available_to_alloc >> 20) << "M available to allocate";
+  return available_to_alloc;
+}
+
+size_t GpuMaxAllocSize() {
+  return std::max(GpuInitAllocSize(), GpuReallocSize());
+}
+
+static size_t GpuAllocSize(bool realloc) {
+  size_t available_to_alloc = GpuAvailableMemToAlloc();
+  PADDLE_ENFORCE_GT(
+      available_to_alloc, 0,
+      platform::errors::ResourceExhausted("Not enough available GPU memory."));
+  // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
+  // allocated by fraction
+  size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
+                           : FLAGS_initial_gpu_memory_in_mb;
+  size_t alloc_bytes =
+      (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
+                                           FLAGS_fraction_of_gpu_memory_to_use);
+  PADDLE_ENFORCE_GE(
+      available_to_alloc, alloc_bytes,
+      platform::errors::ResourceExhausted("Not enough available GPU memory."));
+  VLOG(10) << "Alloc size is " << (alloc_bytes >> 20)
+           << " MiB, is it Re-alloc: " << realloc;
+  return alloc_bytes;
+}
+
+size_t GpuInitAllocSize() { return GpuAllocSize(/* realloc = */ false); }
+
+size_t GpuReallocSize() { return GpuAllocSize(/* realloc = */ true); }
+
+size_t GpuMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 256 bytes.
+  return 1 << 8;
+}
+
+size_t GpuMaxChunkSize() {
+  size_t max_chunk_size = GpuMaxAllocSize();
+  VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
+  return max_chunk_size;
+}
+
+static void RaiseNonOutOfMemoryError(gpuError_t *status) {
+  if (*status == gpuErrorOutOfMemory) {
+    *status = gpuSuccess;
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(*status);
+
+  *status = platform::GpuGetLastError();
+  if (*status == gpuErrorOutOfMemory) {
+    *status = gpuSuccess;
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(*status);
+}
+
+class RecordedGpuMallocHelper {
+ private:
+  explicit RecordedGpuMallocHelper(int dev_id, uint64_t limit_size = 0)
+      : dev_id_(dev_id), limit_size_(limit_size) {
+    if (NeedRecord()) {
+      mtx_.reset(new std::mutex());
+    }
+  }
+
+  DISABLE_COPY_AND_ASSIGN(RecordedGpuMallocHelper);
+
+ public:
+  static RecordedGpuMallocHelper *Instance(int dev_id) {
+    std::call_once(once_flag_, [] {
+      int dev_cnt = GetGPUDeviceCount();
+      instances_.reserve(dev_cnt);
+      for (int i = 0; i < dev_cnt; ++i) {
+        instances_.emplace_back(
+            new RecordedGpuMallocHelper(i, FLAGS_gpu_memory_limit_mb << 20));
+      }
+    });
+
+    PADDLE_ENFORCE_GE(
+        dev_id, 0,
+        platform::errors::OutOfRange(
+            "Device id must be not less than 0, but got %d.", dev_id));
+    PADDLE_ENFORCE_LT(
+        dev_id, instances_.size(),
+        platform::errors::OutOfRange("Device id %d exceeds gpu card number %d.",
+                                     dev_id, instances_.size()));
+    return instances_[dev_id].get();
+  }
+
+  /**
+   * Try to allocate `size` gpu memory. Only cudaErrorMemoryAllocation
+   * or cudaSuccess would be returned, and the cudaGetLastError() flag
+   * would be clear.
+   */
+  gpuError_t Malloc(void **ptr, size_t size) {
+    LockGuardPtr<std::mutex> lock(mtx_);
+    if (UNLIKELY(NeedRecord() && cur_size_.load() + size > limit_size_)) {
+      return gpuErrorOutOfMemory;
+    }
+
+    CUDADeviceGuard guard(dev_id_);
+#ifdef PADDLE_WITH_HIP
+    auto result = hipMalloc(ptr, size);
+#else
+    CUDAGraphCaptureModeGuard capture_mode_guard;
+    auto result = cudaMalloc(ptr, size);
+#endif
+    if (result == gpuSuccess) {
+      cur_size_.fetch_add(size);
+      STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
+      return gpuSuccess;
+    } else {
+      RaiseNonOutOfMemoryError(&result);
+      // Non out of memory error would be raised inside
+      // RaiseNonOutOfMemoryError. Therefore, we can
+      // return cudaErrorMemoryAllocation directly here.
+      return gpuErrorOutOfMemory;
+    }
+  }
+
+  /**
+   * Free gpu memory. Usually, free is not allowed to raise error.
+   * If it does raise error, the process should be crashed.
+   */
+  void Free(void *ptr, size_t size) {
+    // Purposefully allow cudaErrorCudartUnloading, because
+    // that is returned if you ever call cudaFree after the
+    // driver has already shutdown. This happens only if the
+    // process is terminating, in which case we don't care if
+    // cudaFree succeeds.
+    CUDADeviceGuard guard(dev_id_);
+#ifdef PADDLE_WITH_HIP
+    auto err = hipFree(ptr);
+    if (err != hipErrorDeinitialized) {
+#else
+    auto err = cudaFree(ptr);
+    if (err != cudaErrorCudartUnloading) {
+#endif
+      PADDLE_ENFORCE_GPU_SUCCESS(err);
+      cur_size_.fetch_sub(size);
+      STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
+    } else {
+      platform::GpuGetLastError();  // clear the error flag when
+                                    // cudaErrorCudartUnloading /
+                                    // hipErrorDeinitialized
+    }
+  }
+
+  bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail,
+                  size_t *actual_total) {
+    {
+      CUDADeviceGuard guard(dev_id_);
+#ifdef PADDLE_WITH_HIP
+      auto result = hipMemGetInfo(actual_avail, actual_total);
+#else
+      auto result = cudaMemGetInfo(actual_avail, actual_total);
+#endif
+      if (result != gpuSuccess) {
+        *actual_avail = 0;
+      }
+      RaiseNonOutOfMemoryError(&result);
+    }
+
+    if (NeedRecord()) {
+      std::lock_guard<std::mutex> guard(*mtx_);
+      *avail = std::min(*actual_avail, limit_size_ - cur_size_.load());
+      *total = std::min(*actual_total, limit_size_);
+      return *total < *actual_total;
+    } else {
+      *avail = *actual_avail;
+      *total = *actual_total;
+      return false;
+    }
+  }
+
+  inline bool NeedRecord() const { return limit_size_ != 0; }
+
+  uint64_t RecordedSize() const { return cur_size_.load(); }
+
+  uint64_t LimitSize() const { return limit_size_; }
+
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 10020
+  CUresult MemCreate(CUmemGenericAllocationHandle *handle, size_t size,
+                     const CUmemAllocationProp *prop,
+                     unsigned long long flags) {  // NOLINT
+    auto result =
+        paddle::platform::dynload::cuMemCreate(handle, size, prop, flags);
+    if (result == CUDA_SUCCESS) {
+      cur_size_.fetch_add(size);
+    }
+    return result;
+  }
+
+  CUresult MemRelease(CUmemGenericAllocationHandle handle, size_t size) {
+    auto result = paddle::platform::dynload::cuMemRelease(handle);
+    if (result == CUDA_SUCCESS) {
+      cur_size_.fetch_sub(size);
+    }
+    return result;
+  }
+
+#endif
+#endif
+
+ private:
+  const int dev_id_;
+  const uint64_t limit_size_;
+  std::atomic<uint64_t> cur_size_{0};
+
+  mutable std::unique_ptr<std::mutex> mtx_;
+
+  static std::once_flag once_flag_;
+  static std::vector<std::unique_ptr<RecordedGpuMallocHelper>> instances_;
+};  // NOLINT
+
+std::once_flag RecordedGpuMallocHelper::once_flag_;
+std::vector<std::unique_ptr<RecordedGpuMallocHelper>>
+    RecordedGpuMallocHelper::instances_;
+
+gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id) {
+  return RecordedGpuMallocHelper::Instance(dev_id)->Malloc(ptr, size);
+}
+
+void RecordedGpuFree(void *p, size_t size, int dev_id) {
+  return RecordedGpuMallocHelper::Instance(dev_id)->Free(p, size);
+}
+
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 10020
+CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
+                              const CUmemAllocationProp *prop,
+                              unsigned long long flags, int dev_id) {  // NOLINT
+  return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate(handle, size,
+                                                              prop, flags);
+}
+
+CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle, size_t size,
+                               int dev_id) {
+  return RecordedGpuMallocHelper::Instance(dev_id)->MemRelease(handle, size);
+}
+#endif
+#endif
+
+bool RecordedGpuMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
+                           size_t *actual_total, int dev_id) {
+  return RecordedGpuMallocHelper::Instance(dev_id)->GetMemInfo(
+      avail, total, actual_avail, actual_total);
+}
+
+uint64_t RecordedGpuMallocSize(int dev_id) {
+  return RecordedGpuMallocHelper::Instance(dev_id)->RecordedSize();
+}
+
+bool IsGpuMallocRecorded(int dev_id) {
+  return RecordedGpuMallocHelper::Instance(dev_id)->NeedRecord();
+}
+
+void EmptyCache(void) {
+  std::vector<int> devices = GetSelectedDevices();
+  for (auto device : devices) {
+    memory::Release(CUDAPlace(device));
+  }
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/device/gpu/gpu_info.h
similarity index 70%
rename from paddle/fluid/platform/gpu_info.h
rename to paddle/fluid/platform/device/gpu/gpu_info.h
index 93e787fcf36f5..18e6ac83295f8 100644
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/device/gpu/gpu_info.h
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -14,49 +11,42 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef PADDLE_WITH_CUDA
-#include <cuda_runtime.h>
-#endif
-
-#ifdef PADDLE_WITH_HIP
-#include <hip/hip_runtime.h>
-#endif
-
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-// Note: this header for simplify HIP and CUDA type string
+
 #include <stddef.h>
 #include <string>
 #include <vector>
-#include "paddle/fluid/platform/type_defs.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
 
 namespace paddle {
 namespace platform {
-//! Get the version of cudnn
-int CudnnVersion();
+//! Get the version of dnn
+int DnnVersion();
 
 //! Get the total number of GPU devices in system.
-int GetCUDADeviceCount();
+int GetGPUDeviceCount();
 
 //! Get the compute capability of the ith GPU (format: major * 10 + minor)
-int GetCUDAComputeCapability(int i);
+int GetGPUComputeCapability(int id);
 
 //! Get the runtime version of the ith GPU
-int GetCUDARuntimeVersion(int id);
+int GetGPURuntimeVersion(int id);
 
 //! Get the driver version of the ith GPU
-int GetCUDADriverVersion(int id);
+int GetGPUDriverVersion(int id);
 
 //! Wheter the current device support TensorCore
 bool TensorCoreAvailable();
 
 //! Get the MultiProcessors of the ith GPU.
-int GetCUDAMultiProcessors(int i);
+int GetGPUMultiProcessors(int id);
 
 //! Get the MaxThreads of each MultiProcessor of the ith GPU.
-int GetCUDAMaxThreadsPerMultiProcessor(int i);
+int GetGPUMaxThreadsPerMultiProcessor(int id);
 
 //! Get the MaxThreads of each block of the ith GPU.
-int GetCUDAMaxThreadsPerBlock(int i);
+int GetGPUMaxThreadsPerBlock(int id);
 
 //! Get the current GPU device id in system.
 int GetCurrentDeviceId();
@@ -97,19 +87,11 @@ size_t GpuMaxChunkSize();
 
 //! Copy memory from address src to dst asynchronously.
 void GpuMemcpyAsync(void *dst, const void *src, size_t count,
-#ifdef PADDLE_WITH_HIP
-                    enum hipMemcpyKind kind, hipStream_t stream);
-#else
-                    enum cudaMemcpyKind kind, cudaStream_t stream);
-#endif
+                    gpuMemcpyKind kind, gpuStream_t stream);
 
 //! Copy memory from address src to dst synchronously.
 void GpuMemcpySync(void *dst, const void *src, size_t count,
-#ifdef PADDLE_WITH_HIP
-                   enum hipMemcpyKind kind);
-#else
-                   enum cudaMemcpyKind kind);
-#endif
+                   gpuMemcpyKind kind);
 
 //! Copy memory from one device to another device asynchronously.
 void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
@@ -125,34 +107,40 @@ void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream);
 //! Blocks until stream has completed all operations.
 void GpuStreamSync(gpuStream_t stream);
 
+void GpuDestroyStream(gpuStream_t stream);
+
+// ! Blocks until device has completed all operations.
+void GpuDeviceync();
+
 //! CudaMalloc with recorded info
-gpuError_t RecordedCudaMalloc(void **ptr, size_t size, int dev_id);
+gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id);
 
 //! CudaFree with recorded info
-void RecordedCudaFree(void *p, size_t size, int dev_id);
+void RecordedGpuFree(void *p, size_t size, int dev_id);
+
+gpuError_t GpuGetLastError();
 
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10020
-
 //! cuMemCreate with recorded info
-CUresult RecordedCuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
-                             const CUmemAllocationProp *prop,
-                             unsigned long long flags, int dev_id);  // NOLINT
+CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
+                              const CUmemAllocationProp *prop,
+                              unsigned long long flags, int dev_id);  // NOLINT
 
 //! cuMemRelease with recorded info
-CUresult RecordedCuMemRelease(CUmemGenericAllocationHandle handle, size_t size,
-                              int dev_id);
+CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle, size_t size,
+                               int dev_id);
 #endif
 #endif
 
 //! Get available and total gpu memory with considering limitation
-bool RecordedCudaMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
-                            size_t *actual_total, int dev_id);
+bool RecordedGpuMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
+                           size_t *actual_total, int dev_id);
 
 //! Get recorded cudaMalloc size. If record is disabled, return 0.
-uint64_t RecordedCudaMallocSize(int dev_id);
+uint64_t RecordedGpuMallocSize(int dev_id);
 
-bool IsCudaMallocRecorded(int dev_id);
+bool IsGpuMallocRecorded(int dev_id);
 
 //! Empty idle cached memory held by the allocator.
 void EmptyCache(void);
diff --git a/paddle/fluid/platform/gpu_launch_config.h b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
similarity index 98%
rename from paddle/fluid/platform/gpu_launch_config.h
rename to paddle/fluid/platform/device/gpu/gpu_launch_config.h
index 399f1dbaa03e1..55f4c8eb4cd55 100644
--- a/paddle/fluid/platform/gpu_launch_config.h
+++ b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
@@ -28,6 +28,7 @@
 #include <algorithm>
 #include <string>
 #include <vector>
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/device/gpu/gpu_primitives.h
similarity index 100%
rename from paddle/fluid/platform/cuda_primitives.h
rename to paddle/fluid/platform/device/gpu/gpu_primitives.h
diff --git a/paddle/fluid/platform/cuda_resource_pool.cc b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
similarity index 84%
rename from paddle/fluid/platform/cuda_resource_pool.cc
rename to paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
index 70d2ec5505798..2c55eb972b765 100644
--- a/paddle/fluid/platform/cuda_resource_pool.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
@@ -13,24 +13,24 @@
 // limitations under the License.
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#include "paddle/fluid/platform/cuda_resource_pool.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 namespace paddle {
 namespace platform {
 
 CudaStreamResourcePool::CudaStreamResourcePool() {
-  int dev_cnt = platform::GetCUDADeviceCount();
+  int dev_cnt = platform::GetGPUDeviceCount();
   pool_.reserve(dev_cnt);
   for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
     auto creator = [dev_idx] {
       platform::SetDeviceId(dev_idx);
       gpuStream_t stream;
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 #endif
       return stream;
@@ -39,9 +39,9 @@ CudaStreamResourcePool::CudaStreamResourcePool() {
     auto deleter = [dev_idx](gpuStream_t stream) {
       platform::SetDeviceId(dev_idx);
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamDestroy(stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream));
 #endif
     };
 
@@ -69,17 +69,17 @@ std::shared_ptr<CudaStreamObject> CudaStreamResourcePool::New(int dev_idx) {
 }
 
 CudaEventResourcePool::CudaEventResourcePool() {
-  int dev_cnt = platform::GetCUDADeviceCount();
+  int dev_cnt = platform::GetGPUDeviceCount();
   pool_.reserve(dev_cnt);
   for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
     auto creator = [dev_idx] {
       platform::SetDeviceId(dev_idx);
       gpuEvent_t event;
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           hipEventCreateWithFlags(&event, hipEventDisableTiming));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
 #endif
       return event;
@@ -88,9 +88,9 @@ CudaEventResourcePool::CudaEventResourcePool() {
     auto deleter = [dev_idx](gpuEvent_t event) {
       platform::SetDeviceId(dev_idx);
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(event));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event));
 #endif
     };
 
diff --git a/paddle/fluid/platform/cuda_resource_pool.h b/paddle/fluid/platform/device/gpu/gpu_resource_pool.h
similarity index 100%
rename from paddle/fluid/platform/cuda_resource_pool.h
rename to paddle/fluid/platform/device/gpu/gpu_resource_pool.h
diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h
new file mode 100644
index 0000000000000..d7362fe9cbd81
--- /dev/null
+++ b/paddle/fluid/platform/device/gpu/gpu_types.h
@@ -0,0 +1,94 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#include "paddle/fluid/platform/dynload/miopen.h"
+#include "paddle/fluid/platform/dynload/rocblas.h"
+
+#else
+#include <cuda_runtime.h>
+#include "paddle/fluid/platform/dynload/cublas.h"
+#include "paddle/fluid/platform/dynload/cudnn.h"
+#endif
+
+namespace paddle {
+
+#ifdef PADDLE_WITH_HIP
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using GPU_TYPE = ROCM_TYPE;
+#else  // CDUA
+
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using GPU_TYPE = CUDA_TYPE;
+#endif
+
+DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t);
+DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t);
+DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t);
+DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind);
+DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t);
+
+DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t);
+DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor, cudnnActivationStruct,
+                     miopenActivationDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnActivationMode_t, cudnnActivationMode_t,
+                     miopenActivationMode_t);
+DECLARE_TYPE_FOR_GPU(dnnTensorDescriptor, cudnnTensorStruct,
+                     miopenTensorDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t, cudnnTensorFormat_t,
+                     miopenTensorFormat_t);
+DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor, cudnnFilterStruct,
+                     miopenTensorDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor_t, cudnnFilterDescriptor_t,
+                     miopenTensorDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor, cudnnConvolutionStruct,
+                     miopenConvolutionDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor_t, cudnnConvolutionDescriptor_t,
+                     miopenConvolutionDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t, cudnnPoolingDescriptor_t,
+                     miopenPoolingDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t);
+DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t, cudnnDropoutDescriptor_t,
+                     miopenDropoutDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t);
+
+DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle);
+
+using CUDAGraphID = unsigned long long;  // NOLINT
+
+#undef DECLARE_TYPE_FOR_GPU
+
+#ifdef PADDLE_WITH_HIP
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
+  constexpr auto GPU_CV = ROCM_CV;
+#else  // CDUA
+
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
+  constexpr auto GPU_CV = CUDA_CV;
+#endif
+
+DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory, cudaErrorMemoryAllocation,
+                         hipErrorOutOfMemory);
+DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady);
+DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess);
+
+#undef DECLARE_CONSTANT_FOR_GPU
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h
similarity index 99%
rename from paddle/fluid/platform/nccl_helper.h
rename to paddle/fluid/platform/device/gpu/nccl_helper.h
index e297e7203c698..f26116749077e 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/device/gpu/nccl_helper.h
@@ -70,11 +70,11 @@ class NCCLGroupGuard {
 
   inline NCCLGroupGuard() {
     NCCLMutex().lock();
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupStart());
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupStart());
   }
 
   inline ~NCCLGroupGuard() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupEnd());
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupEnd());
     NCCLMutex().unlock();
   }
 };
diff --git a/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt b/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt
new file mode 100644
index 0000000000000..86b9ecd5f5445
--- /dev/null
+++ b/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt
@@ -0,0 +1,3 @@
+hip_library(rocm_info SRCS rocm_info.cc DEPS gflags glog enforce monitor dynload_cuda)
+
+hip_test(miopen_helper_test SRCS miopen_helper_test.cc DEPS dynload_cuda)
diff --git a/paddle/fluid/platform/miopen_desc.h b/paddle/fluid/platform/device/gpu/rocm/miopen_desc.h
similarity index 88%
rename from paddle/fluid/platform/miopen_desc.h
rename to paddle/fluid/platform/device/gpu/rocm/miopen_desc.h
index c82e61ceb122c..d2389ba409e5e 100644
--- a/paddle/fluid/platform/miopen_desc.h
+++ b/paddle/fluid/platform/device/gpu/rocm/miopen_desc.h
@@ -23,8 +23,8 @@
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/platform/device/gpu/rocm/miopen_helper.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/miopen_helper.h"
 
 namespace paddle {
 namespace framework {
@@ -88,7 +88,7 @@ class ActivationDescriptor {
   struct Deleter {
     void operator()(T* t) {
       if (t != nullptr) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             dynload::miopenDestroyActivationDescriptor(t));
         t = nullptr;
       }
@@ -96,13 +96,13 @@ class ActivationDescriptor {
   };
   ActivationDescriptor() {
     T* raw_ptr;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::miopenCreateActivationDescriptor(&raw_ptr));
     desc_.reset(raw_ptr);
   }
   template <typename T>
   void set(miopenActivationMode_t mode, const T& coef) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetActivationDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetActivationDescriptor(
         desc_.get(), mode, static_cast<double>(coef), 0.0, 0.0));
   }
 
@@ -119,15 +119,14 @@ class TensorDescriptor {
   struct Deleter {
     void operator()(T* t) {
       if (t != nullptr) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyTensorDescriptor(t));
+        PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenDestroyTensorDescriptor(t));
         t = nullptr;
       }
     }
   };
   TensorDescriptor() {
     T* raw_ptr;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        dynload::miopenCreateTensorDescriptor(&raw_ptr));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreateTensorDescriptor(&raw_ptr));
     desc_.reset(raw_ptr);
   }
   T* desc() { return desc_.get(); }
@@ -144,7 +143,7 @@ class TensorDescriptor {
     if (groups > 1) {
       dims_with_group[1] = dims_with_group[1] / groups;
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetTensorDescriptor(
         (miopenTensorDescriptor_t)(desc_.get()), ToCudnnDataType(tensor.type()),
         static_cast<int>(dims_with_group.size()),
         const_cast<int*>(dims_with_group.data()),
@@ -166,7 +165,7 @@ class TensorDescriptor {
     if (groups > 1) {
       dims_with_group[1] = dims_with_group[1] / groups;
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetTensorDescriptor(
         (miopenTensorDescriptor_t)(desc_.get()), ToCudnnDataType(tensor.type()),
         static_cast<int>(dims_with_group.size()),
         const_cast<int*>(dims_with_group.data()),
@@ -183,15 +182,14 @@ class FilterDescriptor {
   struct Deleter {
     void operator()(T* t) {
       if (t != nullptr) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyTensorDescriptor(t));
+        PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenDestroyTensorDescriptor(t));
         t = nullptr;
       }
     }
   };
   FilterDescriptor() {
     T* raw_ptr;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        dynload::miopenCreateTensorDescriptor(&raw_ptr));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreateTensorDescriptor(&raw_ptr));
     desc_.reset(raw_ptr);
   }
   T* desc() { return desc_.get(); }
@@ -212,7 +210,7 @@ class FilterDescriptor {
     if (groups > 1) {
       dims_with_group[1] = dims_with_group[1] / groups;
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetTensorDescriptor(
         (miopenTensorDescriptor_t)(desc_.get()), ToCudnnDataType(tensor.type()),
         static_cast<int>(dims_with_group.size()),
         const_cast<int*>(dims_with_group.data()),
@@ -229,7 +227,7 @@ class ConvolutionDescriptor {
   struct Deleter {
     void operator()(T* t) {
       if (t != nullptr) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             dynload::miopenDestroyConvolutionDescriptor(t));
         t = nullptr;
       }
@@ -237,7 +235,7 @@ class ConvolutionDescriptor {
   };
   ConvolutionDescriptor() {
     T* raw_ptr;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::miopenCreateConvolutionDescriptor(&raw_ptr));
     desc_.reset(raw_ptr);
   }
@@ -247,12 +245,12 @@ class ConvolutionDescriptor {
   void set(miopenDataType_t dtype, const std::vector<int>& pads,
            const std::vector<int>& strides, const std::vector<int>& dilations,
            bool allow_tf32, const int groups = 1) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenInitConvolutionNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenInitConvolutionNdDescriptor(
         (miopenConvolutionDescriptor_t)desc_.get(),
         static_cast<int>(pads.size()), const_cast<int*>(pads.data()),
         const_cast<int*>(strides.data()), const_cast<int*>(dilations.data()),
         miopenConvolution));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenSetConvolutionGroupCount(
             (miopenConvolutionDescriptor_t)desc_.get(), groups));
   }
diff --git a/paddle/fluid/platform/miopen_helper.h b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h
similarity index 89%
rename from paddle/fluid/platform/miopen_helper.h
rename to paddle/fluid/platform/device/gpu/rocm/miopen_helper.h
index 46c7da8397041..bd8d05f8124a1 100644
--- a/paddle/fluid/platform/miopen_helper.h
+++ b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/fluid/platform/dynload/miopen.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
@@ -36,13 +37,6 @@ DECLARE_bool(cudnn_deterministic);
 
 namespace paddle {
 namespace platform {
-
-// MIOPEN only support NCHW, just for compatibility with CUDNN API
-typedef enum {
-  MIOPEN_TENSOR_NCHW = 0,
-  MIOPEN_TENSOR_NHWC = 1,
-} miopenTensorFormat_t;
-
 inline const char* miopenGetErrorString(miopenStatus_t status) {
   switch (status) {
     case miopenStatusSuccess:
@@ -188,10 +182,10 @@ inline miopenTensorFormat_t GetCudnnTensorFormat(const DataLayout& order) {
 class ScopedTensorDescriptor {
  public:
   ScopedTensorDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreateTensorDescriptor(&desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreateTensorDescriptor(&desc_));
   }
   ~ScopedTensorDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyTensorDescriptor(desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenDestroyTensorDescriptor(desc_));
   }
 
   inline miopenTensorDescriptor_t descriptor(const miopenTensorFormat_t format,
@@ -216,12 +210,12 @@ class ScopedTensorDescriptor {
                       platform::errors::InvalidArgument(
                           "format should ONLY be NCHW in MIOPEN."));
     if (dims.size() == 4) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetTensorDescriptor(
           desc_, type, dims_with_group.size(),
           const_cast<int*>(dims_with_group.data()),
           const_cast<int*>(strides.data())));
     } else if (dims.size() == 5) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetTensorDescriptor(
           desc_, type, dims_with_group.size(),
           const_cast<int*>(dims_with_group.data()),
           const_cast<int*>(strides.data())));
@@ -240,7 +234,7 @@ class ScopedTensorDescriptor {
   inline miopenTensorDescriptor_t descriptor(const miopenDataType_t miopen_type,
                                              const std::vector<int>& dim,
                                              const std::vector<int>& stride) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetTensorDescriptor(
         desc_, miopen_type, dim.size(), const_cast<int*>(dim.data()),
         const_cast<int*>(stride.data())));
     return desc_;
@@ -262,10 +256,10 @@ class ScopedTensorDescriptor {
 class ScopedDropoutDescriptor {
  public:
   ScopedDropoutDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreateDropoutDescriptor(&desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreateDropoutDescriptor(&desc_));
   }
   ~ScopedDropoutDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyDropoutDescriptor(desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenDestroyDropoutDescriptor(desc_));
   }
 
   inline miopenDropoutDescriptor_t descriptor(const miopenHandle_t& handle,
@@ -275,20 +269,20 @@ class ScopedDropoutDescriptor {
                                               framework::Tensor* dropout_state_,
                                               int seed, size_t state_size) {
     if (dropout_state_ == nullptr) {  // for no dropout or test
-      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetDropoutDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetDropoutDescriptor(
           desc_, handle, 0 /* dropout */, nullptr, 0 /* state_size */,
           0 /* seed */, false, false, MIOPEN_RNG_PSEUDO_XORWOW));
       return desc_;
     }
     auto* dropout_state_data = dropout_state_->data<uint8_t>();
     if (!initialized) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetDropoutDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetDropoutDescriptor(
           desc_, handle, dropout_prob_, dropout_state_data, state_size, seed,
           false, false, MIOPEN_RNG_PSEUDO_XORWOW));
     } else {
       auto dropout_state_dims = dropout_state_->dims();
       state_size = dropout_state_dims[0];
-      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenRestoreDropoutDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenRestoreDropoutDescriptor(
           desc_, handle, dropout_prob_, dropout_state_data, state_size, 0,
           false, false, MIOPEN_RNG_PSEUDO_XORWOW));
     }
@@ -304,10 +298,10 @@ class ScopedDropoutDescriptor {
 class ScopedRNNDescriptor {
  public:
   ScopedRNNDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreateRNNDescriptor(&desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreateRNNDescriptor(&desc_));
   }
   ~ScopedRNNDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyRNNDescriptor(desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenDestroyRNNDescriptor(desc_));
   }
 
   inline miopenRNNDescriptor_t desc() { return desc_; }
@@ -320,10 +314,10 @@ class ScopedRNNDescriptor {
 class ScopedFilterDescriptor {
  public:
   ScopedFilterDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreateTensorDescriptor(&desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreateTensorDescriptor(&desc_));
   }
   ~ScopedFilterDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyTensorDescriptor(desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenDestroyTensorDescriptor(desc_));
   }
 
   inline miopenTensorDescriptor_t descriptor(const miopenTensorFormat_t format,
@@ -344,7 +338,7 @@ class ScopedFilterDescriptor {
     for (int k = kernel_with_group.size() - 2; k >= 0; k--) {
       stride_dim[k] = stride_dim[k + 1] * kernel_with_group[k + 1];
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetTensorDescriptor(
         desc_, type, kernel_with_group.size(),
         const_cast<int*>(kernel_with_group.data()),
         const_cast<int*>(stride_dim.data())));
@@ -369,11 +363,11 @@ class ScopedFilterDescriptor {
 class ScopedConvolutionDescriptor {
  public:
   ScopedConvolutionDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::miopenCreateConvolutionDescriptor(&desc_));
   }
   ~ScopedConvolutionDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::miopenDestroyConvolutionDescriptor(desc_));
   }
 
@@ -391,7 +385,7 @@ class ScopedConvolutionDescriptor {
             "The size of pads and dilations should be equal. But received size "
             "of pads is %d, size of dilations is %d.",
             pads.size(), dilations.size()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenInitConvolutionNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenInitConvolutionNdDescriptor(
         desc_, pads.size(), const_cast<int*>(pads.data()),
         const_cast<int*>(strides.data()), const_cast<int*>(dilations.data()),
         miopenConvolution));
@@ -413,10 +407,10 @@ class ScopedConvolutionDescriptor {
 class ScopedPoolingDescriptor {
  public:
   ScopedPoolingDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreatePoolingDescriptor(&desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreatePoolingDescriptor(&desc_));
   }
   ~ScopedPoolingDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyPoolingDescriptor(desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenDestroyPoolingDescriptor(desc_));
   }
 
   inline miopenPoolingDescriptor_t descriptor(const PoolingMode& mode,
@@ -434,7 +428,7 @@ class ScopedPoolingDescriptor {
             "The size of kernel and strides should be equal. But "
             "received size of kernel is %d, size of strides is %d.",
             kernel.size(), strides.size()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetNdPoolingDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetNdPoolingDescriptor(
         desc_, GetPoolingMode(mode), kernel.size(),
         const_cast<int*>(kernel.data()), const_cast<int*>(pads.data()),
         const_cast<int*>(strides.data())));
@@ -449,11 +443,11 @@ class ScopedPoolingDescriptor {
 class ScopedActivationDescriptor {
  public:
   ScopedActivationDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::miopenCreateActivationDescriptor(&desc_));
   }
   ~ScopedActivationDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::miopenDestroyActivationDescriptor(desc_));
   }
 
@@ -489,7 +483,7 @@ class ScopedActivationDescriptor {
             "Unrecognized MIOPEN activation mode: %d.",
             static_cast<int>(activation_mode)));
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetActivationDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetActivationDescriptor(
         desc_, mode, relu_ceiling, 0.0, 0.0));
     return desc_;
   }
@@ -514,15 +508,15 @@ inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
 class ScopedCTCLossDescriptor {
  public:
   ScopedCTCLossDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreateCTCLossDescriptor(&desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreateCTCLossDescriptor(&desc_));
   }
   ~ScopedCTCLossDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyCTCLossDescriptor(desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenDestroyCTCLossDescriptor(desc_));
   }
 
   template <typename T>
   inline miopenCTCLossDescriptor_t descriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetCTCLossDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetCTCLossDescriptor(
         desc_, CudnnDataType<T>::type, 0, false));
     return desc_;
   }
diff --git a/paddle/fluid/platform/miopen_helper_test.cc b/paddle/fluid/platform/device/gpu/rocm/miopen_helper_test.cc
similarity index 98%
rename from paddle/fluid/platform/miopen_helper_test.cc
rename to paddle/fluid/platform/device/gpu/rocm/miopen_helper_test.cc
index e201f4893f577..13cf52dc2c6a3 100644
--- a/paddle/fluid/platform/miopen_helper_test.cc
+++ b/paddle/fluid/platform/device/gpu/rocm/miopen_helper_test.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #define GLOG_NO_ABBREVIATED_SEVERITIES
 #define GOOGLE_GLOG_DLL_DECL
 
-#include "paddle/fluid/platform/miopen_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 #include <gtest/gtest.h>
 
diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
new file mode 100644
index 0000000000000..2263383f8fabb
--- /dev/null
+++ b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
@@ -0,0 +1,160 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// NOTE(): support float16 to half in header file.
+#define PADDLE_CUDA_FP16
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace platform {
+
+#define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate))
+
+inline static int RoundToPowerOfTwo(int dim) {
+  // HIP results in error or nan if > 256
+  if (dim > 128) {
+    return 256;
+  } else if (dim > 64) {
+    return 128;
+  } else if (dim > 32) {
+    return 64;
+  } else {
+    return 32;
+  }
+}
+
+#define CUDA_LAUNCH_KERNEL_BASE(dim, ...)  \
+  case (dim): {                            \
+    constexpr auto kPowerOfTwoDim = (dim); \
+    __VA_ARGS__;                           \
+  } break
+
+#define CUDA_LAUNCH_KERNEL_HELPER(...)          \
+  CUDA_LAUNCH_KERNEL_BASE(1024, ##__VA_ARGS__); \
+  CUDA_LAUNCH_KERNEL_BASE(512, ##__VA_ARGS__);  \
+  CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__);  \
+  CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__);  \
+  CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__);   \
+  CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__);
+
+template <typename T>
+__forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
+                                                 int delta,
+                                                 int width = warpSize) {
+  return __shfl_down(val, delta, width);
+}
+
+template <typename T>
+__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val,
+                                                int width = warpSize) {
+  return __shfl_xor(val, width);
+}
+
+template <>
+__forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
+                                                       float16 val, int delta,
+                                                       int width) {
+  return float16(__shfl_down(static_cast<float>(val),
+                             static_cast<unsigned>(delta), width));
+}
+
+template <>
+__forceinline__ __device__ paddle::platform::complex<float> CudaShuffleDownSync(
+    unsigned mask, paddle::platform::complex<float> val, int delta, int width) {
+  float real = __shfl_down(val.real, delta, width);
+  float imag = __shfl_down(val.imag, delta, width);
+  return paddle::platform::complex<float>(real, imag);
+}
+
+template <>
+__forceinline__ __device__ paddle::platform::complex<double>
+CudaShuffleDownSync(unsigned mask, paddle::platform::complex<double> val,
+                    int delta, int width) {
+  double real = __shfl_down(val.real, delta, width);
+  double imag = __shfl_down(val.imag, delta, width);
+  return paddle::platform::complex<double>(real, imag);
+}
+
+template <>
+__forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask,
+                                                      float16 val, int width) {
+  return float16(__shfl_xor(static_cast<float>(val), width));
+}
+
+template <>
+__forceinline__ __device__ paddle::platform::complex<float> CudaShuffleXorSync(
+    unsigned mask, paddle::platform::complex<float> val, int width) {
+  float real = __shfl_xor(val.real, width);
+  float imag = __shfl_xor(val.imag, width);
+  return paddle::platform::complex<float>(real, imag);
+}
+
+template <>
+__forceinline__ __device__ paddle::platform::complex<double> CudaShuffleXorSync(
+    unsigned mask, paddle::platform::complex<double> val, int width) {
+  double real = __shfl_xor(val.real, width);
+  double imag = __shfl_xor(val.imag, width);
+  return paddle::platform::complex<double>(real, imag);
+}
+
+template <typename T>
+__forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line,
+                                             int width = 32) {
+  return __shfl(val, src_line, width);
+}
+
+template <typename T>
+HOSTDEVICE T Infinity() {
+  return INFINITY;
+}
+
+template <typename T>
+__device__ T reduceSum(T val, int tid, int len) {
+  // NOTE(zcd): The warp size should be taken from the
+  // parameters of the GPU but not specified as 32 simply.
+  // To make the reduceSum more efficiently,
+  // I use Warp-Level Parallelism and assume the Warp size
+  // is 32 which may be different for different GPU,
+  // but most card's warp size is 32.
+  const int warpSize = 32;
+  __shared__ T shm[warpSize];
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, tid < len);
+
+  for (int offset = warpSize / 2; offset > 0; offset /= 2)
+    val += platform::CudaShuffleDownSync(mask, val, offset);
+
+  if (tid < warpSize) shm[tid] = 0;
+  __syncthreads();
+
+  if (tid % warpSize == 0) {
+    shm[tid / warpSize] = val;
+  }
+  __syncthreads();
+
+  CREATE_SHFL_MASK(mask, tid < warpSize);
+
+  if (tid < warpSize) {
+    val = shm[tid];
+    for (int offset = warpSize / 2; offset > 0; offset /= 2)
+      val += platform::CudaShuffleDownSync(mask, val, offset);
+  }
+  return val;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h b/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
new file mode 100644
index 0000000000000..a0f3fb0f73ba5
--- /dev/null
+++ b/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
@@ -0,0 +1,102 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <mutex>  // NOLINT
+
+#include "paddle/fluid/platform/dynload/rocblas.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace platform {
+
+/*
+ * Summary: Grid stride looping macro in CUDA kernel
+ *
+ *  [ Why need this macro? ]
+ *
+ *    The original looping in CUDA kernel is:
+ *
+ *    `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+ *        i += blockDim.x * gridDim.x)`
+ *
+ *    This for condition is risky. The value of `blockIdx.x * blockDim.x`
+ *    may be large, such as over 1GB, the first iteration is no problem here,
+ *    but when `i += blockDim.x * gridDim.x` is executed, the value of i
+ *    will greater than INT_MAX and overflow becomes negative value, at
+ *    this time, the cycle condition `i < (n)` is still satisfied, so it
+ *    will cause illegal access to cuda memory.
+ *
+ *    Here is a real example in ERINE, it will trigger above error.
+ *    The related data are:
+ *      - blockIdx.x = 2172938
+ *      - blockDim.x = 512
+ *      - blockIdx.x * blockDim.x = 1112543864
+ *      - INT_MAX = 2147483647
+ *
+ *    So we polish the for condition as follow, the int64_t __index__ will
+ *    prevent overflow in the loop increment.
+ *
+ * Parameters:
+ *    - i: loop index
+ *    - num: total element numbers
+ *
+ * Examples:
+ *    template <typename T>
+ *    __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
+ *                      const int d, const int remain) {
+ *    CUDA_KERNEL_LOOP(index, num) {
+ *      int idx_n = index / d;
+ *      int idx_remain = index % remain;
+ *      logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
+ *      }
+ *    }
+ *
+*/
+
+#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                     \
+  int64_t __index__ = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; \
+  for (index_type i = __index__; __index__ < (num);                   \
+       __index__ += hipBlockDim_x * hipGridDim_x, i = __index__)
+
+class CublasHandleHolder {
+ public:
+  explicit CublasHandleHolder(hipStream_t stream) {
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_create_handle(&handle_));
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_set_stream(handle_, stream));
+  }
+
+  const rocblas_handle& GetCublasHandle() const { return handle_; }
+
+  ~CublasHandleHolder() PADDLE_MAY_THROW {
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_destroy_handle(handle_));
+  }
+
+  template <typename Callback>
+  inline void Call(Callback&& callback) const {
+    std::lock_guard<std::mutex> guard(mtx_);
+    callback(handle_);
+  }
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(CublasHandleHolder);
+
+  rocblas_handle handle_;
+  mutable std::mutex mtx_;
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_info.cc b/paddle/fluid/platform/device/gpu/rocm/rocm_info.cc
new file mode 100644
index 0000000000000..06dba8ce423ef
--- /dev/null
+++ b/paddle/fluid/platform/device/gpu/rocm/rocm_info.cc
@@ -0,0 +1,269 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/lock_guard_ptr.h"
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/monitor.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/split.h"
+
+static std::once_flag g_device_props_size_init_flag;
+static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags;
+static std::vector<paddle::gpuDeviceProp> g_device_props;
+
+namespace paddle {
+namespace platform {
+int DnnVersion() {
+  if (!dynload::HasCUDNN()) return -1;
+  size_t version_major, version_minor, version_patch;
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion(
+      &version_major, &version_minor, &version_patch));
+  return version_major * 100 + version_minor * 10 + version_patch;
+}
+
+static int GetGPUDeviceCountImpl() {
+  int driverVersion = 0;
+  hipError_t status = hipDriverGetVersion(&driverVersion);
+
+  if (!(status == gpuSuccess && driverVersion != 0)) {
+    // No GPU driver
+    VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!";
+    return 0;
+  }
+
+  const auto *cuda_visible_devices = std::getenv("HIP_VISIBLE_DEVICES");
+
+  if (cuda_visible_devices != nullptr) {
+    std::string cuda_visible_devices_str(cuda_visible_devices);
+    if (!cuda_visible_devices_str.empty()) {
+      cuda_visible_devices_str.erase(
+          0, cuda_visible_devices_str.find_first_not_of('\''));
+      cuda_visible_devices_str.erase(
+          cuda_visible_devices_str.find_last_not_of('\'') + 1);
+      cuda_visible_devices_str.erase(
+          0, cuda_visible_devices_str.find_first_not_of('\"'));
+      cuda_visible_devices_str.erase(
+          cuda_visible_devices_str.find_last_not_of('\"') + 1);
+    }
+    if (std::all_of(cuda_visible_devices_str.begin(),
+                    cuda_visible_devices_str.end(),
+                    [](char ch) { return ch == ' '; })) {
+      VLOG(2) << "HIP_VISIBLE_DEVICES is set to be "
+                 "empty. No GPU detected.";
+      return 0;
+    }
+  }
+  int count;
+  PADDLE_ENFORCE_GPU_SUCCESS(hipGetDeviceCount(&count));
+  return count;
+}
+
+int GetGPUDeviceCount() {
+  // cache the count
+  static auto dev_cnt = GetGPUDeviceCountImpl();
+  return dev_cnt;
+}
+
+int GetGPUComputeCapability(int id) {
+  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetGPUDeviceCount()));
+  int major, minor;
+  auto major_error_code = hipDeviceGetAttribute(
+      &major, hipDeviceAttributeComputeCapabilityMajor, id);
+  auto minor_error_code = hipDeviceGetAttribute(
+      &minor, hipDeviceAttributeComputeCapabilityMinor, id);
+
+  PADDLE_ENFORCE_GPU_SUCCESS(major_error_code);
+  PADDLE_ENFORCE_GPU_SUCCESS(minor_error_code);
+  return major * 100 + minor;
+}
+
+int GetGPURuntimeVersion(int id) {
+  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetGPUDeviceCount()));
+  int runtime_version = 0;
+  PADDLE_ENFORCE_GPU_SUCCESS(hipRuntimeGetVersion(&runtime_version));
+  return runtime_version;
+}
+
+int GetGPUDriverVersion(int id) {
+  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetGPUDeviceCount()));
+  int driver_version = 0;
+  PADDLE_ENFORCE_GPU_SUCCESS(hipDriverGetVersion(&driver_version));
+  return driver_version;
+}
+
+bool TensorCoreAvailable() { return false; }
+
+int GetGPUMultiProcessors(int id) {
+  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetGPUDeviceCount()));
+  int count;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      hipDeviceGetAttribute(&count, hipDeviceAttributeMultiprocessorCount, id));
+  return count;
+}
+
+int GetGPUMaxThreadsPerMultiProcessor(int id) {
+  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetGPUDeviceCount()));
+  int count;
+  PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceGetAttribute(
+      &count, hipDeviceAttributeMaxThreadsPerMultiProcessor, id));
+
+  return count;
+}
+
+int GetGPUMaxThreadsPerBlock(int id) {
+  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetGPUDeviceCount()));
+  int count;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      hipDeviceGetAttribute(&count, hipDeviceAttributeMaxThreadsPerBlock, id));
+  return count;
+}
+
+int GetCurrentDeviceId() {
+  int device_id;
+  PADDLE_ENFORCE_GPU_SUCCESS(hipGetDevice(&device_id));
+  return device_id;
+}
+
+dim3 GetGpuMaxGridDimSize(int id) {
+  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetGPUDeviceCount()));
+  dim3 ret;
+  int size;
+  auto error_code_x =
+      hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimX, id);
+  PADDLE_ENFORCE_GPU_SUCCESS(error_code_x);
+  ret.x = size;
+
+  auto error_code_y =
+      hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimY, id);
+  PADDLE_ENFORCE_GPU_SUCCESS(error_code_y);
+  ret.y = size;
+
+  auto error_code_z =
+      hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimZ, id);
+  PADDLE_ENFORCE_GPU_SUCCESS(error_code_z);
+  ret.z = size;
+  return ret;
+}
+
+const gpuDeviceProp &GetDeviceProperties(int id) {
+  std::call_once(g_device_props_size_init_flag, [&] {
+    int gpu_num = 0;
+    gpu_num = platform::GetGPUDeviceCount();
+    g_device_props_init_flags.resize(gpu_num);
+    g_device_props.resize(gpu_num);
+    for (int i = 0; i < gpu_num; ++i) {
+      g_device_props_init_flags[i] = std::make_unique<std::once_flag>();
+    }
+  });
+
+  if (id == -1) {
+    id = platform::GetCurrentDeviceId();
+  }
+
+  if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
+    PADDLE_THROW(platform::errors::OutOfRange(
+        "The device id %d is out of range [0, %d), where %d is the number of "
+        "devices on this machine. Because the device id should be greater than "
+        "or equal to zero and smaller than the number of gpus. Please input "
+        "appropriate device again!",
+        id, static_cast<int>(g_device_props.size()),
+        static_cast<int>(g_device_props.size())));
+  }
+
+  std::call_once(*(g_device_props_init_flags[id]), [&] {
+    PADDLE_ENFORCE_GPU_SUCCESS(hipGetDeviceProperties(&g_device_props[id], id));
+  });
+
+  return g_device_props[id];
+}
+
+void SetDeviceId(int id) {
+  // TODO(qijun): find a better way to cache the cuda device count
+  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetGPUDeviceCount()));
+  PADDLE_RETRY_CUDA_SUCCESS(hipSetDevice(id));
+}
+
+void GpuMemcpyAsync(void *dst, const void *src, size_t count,
+                    gpuMemcpyKind kind, gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(dst, src, count, kind, stream));
+}
+
+void GpuMemcpySync(void *dst, const void *src, size_t count,
+                   gpuMemcpyKind kind) {
+  PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(dst, src, count, kind));
+}
+
+void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
+                        int src_device, size_t count, gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      hipMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
+}
+
+void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
+                       int src_device, size_t count) {
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      hipMemcpyPeer(dst, dst_device, src, src_device, count));
+}
+
+void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(hipMemsetAsync(dst, value, count, stream));
+}
+
+void GpuStreamSync(gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(stream));
+}
+
+void GpuDestroyStream(gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream));
+}
+
+void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); }
+
+gpuError_t GpuGetLastError() { return hipGetLastError(); }
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/CMakeLists.txt b/paddle/fluid/platform/device/ipu/CMakeLists.txt
new file mode 100644
index 0000000000000..25629ba74d915
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/CMakeLists.txt
@@ -0,0 +1,12 @@
+# IPU
+IF(WITH_IPU)
+  cc_library(ipu_device SRCS device.cc DEPS enforce popart)
+  cc_library(ipu_utils SRCS ipu_utils.cc DEPS memory framework_proto popart)
+  cc_library(ipu_strategy SRCS ipu_strategy.cc DEPS popart graph framework_proto enforce)
+  cc_library(ipu_optimizer SRCS ipu_optimizer.cc DEPS popart enforce)
+  cc_library(ipu_executor SRCS ipu_executor.cc DEPS ipu_optimizer ipu_utils popart graph framework_proto)
+  cc_library(popart_canonicalization_utils SRCS ${POPART_CANONICALIZATION_SRC} DEPS framework_proto enforce ipu_utils)
+  cc_library(ipu_compiler SRCS ipu_compiler.cc DEPS popart graph ipu_utils graph_helper)
+  cc_library(ipu_backend SRCS ipu_backend.cc DEPS popart ipu_compiler graph framework_proto enforce ipu_utils ipu_strategy ipu_device ipu_executor graph_helper)
+  cc_library(ipu_info SRCS ipu_info.cc DEPS ipu_backend)
+ENDIF()
diff --git a/paddle/fluid/platform/device/ipu/common.h b/paddle/fluid/platform/device/ipu/common.h
new file mode 100644
index 0000000000000..7d62f10abd201
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/common.h
@@ -0,0 +1,35 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <popart/names.hpp>
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+static constexpr const char *sIpuIndexAttr = "ipu_index";
+static constexpr const char *sIpuStageAttr = "ipu_stage";
+static constexpr const char *sOpIdentifyIdAttr = "op_identify_id";
+static constexpr const char *sDebugInfoId = "__debug_info_id";
+
+static constexpr const char *sBeta1 = "beta1";
+static constexpr const char *sBeta2 = "beta2";
+static constexpr const char *sBeta1Pow = "Beta1Pow";
+static constexpr const char *sBeta2Pow = "Beta2Pow";
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/device.cc b/paddle/fluid/platform/device/ipu/device.cc
new file mode 100644
index 0000000000000..47e6475089d3f
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/device.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/device/ipu/device.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+Device::Device(const popart::DeviceInfo& device_info)
+    : id_(device_info.getId()), is_attached_(device_info.isAttached()) {
+  popart::DeviceType popart_device_type = device_info.getType();
+  switch (popart_device_type) {
+    case popart::DeviceType::IpuModel:
+      device_type_ = DeviceType::IpuModel;
+      break;
+    case popart::DeviceType::Ipu:
+      device_type_ = DeviceType::Ipu;
+      break;
+    default:
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "popart::DeviceType:Unsupported type %d", popart_device_type));
+  }
+}
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/device.h b/paddle/fluid/platform/device/ipu/device.h
new file mode 100644
index 0000000000000..24a8bdec3087c
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/device.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <popart/devicemanager.hpp>
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+enum class DeviceType { IpuModel = 0, Cpu, Ipu, OfflineIpu, Sim };
+
+class Device {
+ public:
+  Device() {}
+  explicit Device(const popart::DeviceInfo& device_info);
+
+  int getId() const { return id_; }
+  bool isAttached() const { return is_attached_; }
+  DeviceType getType() const { return device_type_; }
+
+ private:
+  int id_;
+  bool is_attached_;
+  DeviceType device_type_;
+  /* TODO:: Add more elements in the future */
+};
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.cc b/paddle/fluid/platform/device/ipu/ipu_backend.cc
new file mode 100644
index 0000000000000..cd0f5ae554cb4
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/ipu_backend.cc
@@ -0,0 +1,195 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/ipu/ipu_backend.h"
+#include "paddle/fluid/platform/ipu/ipu_utils.h"
+
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/node.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+std::shared_ptr<IpuBackend> IpuBackend::instance_ = nullptr;
+
+IpuBackend::IpuBackend() {
+  compiler_ = std::make_shared<Compiler>();
+  executor_ = std::make_unique<Executor>();
+}
+
+void IpuBackend::Clear() {
+  executor_.reset();
+  // detach device
+  if (device_ != nullptr && device_->isAttached()) {
+    device_->detach();
+    device_.reset();
+    device_ = nullptr;
+  }
+}
+
+IpuBackend::~IpuBackend() { Clear(); }
+
+std::shared_ptr<IpuBackend> IpuBackend::GetInstance() {
+  if (!instance_) {
+    instance_.reset(new IpuBackend());
+  }
+  return instance_;
+}
+
+// This api should only call from python, always return a new object
+std::shared_ptr<IpuBackend> IpuBackend::GetNewInstance() {
+  instance_.reset(new IpuBackend());
+  return instance_;
+}
+
+void IpuBackend::Compile(framework::ir::Graph* graph,
+                         const std::vector<std::string>& feed_list,
+                         const std::vector<std::string>& fetch_list) {
+  VLOG(10) << "enter IpuBackend::Compile";
+  compiler_->InitInputs(graph, feed_list);
+  compiler_->LowerWeights(graph, scope_);
+  compiler_->LowerBody(graph);
+  compiler_->InitOutputs(fetch_list);
+  executor_->SetWeights(compiler_->GetWeights());
+  VLOG(10) << "leave IpuBackend::Compile";
+}
+
+void IpuBackend::Run(const std::vector<const framework::Tensor*>& inputs,
+                     const std::vector<framework::Tensor*>& outputs,
+                     const framework::ExecutionContext& ctx) {
+  Prepare();
+  auto inputs_id = compiler_->GetInputs();
+  auto outputs_id = compiler_->GetOutputs();
+  executor_->Run(inputs_id, inputs, outputs_id, outputs, ctx);
+}
+
+void IpuBackend::Prepare() {
+  if (is_prepared_) {
+    return;
+  } else {
+    is_prepared_ = true;
+  }
+  // convert Model to fp16
+  if (ipu_strategy_->enable_fp16) {
+    compiler_->ConvertProtoToFp16();
+  }
+  auto proto = compiler_->GetModelProto();
+  auto tensors = compiler_->GetTensors();
+  auto outputs = compiler_->GetOutputs();
+  executor_->Prepare(proto, tensors, outputs, device_);
+}
+
+void IpuBackend::SetScope(const framework::Scope& scope) {
+  scope_ = &scope;
+  executor_->SetScope(&scope);
+}
+
+void IpuBackend::SetIpuStrategy(const IpuStrategy& strategy) {
+  ipu_strategy_ = &strategy;
+  executor_->SetIpuStrategy(strategy);
+  compiler_->SetIpuStrategy(strategy);
+}
+
+size_t IpuBackend::GetNumDevices() {
+  // IpuModel
+  bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
+  if (ipu_model) return 1;
+  // Real dev
+  size_t num_devices =
+      popart::DeviceManager::createDeviceManager().enumerateDevices().size();
+  PADDLE_ENFORCE_GT(
+      num_devices, 0,
+      platform::errors::Unavailable(
+          "Do not found any IPU devices, please make "
+          "sure Poplar sdk is enabled or enable ENV \"POPLAR_IPUMODEL=1\""));
+  return num_devices;
+}
+
+std::vector<int> IpuBackend::GetDeviceIds() {
+  bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
+  if (ipu_model) {
+    return {0};
+  }
+  std::vector<int> device_ids;
+  auto devices =
+      popart::DeviceManager::createDeviceManager().enumerateDevices();
+  PADDLE_ENFORCE_GT(
+      devices.size(), 0,
+      platform::errors::Unavailable("Do not found any IPU devices, please make "
+                                    "sure Poplar sdk is enabled."));
+
+  for (auto device : devices) {
+    device_ids.push_back(device->getId());
+  }
+
+  return device_ids;
+}
+
+Device IpuBackend::GetDevice(int id) {
+  bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
+  if (ipu_model) {
+    std::map<std::string, std::string> deviceOpts{{"numIPUs", "1 "}};
+    device_ = popart::DeviceManager::createDeviceManager().createIpuModelDevice(
+        deviceOpts);
+    Device device(*device_.get());
+    return device;
+  }
+  size_t num_devices = GetNumDevices();
+  if (id < 0 || id >= num_devices) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "device id %d is invalid, number devices is %d", id, num_devices));
+  }
+  std::shared_ptr<popart::DeviceInfo> popart_device_info =
+      popart::DeviceManager::createDeviceManager().getDevice(
+          popart::SyncPattern::Full, id);
+  Device device(*popart_device_info.get());
+  return device;
+}
+
+void IpuBackend::AttachDevice(int id) {
+  // trick here
+  // Compiler ipu is not same as the runtime ipu.
+  VLOG(10) << "comile ipu id = " << id;
+  bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
+  if (ipu_model) {
+    return;
+  }
+  device_ = popart::DeviceManager::createDeviceManager().acquireAvailableDevice(
+      UpperIpuNum());
+  PADDLE_ENFORCE_NOT_NULL(
+      device_, platform::errors::Unavailable("Can't attach IPU, ipu_num = %d.",
+                                             UpperIpuNum()));
+}
+
+bool IpuBackend::DeviceIsAttached() { return device_ != nullptr; }
+
+// num_ipus must be pow(2,n);
+int IpuBackend::UpperIpuNum() {
+  PADDLE_ENFORCE_GT(ipu_strategy_->num_ipus, 0,
+                    platform::errors::Unavailable(
+                        "The ipu num get is wrong, please make sure the "
+                        "sharding or pipline parameter is right."));
+  int i = 0;
+  while (std::pow(2, i) < ipu_strategy_->num_ipus) {
+    i++;
+  }
+  return std::pow(2, i);
+}
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.h b/paddle/fluid/platform/device/ipu/ipu_backend.h
new file mode 100644
index 0000000000000..769a1b5b52ab8
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/ipu_backend.h
@@ -0,0 +1,103 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cmath>
+#include <popart/devicemanager.hpp>
+#include <popart/names.hpp>
+
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/ipu/device.h"
+#include "paddle/fluid/platform/ipu/ipu_compiler.h"
+#include "paddle/fluid/platform/ipu/ipu_executor.h"
+#include "paddle/fluid/platform/ipu/ipu_strategy.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+class IpuBackend {
+  // IpuBackend is the center of paddle-ipu, its function include:
+  //   1. Compile paddle model to popart model
+  //   2. Run popart model, inference or training
+  //   3. Request and release device
+  //   4. Other helper function
+
+ public:
+  IpuBackend();
+  ~IpuBackend();
+
+  void Clear();
+
+  // return if exsits, else create and return
+  static std::shared_ptr<IpuBackend> GetInstance();
+
+  // always return a new instance_
+  static std::shared_ptr<IpuBackend> GetNewInstance();
+
+  // what compile does include(call compiler_):
+  //   1. map paddle-op -> poart op
+  //   2. construct popart onnx compute graph
+  void Compile(framework::ir::Graph *graph,
+               const std::vector<std::string> &feed_list,
+               const std::vector<std::string> &fetch_list);
+
+  // what run does include:
+  //   1. construct forward onnx graph
+  //   2. graph-level optimization
+  //   3. autodiff
+  void Run(const std::vector<const framework::Tensor *> &inputs,
+           const std::vector<framework::Tensor *> &outputs,
+           const framework::ExecutionContext &ctx);
+
+  Executor &GetExecutor() { return *executor_; }
+
+  void SetScope(const framework::Scope &scope);
+  const framework::Scope *GetScope() { return scope_; }
+  void SetIpuStrategy(const IpuStrategy &strategy);
+  const IpuStrategy *GetIpuStrategy() { return ipu_strategy_; }
+
+  // Device
+  size_t GetNumDevices();
+  std::vector<int> GetDeviceIds();
+  Device GetDevice(int id);
+  void AttachDevice(int id);
+  bool DeviceIsAttached();
+
+ private:
+  int UpperIpuNum();
+  void Prepare();
+
+ private:
+  std::shared_ptr<Compiler> compiler_;
+  std::unique_ptr<Executor> executor_;
+  std::shared_ptr<popart::DeviceInfo> device_;
+  bool is_prepared_ = false;
+
+  // not own
+  const framework::Scope *scope_ = nullptr;
+  const IpuStrategy *ipu_strategy_ = nullptr;
+
+ private:
+  static std::shared_ptr<IpuBackend> instance_;
+};
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.cc b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
new file mode 100644
index 0000000000000..a1c5ed4fefbf3
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
@@ -0,0 +1,397 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/ipu/ipu_compiler.h"
+
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/platform/ipu/ipu_utils.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+template <typename T>
+T GetAttrAllowNull(std::string attr, framework::OpDesc* op_desc) {
+  if (op_desc->HasAttr(attr)) {
+    return BOOST_GET_CONST(T, op_desc->GetAttr(attr));
+  } else {
+    return {};
+  }
+}
+
+template <typename T>
+nonstd::optional<T> GetOptAttrAllowNull(std::string attr,
+                                        framework::OpDesc* op_desc) {
+  if (op_desc->HasAttr(attr)) {
+    return BOOST_GET_CONST(T, op_desc->GetAttr(attr));
+  } else {
+    return {};
+  }
+}
+
+Compiler::Compiler() {
+  builder_ = popart::Builder::create();
+  RegisterOpFunc();
+}
+
+Compiler::~Compiler() {}
+
+void Compiler::RegisterOpFunc() {
+  VLOG(10) << "enter Compiler::RegisterOpFunc";
+#define INT_VEC std::vector<std::int64_t>
+#define FLOAT_VEC std::vector<float>
+#define FLOAT float
+#define INT std::int64_t
+#define BOOL bool
+#define STRING std::string
+#define STRING_VEC std::vector<std::string*>
+#define NONE
+
+#define ARG(Type, Name) , GetAttrAllowNull<Type>(#Name, op_desc)
+#define OPT_ARG(Type, Name) , GetOptAttrAllowNull<Type>(#Name, op_desc)
+#define POPART_CONST_ARG(Name) , const PopartConstant& Name
+#define HOST_SIDE_CONST_ARG(Name) , const HostSideConstant& Name
+#define POPART_ATTRIB_VEC_ARG(Name)
+#define BODY_ARG(Name) NONE
+
+  name_function_ = {
+#define OP_DECL(FuncName, OnnxImpl, Args)                     \
+  {#FuncName, [&](framework::OpDesc* op_desc) {               \
+     auto op_type = op_desc->Type();                          \
+     VLOG(10) << "build op:" << op_type << " args " << #Args; \
+     auto inputs = GetOpInputs(op_desc);                      \
+     auto output_names = GetOpOutputs(op_desc);               \
+     auto debug_context = BuildDebugContext(op_desc);         \
+     auto aiGraphcoreOpset = builder_->aiGraphcoreOpset1();   \
+     auto aiOnnxOpset = builder_->aiOnnxOpset11();            \
+     auto output_ids = OnnxImpl(inputs Args, debug_context);  \
+     SetIpuIndexStage(output_ids, op_desc);                   \
+     InsertTensors(output_names, output_ids);                 \
+   }},  // NOLINT
+#include "paddle/fluid/platform/ipu/supported_ops_autogen.h"
+  };
+
+#undef OP_DECL
+#undef BODY_ARG
+#undef POPART_ATTRIB_VEC_ARG
+#undef HOST_SIDE_CONST_ARG
+#undef POPART_CONST_ARG
+#undef OPT_ARG
+#undef ARG
+#undef NONE
+#undef STRING_VEC
+#undef STRING
+#undef BOOL
+#undef INT
+#undef FLOAT
+#undef FLOAT_VEC
+#undef INT_VEC
+}
+
+void Compiler::LowerBody(const framework::ir::Graph* graph) {
+  VLOG(10) << "enter Compiler::LowerBody";
+  auto nodes = framework::ir::TopologySortOperations(*graph);
+  for (auto* node : nodes) {
+    auto* op_desc = node->Op();
+    auto op_type = op_desc->Type();
+    VLOG(10) << "node->type: " << op_type;
+
+    if (op_type == "popart_constant") {
+      auto dims =
+          BOOST_GET_CONST(std::vector<int64_t>, op_desc->GetAttr("dims"));
+      auto dtype_ = BOOST_GET_CONST(int, op_desc->GetAttr("dtype"));
+      auto dtype = OnnxDtype2PopartType(dtype_);
+      popart::TensorInfo tensor_info{dtype, dims};
+      auto value_attr = op_desc->GetAttr("value");
+      auto const_data = std::unique_ptr<popart::ConstVoidData>{};
+      switch (dtype) {
+        case popart::DataType::FLOAT:
+          const_data.reset(new popart::ConstVoidData(
+              BOOST_GET_CONST(std::vector<float>, value_attr).data(),
+              tensor_info));
+          break;
+        case popart::DataType::INT32:
+          const_data.reset(new popart::ConstVoidData(
+              BOOST_GET_CONST(std::vector<int>, value_attr).data(),
+              tensor_info));
+          break;
+        case popart::DataType::DOUBLE:
+          const_data.reset(new popart::ConstVoidData(
+              BOOST_GET_CONST(std::vector<double>, value_attr).data(),
+              tensor_info));
+          break;
+        case popart::DataType::INT64:
+          const_data.reset(new popart::ConstVoidData(
+              BOOST_GET_CONST(std::vector<int64_t>, value_attr).data(),
+              tensor_info));
+          break;
+        default:
+          PADDLE_THROW(platform::errors::Unimplemented(
+              "The popart datatype is not supported, popart::DataType is %d",
+              dtype));
+      }
+      popart::TensorId result = builder_->aiOnnxOpset11().constant(*const_data);
+      SetIpuIndexStage(result, op_desc);
+      InsertTensors(GetOpOutputs(op_desc), result);
+    } else if (op_type == "popart_batchnormalization") {
+      auto inputs = GetOpInputs(op_desc);
+      auto outputs = GetOpOutputs(op_desc);
+      auto num_outputs = outputs.size();
+      auto epsilon = BOOST_GET_CONST(float, op_desc->GetAttr("epsilon"));
+      auto momentum = BOOST_GET_CONST(float, op_desc->GetAttr("momentum"));
+      auto result = builder_->aiOnnxOpset11().batchnormalization(
+          inputs, num_outputs, epsilon, momentum);
+      SetIpuIndexStage(result, op_desc);
+      InsertTensors(GetOpOutputs(op_desc), result);
+    } else if (op_type == "popart_nllloss") {
+      auto inputs = GetOpInputs(op_desc);
+      auto ignoreIndex = BOOST_GET_CONST(int, op_desc->GetAttr("ignoreIndex"));
+      auto result = builder_->aiGraphcoreOpset1().nllloss(
+          inputs, popart::ReductionType::NoReduction, ignoreIndex);
+      SetIpuIndexStage(result, op_desc);
+      InsertTensors(GetOpOutputs(op_desc), result);
+    } else if (op_type == "popart_topk") {
+      auto inputs = GetOpInputs(op_desc);
+      auto outputs = GetOpOutputs(op_desc);
+      int64_t axis = BOOST_GET_CONST(int64_t, op_desc->GetAttr("axis"));
+      int sorted_INT32 = BOOST_GET_CONST(int, op_desc->GetAttr("sorted"));
+      int64_t sorted = int64_t{sorted_INT32};
+
+      auto aiOnnxOpset = builder_->aiOnnxOpset11();
+
+      popart::ConvInputs result;
+      if (inputs.size() == 2) {
+        VLOG(10)
+            << "[Compiler::LowerBody] size of inputs for <popart_topk> is 2";
+        result = aiOnnxOpset.topk(inputs, axis, sorted);
+      } else if (inputs.size() == 1) {
+        VLOG(10)
+            << "[Compiler::LowerBody] size of inputs for <popart_topk> is 1";
+        int64_t k = BOOST_GET_CONST(int64_t, op_desc->GetAttr("k"));
+        popart::TensorInfo kShape{"INT64", std::vector<int64_t>{1}};
+        popart::ConstVoidData kData = {&k, kShape};
+        auto K_t = aiOnnxOpset.constant(kData);
+        result = aiOnnxOpset.topk({inputs[0], K_t}, axis, sorted);
+      }
+      result[1] = aiOnnxOpset.cast({result[1]}, "INT32");
+      SetIpuIndexStage(result, op_desc);
+      VLOG(10) << "[Compiler::LowerBody] output[1]: " << outputs[1];
+      VLOG(10) << "[Compiler::LowerBody] output[1]: "
+               << GetOpOutputs(op_desc)[1] << " -> " << result[1];
+      tensors_.emplace(GetOpOutputs(op_desc)[1], result[1]);  // topk indices
+      VLOG(10) << "[Compiler::LowerBody] output[0]: " << outputs[0];
+      VLOG(10) << "[Compiler::LowerBody] output[0]: "
+               << GetOpOutputs(op_desc)[0] << " -> " << result[0];
+      tensors_.emplace(GetOpOutputs(op_desc)[0], result[0]);  // topk values
+    } else {
+      auto itr = name_function_.find(op_type);
+      if (itr != name_function_.end()) {
+        itr->second(node->Op());
+      } else {
+        PADDLE_THROW(platform::errors::NotFound(
+            "Op %s is not registered in popart canonicalization", op_type));
+      }
+    }
+  }
+  VLOG(10) << "leave Compiler::LowerBody";
+}
+
+void Compiler::InitInputs(framework::ir::Graph* graph,
+                          const std::vector<std::string>& feed_list) {
+  for (const auto& feed_name : feed_list) {
+    feed_list_.push_back(feed_name);
+    for (const framework::ir::Node* n : graph->Nodes()) {
+      if (n->IsVar()) {
+        auto* var_desc = n->Var();
+        if (feed_name == var_desc->Name()) {
+          VLOG(10) << "feed_name= " << var_desc->Name();
+          auto data_type = VarType2PopartType(var_desc->GetDataType());
+          if (ipu_strategy_->enable_fp16) {
+            data_type = popart::DataType::FLOAT16;
+          }
+          popart::TensorInfo input_info{data_type, var_desc->GetShape()};
+          VLOG(10) << "popart input_info = " << input_info;
+          popart::TensorId tensor_id =
+              builder_->addInputTensor(input_info, feed_name);
+          VLOG(10) << "popart input tensor id = " << tensor_id;
+          inputs_.push_back(tensor_id);
+          tensors_.emplace(var_desc->Name(), tensor_id);
+        }
+      }
+    }
+  }
+}
+
+void Compiler::InitOutputs(const std::vector<std::string>& fetch_list) {
+  for (const auto& fetch_name : fetch_list) {
+    fetch_list_.push_back(fetch_name);
+    auto tensor = tensors_.find(fetch_name);
+    PADDLE_ENFORCE_NE(tensor, tensors_.end(),
+                      platform::errors::NotFound(
+                          "output tensor %s does not exist.", fetch_name));
+    VLOG(10) << "fetch_name= " << fetch_name;
+    VLOG(10) << "popart output tensor id = " << tensor->second;
+    builder_->addOutputTensor(tensor->second);
+    outputs_.push_back(tensor->second);
+  }
+}
+
+void Compiler::LowerWeights(const framework::ir::Graph* graph,
+                            const framework::Scope* scope_) {
+  PADDLE_ENFORCE_NOT_NULL(scope_,
+                          platform::errors::PreconditionNotMet(
+                              "You should call set_scope before LowerWeights"));
+  // at this step, the graph doesn't contains optimizer related states
+  for (const auto* node : graph->Nodes()) {
+    if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
+      if (node->Var()->Persistable() && node->inputs.empty()) {
+        auto var_name = node->Var()->Name();
+        // workround: https://github.com/graphcore/Paddle/issues/151
+        if (tensors_.count(var_name) != 0) {
+          continue;
+        }
+
+        auto var = scope_->FindVar(var_name);
+        if (var) {
+          auto tensor = var->Get<framework::LoDTensor>();
+          auto dtype = VarType2PopartType(tensor.type());
+          auto shape = std::vector<int64_t>();
+          for (size_t i = 0; i < tensor.dims().size(); ++i) {
+            shape.push_back(tensor.dims().at(i));
+          }
+          popart::TensorInfo tensor_info(dtype, shape);
+          popart::ConstVoidData const_data{tensor.data<void>(), tensor_info};
+          popart::TensorId result =
+              builder_->addInitializedInputTensor(const_data, var_name);
+          tensors_.emplace(var_name, result);
+          weights_.push_back(result);
+        }
+      }
+    }
+  }
+}
+
+void Compiler::InsertTensors(const std::vector<std::string>& output_names,
+                             const std::vector<std::string>& tensor_ids) {
+  PADDLE_ENFORCE_EQ(output_names.size(), tensor_ids.size(),
+                    platform::errors::Fatal("InsertTensors size mismatch"));
+  for (int i = 0; i < tensor_ids.size(); i++) {
+    std::string tensor_id = tensor_ids[i];
+    tensors_.emplace(output_names[i], tensor_ids[i]);
+  }
+}
+
+void Compiler::InsertTensors(const std::vector<std::string>& output_names,
+                             const std::string& tensor_id) {
+  PADDLE_ENFORCE_EQ(output_names.size(), 1,
+                    platform::errors::Fatal("InsertTensors size mismatch"));
+  tensors_.emplace(output_names[0], tensor_id);
+}
+
+void Compiler::SetIpuIndexStage(const std::vector<std::string>& tensor_ids,
+                                const framework::OpDesc* op_desc) {
+  VLOG(10) << "enter Compiler::SetIpuIndexStage";
+  auto tensor_ids_set =
+      std::set<std::string>(tensor_ids.begin(), tensor_ids.end());
+
+  if (op_desc->HasAttr(sIpuIndexAttr)) {
+    auto ipu_index = BOOST_GET_CONST(int, op_desc->GetAttr(sIpuIndexAttr));
+    builder_->virtualGraph(tensor_ids_set, ipu_index);
+    VLOG(10) << "set " << sIpuIndexAttr << " = " << ipu_index
+             << " for op: " << op_desc->Type();
+    if (op_desc->HasAttr(sIpuStageAttr)) {
+      auto ipu_stage = BOOST_GET_CONST(int, op_desc->GetAttr(sIpuStageAttr));
+      builder_->pipelineStage(tensor_ids_set, ipu_stage);
+      VLOG(10) << "set " << sIpuStageAttr << "= " << ipu_stage
+               << " for op: " << op_desc->Type();
+    }
+  }
+  VLOG(10) << "leave Compiler::SetIpuIndexStage";
+}
+
+void Compiler::SetIpuIndexStage(const std::string& tensor_id,
+                                const framework::OpDesc* op_desc) {
+  VLOG(10) << "enter Compiler::SetIpuIndexStage";
+
+  if (op_desc->HasAttr(sIpuIndexAttr)) {
+    auto ipu_index = BOOST_GET_CONST(int, op_desc->GetAttr(sIpuIndexAttr));
+    builder_->virtualGraph(tensor_id, ipu_index);
+    VLOG(10) << "set " << sIpuIndexAttr << " = " << ipu_index
+             << " for op: " << op_desc->Type();
+    if (op_desc->HasAttr(sIpuStageAttr)) {
+      auto ipu_stage = BOOST_GET_CONST(int, op_desc->GetAttr(sIpuStageAttr));
+      builder_->pipelineStage(tensor_id, ipu_stage);
+      VLOG(10) << "set " << sIpuStageAttr << "= " << ipu_stage
+               << " for op: " << op_desc->Type();
+    }
+  }
+  VLOG(10) << "leave Compiler::SetIpuIndexStage";
+}
+
+std::vector<popart::TensorId>& Compiler::GetWeights() { return weights_; }
+
+// convertFloatsToHalfs
+void Compiler::ConvertProtoToFp16() {
+  popart::GraphTransformer graph_transformer(builder_->getModelProto());
+  graph_transformer.convertFloatsToHalfs();
+  converted_proto_ = graph_transformer.getModelProto();
+}
+
+std::string Compiler::GetModelProto() {
+  if (converted_proto_.length()) {
+    return converted_proto_;
+  }
+  return builder_->getModelProto();
+}
+
+void Compiler::SaveModelProto(const std::string& path) {
+  builder_->saveModelProto(path);
+}
+
+void Compiler::SaveModelProtoNoCheck(const std::string& path) {
+  auto proto = GetModelProto();
+  std::ofstream onnxfile(path, std::ios_base::binary);
+  onnxfile.write(proto.data(), proto.size());
+  onnxfile.close();
+}
+
+std::vector<std::string> Compiler::GetOpInputs(const framework::OpDesc* op) {
+  auto ins = op->Input("__inputs__");
+  std::vector<std::string> inputs;
+  for (const auto& in : ins) {
+    if (tensors_.find(in) != tensors_.end()) {
+      inputs.push_back(tensors_[in]);
+    } else {
+      inputs.push_back(in);
+    }
+  }
+  return inputs;
+}
+
+const std::vector<std::string>& Compiler::GetOpOutputs(
+    const framework::OpDesc* op) {
+  return op->Output("__outputs__");
+}
+
+popart::DebugContext Compiler::BuildDebugContext(const framework::OpDesc* op) {
+  auto op_identify_id =
+      BOOST_GET_CONST(std::string, op->GetAttr(sOpIdentifyIdAttr));
+  VLOG(10) << "op_identify_id of op: " << op->Type() << " is "
+           << op_identify_id;
+  return popart::DebugContext(op_identify_id);
+}
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.h b/paddle/fluid/platform/device/ipu/ipu_compiler.h
new file mode 100644
index 0000000000000..ecee1595bb892
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.h
@@ -0,0 +1,93 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <popart/builder.hpp>
+#include <popart/graphtransformer.hpp>
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/ipu/common.h"
+#include "paddle/fluid/platform/ipu/ipu_strategy.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+class Compiler {
+ public:
+  Compiler();
+  ~Compiler();
+  void RegisterOpFunc();
+  void LowerBody(const framework::ir::Graph *graph);
+  void InitInputs(framework::ir::Graph *graph,
+                  const std::vector<std::string> &feed_list);
+  void InitOutputs(const std::vector<std::string> &fetch_list);
+  void LowerWeights(const framework::ir::Graph *graph,
+                    const framework::Scope *scope_);
+
+  void InsertTensors(const std::vector<std::string> &output_names,
+                     const std::vector<std::string> &tensor_ids);
+  void InsertTensors(const std::vector<std::string> &output_names,
+                     const std::string &tensor_id);
+  void SetIpuIndexStage(const std::vector<std::string> &tensor_ids,
+                        const framework::OpDesc *op_desc);
+  void SetIpuIndexStage(const std::string &tensor_id,
+                        const framework::OpDesc *op_desc);
+
+  std::vector<popart::TensorId> GetInputs() { return inputs_; }
+  std::vector<popart::TensorId> GetOutputs() { return outputs_; }
+  std::map<std::string, popart::TensorId> GetTensors() { return tensors_; }
+  std::vector<popart::TensorId> &GetWeights();
+
+  std::string GetModelProto();
+  void SetIpuStrategy(const IpuStrategy &strategy) {
+    ipu_strategy_ = &strategy;
+  };
+  void SaveModelProto(const std::string &path);
+  void SaveModelProtoNoCheck(const std::string &path);
+  void ConvertProtoToFp16();
+
+ private:
+  std::vector<std::string> GetOpInputs(const framework::OpDesc *op);
+  const std::vector<std::string> &GetOpOutputs(const framework::OpDesc *op);
+  popart::DebugContext BuildDebugContext(const framework::OpDesc *op);
+
+ private:
+  std::unique_ptr<popart::Builder> builder_;
+
+  using OpFunc = std::function<void(framework::OpDesc *op_desc)>;
+  std::unordered_map<std::string, OpFunc> name_function_;
+
+  // stateful variable
+  std::map<std::string, popart::TensorId> tensors_;
+
+  // feed_list_ & fetch_list save paddle tensor id
+  std::vector<std::string> feed_list_;
+  std::vector<std::string> fetch_list_;
+
+  // inputs_ & outputs_ save popart tensor id
+  std::vector<popart::TensorId> inputs_;
+  std::vector<popart::TensorId> outputs_;
+
+  // weights info map
+  std::vector<popart::TensorId> weights_;
+
+  std::string converted_proto_ = "";
+  const IpuStrategy *ipu_strategy_ = nullptr;
+};
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.cc b/paddle/fluid/platform/device/ipu/ipu_executor.cc
new file mode 100644
index 0000000000000..a7978ba6f37b1
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/ipu_executor.cc
@@ -0,0 +1,209 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/ipu/ipu_executor.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+Executor::Executor() {}
+
+Executor::~Executor() {}
+
+void Executor::Prepare(const std::string &proto,
+                       const std::map<std::string, popart::TensorId> &tensors,
+                       const std::vector<popart::TensorId> &outputs,
+                       std::shared_ptr<popart::DeviceInfo> device) {
+  auto art = popart::AnchorReturnType("All");
+  std::map<popart::TensorId, popart::AnchorReturnType> anchor_ids;
+  for (const auto &id : outputs) {
+    anchor_ids.emplace(id, art);
+  }
+
+  auto dataFlow = popart::DataFlow(ipu_strategy_->batches_per_step, anchor_ids);
+
+  PADDLE_ENFORCE_NOT_NULL(device, platform::errors::Unavailable(
+                                      "IPU device isn't attached, please call "
+                                      "IpuBackend::AttachDevice(id) first."));
+
+  if (ipu_strategy_ != nullptr && ipu_strategy_->is_training) {
+    VLOG(10) << "Creating TrainingSession from Onnx Model...";
+    auto popart_optimizer = GetPopartOptimizer(opt_info);
+
+    auto it = tensors.find(opt_info.GetLoss());
+    PADDLE_ENFORCE_NE(
+        it, tensors.end(),
+        paddle::platform::errors::InvalidArgument(
+            "loss_id = %s doesn't exist in popart graph.", opt_info.GetLoss()));
+
+    session_ = popart::TrainingSession::createFromOnnxModel(
+        proto, dataFlow, it->second, *popart_optimizer, device,
+        popart::InputShapeInfo(), ipu_strategy_->popart_options_,
+        popart::Patterns(popart::PatternsLevel::Default));
+  } else {
+    VLOG(10) << "Creating InferenceSession from Onnx Model...";
+    session_ = popart::InferenceSession::createFromOnnxModel(
+        proto, dataFlow, device, popart::InputShapeInfo(),
+        ipu_strategy_->popart_options_,
+        popart::Patterns(popart::PatternsLevel::Default));
+  }
+  VLOG(10) << "Creating session from Onnx Model...done";
+
+  VLOG(10) << "Preparing session device...";
+  session_->prepareDevice();
+  VLOG(10) << "Preparing session device...done";
+
+  SetWeightsIO();
+
+  VLOG(10) << "Copy weights from paddle to popart...";
+  WeightsFromPaddle();
+  VLOG(10) << "Copy weights from paddle to popart...done";
+
+  VLOG(10) << "Copy weights from host to device...";
+  session_->weightsFromHost();
+  VLOG(10) << "Copy weights from host to device...done";
+
+  if (ipu_strategy_->save_init_onnx) {
+    session_->modelToHost("test_init.onnx");
+  }
+}
+
+void Executor::Run(const std::vector<popart::TensorId> &inputs_id,
+                   const std::vector<const framework::Tensor *> &inputs,
+                   const std::vector<popart::TensorId> &outputs_id,
+                   const std::vector<framework::Tensor *> &outputs,
+                   const framework::ExecutionContext &ctx) {
+  // inputs
+  std::map<popart::TensorId, popart::IArray &> popart_inputs;
+  std::map<popart::TensorId, PaddleIArray> input_wrappers;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    auto tensor_id = inputs_id[i];
+    framework::Tensor *tensor = nullptr;
+    tensor->ShareDataWith(*inputs[i]);
+    input_wrappers.emplace(tensor_id, PaddleIArray(tensor));
+    popart_inputs.emplace(tensor_id, input_wrappers.at(tensor_id));
+  }
+  // anchors
+  std::map<popart::TensorId, popart::IArray &> popart_anchors;
+  std::map<popart::TensorId, PaddleIArray> anchor_wrappers;
+  for (size_t i = 0; i < outputs.size(); i++) {
+    auto tensor_id = outputs_id[i];
+    framework::Tensor *tensor = nullptr;
+    tensor->ShareDataWith(*outputs[i]);
+    // get dims & dtype from session
+    auto fetch_info = session_->getInfo(tensor_id);
+    auto output_shape = fetch_info.shape();
+    if (ipu_strategy_->batches_per_step > 1) {
+      output_shape.insert(output_shape.begin(),
+                          ipu_strategy_->batches_per_step);
+    }
+    tensor->Resize(framework::make_ddim(output_shape));
+    auto fetch_dtype = fetch_info.dataType();
+    auto paddle_type = PopartType2VarType(fetch_dtype);
+    tensor->mutable_data(ctx.GetPlace(), paddle_type);
+    anchor_wrappers.emplace(tensor_id, PaddleIArray(tensor));
+    popart_anchors.emplace(tensor_id, anchor_wrappers.at(tensor_id));
+  }
+
+  if (ipu_strategy_ != nullptr && ipu_strategy_->is_training) {
+    VLOG(10) << "Update optimizer learning rate...";
+    SetLR(GetLRFromScope());
+    auto popart_optimizer = GetPopartOptimizer(opt_info);
+    auto &session = dynamic_cast<popart::TrainingSession &>(*session_);
+    session.updateOptimizerFromHost(popart_optimizer.get());
+  }
+
+  popart::StepIO stepio(popart_inputs, popart_anchors);
+  VLOG(10) << "Running...";
+  session_->run(stepio);
+  VLOG(10) << "Running...done";
+
+  if (ipu_strategy_ != nullptr && ipu_strategy_->is_training) {
+    session_->weightsToHost();
+    WeightsToPaddle();
+    if (ipu_strategy_->save_last_onnx) {
+      session_->modelToHost("test_last.onnx");
+    }
+  }
+}
+
+void Executor::SetOptimizerType(const std::string &type) {
+  opt_info.SetType(type);
+}
+
+void Executor::SetLR(float lr_rate) { opt_info.SetLR(lr_rate); }
+
+void Executor::SetOptimizerAttr(const std::string &attr, float value) {
+  opt_info.SetAttr(attr, value);
+}
+
+void Executor::SetLoss(const std::string &loss) { opt_info.SetLoss(loss); }
+
+void Executor::SetLRVarName(const std::string &name) {
+  opt_info.SetLRVarName(name);
+}
+
+void Executor::SetWeights(const std::vector<popart::TensorId> &weights) {
+  weights_ = weights;
+}
+
+void Executor::SetWeightsIO() {
+  auto opt_type = opt_info.GetType();
+  auto pre_post_fix = GetOptPrePostfix(opt_type);
+  for (const auto &weight_id : weights_) {
+    for (const auto &pair : pre_post_fix) {
+      if (!IsOptimizerSupported(opt_type)) {
+        continue;
+      }
+
+      // pair.first : popart prefix, pair.second : paddle postfix
+      auto popart_var_name = pair.first + weight_id;
+      auto paddle_var_name = weight_id + pair.second;
+
+      if (scope_->FindVar(paddle_var_name) == nullptr) {
+        continue;
+      }
+
+      auto var = scope_->GetVar(paddle_var_name);
+      auto data_ptr = var->GetMutable<framework::LoDTensor>()->data<float>();
+
+      auto tensor_info = session_->getInfo(popart_var_name);
+      weights_io_.insert(popart_var_name, {data_ptr, tensor_info});
+    }
+  }
+}
+
+void Executor::WeightsFromPaddle() { session_->writeWeights(weights_io_); }
+
+void Executor::WeightsToPaddle() { session_->readWeights(weights_io_); }
+
+void Executor::SetIpuStrategy(const IpuStrategy &strategy) {
+  ipu_strategy_ = &strategy;
+}
+
+float Executor::GetLRFromScope() {
+  auto lr_var = scope_->GetVar(opt_info.GetLRVarName());
+  auto tensor = lr_var->Get<framework::LoDTensor>();
+
+  PADDLE_ENFORCE_EQ(tensor.type(), framework::proto::VarType::FP32,
+                    platform::errors::InvalidArgument(
+                        "LR requiree float, but got (%s).", tensor.type()));
+
+  return tensor.data<float>()[0];
+}
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.h b/paddle/fluid/platform/device/ipu/ipu_executor.h
new file mode 100644
index 0000000000000..400884a2c2b0f
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/ipu_executor.h
@@ -0,0 +1,83 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <popart/dataflow.hpp>
+#include <popart/names.hpp>
+#include <popart/session.hpp>
+
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/ipu/common.h"
+#include "paddle/fluid/platform/ipu/ipu_optimizer.h"
+#include "paddle/fluid/platform/ipu/ipu_strategy.h"
+#include "paddle/fluid/platform/ipu/ipu_utils.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+class Executor {
+ public:
+  Executor();
+  ~Executor();
+
+  void Prepare(const std::string &proto,
+               const std::map<std::string, popart::TensorId> &tensors,
+               const std::vector<popart::TensorId> &outputs,
+               std::shared_ptr<popart::DeviceInfo> device);
+
+  void Run(const std::vector<popart::TensorId> &inputs_id,
+           const std::vector<const framework::Tensor *> &inputs,
+           const std::vector<popart::TensorId> &outputs_id,
+           const std::vector<framework::Tensor *> &outputs,
+           const framework::ExecutionContext &ctx);
+
+  // Optimizer
+  void SetOptimizerType(const std::string &type);
+  void SetOptimizerAttr(const std::string &attr, float value);
+  void SetLoss(const std::string &loss);
+  void SetLR(float lr_rate);
+  void SetLRVarName(const std::string &name);
+
+  void SetWeights(const std::vector<popart::TensorId> &info);
+
+  void SetWeightsIO();
+  void WeightsFromPaddle();
+  void WeightsToPaddle();
+
+  // Scope
+  void SetScope(const framework::Scope *scope) { scope_ = scope; }
+
+  // Strategy
+  void SetIpuStrategy(const IpuStrategy &strategy);
+
+ private:
+  float GetLRFromScope();
+
+ public:
+  OptmizerMetaInfo opt_info;
+  std::unique_ptr<popart::Session> session_;
+
+ private:
+  const framework::Scope *scope_ = nullptr;
+  const IpuStrategy *ipu_strategy_ = nullptr;
+  popart::WeightsIO weights_io_;
+  std::vector<popart::TensorId> weights_;
+};
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_info.cc b/paddle/fluid/platform/device/ipu/ipu_info.cc
new file mode 100644
index 0000000000000..c184149a9d38d
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/ipu_info.cc
@@ -0,0 +1,32 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/device/ipu/ipu_info.h"
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+
+namespace paddle {
+namespace platform {
+
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetSelectedIPUDevices() {
+  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
+      platform::ipu::IpuBackend::GetInstance();
+  return ipu_backend->GetDeviceIds();
+}
+
+//! Get the total number of IPU devices in system.
+int GetIPUDeviceCount() {
+  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
+      platform::ipu::IpuBackend::GetInstance();
+  return ipu_backend->GetNumDevices();
+}
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/type_defs.h b/paddle/fluid/platform/device/ipu/ipu_info.h
similarity index 50%
rename from paddle/fluid/platform/type_defs.h
rename to paddle/fluid/platform/device/ipu/ipu_info.h
index 88a2d16472fa7..3d032eeb4bfc1 100644
--- a/paddle/fluid/platform/type_defs.h
+++ b/paddle/fluid/platform/device/ipu/ipu_info.h
@@ -1,40 +1,24 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
 
-#ifdef PADDLE_WITH_HIP
-#include <hip/hip_runtime.h>
-#else
-#include <cuda_runtime.h>
-#endif
+#ifdef PADDLE_WITH_IPU
+#include <memory>
+#include <vector>
+#include "glog/logging.h"
 
 namespace paddle {
-
-#ifdef PADDLE_WITH_HIP
-#define gpuSuccess hipSuccess
-using gpuStream_t = hipStream_t;
-using gpuError_t = hipError_t;
-using gpuEvent_t = hipEvent_t;
-using gpuDeviceProp = hipDeviceProp_t;
-#else
-#define gpuSuccess cudaSuccess
-using gpuStream_t = cudaStream_t;
-using gpuError_t = cudaError_t;
-using gpuEvent_t = cudaEvent_t;
-using gpuDeviceProp = cudaDeviceProp;
-#endif
-
-using CUDAGraphID = unsigned long long;  // NOLINT
+namespace platform {
+std::vector<int> GetSelectedIPUDevices();
+int GetIPUDeviceCount();
+}  // namespace platform
 }  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/device/ipu/ipu_optimizer.cc b/paddle/fluid/platform/device/ipu/ipu_optimizer.cc
new file mode 100644
index 0000000000000..92bb2ca3afcf8
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/ipu_optimizer.cc
@@ -0,0 +1,136 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/device/ipu/ipu_optimizer.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+OptmizerMetaInfo::OptmizerMetaInfo() {}
+
+OptmizerMetaInfo::~OptmizerMetaInfo() {}
+
+void OptmizerMetaInfo::SetType(const std::string &type) {
+  type_ = OptTypeStr2Enum(type);
+}
+
+float OptmizerMetaInfo::GetAttr(const std::string &attr,
+                                float default_value) const {
+  if (attrs_.count(attr) == 0) {
+    return default_value;
+  }
+  return attrs_.at(attr);
+}
+
+void OptmizerMetaInfo::SetAttr(const std::string &attr, float value) {
+  attrs_[attr] = value;
+}
+
+OptimizerType OptTypeStr2Enum(const std::string type) {
+  if (type == "sgd") {
+    return OptimizerType::SGD;
+  } else if (type == "adam") {
+    return OptimizerType::Adam;
+  } else if (type == "lamb") {
+    return OptimizerType::Lamb;
+  } else {
+    return OptimizerType::Undefined;
+  }
+}
+
+std::unique_ptr<popart::Optimizer> GetPopartOptimizer(
+    const OptmizerMetaInfo &opt_meta_info) {
+  auto opt_type = opt_meta_info.GetType();
+  PADDLE_ENFORCE_NE(
+      opt_type, OptimizerType::Undefined,
+      platform::errors::InvalidArgument("Optimizer type have not been set."));
+
+  if (opt_type == OptimizerType::SGD) {
+    auto optimizer = std::make_unique<popart::SGD>(
+        popart::OptimizerValue(opt_meta_info.GetLR(), false),
+        popart::OptimizerValue(popart::SGD::getUnsetWeightDecay()),
+        popart::OptimizerValue(popart::SGD::getUnsetMomentum()),
+        popart::OptimizerValue(popart::SGD::getUnsetDampening()),
+        popart::OptimizerValue(popart::SGD::getUnsetVelocityScaling()),
+        popart::OptimizerValue(popart::SGD::getUnsetLossScaling()));
+    return optimizer;
+  } else if (opt_type == OptimizerType::Adam) {
+    auto optimizer = std::make_unique<popart::Adam>(
+        popart::OptimizerValue(opt_meta_info.GetLR(), false),
+        popart::OptimizerValue(popart::Adam::getUnsetWeightDecay()),
+        popart::OptimizerValue(opt_meta_info.GetAttr("beta1"), false),
+        popart::OptimizerValue(opt_meta_info.GetAttr("beta2"), false),
+        popart::OptimizerValue(opt_meta_info.GetAttr("epsilon"), false),
+        popart::OptimizerValue(popart::Adam::getUnsetLossScaling()),
+        popart::AdamMode::Adam, popart::WeightDecayMode::Decay,
+        popart::DataType::FLOAT, popart::DataType::FLOAT,
+        popart::DataType::FLOAT);
+    return optimizer;
+  } else if (opt_type == OptimizerType::Lamb) {
+    auto optimizer = std::make_unique<popart::Adam>(
+        popart::OptimizerValue(opt_meta_info.GetLR(), false),
+        popart::OptimizerValue(opt_meta_info.GetAttr("weight_decay"), false),
+        popart::OptimizerValue(opt_meta_info.GetAttr("beta1"), false),
+        popart::OptimizerValue(opt_meta_info.GetAttr("beta2"), false),
+        popart::OptimizerValue(opt_meta_info.GetAttr("epsilon"), false),
+        popart::OptimizerValue(popart::Adam::getUnsetLossScaling()),
+        popart::AdamMode::Lamb, popart::WeightDecayMode::Decay,
+        popart::DataType::FLOAT, popart::DataType::FLOAT,
+        popart::DataType::FLOAT);
+    return optimizer;
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Optimizer %d is not implemented now.", static_cast<int>(opt_type)));
+  }
+}
+
+bool IsOptimizerSupported(OptimizerType type) {
+  switch (type) {
+    case OptimizerType::SGD:
+    case OptimizerType::Adam:
+    case OptimizerType::Lamb:
+      return true;
+    default:
+      return false;
+  }
+}
+
+std::vector<std::pair<std::string, std::string>> GetOptPrePostfix(
+    OptimizerType opt_type) {
+  // format: {popart_tensor_id, paddle_tensor_id}, ...
+  std::vector<std::pair<std::string, std::string>> pre_post_fix;
+
+  switch (opt_type) {
+    case OptimizerType::SGD:
+      pre_post_fix.push_back(std::make_pair("", ""));
+      break;
+    case OptimizerType::Adam:
+    case OptimizerType::Lamb:
+      pre_post_fix.push_back(std::make_pair("", ""));
+      pre_post_fix.push_back(std::make_pair("Accl1___", "_moment1_0"));
+      pre_post_fix.push_back(std::make_pair("Accl2___", "_moment2_0"));
+      pre_post_fix.push_back(std::make_pair("Step___", "_beta1_pow_acc_0"));
+      break;
+    default:
+      pre_post_fix.push_back(std::make_pair("", ""));
+      break;
+  }
+
+  return pre_post_fix;
+}
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_optimizer.h b/paddle/fluid/platform/device/ipu/ipu_optimizer.h
new file mode 100644
index 0000000000000..ee16abce398fb
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/ipu_optimizer.h
@@ -0,0 +1,76 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <popart/adam.hpp>
+#include <popart/names.hpp>
+#include <popart/optimizer.hpp>
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+enum class OptimizerType { SGD = 0, Adam, Lamb, Undefined };
+
+class OptmizerMetaInfo {
+ public:
+  OptmizerMetaInfo();
+  ~OptmizerMetaInfo();
+
+  void SetType(const std::string &type);
+  OptimizerType GetType() const { return type_; }
+
+  void SetAttr(const std::string &attr, float value);
+  float GetAttr(const std::string &attr, float default_value = 0.0f) const;
+
+  void SetLoss(const std::string &loss) { loss_ = loss; }
+  std::string GetLoss() const { return loss_; }
+
+  void SetLR(float lr_rate) { lr_rate_ = lr_rate; }
+  float GetLR() const { return lr_rate_; }
+
+  void SetLRVarName(const std::string &name) { lr_var_name_ = name; }
+  std::string GetLRVarName() const { return lr_var_name_; }
+
+ private:
+  // type: adam, sgd, ...
+  OptimizerType type_ = OptimizerType::Undefined;
+
+  // loss: loss TensorId
+  std::string loss_;
+
+  // attrs: beta1, beta2, ...
+  std::map<std::string, float> attrs_;
+
+  // learning rate
+  float lr_rate_ = 1.0;
+  std::string lr_var_name_;
+};
+
+OptimizerType OptTypeStr2Enum(const std::string type);
+
+std::unique_ptr<popart::Optimizer> GetPopartOptimizer(
+    const OptmizerMetaInfo &info);
+
+bool IsOptimizerSupported(OptimizerType type);
+
+std::vector<std::pair<std::string, std::string>> GetOptPrePostfix(
+    OptimizerType type);
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
new file mode 100644
index 0000000000000..47e7e332c8fba
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
@@ -0,0 +1,21 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/ipu/ipu_strategy.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h
new file mode 100644
index 0000000000000..7e07d517e1031
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <popart/sessionoptions.hpp>
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+using VirtualGraphMode = popart::VirtualGraphMode;
+
+struct IpuStrategy {
+  int num_ipus = 1;
+  int batches_per_step = 1;
+  int batch_size = 1;
+  bool is_training = true;
+  bool save_init_onnx = false;
+  bool save_last_onnx = true;
+  popart::SessionOptions popart_options_;
+  bool need_avg_shard = false;
+  bool enable_fp16 = false;
+};
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_utils.cc b/paddle/fluid/platform/device/ipu/ipu_utils.cc
new file mode 100644
index 0000000000000..08ba50415dd5f
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/ipu_utils.cc
@@ -0,0 +1,155 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/ipu/ipu_utils.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+void* PaddleIArray::data() { return tensor_->data<void>(); }
+
+popart::DataType PaddleIArray::dataType() const {
+  return VarType2PopartType(tensor_->type());
+}
+
+std::size_t PaddleIArray::rank() const { return tensor_->dims().size(); }
+
+int64_t PaddleIArray::dim(size_t index) const {
+  return tensor_->dims().at(index);
+}
+
+std::size_t PaddleIArray::nelms() const {
+  return std::accumulate(shape_.begin(), shape_.end(), static_cast<int64_t>(1),
+                         std::multiplies<int64_t>());
+}
+
+const popart::Shape PaddleIArray::shape() const { return shape_; }
+
+popart::DataType VarType2PopartType(
+    const framework::proto::VarType::Type type) {
+  switch (type) {
+    case framework::proto::VarType::UINT8:
+      return popart::DataType::UINT8;
+    case framework::proto::VarType::INT8:
+      return popart::DataType::INT8;
+    case framework::proto::VarType::INT16:
+      return popart::DataType::INT16;
+    case framework::proto::VarType::INT32:
+      return popart::DataType::INT32;
+    case framework::proto::VarType::INT64:
+      return popart::DataType::INT64;
+    case framework::proto::VarType::BOOL:
+      return popart::DataType::BOOL;
+    case framework::proto::VarType::FP64:
+      return popart::DataType::DOUBLE;
+    case framework::proto::VarType::FP32:
+      return popart::DataType::FLOAT;
+    case framework::proto::VarType::FP16:
+      return popart::DataType::FLOAT16;
+    case framework::proto::VarType::BF16:
+      return popart::DataType::BFLOAT16;
+    case framework::proto::VarType::COMPLEX64:
+      return popart::DataType::COMPLEX64;
+    case framework::proto::VarType::COMPLEX128:
+      return popart::DataType::COMPLEX128;
+    default:
+      PADDLE_THROW(paddle::platform::errors::Unavailable(
+          "Unsupported Paddle var type."));
+  }
+}
+
+framework::proto::VarType::Type PopartType2VarType(
+    const popart::DataType type) {
+  switch (type) {
+    case popart::DataType::UINT8:
+      return framework::proto::VarType::UINT8;
+    case popart::DataType::INT8:
+      return framework::proto::VarType::INT8;
+    case popart::DataType::INT16:
+      return framework::proto::VarType::INT16;
+    case popart::DataType::INT32:
+      return framework::proto::VarType::INT32;
+    case popart::DataType::INT64:
+      return framework::proto::VarType::INT64;
+    case popart::DataType::BOOL:
+      return framework::proto::VarType::BOOL;
+    case popart::DataType::DOUBLE:
+      return framework::proto::VarType::FP64;
+    case popart::DataType::FLOAT:
+      return framework::proto::VarType::FP32;
+    case popart::DataType::FLOAT16:
+      return framework::proto::VarType::FP16;
+    case popart::DataType::BFLOAT16:
+      return framework::proto::VarType::BF16;
+    case popart::DataType::COMPLEX64:
+      return framework::proto::VarType::COMPLEX64;
+    case popart::DataType::COMPLEX128:
+      return framework::proto::VarType::COMPLEX128;
+    default:
+      PADDLE_THROW(paddle::platform::errors::Unavailable(
+          "Unsupported Paddle var type."));
+  }
+}
+
+popart::DataType OnnxDtype2PopartType(const int type) {
+  auto dtype = static_cast<ONNXDataType>(type);
+  switch (dtype) {
+    case ONNXDataType::BOOL:
+      return popart::DataType::BOOL;
+    case ONNXDataType::INT16:
+      return popart::DataType::INT16;
+    case ONNXDataType::INT32:
+      return popart::DataType::INT32;
+    case ONNXDataType::INT64:
+      return popart::DataType::INT64;
+    case ONNXDataType::FLOAT16:
+      return popart::DataType::FLOAT16;
+    case ONNXDataType::FLOAT:
+      return popart::DataType::FLOAT;
+    case ONNXDataType::DOUBLE:
+      return popart::DataType::DOUBLE;
+    case ONNXDataType::UINT8:
+      return popart::DataType::UINT8;
+    case ONNXDataType::INT8:
+      return popart::DataType::INT8;
+    case ONNXDataType::BFLOAT16:
+      return popart::DataType::BFLOAT16;
+    case ONNXDataType::COMPLEX64:
+      return popart::DataType::COMPLEX64;
+    case ONNXDataType::COMPLEX128:
+      return popart::DataType::COMPLEX128;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported ONNX data type: %d.", dtype));
+  }
+}
+
+// count num should > 0
+bool GetBoolEnv(std::string str) {
+  char* str_val = getenv(str.c_str());
+  if (str_val == NULL) {
+    return false;
+  } else {
+    bool val = false;
+    if (strcmp(str_val, "1") == 0 || strcmp(str_val, "true") == 0 ||
+        strcmp(str_val, "True") == 0 || strcmp(str_val, "TRUE") == 0)
+      val = true;
+    return val;
+  }
+}
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_utils.h b/paddle/fluid/platform/device/ipu/ipu_utils.h
new file mode 100644
index 0000000000000..670427128b870
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/ipu_utils.h
@@ -0,0 +1,101 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <popart/ndarraywrapper.hpp>
+#include <popart/tensordata.hpp>
+#include <popart/tensorinfo.hpp>
+
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+// onnx dtype
+// https://github.com/onnx/onnx/blob/master/onnx/onnx-ml.proto3
+enum ONNXDataType : int {
+  UNDEFINED = 0,
+  FLOAT = 1,
+  UINT8 = 2,
+  INT8 = 3,
+  UINT16 = 4,
+  INT16 = 5,
+  INT32 = 6,
+  INT64 = 7,
+  STRING = 8,
+  BOOL = 9,
+  FLOAT16 = 10,
+  DOUBLE = 11,
+  UINT32 = 12,
+  UINT64 = 13,
+  COMPLEX64 = 14,
+  COMPLEX128 = 15,
+  BFLOAT16 = 16
+};
+
+class PaddleIArray final : public popart::IArray {
+ public:
+  explicit PaddleIArray(framework::Tensor *tensor) : tensor_(tensor) {
+    for (int i = 0; i < tensor->dims().size(); ++i) {
+      shape_.push_back(tensor->dims().at(i));
+    }
+  }
+
+ public:
+  void *data();
+  popart::DataType dataType() const;
+  std::size_t rank() const;
+  int64_t dim(size_t index) const;
+  std::size_t nelms() const;
+  const popart::Shape shape() const;
+
+ private:
+  framework::Tensor *tensor_;
+  std::vector<int64_t> shape_;
+};
+
+popart::DataType VarType2PopartType(const framework::proto::VarType::Type type);
+framework::proto::VarType::Type PopartType2VarType(const popart::DataType type);
+popart::DataType OnnxDtype2PopartType(const int type);
+bool GetBoolEnv(std::string str);
+
+template <typename T>
+std::unique_ptr<popart::NDArrayWrapper<T>> Tensor2IArray(
+    const framework::Tensor &tensor) {
+  auto dtype = VarType2PopartType(tensor.type());
+  auto shape = std::vector<int64_t>();
+  for (size_t i = 0; i < tensor.dims().size(); ++i) {
+    shape.push_back(tensor.dims().at(i));
+  }
+  popart::TensorInfo tensor_info(dtype, shape);
+
+  return std::make_unique<popart::NDArrayWrapper<T>>(
+      reinterpret_cast<T *>(tensor.data<void>()), tensor_info);
+}
+
+template <typename T>
+std::unique_ptr<popart::NDArrayWrapper<T>> LoDTensor2IArray(
+    framework::LoDTensor const &lod_tensor) {
+  PADDLE_ENFORCE_EQ(
+      lod_tensor.lod().size(), 0UL,
+      platform::errors::InvalidArgument("LoDTensor2IArray is Unimplemented"));
+  return Tensor2IArray<T>(lod_tensor);
+}
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/supported_ops_autogen.h b/paddle/fluid/platform/device/ipu/supported_ops_autogen.h
new file mode 100644
index 0000000000000..4cd7f928f6e22
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/supported_ops_autogen.h
@@ -0,0 +1,197 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// clang-format off
+
+#pragma once
+
+// Ops from AiGraphcoreOpset1
+OP_DECL(popart_groupnormalization_v2, aiGraphcoreOpset.groupnormalization, ARG(INT,num_groups) ARG(FLOAT,epsilon) ) // NOLINT
+OP_DECL(popart_subsample_v2, aiGraphcoreOpset.subsample, ARG(INT_VEC,strides) ) // NOLINT
+OP_DECL(popart_nop_v2, aiGraphcoreOpset.nop, NONE) // NOLINT
+OP_DECL(popart_scale_v2, aiGraphcoreOpset.scale, ARG(FLOAT,scale) ) // NOLINT
+OP_DECL(popart_scaledadd_v2, aiGraphcoreOpset.scaledadd, ARG(FLOAT,scale0) ARG(FLOAT,scale1) ) // NOLINT
+OP_DECL(popart_gelu_v2, aiGraphcoreOpset.gelu, NONE) // NOLINT
+OP_DECL(popart_detach_v2, aiGraphcoreOpset.detach, NONE) // NOLINT
+OP_DECL(popart_depthtospace_v2, aiGraphcoreOpset.depthtospace, ARG(INT,blocksize) ARG(STRING,mode) ) // NOLINT
+OP_DECL(popart_round_v2, aiGraphcoreOpset.round, NONE) // NOLINT
+OP_DECL(popart_dynamicslice_v2, aiGraphcoreOpset.dynamicslice, ARG(INT_VEC,axes) ARG(INT_VEC,sizes) ARG(INT,noOverlap) ) // NOLINT
+OP_DECL(popart_dynamicupdate_v2, aiGraphcoreOpset.dynamicupdate, ARG(INT_VEC,axes) ARG(INT_VEC,sizes) ARG(INT,noOverlap) ) // NOLINT
+OP_DECL(popart_dynamiczero_v2, aiGraphcoreOpset.dynamiczero, ARG(INT_VEC,axes) ARG(INT_VEC,sizes) ) // NOLINT
+OP_DECL(popart_dynamicadd_v2, aiGraphcoreOpset.dynamicadd, ARG(INT_VEC,axes) ARG(INT_VEC,sizes) ) // NOLINT
+OP_DECL(popart_sequenceslice_v2, aiGraphcoreOpset.sequenceslice, ARG(INT,zeroUnused) ) // NOLINT
+OP_DECL(popart_replicatedallreduce_v2, aiGraphcoreOpset.replicatedallreduce, OPT_ARG(INT_VEC,commGroup) ) // NOLINT
+OP_DECL(popart_ctcbeamsearchdecoder_v2, aiGraphcoreOpset.ctcbeamsearchdecoder, ARG(INT,blank) ARG(INT,beamWidth) ARG(INT,topPaths) ) // NOLINT
+OP_DECL(popart_shapeddropout_v2, aiGraphcoreOpset.shapeddropout, ARG(INT_VEC,shape) ARG(FLOAT,ratio) ) // NOLINT
+OP_DECL(popart_atan2_v2, aiGraphcoreOpset.atan2, NONE) // NOLINT
+OP_DECL(popart_expm1_v2, aiGraphcoreOpset.expm1, NONE) // NOLINT
+OP_DECL(popart_log1p_v2, aiGraphcoreOpset.log1p, NONE) // NOLINT
+OP_DECL(popart_fmod_v2, aiGraphcoreOpset.fmod, NONE) // NOLINT
+OP_DECL(popart_remainder_v2, aiGraphcoreOpset.remainder, NONE) // NOLINT
+OP_DECL(popart_reverse_v2, aiGraphcoreOpset.reverse, ARG(INT_VEC,dimensions) ) // NOLINT
+OP_DECL(popart_bitwisenot_v2, aiGraphcoreOpset.bitwisenot, NONE) // NOLINT
+OP_DECL(popart_bitwiseand_v2, aiGraphcoreOpset.bitwiseand, NONE) // NOLINT
+OP_DECL(popart_bitwiseor_v2, aiGraphcoreOpset.bitwiseor, NONE) // NOLINT
+OP_DECL(popart_bitwisexor_v2, aiGraphcoreOpset.bitwisexor, NONE) // NOLINT
+OP_DECL(popart_bitwisexnor_v2, aiGraphcoreOpset.bitwisexnor, NONE) // NOLINT
+OP_DECL(popart_reducemedian_v2, aiGraphcoreOpset.reducemedian, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT
+// Ops from AiOnnxOpset11
+OP_DECL(popart_argmax, aiOnnxOpset.argmax, ARG(INT,axis) ARG(INT,keepdims) ) // NOLINT
+OP_DECL(popart_argmin, aiOnnxOpset.argmin, ARG(INT,axis) ARG(INT,keepdims) ) // NOLINT
+OP_DECL(popart_averagepool, aiOnnxOpset.averagepool, ARG(INT_VEC,kernel_shape) ARG(INT,ceil_mode) ARG(INT,count_include_pad) ARG(INT_VEC,pads) ARG(INT_VEC,strides) ) // NOLINT
+OP_DECL(popart_bitshift, aiOnnxOpset.bitshift, ARG(STRING,direction) ) // NOLINT
+OP_DECL(popart_clip, aiOnnxOpset.clip, NONE) // NOLINT
+OP_DECL(popart_compress, aiOnnxOpset.compress, OPT_ARG(INT,axis) ) // NOLINT
+OP_DECL(popart_concat, aiOnnxOpset.concat, ARG(INT,axis) ) // NOLINT
+OP_DECL(popart_concatfromsequence, aiOnnxOpset.concatfromsequence, ARG(INT,axis) ARG(INT,new_axis) ) // NOLINT
+OP_DECL(popart_conv, aiOnnxOpset.conv, ARG(INT_VEC,dilations) ARG(INT,group) ARG(INT_VEC,kernel_shape) ARG(INT_VEC,pads) ARG(INT_VEC,strides) ) // NOLINT
+OP_DECL(popart_convtranspose, aiOnnxOpset.convtranspose, ARG(INT_VEC,dilations) ARG(INT,group) ARG(INT_VEC,kernel_shape) ARG(INT_VEC,output_padding) ARG(INT_VEC,output_shape) ARG(INT_VEC,pads) ARG(INT_VEC,strides) ) // NOLINT
+OP_DECL(popart_cumsum, aiOnnxOpset.cumsum, ARG(INT,exclusive) ARG(INT,reverse) ) // NOLINT
+OP_DECL(popart_depthtospace, aiOnnxOpset.depthtospace, ARG(INT,blocksize) ARG(STRING,mode) ) // NOLINT
+OP_DECL(popart_det, aiOnnxOpset.det, NONE) // NOLINT
+OP_DECL(popart_dynamicquantizelinear, aiOnnxOpset.dynamicquantizelinear, NONE) // NOLINT
+OP_DECL(popart_equal, aiOnnxOpset.equal, NONE) // NOLINT
+OP_DECL(popart_flatten, aiOnnxOpset.flatten, ARG(INT,axis) ) // NOLINT
+OP_DECL(popart_gather, aiOnnxOpset.gather, ARG(INT,axis) ) // NOLINT
+OP_DECL(popart_gatherelements, aiOnnxOpset.gatherelements, ARG(INT,axis) ) // NOLINT
+OP_DECL(popart_gathernd, aiOnnxOpset.gathernd, NONE) // NOLINT
+OP_DECL(popart_gemm, aiOnnxOpset.gemm, ARG(FLOAT,alpha) ARG(FLOAT,beta) ARG(INT,transA) ARG(INT,transB) ) // NOLINT
+OP_DECL(popart_hardmax, aiOnnxOpset.hardmax, ARG(INT,axis) ) // NOLINT
+OP_DECL(popart_logsoftmax, aiOnnxOpset.logsoftmax, ARG(INT,axis) ) // NOLINT
+OP_DECL(popart_lppool, aiOnnxOpset.lppool, ARG(INT_VEC,kernel_shape) ARG(INT,p) ARG(INT_VEC,pads) ARG(INT_VEC,strides) ) // NOLINT
+OP_DECL(popart_maxpool, aiOnnxOpset.maxpool, ARG(INT,num_outputs) ARG(INT_VEC,kernel_shape) ARG(INT,ceil_mode) ARG(INT_VEC,dilations) ARG(INT_VEC,pads) ARG(INT,storage_order) ARG(INT_VEC,strides) ) // NOLINT
+OP_DECL(popart_maxunpool, aiOnnxOpset.maxunpool, ARG(INT_VEC,kernel_shape) ARG(INT_VEC,pads) ARG(INT_VEC,strides) ) // NOLINT
+OP_DECL(popart_nonmaxsuppression, aiOnnxOpset.nonmaxsuppression, ARG(INT,center_point_box) ) // NOLINT
+OP_DECL(popart_onehot, aiOnnxOpset.onehot, ARG(INT,axis) ) // NOLINT
+OP_DECL(popart_pad, aiOnnxOpset.pad, ARG(STRING,mode) ) // NOLINT
+OP_DECL(popart_range, aiOnnxOpset.range, NONE) // NOLINT
+OP_DECL(popart_reducel1, aiOnnxOpset.reducel1, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT
+OP_DECL(popart_reducel2, aiOnnxOpset.reducel2, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT
+OP_DECL(popart_reducelogsum, aiOnnxOpset.reducelogsum, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT
+OP_DECL(popart_reducelogsumexp, aiOnnxOpset.reducelogsumexp, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT
+OP_DECL(popart_reducemax, aiOnnxOpset.reducemax, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT
+OP_DECL(popart_reducemean, aiOnnxOpset.reducemean, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT
+OP_DECL(popart_reducemin, aiOnnxOpset.reducemin, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT
+OP_DECL(popart_reduceprod, aiOnnxOpset.reduceprod, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT
+OP_DECL(popart_reducesum, aiOnnxOpset.reducesum, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT
+OP_DECL(popart_reducesumsquare, aiOnnxOpset.reducesumsquare, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT
+OP_DECL(popart_resize, aiOnnxOpset.resize, ARG(STRING,coordinate_transformation_mode) ARG(FLOAT,cubic_coeff_a) ARG(INT,exclude_outside) ARG(FLOAT,extrapolation_value) ARG(STRING,mode) ARG(STRING,nearest_mode) ) // NOLINT
+OP_DECL(popart_round, aiOnnxOpset.round, NONE) // NOLINT
+OP_DECL(popart_scatter, aiOnnxOpset.scatter, ARG(INT,axis) ) // NOLINT
+OP_DECL(popart_scatterelements, aiOnnxOpset.scatterelements, ARG(INT,axis) ) // NOLINT
+OP_DECL(popart_scatternd, aiOnnxOpset.scatternd, NONE) // NOLINT
+OP_DECL(popart_sequenceat, aiOnnxOpset.sequenceat, NONE) // NOLINT
+OP_DECL(popart_sequenceconstruct, aiOnnxOpset.sequenceconstruct, NONE) // NOLINT
+OP_DECL(popart_sequenceerase, aiOnnxOpset.sequenceerase, NONE) // NOLINT
+OP_DECL(popart_sequenceinsert, aiOnnxOpset.sequenceinsert, NONE) // NOLINT
+OP_DECL(popart_sequencelength, aiOnnxOpset.sequencelength, NONE) // NOLINT
+OP_DECL(popart_slice, aiOnnxOpset.slice, NONE) // NOLINT
+OP_DECL(popart_softmax, aiOnnxOpset.softmax, ARG(INT,axis) ) // NOLINT
+OP_DECL(popart_split, aiOnnxOpset.split, ARG(INT,num_outputs) ARG(INT,axis) ARG(INT_VEC,split) ) // NOLINT
+OP_DECL(popart_splittosequence, aiOnnxOpset.splittosequence, ARG(INT,axis) ARG(INT,keepdims) ) // NOLINT
+OP_DECL(popart_squeeze, aiOnnxOpset.squeeze, ARG(INT_VEC,axes) ) // NOLINT
+OP_DECL(popart_topk, aiOnnxOpset.topk, ARG(INT,axis) ARG(INT,largest) ARG(INT,sorted) ) // NOLINT
+OP_DECL(popart_unique, aiOnnxOpset.unique, ARG(INT,num_outputs) OPT_ARG(INT,axis) ARG(INT,sorted) ) // NOLINT
+OP_DECL(popart_unsqueeze, aiOnnxOpset.unsqueeze, ARG(INT_VEC,axes) ) // NOLINT
+// Ops from AiOnnxOpset10
+OP_DECL(popart_convinteger, aiOnnxOpset.convinteger, ARG(INT_VEC,dilations) ARG(INT,group) ARG(INT_VEC,kernel_shape) ARG(INT_VEC,pads) ARG(INT_VEC,strides) ) // NOLINT
+OP_DECL(popart_dequantizelinear, aiOnnxOpset.dequantizelinear, NONE) // NOLINT
+OP_DECL(popart_dropout, aiOnnxOpset.dropout, ARG(INT,num_outputs) ARG(FLOAT,ratio) ) // NOLINT
+OP_DECL(popart_isinf, aiOnnxOpset.isinf, ARG(INT,detect_negative) ARG(INT,detect_positive) ) // NOLINT
+OP_DECL(popart_matmulinteger, aiOnnxOpset.matmulinteger, NONE) // NOLINT
+OP_DECL(popart_mod, aiOnnxOpset.mod, ARG(INT,fmod) ) // NOLINT
+OP_DECL(popart_qlinearconv, aiOnnxOpset.qlinearconv, ARG(INT_VEC,dilations) ARG(INT,group) ARG(INT_VEC,kernel_shape) ARG(INT_VEC,pads) ARG(INT_VEC,strides) ) // NOLINT
+OP_DECL(popart_qlinearmatmul, aiOnnxOpset.qlinearmatmul, NONE) // NOLINT
+OP_DECL(popart_quantizelinear, aiOnnxOpset.quantizelinear, NONE) // NOLINT
+OP_DECL(popart_reversesequence, aiOnnxOpset.reversesequence, ARG(INT,batch_axis) ARG(INT,time_axis) ) // NOLINT
+OP_DECL(popart_roialign, aiOnnxOpset.roialign, ARG(STRING,mode) ARG(INT,output_height) ARG(INT,output_width) ARG(INT,sampling_ratio) ARG(FLOAT,spatial_scale) ) // NOLINT
+OP_DECL(popart_thresholdedrelu, aiOnnxOpset.thresholdedrelu, ARG(FLOAT,alpha) ) // NOLINT
+OP_DECL(popart_upsample, aiOnnxOpset.upsample, ARG(STRING,mode) ) // NOLINT
+// Ops from AiOnnxOpset9
+OP_DECL(popart_acosh, aiOnnxOpset.acosh, NONE) // NOLINT
+OP_DECL(popart_asinh, aiOnnxOpset.asinh, NONE) // NOLINT
+OP_DECL(popart_atanh, aiOnnxOpset.atanh, NONE) // NOLINT
+OP_DECL(popart_batchnormalization, aiOnnxOpset.batchnormalization, ARG(INT,num_outputs) ARG(FLOAT,epsilon) ARG(FLOAT,momentum) ) // NOLINT
+OP_DECL(popart_cast, aiOnnxOpset.cast, ARG(STRING,to) ) // NOLINT
+OP_DECL(popart_cosh, aiOnnxOpset.cosh, NONE) // NOLINT
+OP_DECL(popart_erf, aiOnnxOpset.erf, NONE) // NOLINT
+OP_DECL(popart_eyelike, aiOnnxOpset.eyelike, OPT_ARG(INT,dtype) ARG(INT,k) ) // NOLINT
+OP_DECL(popart_greater, aiOnnxOpset.greater, NONE) // NOLINT
+OP_DECL(popart_isnan, aiOnnxOpset.isnan, NONE) // NOLINT
+OP_DECL(popart_less, aiOnnxOpset.less, NONE) // NOLINT
+OP_DECL(popart_matmul, aiOnnxOpset.matmul, NONE) // NOLINT
+OP_DECL(popart_meanvariancenormalization, aiOnnxOpset.meanvariancenormalization, ARG(INT_VEC,axes) ) // NOLINT
+OP_DECL(popart_nonzero, aiOnnxOpset.nonzero, NONE) // NOLINT
+OP_DECL(popart_prelu, aiOnnxOpset.prelu, NONE) // NOLINT
+OP_DECL(popart_shrink, aiOnnxOpset.shrink, ARG(FLOAT,bias) ARG(FLOAT,lambd) ) // NOLINT
+OP_DECL(popart_sign, aiOnnxOpset.sign, NONE) // NOLINT
+OP_DECL(popart_sinh, aiOnnxOpset.sinh, NONE) // NOLINT
+OP_DECL(popart_where, aiOnnxOpset.where, NONE) // NOLINT
+// Ops from AiOnnxOpset8
+OP_DECL(popart_expand, aiOnnxOpset.expand, NONE) // NOLINT
+OP_DECL(popart_max, aiOnnxOpset.max, NONE) // NOLINT
+OP_DECL(popart_mean, aiOnnxOpset.mean, NONE) // NOLINT
+OP_DECL(popart_min, aiOnnxOpset.min, NONE) // NOLINT
+OP_DECL(popart_sum, aiOnnxOpset.sum, NONE) // NOLINT
+// Ops from AiOnnxOpset7
+OP_DECL(popart_acos, aiOnnxOpset.acos, NONE) // NOLINT
+OP_DECL(popart_add, aiOnnxOpset.add, NONE) // NOLINT
+OP_DECL(popart_logical_and, aiOnnxOpset.logical_and, NONE) // NOLINT
+OP_DECL(popart_asin, aiOnnxOpset.asin, NONE) // NOLINT
+OP_DECL(popart_atan, aiOnnxOpset.atan, NONE) // NOLINT
+OP_DECL(popart_cos, aiOnnxOpset.cos, NONE) // NOLINT
+OP_DECL(popart_div, aiOnnxOpset.div, NONE) // NOLINT
+OP_DECL(popart_mul, aiOnnxOpset.mul, NONE) // NOLINT
+OP_DECL(popart_multinomial, aiOnnxOpset.multinomial, ARG(INT,dtype) ARG(INT,sample_size) OPT_ARG(FLOAT,seed) ) // NOLINT
+OP_DECL(popart_logical_or, aiOnnxOpset.logical_or, NONE) // NOLINT
+OP_DECL(popart_pow, aiOnnxOpset.pow, NONE) // NOLINT
+OP_DECL(popart_sin, aiOnnxOpset.sin, NONE) // NOLINT
+OP_DECL(popart_sub, aiOnnxOpset.sub, NONE) // NOLINT
+OP_DECL(popart_tan, aiOnnxOpset.tan, NONE) // NOLINT
+OP_DECL(popart_logical_xor, aiOnnxOpset.logical_xor, NONE) // NOLINT
+// Ops from AiOnnxOpset6
+OP_DECL(popart_abs, aiOnnxOpset.abs, NONE) // NOLINT
+OP_DECL(popart_ceil, aiOnnxOpset.ceil, NONE) // NOLINT
+OP_DECL(popart_elu, aiOnnxOpset.elu, ARG(FLOAT,alpha) ) // NOLINT
+OP_DECL(popart_exp, aiOnnxOpset.exp, NONE) // NOLINT
+OP_DECL(popart_floor, aiOnnxOpset.floor, NONE) // NOLINT
+OP_DECL(popart_globalaveragepool, aiOnnxOpset.globalaveragepool, NONE) // NOLINT
+OP_DECL(popart_globallppool, aiOnnxOpset.globallppool, ARG(INT,p) ) // NOLINT
+OP_DECL(popart_globalmaxpool, aiOnnxOpset.globalmaxpool, NONE) // NOLINT
+OP_DECL(popart_hardsigmoid, aiOnnxOpset.hardsigmoid, ARG(FLOAT,alpha) ARG(FLOAT,beta) ) // NOLINT
+OP_DECL(popart_identity, aiOnnxOpset.identity, NONE) // NOLINT
+OP_DECL(popart_instancenormalization, aiOnnxOpset.instancenormalization, ARG(FLOAT,epsilon) ) // NOLINT
+OP_DECL(popart_lrn, aiOnnxOpset.lrn, ARG(INT,size) ARG(FLOAT,alpha) ARG(FLOAT,beta) ARG(FLOAT,bias) ) // NOLINT
+OP_DECL(popart_leakyrelu, aiOnnxOpset.leakyrelu, ARG(FLOAT,alpha) ) // NOLINT
+OP_DECL(popart_log, aiOnnxOpset.log, NONE) // NOLINT
+OP_DECL(popart_lpnormalization, aiOnnxOpset.lpnormalization, ARG(INT,axis) ARG(INT,p) ) // NOLINT
+OP_DECL(popart_maxroipool, aiOnnxOpset.maxroipool, ARG(INT_VEC,pooled_shape) ARG(FLOAT,spatial_scale) ) // NOLINT
+OP_DECL(popart_neg, aiOnnxOpset.neg, NONE) // NOLINT
+OP_DECL(popart_logical_not, aiOnnxOpset.logical_not, NONE) // NOLINT
+OP_DECL(popart_randomnormallike, aiOnnxOpset.randomnormallike, OPT_ARG(INT,dtype) ARG(FLOAT,mean) ARG(FLOAT,scale) OPT_ARG(FLOAT,seed) ) // NOLINT
+OP_DECL(popart_randomuniformlike, aiOnnxOpset.randomuniformlike, OPT_ARG(INT,dtype) ARG(FLOAT,high) ARG(FLOAT,low) OPT_ARG(FLOAT,seed) ) // NOLINT
+OP_DECL(popart_reciprocal, aiOnnxOpset.reciprocal, NONE) // NOLINT
+OP_DECL(popart_relu, aiOnnxOpset.relu, NONE) // NOLINT
+OP_DECL(popart_reshape, aiOnnxOpset.reshape, NONE) // NOLINT
+OP_DECL(popart_selu, aiOnnxOpset.selu, ARG(FLOAT,alpha) ARG(FLOAT,gamma) ) // NOLINT
+OP_DECL(popart_shape, aiOnnxOpset.shape, NONE) // NOLINT
+OP_DECL(popart_sigmoid, aiOnnxOpset.sigmoid, NONE) // NOLINT
+OP_DECL(popart_size, aiOnnxOpset.size, NONE) // NOLINT
+OP_DECL(popart_softplus, aiOnnxOpset.softplus, NONE) // NOLINT
+OP_DECL(popart_softsign, aiOnnxOpset.softsign, NONE) // NOLINT
+OP_DECL(popart_spacetodepth, aiOnnxOpset.spacetodepth, ARG(INT,blocksize) ) // NOLINT
+OP_DECL(popart_sqrt, aiOnnxOpset.sqrt, NONE) // NOLINT
+OP_DECL(popart_tanh, aiOnnxOpset.tanh, NONE) // NOLINT
+OP_DECL(popart_tile, aiOnnxOpset.tile, NONE) // NOLINT
+OP_DECL(popart_transpose, aiOnnxOpset.transpose, ARG(INT_VEC,perm) ) // NOLINT
diff --git a/paddle/fluid/platform/device/npu/hccl_helper.h b/paddle/fluid/platform/device/npu/hccl_helper.h
index f1ef8650be4c1..69cea31446680 100644
--- a/paddle/fluid/platform/device/npu/hccl_helper.h
+++ b/paddle/fluid/platform/device/npu/hccl_helper.h
@@ -66,11 +66,11 @@ inline HcclDataType ToHCCLDataType(framework::proto::VarType::Type type) {
 
 //   inline HCCLGroupGuard() {
 //     HCCLMutex().lock();
-//     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupStart());
+//     PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupStart());
 //   }
 
 //   inline ~HCCLGroupGuard() PADDLE_MAY_THROW {
-//     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupEnd());
+//     PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupEnd());
 //     HCCLMutex().unlock();
 //   }
 // };
diff --git a/paddle/fluid/platform/device/xpu/xpu1_op_list.h b/paddle/fluid/platform/device/xpu/xpu1_op_list.h
index 1cc7bba132e59..d6b466ff92c5b 100644
--- a/paddle/fluid/platform/device/xpu/xpu1_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu1_op_list.h
@@ -29,40 +29,35 @@ using XPUOpMap = std::unordered_map<std::string, XPUKernelSet>;
 XPUOpMap& get_kl1_ops() {
   // KL1支持的op，通过op_name, data_type, place来索引
   static XPUOpMap s_xpu1_kernels{
-      {"relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"relu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"tanh", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"tanh_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"sigmoid", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"sigmoid_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"gelu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"gelu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"sqrt", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"sqrt_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"square", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"square_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"hard_switch", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"hard_switch_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"leaky_relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"leaky_relu_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"log", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"pow", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"abs", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"affine_channel",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"accuracy", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"adam", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"adamw", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"affine_channel_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"affine_channel",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"arg_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"assign", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                                pOpKernelType(vartype::FP64, XPUPlace()),
                                pOpKernelType(vartype::INT32, XPUPlace()),
                                pOpKernelType(vartype::INT64, XPUPlace()),
                                pOpKernelType(vartype::BOOL, XPUPlace())})},
-      {"batch_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"batch_norm_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"batch_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"bilinear_interp",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"bilinear_interp_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"bilinear_interp_v2",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"bilinear_interp_v2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"broadcast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                  pOpKernelType(vartype::FP64, XPUPlace()),
+                                  pOpKernelType(vartype::INT32, XPUPlace()),
+                                  pOpKernelType(vartype::INT64, XPUPlace())})},
       {"cast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                              pOpKernelType(vartype::INT64, XPUPlace()),
                              pOpKernelType(vartype::INT32, XPUPlace())})},
@@ -72,188 +67,197 @@ XPUOpMap& get_kl1_ops() {
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace())})},
-      {"c_reduce_sum",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"c_allreduce_sum",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"broadcast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                                  pOpKernelType(vartype::FP64, XPUPlace()),
-                                  pOpKernelType(vartype::INT32, XPUPlace()),
-                                  pOpKernelType(vartype::INT64, XPUPlace())})},
       {"concat", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"concat_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"logicalor", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace()),
-                                  pOpKernelType(vartype::INT8, XPUPlace()),
-                                  pOpKernelType(vartype::INT16, XPUPlace()),
-                                  pOpKernelType(vartype::INT32, XPUPlace()),
-                                  pOpKernelType(vartype::INT64, XPUPlace()),
-                                  pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"logicaland", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace()),
-                                   pOpKernelType(vartype::INT8, XPUPlace()),
-                                   pOpKernelType(vartype::INT16, XPUPlace()),
-                                   pOpKernelType(vartype::INT32, XPUPlace()),
-                                   pOpKernelType(vartype::INT64, XPUPlace()),
-                                   pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"logicalnot", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace()),
-                                   pOpKernelType(vartype::INT8, XPUPlace()),
-                                   pOpKernelType(vartype::INT16, XPUPlace()),
-                                   pOpKernelType(vartype::INT32, XPUPlace()),
-                                   pOpKernelType(vartype::INT64, XPUPlace()),
-                                   pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"depthwise_conv2d",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"depthwise_conv2d_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"conv2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"conv2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"deformable_conv",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"deformable_conv_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"depthwise_conv2d",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"depthwise_conv2d_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"dropout", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"dropout_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_sub",
+      {"c_allreduce_sum",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_sub_grad",
+      {"c_reduce_sum",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"elementwise_add",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_add_grad",
+      {"elementwise_div_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"elementwise_div",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_div_grad",
+      {"elementwise_floordiv",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_pow",
+      {"elementwise_max_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_floordiv",
+      {"elementwise_max",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_mul",
+      {"elementwise_min_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_min",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"elementwise_mul_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_max",
+      {"elementwise_mul",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_max_grad",
+      {"elementwise_pow",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_min",
+      {"elementwise_sub_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_min_grad",
+      {"elementwise_sub",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"expand_as_v2",
+       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+                     pOpKernelType(vartype::INT64, XPUPlace()),
+                     pOpKernelType(vartype::BOOL, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace()),
+                     pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"expand_v2", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+                                  pOpKernelType(vartype::INT64, XPUPlace()),
+                                  pOpKernelType(vartype::BOOL, XPUPlace()),
+                                  pOpKernelType(vartype::FP16, XPUPlace()),
+                                  pOpKernelType(vartype::FP32, XPUPlace())})},
       {"fill_constant",
        XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
                      pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::FP64, XPUPlace()),
                      pOpKernelType(vartype::BOOL, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"gather", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"gather_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"gather", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"gaussian_random",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"bilinear_interp",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"bilinear_interp_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"nearest_interp",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"nearest_interp_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"bilinear_interp_v2",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"bilinear_interp_v2_grad",
+      {"gelu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"gelu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"hard_switch_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"nearest_interp_v2",
+      {"hard_switch", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"iou_similarity",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"nearest_interp_v2_grad",
+      {"lamb", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"layer_norm_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"layer_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"layer_norm_grad",
+      {"leaky_relu_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"leaky_relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"load", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
                              pOpKernelType(vartype::INT8, XPUPlace()),
                              pOpKernelType(vartype::INT32, XPUPlace()),
                              pOpKernelType(vartype::INT64, XPUPlace()),
                              pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"log_loss", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"logicaland", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace()),
+                                   pOpKernelType(vartype::INT8, XPUPlace()),
+                                   pOpKernelType(vartype::INT16, XPUPlace()),
+                                   pOpKernelType(vartype::INT32, XPUPlace()),
+                                   pOpKernelType(vartype::INT64, XPUPlace()),
+                                   pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"logicalnot", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace()),
+                                   pOpKernelType(vartype::INT8, XPUPlace()),
+                                   pOpKernelType(vartype::INT16, XPUPlace()),
+                                   pOpKernelType(vartype::INT32, XPUPlace()),
+                                   pOpKernelType(vartype::INT64, XPUPlace()),
+                                   pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"logicalor", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace()),
+                                  pOpKernelType(vartype::INT8, XPUPlace()),
+                                  pOpKernelType(vartype::INT16, XPUPlace()),
+                                  pOpKernelType(vartype::INT32, XPUPlace()),
+                                  pOpKernelType(vartype::INT64, XPUPlace()),
+                                  pOpKernelType(vartype::FP32, XPUPlace())})},
       {"log_loss_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"lookup_table_v2",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"log_loss", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"logsumexp", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"log", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"lookup_table_v2_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"matmul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"lookup_table_v2",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"matmul_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"matmul_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"matmul_v2_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"matmul_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"matmul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"mean_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"accuracy", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"mul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"momuntem", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"mul_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"one_hot", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
-                                pOpKernelType(vartype::INT64, XPUPlace())})},
+      {"mul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"nearest_interp_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"nearest_interp_v2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"nearest_interp_v2",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"nearest_interp",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"one_hot_v2", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
                                    pOpKernelType(vartype::INT64, XPUPlace())})},
-      {"sgd", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"adam", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"adamw", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"rmsprop", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"lamb", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"pool2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"one_hot", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+                                pOpKernelType(vartype::INT64, XPUPlace())})},
       {"pool2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"pool2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"pow", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"range", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
                               pOpKernelType(vartype::INT64, XPUPlace()),
                               pOpKernelType(vartype::INT32, XPUPlace()),
                               pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_sum_grad",
+      {"reduce_max_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"logsumexp", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reduce_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_max_grad",
+      {"reduce_mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_sum_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reshape2", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
-                                 pOpKernelType(vartype::INT64, XPUPlace()),
-                                 pOpKernelType(vartype::INT32, XPUPlace()),
-                                 pOpKernelType(vartype::BOOL, XPUPlace()),
-                                 pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"relu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reshape2_grad",
        XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
                      pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
                      pOpKernelType(vartype::BOOL, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"rnn", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reshape2", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
+                                 pOpKernelType(vartype::INT64, XPUPlace()),
+                                 pOpKernelType(vartype::INT32, XPUPlace()),
+                                 pOpKernelType(vartype::BOOL, XPUPlace()),
+                                 pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"rmsprop", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"rnn_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"roi_align", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"rnn", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"roi_align_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"roi_align", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"scale", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sgd", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"shape", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
                               pOpKernelType(vartype::INT64, XPUPlace()),
                               pOpKernelType(vartype::INT32, XPUPlace()),
                               pOpKernelType(vartype::BOOL, XPUPlace()),
                               pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sigmoid_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sigmoid", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"sign", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"slice_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"slice", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                               pOpKernelType(vartype::INT32, XPUPlace())})},
-      {"slice_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"softmax_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"softmax_with_cross_entropy",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"squeeze", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
-                                pOpKernelType(vartype::INT64, XPUPlace()),
-                                pOpKernelType(vartype::INT32, XPUPlace()),
-                                pOpKernelType(vartype::BOOL, XPUPlace()),
-                                pOpKernelType(vartype::INT8, XPUPlace()),
-                                pOpKernelType(vartype::UINT8, XPUPlace()),
-                                pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"squeeze_grad",
+      {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sqrt_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sqrt", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"square_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"square", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"squeeze2_grad",
        XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
                      pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
@@ -268,7 +272,7 @@ XPUOpMap& get_kl1_ops() {
                                  pOpKernelType(vartype::INT8, XPUPlace()),
                                  pOpKernelType(vartype::UINT8, XPUPlace()),
                                  pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"squeeze2_grad",
+      {"squeeze_grad",
        XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
                      pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
@@ -276,27 +280,29 @@ XPUOpMap& get_kl1_ops() {
                      pOpKernelType(vartype::INT8, XPUPlace()),
                      pOpKernelType(vartype::UINT8, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"squeeze", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
+                                pOpKernelType(vartype::INT64, XPUPlace()),
+                                pOpKernelType(vartype::INT32, XPUPlace()),
+                                pOpKernelType(vartype::BOOL, XPUPlace()),
+                                pOpKernelType(vartype::INT8, XPUPlace()),
+                                pOpKernelType(vartype::UINT8, XPUPlace()),
+                                pOpKernelType(vartype::FP32, XPUPlace())})},
       {"stack", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"tanh_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"tanh", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"top_k", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"transpose", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"transpose_grad",
+      {"transpose2_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"transpose2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"transpose2_grad",
+      {"transpose_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"transpose", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"truncated_gaussian_random",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"uniform_random",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"unsqueeze", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
-                                  pOpKernelType(vartype::INT64, XPUPlace()),
-                                  pOpKernelType(vartype::INT32, XPUPlace()),
-                                  pOpKernelType(vartype::BOOL, XPUPlace()),
-                                  pOpKernelType(vartype::INT8, XPUPlace()),
-                                  pOpKernelType(vartype::UINT8, XPUPlace()),
-                                  pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"unsqueeze_grad",
+      {"unsqueeze2_grad",
        XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
                      pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
@@ -311,7 +317,7 @@ XPUOpMap& get_kl1_ops() {
                                    pOpKernelType(vartype::INT8, XPUPlace()),
                                    pOpKernelType(vartype::UINT8, XPUPlace()),
                                    pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"unsqueeze2_grad",
+      {"unsqueeze_grad",
        XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
                      pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
@@ -319,21 +325,13 @@ XPUOpMap& get_kl1_ops() {
                      pOpKernelType(vartype::INT8, XPUPlace()),
                      pOpKernelType(vartype::UINT8, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"momuntem", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"iou_similarity",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"arg_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"expand_v2", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+      {"unsqueeze", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
                                   pOpKernelType(vartype::INT64, XPUPlace()),
+                                  pOpKernelType(vartype::INT32, XPUPlace()),
                                   pOpKernelType(vartype::BOOL, XPUPlace()),
-                                  pOpKernelType(vartype::FP16, XPUPlace()),
+                                  pOpKernelType(vartype::INT8, XPUPlace()),
+                                  pOpKernelType(vartype::UINT8, XPUPlace()),
                                   pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"expand_as_v2",
-       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
       // AddMore
   };
 
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index 78fc53cfc8535..74f519c7a8617 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -29,141 +29,109 @@ using XPUOpMap = std::unordered_map<std::string, XPUKernelSet>;
 XPUOpMap& get_kl2_ops() {
   // KL1支持的op，通过op_name, data_type, place来索引
   static XPUOpMap s_xpu2_kernels{
-      {"label_smooth",
+      {"adamw", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"adam", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"arg_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"assign_value",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"mul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                            pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_sub",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_sub_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_add",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"batch_norm_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"batch_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"cast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                             pOpKernelType(vartype::FP16, XPUPlace()),
+                             pOpKernelType(vartype::BOOL, XPUPlace()),
+                             pOpKernelType(vartype::INT64, XPUPlace()),
+                             pOpKernelType(vartype::INT32, XPUPlace())})},
+      {"clip", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"concat_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"concat", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"conv2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"conv2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"depthwise_conv2d_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"depthwise_conv2d",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"dropout_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"dropout", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"elementwise_add_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_div",
+      {"elementwise_add",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"elementwise_div_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"elementwise_div_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_pow",
+      {"elementwise_div",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_div",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
       {"elementwise_floordiv",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_mul",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_mul_grad",
+      {"elementwise_max_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
       {"elementwise_max",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_max_grad",
+      {"elementwise_min_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
       {"elementwise_min",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_min_grad",
+      {"elementwise_mul_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"momentum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"batch_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"batch_norm_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"layer_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                                   pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"layer_norm_grad",
+      {"elementwise_mul",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                             pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"mean_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                                  pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"adam", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"adamw", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_sum_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"softmax_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"softmax_with_cross_entropy",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"softmax_with_cross_entropy_grad",
+      {"elementwise_pow",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                            pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"transpose", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                                  pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"transpose_grad",
+      {"elementwise_sub_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"transpose2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                                   pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"transpose2_grad",
+      {"elementwise_sub",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"iou_similarity",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"arg_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_mean_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"slice", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                              pOpKernelType(vartype::FP16, XPUPlace()),
-                              pOpKernelType(vartype::INT32, XPUPlace())})},
-      {"slice_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                                   pOpKernelType(vartype::FP16, XPUPlace()),
-                                   pOpKernelType(vartype::INT32, XPUPlace())})},
       {"equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                               pOpKernelType(vartype::INT32, XPUPlace()),
                               pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"not_equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
-                                  pOpKernelType(vartype::INT32, XPUPlace()),
-                                  pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"less_than", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
-                                  pOpKernelType(vartype::INT32, XPUPlace()),
-                                  pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"less_equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
-                                   pOpKernelType(vartype::INT32, XPUPlace()),
-                                   pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"greater_than",
-       XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
+      {"expand_as_v2",
+       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+                     pOpKernelType(vartype::INT64, XPUPlace()),
+                     pOpKernelType(vartype::BOOL, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"greater_equal",
+      {"expand_v2", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+                                  pOpKernelType(vartype::INT64, XPUPlace()),
+                                  pOpKernelType(vartype::BOOL, XPUPlace()),
+                                  pOpKernelType(vartype::FP16, XPUPlace()),
+                                  pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"fill_any_like",
        XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"clip", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"stack", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                              pOpKernelType(vartype::INT64, XPUPlace()),
-                              pOpKernelType(vartype::INT32, XPUPlace())})},
-      {"cast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                             pOpKernelType(vartype::FP16, XPUPlace()),
-                             pOpKernelType(vartype::BOOL, XPUPlace()),
-                             pOpKernelType(vartype::INT64, XPUPlace()),
-                             pOpKernelType(vartype::INT32, XPUPlace())})},
-      {"fill_any_like",
+      {"fill_constant",
        XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
+                     pOpKernelType(vartype::INT16, XPUPlace()),
+                     pOpKernelType(vartype::INT8, XPUPlace()),
+                     pOpKernelType(vartype::BOOL, XPUPlace()),
+                     pOpKernelType(vartype::FP64, XPUPlace()),
+                     pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"flatten", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
-                                pOpKernelType(vartype::INT32, XPUPlace()),
-                                pOpKernelType(vartype::INT8, XPUPlace()),
-                                pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"flatten_grad",
+                     pOpKernelType(vartype::BF16, XPUPlace()),
+                     pOpKernelType(vartype::COMPLEX64, XPUPlace()),
+                     pOpKernelType(vartype::COMPLEX128, XPUPlace())})},
+      {"flatten2_grad",
        XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
                      pOpKernelType(vartype::INT8, XPUPlace()),
@@ -172,123 +140,205 @@ XPUOpMap& get_kl2_ops() {
                                  pOpKernelType(vartype::INT32, XPUPlace()),
                                  pOpKernelType(vartype::INT8, XPUPlace()),
                                  pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"flatten2_grad",
+      {"flatten_contiguous_range_grad",
        XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
                      pOpKernelType(vartype::INT8, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"matmul_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"matmul_v2_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"matmul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"matmul_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"relu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"assign_value",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"dropout", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"dropout_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_div",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_div_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"range", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                              pOpKernelType(vartype::INT64, XPUPlace())})},
-      {"reshape2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reshape2_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"shape", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                              pOpKernelType(vartype::INT64, XPUPlace())})},
-      {"one_hot_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                                   pOpKernelType(vartype::INT64, XPUPlace())})},
-      {"layer_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"layer_norm_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"lookup_table_v2",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"lookup_table_v2_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"scale", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-
       {"flatten_contiguous_range",
        XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
                      pOpKernelType(vartype::INT8, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"flatten_contiguous_range_grad",
+      {"flatten_grad",
        XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
                      pOpKernelType(vartype::INT8, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"scale", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                              pOpKernelType(vartype::FP16, XPUPlace()),
-                              pOpKernelType(vartype::INT64, XPUPlace())})},
-      {"tanh", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                             pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"tanh_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+      {"flatten", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
+                                pOpKernelType(vartype::INT32, XPUPlace()),
+                                pOpKernelType(vartype::INT8, XPUPlace()),
+                                pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"gather_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                    pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"gather_nd", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+                                  pOpKernelType(vartype::INT64, XPUPlace()),
+                                  pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"gather", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                               pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"gaussian_random",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"gelu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                                   pOpKernelType(vartype::FP16, XPUPlace())})},
       {"gelu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                              pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"gelu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                                  pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"gather", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                               pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"gather_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                                    pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"fill_constant",
+      {"greater_equal",
        XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT16, XPUPlace()),
-                     pOpKernelType(vartype::INT8, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::BF16, XPUPlace()),
-                     pOpKernelType(vartype::COMPLEX64, XPUPlace()),
-                     pOpKernelType(vartype::COMPLEX128, XPUPlace())})},
-      {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                                pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"softmax_grad",
+                     pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"greater_than",
+       XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
+                     pOpKernelType(vartype::INT32, XPUPlace()),
+                     pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"iou_similarity",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"label_smooth",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"layer_norm_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"layer_norm_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"gather_nd", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
-                                  pOpKernelType(vartype::INT64, XPUPlace()),
+      {"layer_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"layer_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                   pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"less_equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
+                                   pOpKernelType(vartype::INT32, XPUPlace()),
+                                   pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"less_than", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
+                                  pOpKernelType(vartype::INT32, XPUPlace()),
                                   pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"tile", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
-                             pOpKernelType(vartype::INT64, XPUPlace()),
-                             pOpKernelType(vartype::BOOL, XPUPlace()),
-                             pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"where", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
-                              pOpKernelType(vartype::INT64, XPUPlace()),
-                              pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"where_index", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
-                                    pOpKernelType(vartype::BOOL, XPUPlace()),
-                                    pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"log", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"lookup_table_v2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"lookup_table_v2",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"masked_select",
        XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
                      pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"expand_v2", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
-                                  pOpKernelType(vartype::INT64, XPUPlace()),
-                                  pOpKernelType(vartype::BOOL, XPUPlace()),
-                                  pOpKernelType(vartype::FP16, XPUPlace()),
+      {"matmul_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"matmul_v2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"matmul_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"matmul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"mean_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                  pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                             pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"momentum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"mul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                            pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"not_equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
+                                  pOpKernelType(vartype::INT32, XPUPlace()),
                                   pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"expand_as_v2",
-       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+      {"one_hot_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                   pOpKernelType(vartype::INT64, XPUPlace())})},
+      {"pool2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                    pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"pool2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                               pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"prior_box", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"range", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                              pOpKernelType(vartype::INT64, XPUPlace())})},
+      {"reduce_max_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_mean_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_sum_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"relu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reshape2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
                      pOpKernelType(vartype::INT64, XPUPlace()),
+                     pOpKernelType(vartype::INT32, XPUPlace()),
                      pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"depthwise_conv2d",
+      {"reshape2", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
+                                 pOpKernelType(vartype::INT64, XPUPlace()),
+                                 pOpKernelType(vartype::INT32, XPUPlace()),
+                                 pOpKernelType(vartype::BOOL, XPUPlace()),
+                                 pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"scale", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"scale", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                              pOpKernelType(vartype::FP16, XPUPlace()),
+                              pOpKernelType(vartype::INT64, XPUPlace())})},
+      {"shape", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                              pOpKernelType(vartype::INT64, XPUPlace())})},
+      {"slice_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                   pOpKernelType(vartype::FP16, XPUPlace()),
+                                   pOpKernelType(vartype::INT32, XPUPlace())})},
+      {"slice", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                              pOpKernelType(vartype::FP16, XPUPlace()),
+                              pOpKernelType(vartype::INT32, XPUPlace())})},
+      {"softmax_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"depthwise_conv2d_grad",
+      {"softmax_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"softmax_with_cross_entropy_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"softmax_with_cross_entropy",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"conv2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"conv2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"squeeze2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
+                     pOpKernelType(vartype::INT64, XPUPlace()),
+                     pOpKernelType(vartype::INT32, XPUPlace()),
+                     pOpKernelType(vartype::BOOL, XPUPlace()),
+                     pOpKernelType(vartype::INT8, XPUPlace()),
+                     pOpKernelType(vartype::UINT8, XPUPlace()),
+                     pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"squeeze2", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
+                                 pOpKernelType(vartype::INT64, XPUPlace()),
+                                 pOpKernelType(vartype::INT32, XPUPlace()),
+                                 pOpKernelType(vartype::BOOL, XPUPlace()),
+                                 pOpKernelType(vartype::INT8, XPUPlace()),
+                                 pOpKernelType(vartype::UINT8, XPUPlace()),
+                                 pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"stack", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                              pOpKernelType(vartype::INT64, XPUPlace()),
+                              pOpKernelType(vartype::INT32, XPUPlace())})},
+      {"sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                            pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"tanh_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                  pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"tanh", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                             pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"tile", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+                             pOpKernelType(vartype::INT64, XPUPlace()),
+                             pOpKernelType(vartype::BOOL, XPUPlace()),
+                             pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"transpose2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"transpose2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                   pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"transpose_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"transpose", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                  pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"unsqueeze2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
+                     pOpKernelType(vartype::INT64, XPUPlace()),
+                     pOpKernelType(vartype::INT32, XPUPlace()),
+                     pOpKernelType(vartype::BOOL, XPUPlace()),
+                     pOpKernelType(vartype::INT8, XPUPlace()),
+                     pOpKernelType(vartype::UINT8, XPUPlace()),
+                     pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"unsqueeze2", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
+                                   pOpKernelType(vartype::INT64, XPUPlace()),
+                                   pOpKernelType(vartype::INT32, XPUPlace()),
+                                   pOpKernelType(vartype::BOOL, XPUPlace()),
+                                   pOpKernelType(vartype::INT8, XPUPlace()),
+                                   pOpKernelType(vartype::UINT8, XPUPlace()),
+                                   pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"where_index", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+                                    pOpKernelType(vartype::BOOL, XPUPlace()),
+                                    pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"where", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+                              pOpKernelType(vartype::INT64, XPUPlace()),
+                              pOpKernelType(vartype::FP32, XPUPlace())})},
       // AddMore
   };
 
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index c2dc60a29fe42..a0c9ff09460af 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -370,10 +370,10 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
       char* scratch = static_cast<char*>(scratchpad()) + Eigen::kGpuScratchSize;
       semaphore_ = reinterpret_cast<unsigned int*>(scratch);
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_));
 #endif
     }
@@ -439,14 +439,14 @@ CUDAContext::~CUDAContext() {
 
 CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
   CUDADeviceGuard guard(place_.device);
-  compute_capability_ = GetCUDAComputeCapability(place_.device);
-  multi_process_ = GetCUDAMultiProcessors(place_.device);
-  max_threads_per_mp_ = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
+  compute_capability_ = GetGPUComputeCapability(place_.device);
+  multi_process_ = GetGPUMultiProcessors(place_.device);
+  max_threads_per_mp_ = GetGPUMaxThreadsPerMultiProcessor(place_.device);
   max_grid_dim_size_ = GetGpuMaxGridDimSize(place_.device);
-  max_threads_per_block_ = GetCUDAMaxThreadsPerBlock(place_.device);
+  max_threads_per_block_ = GetGPUMaxThreadsPerBlock(place_.device);
 
-  driver_version_ = GetCUDADriverVersion(place_.device);
-  runtime_version_ = GetCUDARuntimeVersion(place_.device);
+  driver_version_ = GetGPUDriverVersion(place_.device);
+  runtime_version_ = GetGPURuntimeVersion(place_.device);
 
   LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: " << place_.device
                           << ", GPU Compute Capability: "
@@ -459,7 +459,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
                           << (runtime_version_ % 100) / 10;
 #ifdef PADDLE_WITH_HIP
   size_t version_major, version_minor, version_patch;
-  PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenGetVersion(
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion(
       &version_major, &version_minor, &version_patch));
   LOG_FIRST_N(WARNING, 1) << "device: " << place_.device
                           << ", MIOpen Version: " << version_major << "."
@@ -499,7 +499,7 @@ CUDADeviceContext::~CUDADeviceContext() {
   SetDeviceId(place_.device);
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   if (nccl_comm_) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
   }
 #endif
 }
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 73232994516b6..875132dfe89c4 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -20,7 +20,7 @@ limitations under the License. */
 
 #include "paddle/fluid/memory/malloc.h"
 #ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_helper.h"
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/dynload/cusolver.h"
@@ -28,17 +28,17 @@ limitations under the License. */
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 
 #ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/cuda_helper.h"  // NOLINT
+#include "paddle/fluid/platform/device/gpu/gpu_helper.h"  // NOLINT
 #include "paddle/fluid/platform/dynload/miopen.h"
 #include "paddle/fluid/platform/dynload/rocblas.h"
 #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/dynload/rccl.h"
 #endif
-#include "paddle/fluid/platform/gpu_info.h"  // NOLINT
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"  // NOLINT
 #endif
 
 #if defined(PADDLE_WITH_XPU_BKCL)
@@ -62,6 +62,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/npu/enforce_npu.h"
 #include "paddle/fluid/platform/device/npu/npu_stream.h"
 #endif
+#ifdef PADDLE_WITH_IPU
+#include "paddle/fluid/platform/device/ipu/device.h"
+#endif
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace Eigen {
@@ -99,8 +102,8 @@ enum DeviceType {
   CUDA = 1,
   XPU = 2,
   NPU = 3,
-
-  MAX_DEVICE_TYPES = 4,
+  IPU = 4,
+  MAX_DEVICE_TYPES = 5,
 };
 
 DeviceType Place2DeviceType(const platform::Place& place);
@@ -109,6 +112,7 @@ constexpr DeviceType kCPU = DeviceType::CPU;
 constexpr DeviceType kCUDA = DeviceType::CUDA;
 constexpr DeviceType kXPU = DeviceType::XPU;
 constexpr DeviceType kNPU = DeviceType::NPU;
+constexpr DeviceType kIPU = DeviceType::IPU;
 
 class DeviceContext {
  public:
@@ -140,6 +144,30 @@ struct DefaultDeviceContextType<platform::CPUPlace> {
   using TYPE = CPUDeviceContext;
 };
 
+// Graphcore IPU
+#ifdef PADDLE_WITH_IPU
+class IPUDeviceContext : public DeviceContext {
+ public:
+  IPUDeviceContext() = delete;
+  explicit IPUDeviceContext(IPUPlace place);
+  virtual ~IPUDeviceContext();
+  Eigen::DefaultDevice* eigen_device() const { return nullptr; }
+  Place GetPlace() const override;
+  /*! \brief  Wait for all operations completion in the stream. */
+  void Wait() const override;
+  int DeviceId() const { return device_.getId(); }
+
+ private:
+  IPUPlace place_;
+  platform::ipu::Device device_;
+};
+template <>
+struct DefaultDeviceContextType<platform::IPUPlace> {
+  using TYPE = IPUDeviceContext;
+};
+
+#endif
+
 #ifdef PADDLE_WITH_XPU
 namespace xpu = baidu::xpu::api;
 class XPUDeviceContext : public DeviceContext {
@@ -371,7 +399,7 @@ class CUDAContext {
     if (dynload::HasCUDNN()) {
 #ifdef PADDLE_WITH_HIP
       size_t miopen_major, miopen_minor, miopen_patch;
-      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenGetVersion(
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion(
           &miopen_major, &miopen_minor, &miopen_patch));
       auto local_miopen_version =
           (miopen_major * 1000 + miopen_minor * 10 + miopen_patch) / 10;
@@ -388,8 +416,8 @@ class CUDAContext {
             << "Please recompile or reinstall Paddle with compatible MIOPEN "
                "version.";
       }
-      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreate(&cudnn_handle_));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreate(&cudnn_handle_));
+      PADDLE_ENFORCE_GPU_SUCCESS(
           dynload::miopenSetStream(cudnn_handle_, RawStream()));
 #else
       auto local_cudnn_version = dynload::cudnnGetVersion() / 100;
@@ -425,9 +453,9 @@ class CUDAContext {
   void DestoryCuDNNContext() {
     if (cudnn_handle_) {
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroy(cudnn_handle_));
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenDestroy(cudnn_handle_));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_));
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroy(cudnn_handle_));
 #endif
     }
     cudnn_handle_ = nullptr;
@@ -442,7 +470,7 @@ class CUDAContext {
 #ifndef PADDLE_WITH_HIP
   void DestoryCuSolverContext() {
     if (cusolver_dn_handle_) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           dynload::cusolverDnDestroy(cusolver_dn_handle_));
     }
   }
diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu
index 2f9413c4f3ea7..cf617a478eb71 100644
--- a/paddle/fluid/platform/device_context_test.cu
+++ b/paddle/fluid/platform/device_context_test.cu
@@ -23,7 +23,7 @@ TEST(Device, Init) {
   using paddle::platform::CUDADeviceContext;
   using paddle::platform::CUDAPlace;
 
-  int count = paddle::platform::GetCUDADeviceCount();
+  int count = paddle::platform::GetGPUDeviceCount();
   for (int i = 0; i < count; i++) {
     CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
     Eigen::GpuDevice* gpu_device = device_context->eigen_device();
@@ -36,7 +36,7 @@ TEST(Device, CUDADeviceContext) {
   using paddle::platform::CUDADeviceContext;
   using paddle::platform::CUDAPlace;
 
-  int count = paddle::platform::GetCUDADeviceCount();
+  int count = paddle::platform::GetGPUDeviceCount();
   for (int i = 0; i < count; i++) {
     CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
     Eigen::GpuDevice* gpu_device = device_context->eigen_device();
@@ -70,7 +70,7 @@ TEST(Device, DeviceContextPool) {
   ASSERT_EQ(cpu_dev_ctx2, cpu_dev_ctx1);
 
   std::vector<Place> gpu_places;
-  int count = paddle::platform::GetCUDADeviceCount();
+  int count = paddle::platform::GetGPUDeviceCount();
   for (int i = 0; i < count; ++i) {
     auto dev_ctx = pool.Get(CUDAPlace(i));
     ASSERT_NE(dev_ctx, nullptr);
diff --git a/paddle/fluid/platform/device_memory_aligment.h b/paddle/fluid/platform/device_memory_aligment.h
index f42eb7ece1a72..a3f88592b7649 100644
--- a/paddle/fluid/platform/device_memory_aligment.h
+++ b/paddle/fluid/platform/device_memory_aligment.h
@@ -17,12 +17,10 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/place.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#include "paddle/fluid/platform/gpu_info.h"
-#endif
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/device/npu/npu_info.h"
 #endif
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/dynload/miopen.h b/paddle/fluid/platform/dynload/miopen.h
index f72eb6731f627..34845f24ff50d 100644
--- a/paddle/fluid/platform/dynload/miopen.h
+++ b/paddle/fluid/platform/dynload/miopen.h
@@ -25,6 +25,12 @@ limitations under the License. */
   (MIOPEN_VERSION_MAJOR * 1000 + MIOPEN_VERSION_MINOR * 10 + \
    MIOPEN_VERSION_PATCH)  // NOLINT
 
+// MIOPEN only support NCHW, just for compatibility with CUDNN API
+typedef enum {
+  MIOPEN_TENSOR_NCHW = 0,
+  MIOPEN_TENSOR_NHWC = 1,
+} miopenTensorFormat_t;
+
 namespace paddle {
 namespace platform {
 namespace dynload {
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 86f71fdf64fba..530ae6ba79889 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -96,7 +96,7 @@ limitations under the License. */
 #include "paddle/fluid/imperative/type_defs.h"
 // Note: this header for simplify HIP and CUDA type string
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#include "paddle/fluid/platform/type_defs.h"
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #endif
 #include "paddle/fluid/platform/flags.h"
 
@@ -944,7 +944,7 @@ inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) {
 }
 #endif  // not(__APPLE__) and PADDLE_WITH_NCCL
 
-#define PADDLE_ENFORCE_CUDA_SUCCESS(COND)                        \
+#define PADDLE_ENFORCE_GPU_SUCCESS(COND)                         \
   do {                                                           \
     auto __cond__ = (COND);                                      \
     using __CUDA_STATUS_TYPE__ = decltype(__cond__);             \
@@ -1150,7 +1150,7 @@ DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess);
 
 }  // namespace details
 
-#define PADDLE_ENFORCE_CUDA_SUCCESS(COND)                      \
+#define PADDLE_ENFORCE_GPU_SUCCESS(COND)                       \
   do {                                                         \
     auto __cond__ = (COND);                                    \
     using __CUDA_STATUS_TYPE__ = decltype(__cond__);           \
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
index 6ff9e6ea903cd..b9e4239299169 100644
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -294,14 +294,14 @@ TEST(EOF_EXCEPTION, THROW_EOF) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <typename T>
 bool CheckCudaStatusSuccess(T value, const std::string& msg = "success") {
-  PADDLE_ENFORCE_CUDA_SUCCESS(value);
+  PADDLE_ENFORCE_GPU_SUCCESS(value);
   return true;
 }
 
 template <typename T>
 bool CheckCudaStatusFailure(T value, const std::string& msg) {
   try {
-    PADDLE_ENFORCE_CUDA_SUCCESS(value);
+    PADDLE_ENFORCE_GPU_SUCCESS(value);
     return false;
   } catch (paddle::platform::EnforceNotMet& error) {
     std::string ex_msg = error.what();
diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h
index 2b11de48a1ec7..136dc2d725208 100644
--- a/paddle/fluid/platform/event.h
+++ b/paddle/fluid/platform/event.h
@@ -148,9 +148,9 @@ class CudaEvent {
 
   void Record(const paddle::platform::stream::CUDAStream& stream) {
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event_, stream.raw_stream()));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream.raw_stream()));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, stream.raw_stream()));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, stream.raw_stream()));
 #endif
   }
 
@@ -172,15 +172,15 @@ class CudaEvent {
       return false;
     }
 #endif
-    PADDLE_ENFORCE_CUDA_SUCCESS(err);
+    PADDLE_ENFORCE_GPU_SUCCESS(err);
     return false;
   }
 
   void Synchronize() {
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventSynchronize(event_));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventSynchronize(event_));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventSynchronize(event_));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(event_));
 #endif
   }
   gpuEvent_t GetRawCudaEvent() { return event_; }
diff --git a/paddle/fluid/platform/for_range.h b/paddle/fluid/platform/for_range.h
index 6e5c7f4e91660..5518dabbf92a4 100644
--- a/paddle/fluid/platform/for_range.h
+++ b/paddle/fluid/platform/for_range.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
deleted file mode 100644
index 9dc6254234a97..0000000000000
--- a/paddle/fluid/platform/gpu_info.cc
+++ /dev/null
@@ -1,734 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/gpu_info.h"
-#include <cstdlib>
-#include <mutex>
-#include <vector>
-
-#include "gflags/gflags.h"
-#include "paddle/fluid/platform/cuda_device_guard.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/dynload/miopen.h"
-#else
-#include "paddle/fluid/platform/cuda_graph.h"
-#include "paddle/fluid/platform/dynload/cudnn.h"
-#endif
-#include "paddle/fluid/memory/malloc.h"
-#ifdef PADDLE_WITH_CUDA
-#if CUDA_VERSION >= 10020
-#include "paddle/fluid/platform/dynload/cuda_driver.h"
-#endif
-#endif
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/lock_guard_ptr.h"
-#include "paddle/fluid/platform/macros.h"
-#include "paddle/fluid/platform/monitor.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/split.h"
-
-DECLARE_double(fraction_of_gpu_memory_to_use);
-DECLARE_uint64(initial_gpu_memory_in_mb);
-DECLARE_uint64(reallocate_gpu_memory_in_mb);
-DECLARE_bool(enable_cublas_tensor_op_math);
-DECLARE_string(selected_gpus);
-DECLARE_uint64(gpu_memory_limit_mb);
-
-constexpr static float fraction_reserve_gpu_memory = 0.05f;
-
-static std::once_flag g_device_props_size_init_flag;
-static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags;
-static std::vector<paddle::gpuDeviceProp> g_device_props;
-
-USE_GPU_MEM_STAT;
-namespace paddle {
-namespace platform {
-
-int CudnnVersion() {
-  if (!dynload::HasCUDNN()) return -1;
-
-#ifdef PADDLE_WITH_HIP
-  size_t version_major, version_minor, version_patch;
-  PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenGetVersion(
-      &version_major, &version_minor, &version_patch));
-  return version_major * 100 + version_minor * 10 + version_patch;
-#else
-  return dynload::cudnnGetVersion();
-#endif
-}
-static int GetCUDADeviceCountImpl() {
-  int driverVersion = 0;
-#ifdef PADDLE_WITH_HIP
-  hipError_t status = hipDriverGetVersion(&driverVersion);
-#else
-  cudaError_t status = cudaDriverGetVersion(&driverVersion);
-#endif
-
-  if (!(status == gpuSuccess && driverVersion != 0)) {
-    // No GPU driver
-    VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!";
-    return 0;
-  }
-
-#ifdef PADDLE_WITH_HIP
-  const auto *cuda_visible_devices = std::getenv("HIP_VISIBLE_DEVICES");
-#else
-  const auto *cuda_visible_devices = std::getenv("CUDA_VISIBLE_DEVICES");
-#endif
-  if (cuda_visible_devices != nullptr) {
-    std::string cuda_visible_devices_str(cuda_visible_devices);
-    if (!cuda_visible_devices_str.empty()) {
-      cuda_visible_devices_str.erase(
-          0, cuda_visible_devices_str.find_first_not_of('\''));
-      cuda_visible_devices_str.erase(
-          cuda_visible_devices_str.find_last_not_of('\'') + 1);
-      cuda_visible_devices_str.erase(
-          0, cuda_visible_devices_str.find_first_not_of('\"'));
-      cuda_visible_devices_str.erase(
-          cuda_visible_devices_str.find_last_not_of('\"') + 1);
-    }
-    if (std::all_of(cuda_visible_devices_str.begin(),
-                    cuda_visible_devices_str.end(),
-                    [](char ch) { return ch == ' '; })) {
-      VLOG(2) << "CUDA_VISIBLE_DEVICES or HIP_VISIBLE_DEVICES is set to be "
-                 "empty. No GPU detected.";
-      return 0;
-    }
-  }
-  int count;
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipGetDeviceCount(&count));
-#else
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetDeviceCount(&count));
-#endif
-  return count;
-}
-
-int GetCUDADeviceCount() {
-  // cache the count
-  static auto dev_cnt = GetCUDADeviceCountImpl();
-  return dev_cnt;
-}
-
-/* Here is a very simple CUDA “pro tip”: cudaDeviceGetAttribute() is a much
-faster way to query device properties. You can see details in
-https://devblogs.nvidia.com/cuda-pro-tip-the-fast-way-to-query-device-properties/
-*/
-int GetCUDAComputeCapability(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
-                    platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id, GetCUDADeviceCount()));
-  int major, minor;
-
-#ifdef PADDLE_WITH_HIP
-  auto major_error_code = hipDeviceGetAttribute(
-      &major, hipDeviceAttributeComputeCapabilityMajor, id);
-  auto minor_error_code = hipDeviceGetAttribute(
-      &minor, hipDeviceAttributeComputeCapabilityMinor, id);
-#else
-  auto major_error_code =
-      cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id);
-  auto minor_error_code =
-      cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, id);
-#endif
-  PADDLE_ENFORCE_CUDA_SUCCESS(major_error_code);
-  PADDLE_ENFORCE_CUDA_SUCCESS(minor_error_code);
-#ifdef PADDLE_WITH_HIP
-  return major * 100 + minor;
-#else
-  return major * 10 + minor;
-#endif
-}
-
-dim3 GetGpuMaxGridDimSize(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
-                    platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id, GetCUDADeviceCount()));
-  dim3 ret;
-  int size;
-#ifdef PADDLE_WITH_HIP
-  auto error_code_x =
-      hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimX, id);
-#else
-  auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id);
-#endif
-  PADDLE_ENFORCE_CUDA_SUCCESS(error_code_x);
-  ret.x = size;
-
-#ifdef PADDLE_WITH_HIP
-  auto error_code_y =
-      hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimY, id);
-#else
-  auto error_code_y = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id);
-#endif
-  PADDLE_ENFORCE_CUDA_SUCCESS(error_code_y);
-  ret.y = size;
-
-#ifdef PADDLE_WITH_HIP
-  auto error_code_z =
-      hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimZ, id);
-#else
-  auto error_code_z = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id);
-#endif
-  PADDLE_ENFORCE_CUDA_SUCCESS(error_code_z);
-  ret.z = size;
-  return ret;
-}
-
-int GetCUDARuntimeVersion(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
-                    platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id, GetCUDADeviceCount()));
-  int runtime_version = 0;
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipRuntimeGetVersion(&runtime_version));
-#else
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaRuntimeGetVersion(&runtime_version));
-#endif
-  return runtime_version;
-}
-
-int GetCUDADriverVersion(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
-                    platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id, GetCUDADeviceCount()));
-  int driver_version = 0;
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipDriverGetVersion(&driver_version));
-#else
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaDriverGetVersion(&driver_version));
-#endif
-  return driver_version;
-}
-
-bool TensorCoreAvailable() {
-#if !defined(PADDLE_WITH_HIP) && CUDA_VERSION >= 9000
-  int device = GetCurrentDeviceId();
-  int driver_version = GetCUDAComputeCapability(device);
-  return driver_version >= 70;
-#else
-  return false;
-#endif
-}
-
-int GetCUDAMultiProcessors(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
-                    platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id, GetCUDADeviceCount()));
-  int count;
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      hipDeviceGetAttribute(&count, hipDeviceAttributeMultiprocessorCount, id));
-#else
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id));
-#endif
-  return count;
-}
-
-int GetCUDAMaxThreadsPerMultiProcessor(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
-                    platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id, GetCUDADeviceCount()));
-  int count;
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceGetAttribute(
-      &count, hipDeviceAttributeMaxThreadsPerMultiProcessor, id));
-#else
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceGetAttribute(
-      &count, cudaDevAttrMaxThreadsPerMultiProcessor, id));
-#endif
-  return count;
-}
-
-int GetCUDAMaxThreadsPerBlock(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
-                    platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id, GetCUDADeviceCount()));
-  int count;
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      hipDeviceGetAttribute(&count, hipDeviceAttributeMaxThreadsPerBlock, id));
-#else
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id));
-#endif
-  return count;
-}
-
-int GetCurrentDeviceId() {
-  int device_id;
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipGetDevice(&device_id));
-#else
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetDevice(&device_id));
-#endif
-  return device_id;
-}
-
-//! Get a list of device ids from environment variable or use all.
-std::vector<int> GetSelectedDevices() {
-  // use user specified GPUs in single-node multi-process mode.
-  std::vector<int> devices;
-  if (!FLAGS_selected_gpus.empty()) {
-    auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ',');
-    for (auto id : devices_str) {
-      devices.push_back(atoi(id.c_str()));
-    }
-  } else {
-    int count = GetCUDADeviceCount();
-    for (int i = 0; i < count; ++i) {
-      devices.push_back(i);
-    }
-  }
-  return devices;
-}
-
-const gpuDeviceProp &GetDeviceProperties(int id) {
-  std::call_once(g_device_props_size_init_flag, [&] {
-    int gpu_num = 0;
-    gpu_num = platform::GetCUDADeviceCount();
-    g_device_props_init_flags.resize(gpu_num);
-    g_device_props.resize(gpu_num);
-    for (int i = 0; i < gpu_num; ++i) {
-      g_device_props_init_flags[i] = std::make_unique<std::once_flag>();
-    }
-  });
-
-  if (id == -1) {
-    id = platform::GetCurrentDeviceId();
-  }
-
-  if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
-    PADDLE_THROW(platform::errors::OutOfRange(
-        "The device id %d is out of range [0, %d), where %d is the number of "
-        "devices on this machine. Because the device id should be greater than "
-        "or equal to zero and smaller than the number of gpus. Please input "
-        "appropriate device again!",
-        id, static_cast<int>(g_device_props.size()),
-        static_cast<int>(g_device_props.size())));
-  }
-
-  std::call_once(*(g_device_props_init_flags[id]), [&] {
-#ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaGetDeviceProperties(&g_device_props[id], id));
-#else
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-      hipGetDeviceProperties(&g_device_props[id], id));
-#endif
-  });
-
-  return g_device_props[id];
-}
-
-void SetDeviceId(int id) {
-  // TODO(qijun): find a better way to cache the cuda device count
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
-                    platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id, GetCUDADeviceCount()));
-#ifdef PADDLE_WITH_HIP
-  PADDLE_RETRY_CUDA_SUCCESS(hipSetDevice(id));
-#else
-  PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(id));
-#endif
-}
-
-void GpuMemoryUsage(size_t *available, size_t *total) {
-  size_t actual_available, actual_total;
-  RecordedCudaMemGetInfo(available, total, &actual_available, &actual_total,
-                         platform::GetCurrentDeviceId());
-}
-
-size_t GpuAvailableMemToAlloc() {
-  size_t total = 0;
-  size_t available = 0;
-  GpuMemoryUsage(&available, &total);
-  size_t reserving =
-      static_cast<size_t>(fraction_reserve_gpu_memory * available);
-  // If available size is less than minimum chunk size, no usable memory exists
-  size_t available_to_alloc = available - reserving;
-  size_t min_chunk_size = GpuMinChunkSize();
-  if (available_to_alloc < min_chunk_size) {
-    available_to_alloc = 0;
-  }
-  VLOG(10) << "GPU usage " << (available >> 20) << "M/" << (total >> 20)
-           << "M, " << (available_to_alloc >> 20) << "M available to allocate";
-  return available_to_alloc;
-}
-
-size_t GpuMaxAllocSize() {
-  return std::max(GpuInitAllocSize(), GpuReallocSize());
-}
-
-static size_t GpuAllocSize(bool realloc) {
-  size_t available_to_alloc = GpuAvailableMemToAlloc();
-  PADDLE_ENFORCE_GT(
-      available_to_alloc, 0,
-      platform::errors::ResourceExhausted("Not enough available GPU memory."));
-  // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
-  // allocated by fraction
-  size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
-                           : FLAGS_initial_gpu_memory_in_mb;
-  size_t alloc_bytes =
-      (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
-                                           FLAGS_fraction_of_gpu_memory_to_use);
-  PADDLE_ENFORCE_GE(
-      available_to_alloc, alloc_bytes,
-      platform::errors::ResourceExhausted("Not enough available GPU memory."));
-  VLOG(10) << "Alloc size is " << (alloc_bytes >> 20)
-           << " MiB, is it Re-alloc: " << realloc;
-  return alloc_bytes;
-}
-
-size_t GpuInitAllocSize() { return GpuAllocSize(/* realloc = */ false); }
-
-size_t GpuReallocSize() { return GpuAllocSize(/* realloc = */ true); }
-
-size_t GpuMinChunkSize() {
-  // Allow to allocate the minimum chunk size is 256 bytes.
-  return 1 << 8;
-}
-
-size_t GpuMaxChunkSize() {
-  size_t max_chunk_size = GpuMaxAllocSize();
-  VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
-  return max_chunk_size;
-}
-
-#ifdef PADDLE_WITH_HIP
-void GpuMemcpyAsync(void *dst, const void *src, size_t count,
-                    enum hipMemcpyKind kind, hipStream_t stream) {
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipMemcpyAsync(dst, src, count, kind, stream));
-}
-#else
-void GpuMemcpyAsync(void *dst, const void *src, size_t count,
-                    enum cudaMemcpyKind kind, cudaStream_t stream) {
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(dst, src, count, kind, stream));
-}
-#endif
-
-#ifdef PADDLE_WITH_HIP
-void GpuMemcpySync(void *dst, const void *src, size_t count,
-                   enum hipMemcpyKind kind) {
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipMemcpy(dst, src, count, kind));
-}
-#else
-void GpuMemcpySync(void *dst, const void *src, size_t count,
-                   enum cudaMemcpyKind kind) {
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpy(dst, src, count, kind));
-}
-#endif
-
-void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
-                        int src_device, size_t count, gpuStream_t stream) {
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      hipMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
-#else
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
-#endif
-}
-
-void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
-                       int src_device, size_t count) {
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      hipMemcpyPeer(dst, dst_device, src, src_device, count));
-#else
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      cudaMemcpyPeer(dst, dst_device, src, src_device, count));
-#endif
-}
-
-void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipMemsetAsync(dst, value, count, stream));
-#else
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync(dst, value, count, stream));
-#endif
-}
-
-void GpuStreamSync(gpuStream_t stream) {
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
-}
-
-static void RaiseNonOutOfMemoryError(gpuError_t *status) {
-#ifdef PADDLE_WITH_HIP
-  if (*status == hipErrorOutOfMemory) {
-    *status = hipSuccess;
-  }
-#else
-  if (*status == cudaErrorMemoryAllocation) {
-    *status = cudaSuccess;
-  }
-#endif
-  PADDLE_ENFORCE_CUDA_SUCCESS(*status);
-
-#ifdef PADDLE_WITH_HIP
-  *status = hipGetLastError();
-  if (*status == hipErrorOutOfMemory) {
-    *status = hipSuccess;
-  }
-#else
-  *status = cudaGetLastError();
-  if (*status == cudaErrorMemoryAllocation) {
-    *status = cudaSuccess;
-  }
-#endif
-  PADDLE_ENFORCE_CUDA_SUCCESS(*status);
-}
-
-class RecordedCudaMallocHelper {
- private:
-  explicit RecordedCudaMallocHelper(int dev_id, uint64_t limit_size = 0)
-      : dev_id_(dev_id), limit_size_(limit_size) {
-    if (NeedRecord()) {
-      mtx_.reset(new std::mutex());
-    }
-  }
-
-  DISABLE_COPY_AND_ASSIGN(RecordedCudaMallocHelper);
-
- public:
-  static RecordedCudaMallocHelper *Instance(int dev_id) {
-    std::call_once(once_flag_, [] {
-      int dev_cnt = GetCUDADeviceCount();
-      instances_.reserve(dev_cnt);
-      for (int i = 0; i < dev_cnt; ++i) {
-        instances_.emplace_back(
-            new RecordedCudaMallocHelper(i, FLAGS_gpu_memory_limit_mb << 20));
-      }
-    });
-
-    PADDLE_ENFORCE_GE(
-        dev_id, 0,
-        platform::errors::OutOfRange(
-            "Device id must be not less than 0, but got %d.", dev_id));
-    PADDLE_ENFORCE_LT(
-        dev_id, instances_.size(),
-        platform::errors::OutOfRange("Device id %d exceeds gpu card number %d.",
-                                     dev_id, instances_.size()));
-    return instances_[dev_id].get();
-  }
-
-  /**
-   * Try to allocate `size` gpu memory. Only cudaErrorMemoryAllocation
-   * or cudaSuccess would be returned, and the cudaGetLastError() flag
-   * would be clear.
-   */
-  gpuError_t Malloc(void **ptr, size_t size) {
-    LockGuardPtr<std::mutex> lock(mtx_);
-    if (UNLIKELY(NeedRecord() && cur_size_.load() + size > limit_size_)) {
-#ifdef PADDLE_WITH_HIP
-      return hipErrorOutOfMemory;
-#else
-      return cudaErrorMemoryAllocation;
-#endif
-    }
-
-    CUDADeviceGuard guard(dev_id_);
-#ifdef PADDLE_WITH_HIP
-    auto result = hipMalloc(ptr, size);
-#else
-    CUDAGraphCaptureModeGuard capture_mode_guard;
-    auto result = cudaMalloc(ptr, size);
-#endif
-    if (result == gpuSuccess) {
-      cur_size_.fetch_add(size);
-      STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
-      return gpuSuccess;
-    } else {
-      RaiseNonOutOfMemoryError(&result);
-// Non out of memory error would be raised inside
-// RaiseNonOutOfMemoryError. Therefore, we can
-// return cudaErrorMemoryAllocation directly here.
-#ifdef PADDLE_WITH_HIP
-      return hipErrorOutOfMemory;
-#else
-      return cudaErrorMemoryAllocation;
-#endif
-    }
-  }
-
-  /**
-   * Free gpu memory. Usually, free is not allowed to raise error.
-   * If it does raise error, the process should be crashed.
-   */
-  void Free(void *ptr, size_t size) {
-    // Purposefully allow cudaErrorCudartUnloading, because
-    // that is returned if you ever call cudaFree after the
-    // driver has already shutdown. This happens only if the
-    // process is terminating, in which case we don't care if
-    // cudaFree succeeds.
-    CUDADeviceGuard guard(dev_id_);
-#ifdef PADDLE_WITH_HIP
-    auto err = hipFree(ptr);
-    if (err != hipErrorDeinitialized) {
-#else
-    auto err = cudaFree(ptr);
-    if (err != cudaErrorCudartUnloading) {
-#endif
-      PADDLE_ENFORCE_CUDA_SUCCESS(err);
-      cur_size_.fetch_sub(size);
-      STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
-    } else {
-#ifdef PADDLE_WITH_HIP
-      hipGetLastError();  // clear the error flag when hipErrorDeinitialized
-#else
-      cudaGetLastError();  // clear the error flag when cudaErrorCudartUnloading
-#endif
-    }
-  }
-
-  bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail,
-                  size_t *actual_total) {
-    {
-      CUDADeviceGuard guard(dev_id_);
-#ifdef PADDLE_WITH_HIP
-      auto result = hipMemGetInfo(actual_avail, actual_total);
-#else
-      auto result = cudaMemGetInfo(actual_avail, actual_total);
-#endif
-      if (result != gpuSuccess) {
-        *actual_avail = 0;
-      }
-      RaiseNonOutOfMemoryError(&result);
-    }
-
-    if (NeedRecord()) {
-      std::lock_guard<std::mutex> guard(*mtx_);
-      *avail = std::min(*actual_avail, limit_size_ - cur_size_.load());
-      *total = std::min(*actual_total, limit_size_);
-      return *total < *actual_total;
-    } else {
-      *avail = *actual_avail;
-      *total = *actual_total;
-      return false;
-    }
-  }
-
-  inline bool NeedRecord() const { return limit_size_ != 0; }
-
-  uint64_t RecordedSize() const { return cur_size_.load(); }
-
-  uint64_t LimitSize() const { return limit_size_; }
-
-#ifdef PADDLE_WITH_CUDA
-#if CUDA_VERSION >= 10020
-  CUresult MemCreate(CUmemGenericAllocationHandle *handle, size_t size,
-                     const CUmemAllocationProp *prop,
-                     unsigned long long flags) {  // NOLINT
-    auto result =
-        paddle::platform::dynload::cuMemCreate(handle, size, prop, flags);
-    if (result == CUDA_SUCCESS) {
-      cur_size_.fetch_add(size);
-    }
-    return result;
-  }
-
-  CUresult MemRelease(CUmemGenericAllocationHandle handle, size_t size) {
-    auto result = paddle::platform::dynload::cuMemRelease(handle);
-    if (result == CUDA_SUCCESS) {
-      cur_size_.fetch_sub(size);
-    }
-    return result;
-  }
-
-#endif
-#endif
-
- private:
-  const int dev_id_;
-  const uint64_t limit_size_;
-  std::atomic<uint64_t> cur_size_{0};
-
-  mutable std::unique_ptr<std::mutex> mtx_;
-
-  static std::once_flag once_flag_;
-  static std::vector<std::unique_ptr<RecordedCudaMallocHelper>> instances_;
-};  // NOLINT
-
-std::once_flag RecordedCudaMallocHelper::once_flag_;
-std::vector<std::unique_ptr<RecordedCudaMallocHelper>>
-    RecordedCudaMallocHelper::instances_;
-
-gpuError_t RecordedCudaMalloc(void **ptr, size_t size, int dev_id) {
-  return RecordedCudaMallocHelper::Instance(dev_id)->Malloc(ptr, size);
-}
-
-void RecordedCudaFree(void *p, size_t size, int dev_id) {
-  return RecordedCudaMallocHelper::Instance(dev_id)->Free(p, size);
-}
-
-#ifdef PADDLE_WITH_CUDA
-#if CUDA_VERSION >= 10020
-CUresult RecordedCuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
-                             const CUmemAllocationProp *prop,
-                             unsigned long long flags, int dev_id) {  // NOLINT
-  return RecordedCudaMallocHelper::Instance(dev_id)->MemCreate(handle, size,
-                                                               prop, flags);
-}
-
-CUresult RecordedCuMemRelease(CUmemGenericAllocationHandle handle, size_t size,
-                              int dev_id) {
-  return RecordedCudaMallocHelper::Instance(dev_id)->MemRelease(handle, size);
-}
-#endif
-#endif
-
-bool RecordedCudaMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
-                            size_t *actual_total, int dev_id) {
-  return RecordedCudaMallocHelper::Instance(dev_id)->GetMemInfo(
-      avail, total, actual_avail, actual_total);
-}
-
-uint64_t RecordedCudaMallocSize(int dev_id) {
-  return RecordedCudaMallocHelper::Instance(dev_id)->RecordedSize();
-}
-
-bool IsCudaMallocRecorded(int dev_id) {
-  return RecordedCudaMallocHelper::Instance(dev_id)->NeedRecord();
-}
-
-void EmptyCache(void) {
-  std::vector<int> devices = GetSelectedDevices();
-  for (auto device : devices) {
-    memory::Release(CUDAPlace(device));
-  }
-}
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 698563a53d255..b642f160da21a 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -45,6 +45,10 @@ limitations under the License. */
 #include "DbgHelp.h"
 #endif
 
+#ifdef PADDLE_WITH_IPU
+#include "paddle/fluid/platform/device/ipu/ipu_info.h"
+#endif
+
 DECLARE_int32(paddle_num_threads);
 PADDLE_DEFINE_EXPORTED_int32(
     multiple_of_cupti_buffer_size, 1,
@@ -164,6 +168,15 @@ void InitDevices() {
     LOG(WARNING)
         << "Compiled with PADDLE_WITH_ASCEND_CL, but no NPU found in runtime.";
   }
+#endif
+#ifdef PADDLE_WITH_IPU
+  try {
+    // use user specified IPUs.
+    devices = platform::GetSelectedIPUDevices();
+  } catch (const std::exception &exp) {
+    LOG(WARNING)
+        << "Compiled with PADDLE_WITH_IPU, but no IPU found in runtime.";
+  }
 #endif
   InitDevices(devices);
 }
@@ -185,6 +198,9 @@ void InitDevices(const std::vector<int> devices) {
 #ifdef PADDLE_WITH_XPU
     places.emplace_back(platform::XPUPlace(devices[i]));
 #endif
+#ifdef PADDLE_WITH_IPU
+    places.emplace_back(platform::IPUPlace(devices[i]));
+#endif
 #ifdef PADDLE_WITH_ASCEND_CL
     places.emplace_back(platform::NPUPlace(devices[i]));
 #endif
diff --git a/paddle/fluid/platform/init_test.cc b/paddle/fluid/platform/init_test.cc
index 965fe7b6db45c..dbca7d1549546 100644
--- a/paddle/fluid/platform/init_test.cc
+++ b/paddle/fluid/platform/init_test.cc
@@ -32,7 +32,7 @@ TEST(InitDevices, CUDA) {
   using paddle::platform::DeviceContextPool;
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  int count = paddle::platform::GetCUDADeviceCount();
+  int count = paddle::platform::GetGPUDeviceCount();
   InitDevices();
   DeviceContextPool& pool = DeviceContextPool::Instance();
   ASSERT_EQ(pool.size(), 2U + static_cast<unsigned>(count));
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 40d9bb99f44f5..f6d9c8f64fd35 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #include <mutex>  // NOLINT
 #include <random>
+#include <sstream>
 #include <string>
+#include <type_traits>
 
 #include "paddle/fluid/platform/device_tracer.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -30,6 +32,290 @@ PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler, false,
 namespace paddle {
 namespace platform {
 
+struct DurationEvent {
+ public:
+  DurationEvent(const char *name, uint64_t start_ns, uint64_t end_ns,
+                EventRole role)
+      : name(name), start_ns(start_ns), end_ns(end_ns), role(role) {}
+
+  DurationEvent(std::function<void *(size_t)> &arena_allocator,
+                const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
+                EventRole role, const std::string &attr_str)
+      : start_ns(start_ns), end_ns(end_ns), role(role) {
+    auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
+    strncpy(buf, name_str.c_str(), name_str.length() + 1);
+    name = buf;
+    buf = static_cast<char *>(arena_allocator(attr_str.length() + 1));
+    strncpy(buf, attr_str.c_str(), attr_str.length() + 1);
+    attr = buf;
+  }
+
+  DurationEvent(const std::function<void *(size_t)> &arena_allocator,
+                const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
+                EventRole role)
+      : start_ns(start_ns), end_ns(end_ns), role(role) {
+    auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
+    strncpy(buf, name_str.c_str(), name_str.length() + 1);
+    name = buf;
+  }
+
+  const char *name = nullptr;  // not owned, designed for performance
+  uint64_t start_ns = 0;
+  uint64_t end_ns = 0;
+  EventRole role = EventRole::kOrdinary;
+  const char *attr = nullptr;  // not owned, designed for performance
+};
+
+template <typename HeadType, typename... RestTypes>
+struct ContainsStdString
+    : std::conditional_t<
+          std::is_same<std::string, std::remove_cv_t<std::remove_reference_t<
+                                        HeadType>>>::value,
+          std::true_type, ContainsStdString<RestTypes...>> {};
+
+template <typename TailType>
+struct ContainsStdString<TailType>
+    : std::is_same<std::string,
+                   std::remove_cv_t<std::remove_reference_t<TailType>>> {};
+
+template <typename EventType>
+class EventContainer {
+ public:
+  EventContainer() {
+    event_blocks_ = cur_event_block_ = new EventBlock;
+    str_blocks_ = cur_str_block_ = new StringBlock;
+  }
+  ~EventContainer() {
+    Reduce();
+    delete event_blocks_;
+    for (auto cur = str_blocks_; cur != nullptr;) {
+      auto next = cur->next;
+      delete cur;
+      cur = next;
+    }
+  }
+  DISABLE_COPY_AND_ASSIGN(EventContainer);
+
+ public:
+  // Record an event
+  template <typename... Args>
+  void Record(Args &&... args) {
+    DoRecord(ContainsStdString<Args...>(), std::forward<Args>(args)...);
+  }
+
+  // Get all events and clear the container
+  std::vector<EventType> Reduce();
+
+  // Return a buffer to store the string attribute of Event.
+  // HostEventRecorder locates in the static data section.
+  // So it's safe to use arena to avoid fragmented allocations.
+  char *GetStrBufFromArena(size_t size) { return GetStringStorage(size); }
+
+ private:
+  struct EventBlock {
+    union InitDeferedEvent {
+      InitDeferedEvent() {}
+      ~InitDeferedEvent() {}
+
+      EventType event;
+    };
+
+    static constexpr size_t kBlockSize = 1 << 24;  // 16 MB
+    static constexpr size_t kAvailSize =
+        kBlockSize - sizeof(size_t) - sizeof(nullptr);
+    static constexpr size_t kNumEvents = kAvailSize / sizeof(InitDeferedEvent);
+    static constexpr size_t kPadSize =
+        kAvailSize - kNumEvents * sizeof(InitDeferedEvent);
+    static constexpr size_t kMinimumEventsPerBlock = 1024;
+    static_assert(
+        kNumEvents >= kMinimumEventsPerBlock,
+        "EventType is too large for kBlockSize, make kBlockSize larger");
+
+    size_t offset = 0;
+    EventBlock *next = nullptr;
+    InitDeferedEvent events[kNumEvents];
+    char padding[kPadSize];
+  };
+  static_assert(sizeof(EventBlock) == EventBlock::kBlockSize,
+                "sizeof EventBlock must equal to kBlockSize");
+
+  struct StringBlock {
+    static constexpr size_t kBlockSize = 1 << 22;  // 4 MB
+    static constexpr size_t kAvailSize =
+        kBlockSize - sizeof(size_t) - sizeof(nullptr);
+
+    size_t offset = 0;
+    StringBlock *next = nullptr;
+    char storage[kAvailSize];
+  };
+  static_assert(sizeof(StringBlock) == StringBlock::kBlockSize,
+                "sizeof StringBlock must equal to kBlockSize");
+
+  // Record an event with string arguments
+  template <typename... Args>
+  void DoRecord(std::true_type, Args &&... args) {
+    auto *storage = GetEventStorage();
+    std::function<void *(size_t)> allocator = [this](size_t size) {
+      return GetStrBufFromArena(size);
+    };
+    new (storage) EventType(allocator, std::forward<Args>(args)...);
+  }
+
+  // Record an event without any string argument
+  template <typename... Args>
+  void DoRecord(std::false_type, Args &&... args) {
+    auto *storage = GetEventStorage();
+    new (storage) EventType(std::forward<Args>(args)...);
+  }
+
+  EventType *GetEventStorage();
+
+  char *GetStringStorage(size_t sz);
+
+  EventBlock *event_blocks_ = nullptr;
+  EventBlock *cur_event_block_ = nullptr;
+  StringBlock *str_blocks_ = nullptr;
+  StringBlock *cur_str_block_ = nullptr;
+};
+
+template <typename EventType>
+std::vector<EventType> EventContainer<EventType>::Reduce() {
+  std::vector<EventType> all_events;
+  size_t event_cnt = 0;
+  for (auto cur = event_blocks_; cur != nullptr; cur = cur->next) {
+    event_cnt += cur->offset;
+  }
+  all_events.reserve(event_cnt);
+  for (auto cur = event_blocks_; cur != nullptr;) {
+    for (size_t i = 0; i < cur->offset; ++i) {
+      all_events.emplace_back(cur->events[i].event);
+    }
+    auto next = cur->next;
+    delete cur;
+    cur = next;
+  }
+  event_blocks_ = cur_event_block_ = new EventBlock;
+  return std::move(all_events);
+}
+
+template <typename EventType>
+EventType *EventContainer<EventType>::GetEventStorage() {
+  if (UNLIKELY(cur_event_block_->offset >=
+               EventBlock::kNumEvents)) {  // another block
+    cur_event_block_->next = new EventBlock;
+    cur_event_block_ = cur_event_block_->next;
+  }
+  auto &obj = cur_event_block_->events[cur_event_block_->offset].event;
+  ++cur_event_block_->offset;
+  return &obj;
+}
+
+template <typename EventType>
+char *EventContainer<EventType>::GetStringStorage(size_t sz) {
+  if (UNLIKELY(cur_str_block_->offset + sz >
+               StringBlock::kAvailSize)) {  // another block
+    cur_str_block_->next = new StringBlock;
+    cur_str_block_ = cur_str_block_->next;
+  }
+  char *storage = cur_str_block_->storage + cur_str_block_->offset;
+  cur_str_block_->offset += sz;
+  return storage;
+}
+
+struct ThreadEventSection {
+  std::string thread_name;
+  uint64_t thread_id;
+  std::vector<DurationEvent> events;
+};
+
+class ThreadEventRecorder {
+ public:
+  ThreadEventRecorder();
+  DISABLE_COPY_AND_ASSIGN(ThreadEventRecorder);
+
+ public:
+  // Forward call to EventContainer::Record
+  template <typename... Args>
+  void RecordEvent(Args &&... args) {
+    base_evt_cntr_.Record(std::forward<Args>(args)...);
+  }
+
+  ThreadEventSection GatherEvents() {
+    ThreadEventSection thr_sec;
+    thr_sec.thread_name = thread_name_;
+    thr_sec.thread_id = thread_id_;
+    thr_sec.events = std::move(base_evt_cntr_.Reduce());
+    return std::move(thr_sec);
+  }
+
+ private:
+  uint64_t thread_id_;
+  std::string thread_name_;
+  EventContainer<DurationEvent> base_evt_cntr_;
+};
+
+struct HostEventSection {
+  std::string process_name;
+  uint64_t process_id;
+  std::vector<ThreadEventSection> thr_sections;
+};
+
+class HostEventRecorder {
+ public:
+  // singleton
+  static HostEventRecorder &GetInstance() {
+    static HostEventRecorder instance;
+    return instance;
+  }
+
+  // If your string argument has a longer lifetime than the Event,
+  // use 'const char*'. e.g.: string literal, op name, etc.
+  // Do your best to avoid using 'std::string' as the argument type.
+  // It will cause deep-copy to harm performance.
+  template <typename... Args>
+  void RecordEvent(Args &&... args) {
+    GetThreadLocalRecorder().RecordEvent(std::forward<Args>(args)...);
+  }
+
+  // Poor performance, call it at the ending
+  HostEventSection GatherEvents();
+
+  void RegisterThreadRecorder(uint64_t tid, ThreadEventRecorder *recorder) {
+    const std::lock_guard<std::mutex> guard(thread_recorders_lock_);
+    thread_recorders_[tid] = recorder;
+  }
+
+ private:
+  HostEventRecorder() = default;
+  DISABLE_COPY_AND_ASSIGN(HostEventRecorder);
+
+  ThreadEventRecorder &GetThreadLocalRecorder() {
+    static thread_local ThreadEventRecorder tls_recorder;
+    return tls_recorder;
+  }
+
+  std::mutex thread_recorders_lock_;
+  std::unordered_map<uint64_t, ThreadEventRecorder *> thread_recorders_;
+};
+
+static uint64_t GetThreadId() {
+  return std::hash<std::thread::id>{}(std::this_thread::get_id());
+}
+
+ThreadEventRecorder::ThreadEventRecorder() {
+  thread_id_ = GetThreadId();
+  HostEventRecorder::GetInstance().RegisterThreadRecorder(thread_id_, this);
+}
+
+HostEventSection HostEventRecorder::GatherEvents() {
+  HostEventSection host_sec;
+  host_sec.thr_sections.reserve(thread_recorders_.size());
+  for (auto &kv : thread_recorders_) {
+    host_sec.thr_sections.emplace_back(std::move(kv.second->GatherEvents()));
+  }
+  return std::move(host_sec);
+}
+
 MemEvenRecorder MemEvenRecorder::recorder;
 
 Event::Event(EventType type, std::string name, uint32_t thread_id,
@@ -57,8 +343,44 @@ double Event::CudaElapsedMs(const Event &e) const {
 #endif
 }
 
+RecordEvent::RecordEvent(const char *name, const EventRole role) {
+#ifndef _WIN32
+#ifdef PADDLE_WITH_CUDA
+  if (g_enable_nvprof_hook) {
+    dynload::nvtxRangePushA(name);
+    is_pushed_ = true;
+  }
+#endif
+#endif
+  if (UNLIKELY(g_enable_host_event_recorder_hook == false)) {
+    RecordEvent(name, role, "none");
+    return;
+  }
+  shallow_copy_name_ = name;
+  role_ = role;
+  start_ns_ = PosixInNsec();
+}
+
+RecordEvent::RecordEvent(const std::string &name, const EventRole role) {
+#ifndef _WIN32
+#ifdef PADDLE_WITH_CUDA
+  if (g_enable_nvprof_hook) {
+    dynload::nvtxRangePushA(name.c_str());
+    is_pushed_ = true;
+  }
+#endif
+#endif
+  if (UNLIKELY(g_enable_host_event_recorder_hook == false)) {
+    RecordEvent(name, role, "none");
+    return;
+  }
+  name_ = new std::string(name);
+  role_ = role;
+  start_ns_ = PosixInNsec();
+}
+
 RecordEvent::RecordEvent(const std::string &name, const EventRole role,
-                         const std::string attr) {
+                         const std::string &attr) {
 #ifndef _WIN32
 #ifdef PADDLE_WITH_CUDA
   if (g_enable_nvprof_hook) {
@@ -67,17 +389,26 @@ RecordEvent::RecordEvent(const std::string &name, const EventRole role,
   }
 #endif
 #endif
+  if (g_enable_host_event_recorder_hook) {
+    name_ = new std::string(name);
+    start_ns_ = PosixInNsec();
+    attr_ = new std::string(attr);
+    return;
+  }
+
   if (g_state == ProfilerState::kDisabled || name.empty()) return;
 
   // do some initialization
+  name_ = new std::string(name);
   start_ns_ = PosixInNsec();
   role_ = role;
+  attr_ = new std::string(attr);
   is_enabled_ = true;
   // lock is not needed, the code below is thread-safe
   // Maybe need the same push/pop behavior.
   Event *e = PushEvent(name, role, attr);
   SetCurAnnotation(e);
-  name_ = e->name();
+  // name_ = e->name();
 }
 
 RecordEvent::~RecordEvent() {
@@ -88,15 +419,36 @@ RecordEvent::~RecordEvent() {
   }
 #endif
 #endif
+  uint64_t end_ns = PosixInNsec();
+  if (LIKELY(g_enable_host_event_recorder_hook)) {
+    if (LIKELY(shallow_copy_name_ != nullptr)) {
+      HostEventRecorder::GetInstance().RecordEvent(shallow_copy_name_,
+                                                   start_ns_, end_ns, role_);
+    } else if (name_ != nullptr) {
+      if (attr_ == nullptr) {
+        HostEventRecorder::GetInstance().RecordEvent(*name_, start_ns_, end_ns,
+                                                     role_);
+      } else {
+        HostEventRecorder::GetInstance().RecordEvent(*name_, start_ns_, end_ns,
+                                                     role_, *attr_);
+      }
+    }
+    delete name_;
+    delete attr_;
+    return;
+  }
+
   if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
   // lock is not needed, the code below is thread-safe
   DeviceTracer *tracer = GetDeviceTracer();
   if (tracer) {
-    tracer->AddCPURecords(CurAnnotationName(), start_ns_, PosixInNsec(),
-                          BlockDepth(), g_thread_id);
+    tracer->AddCPURecords(CurAnnotationName(), start_ns_, end_ns, BlockDepth(),
+                          g_thread_id);
   }
   ClearCurAnnotation();
-  PopEvent(name_, role_);
+  PopEvent(*name_, role_);
+  delete name_;
+  delete attr_;
 }
 
 void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place,
@@ -148,11 +500,11 @@ MemEvenRecorder::RecordMemEvent::~RecordMemEvent() {
   PopMemEvent(start_ns_, end_ns_, bytes_, place_, annotation_free);
 }
 
-RecordRPCEvent::RecordRPCEvent(const std::string &name) {
+/*RecordRPCEvent::RecordRPCEvent(const std::string &name) {
   if (FLAGS_enable_rpc_profiler) {
     event_.reset(new platform::RecordEvent(name));
   }
-}
+}*/
 
 RecordBlock::RecordBlock(int block_id)
     : is_enabled_(false), start_ns_(PosixInNsec()) {
@@ -362,5 +714,20 @@ void NvprofEnableRecordEvent() {
 
 void NvprofDisableRecordEvent() { g_enable_nvprof_hook = false; }
 
+void EnableHostEventRecorder() { g_enable_host_event_recorder_hook = true; }
+
+std::string PrintHostEvents() {
+  std::ostringstream oss;
+  auto host_evt_sec = HostEventRecorder::GetInstance().GatherEvents();
+  for (const auto &thr_evt_sec : host_evt_sec.thr_sections) {
+    oss << thr_evt_sec.thread_id << std::endl;
+    for (const auto &evt : thr_evt_sec.events) {
+      oss << "{ " << evt.name << " | " << evt.start_ns << " | " << evt.end_ns
+          << " }" << std::endl;
+    }
+  }
+  return oss.str();
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/profiler.cu b/paddle/fluid/platform/profiler.cu
index 02930627d41e3..5d1caffd45326 100644
--- a/paddle/fluid/platform/profiler.cu
+++ b/paddle/fluid/platform/profiler.cu
@@ -29,7 +29,7 @@ __global__ void DummyKernel(int *a) { a[0] = 0; }
 
 static void ForEachDevice(std::function<void(int)> func) {
   auto original_device = platform::GetCurrentDeviceId();
-  int count = platform::GetCUDADeviceCount();
+  int count = platform::GetGPUDeviceCount();
   for (int i = 0; i < count; i++) {
     platform::SetDeviceId(i);
     func(i);
@@ -43,13 +43,13 @@ void DummyKernelAndEvent() {
     ForEachDevice([](int d) {
       platform::SetDeviceId(d);
       hipStream_t stream;
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream));
       Mark("_cuda_startup_");
       int *ptr;
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipMalloc(&ptr, sizeof(int)));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&ptr, sizeof(int)));
       hipLaunchKernelGGL(DummyKernel, dim3(1), dim3(1), 0, stream, ptr);
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipFree(ptr));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipFree(ptr));
     });
   }
 #else
@@ -57,13 +57,13 @@ void DummyKernelAndEvent() {
     ForEachDevice([](int d) {
       platform::SetDeviceId(d);
       cudaStream_t stream;
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream));
       Mark("_cuda_startup_");
       int *ptr;
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(&ptr, sizeof(int)));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc(&ptr, sizeof(int)));
       DummyKernel<<<1, 1, 0, stream>>>(ptr);
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaFree(ptr));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaFree(ptr));
     });
   }
 #endif
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index fbae6165e313a..317991160b798 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -31,7 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.pb.h"
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 
 namespace paddle {
@@ -128,31 +128,38 @@ struct MemEvenRecorder {
 };
 
 struct RecordEvent {
-  RecordEvent(const std::string& name,
-              const EventRole role = EventRole::kOrdinary,
-              const std::string attr = "none");
+  explicit RecordEvent(const std::string& name,
+                       const EventRole role = EventRole::kOrdinary);
+
+  explicit RecordEvent(const char* name,
+                       const EventRole role = EventRole::kOrdinary);
+
+  RecordEvent(const std::string& name, const EventRole role,
+              const std::string& attr);
 
   ~RecordEvent();
 
   bool is_enabled_{false};
   bool is_pushed_{false};
-  uint64_t start_ns_;
   // Event name
-  std::string name_;
+  const std::string* name_{nullptr};
+  const char* shallow_copy_name_{nullptr};
+  uint64_t start_ns_;
   // Need to distinguish name by op type, block_id, program_id and perhaps
   // different kernel invocations within an op.
-  std::string full_name_;
+  // std::string full_name_;
   EventRole role_{EventRole::kOrdinary};
+  const std::string* attr_{nullptr};
 };
 
-class RecordRPCEvent {
+/*class RecordRPCEvent {
  public:
   explicit RecordRPCEvent(const std::string& name);
   ~RecordRPCEvent() {}
 
  private:
   std::unique_ptr<RecordEvent> event_;
-};
+};*/
 
 struct RecordBlock {
   explicit RecordBlock(int block_id);
@@ -242,5 +249,10 @@ int64_t ListenerId();
 void NvprofEnableRecordEvent();
 void NvprofDisableRecordEvent();
 
+void EnableHostEventRecorder();
+
+// Defined for UT
+std::string PrintHostEvents();
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h
index a8438263cb97b..4277f7d4dc63e 100644
--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@@ -47,6 +47,8 @@ static TracerOption g_tracer_option = TracerOption::kDefault;
 static ProfilerState g_state = ProfilerState::kDisabled;
 // To hook RecordEvent's events, use it to nvtx timeline
 static bool g_enable_nvprof_hook = false;
+// To hook RecordEvent, use HostEventRecorder
+static bool g_enable_host_event_recorder_hook = false;
 // The thread local event list only can be accessed by the specific thread
 // The thread index of each thread
 static thread_local int32_t g_thread_id;
@@ -119,17 +121,17 @@ std::vector<std::vector<MemEvent>> GetMemEvents() {
 
 void SynchronizeAllDevice() {
 #ifdef PADDLE_WITH_CUDA
-  int count = GetCUDADeviceCount();
+  int count = GetGPUDeviceCount();
   for (int i = 0; i < count; i++) {
     SetDeviceId(i);
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize());
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
   }
 #endif
 #ifdef PADDLE_WITH_HIP
-  int count = GetCUDADeviceCount();
+  int count = GetGPUDeviceCount();
   for (int i = 0; i < count; i++) {
     SetDeviceId(i);
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize());
+    PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
   }
 #endif
 }
diff --git a/paddle/fluid/platform/stream/cuda_stream.cc b/paddle/fluid/platform/stream/cuda_stream.cc
index 212d99f6a78ed..dafb61fe0aaf4 100644
--- a/paddle/fluid/platform/stream/cuda_stream.cc
+++ b/paddle/fluid/platform/stream/cuda_stream.cc
@@ -30,18 +30,18 @@ bool CUDAStream::Init(const Place& place, const Priority& priority,
   CUDADeviceGuard guard(BOOST_GET_CONST(CUDAPlace, place_).device);
   if (priority == Priority::kHigh) {
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreateWithPriority(
+    PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreateWithPriority(
         &stream_, static_cast<unsigned int>(flag), -1));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreateWithPriority(
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreateWithPriority(
         &stream_, static_cast<unsigned int>(flag), -1));
 #endif
   } else if (priority == Priority::kNormal) {
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreateWithPriority(
+    PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreateWithPriority(
         &stream_, static_cast<unsigned int>(flag), 0));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreateWithPriority(
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreateWithPriority(
         &stream_, static_cast<unsigned int>(flag), 0));
 #endif
   }
@@ -58,9 +58,9 @@ void CUDAStream::Destroy() {
   WaitCallback();
   if (stream_) {
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamDestroy(stream_));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream_));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream_));
 #endif
   }
   stream_ = nullptr;
@@ -89,7 +89,7 @@ void CUDAStream::Wait() const {
 #endif
 #endif  // PADDLE_WITH_HIP
 
-  PADDLE_ENFORCE_CUDA_SUCCESS(e_sync);
+  PADDLE_ENFORCE_GPU_SUCCESS(e_sync);
 }
 
 CUDAStream* get_current_stream(int deviceId) {
diff --git a/paddle/fluid/platform/stream/cuda_stream.h b/paddle/fluid/platform/stream/cuda_stream.h
index 472d6bbab0c6c..36f31c46673b2 100644
--- a/paddle/fluid/platform/stream/cuda_stream.h
+++ b/paddle/fluid/platform/stream/cuda_stream.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <cstdint>
 #include <memory>
 
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/stream_callback_manager.h"
@@ -64,32 +64,32 @@ class CUDAStream final {
 #ifdef PADDLE_WITH_HIP
   void RecordEvent(hipEvent_t ev, Callback callback) const {
     callback();
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(ev, stream_));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(ev, stream_));
   }
 #else
   void RecordEvent(cudaEvent_t ev, Callback callback) const {
     callback();
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(ev, stream_));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(ev, stream_));
   }
 #endif
 
 #ifdef PADDLE_WITH_HIP
   void RecordEvent(hipEvent_t ev) const {
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(ev, stream_));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(ev, stream_));
   }
 #else
   void RecordEvent(cudaEvent_t ev) const {
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(ev, stream_));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(ev, stream_));
   }
 #endif
 
 #ifdef PADDLE_WITH_HIP
   void WaitEvent(hipEvent_t ev) const {
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(stream_, ev, 0));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream_, ev, 0));
   }
 #else
   void WaitEvent(cudaEvent_t ev) const {
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(stream_, ev, 0));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(stream_, ev, 0));
   }
 #endif
 
@@ -122,17 +122,11 @@ class CUDAStream final {
     }
 #endif
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(err);
+    PADDLE_ENFORCE_GPU_SUCCESS(err);
     return false;
   }
 
-  void Synchronize() const {
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream_));
-#else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_));
-#endif
-  }
+  void Synchronize() const { platform::GpuStreamSync(stream_); }
 
   const Place& GetPlace() const { return place_; }
 
diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc
index 3f0c5ace900d1..28aa022fe2f13 100644
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/stream_callback_manager.h"
-#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/npu/npu_info.h"
-#endif
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace platform {
@@ -59,15 +59,15 @@ void StreamCallbackManager<Stream>::AddCallback(
   });
 
 #ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       hipStreamAddCallback(stream_, StreamCallbackFunc, func, 0));
 #endif
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10000
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       cudaLaunchHostFunc(stream_, StreamCallbackFunc, func));
 #else
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       cudaStreamAddCallback(stream_, StreamCallbackFunc, func, 0));
 #endif
 #endif
@@ -81,11 +81,8 @@ void StreamCallbackManager<Stream>::AddCallback(
 
 template <typename Stream>
 void StreamCallbackManager<Stream>::Wait() const {
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream_));
-#endif
-#ifdef PADDLE_WITH_CUDA
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_));
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA)
+  platform::GpuStreamSync(stream_);
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
   NPUStreamSync(stream_);
diff --git a/paddle/fluid/platform/test_limit_gpu_memory.cu b/paddle/fluid/platform/test_limit_gpu_memory.cu
index 81b766182337f..684cb78073551 100644
--- a/paddle/fluid/platform/test_limit_gpu_memory.cu
+++ b/paddle/fluid/platform/test_limit_gpu_memory.cu
@@ -15,7 +15,7 @@
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 DECLARE_uint64(gpu_memory_limit_mb);
 
@@ -30,32 +30,24 @@ TEST(test_record_malloc, test_limit_gpu_memory) {
   size_t limit = FLAGS_gpu_memory_limit_mb << 20;
 
   {
-    ASSERT_TRUE(IsCudaMallocRecorded(DEVICE_ID));
-    ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), 0UL);
+    ASSERT_TRUE(IsGpuMallocRecorded(DEVICE_ID));
+    ASSERT_EQ(RecordedGpuMallocSize(DEVICE_ID), 0UL);
   }
 
   size_t avail, total;
   {
     size_t actual_avail, actual_total;
-    RecordedCudaMemGetInfo(&avail, &total, &actual_avail, &actual_total,
-                           DEVICE_ID);
+    RecordedGpuMemGetInfo(&avail, &total, &actual_avail, &actual_total,
+                          DEVICE_ID);
     ASSERT_EQ(total, limit);
-#ifdef PADDLE_WITH_HIP
-    ASSERT_EQ(hipGetLastError(), gpuSuccess);
-#else
-    ASSERT_EQ(cudaGetLastError(), gpuSuccess);
-#endif
+    ASSERT_EQ(paddle::platform::GpuGetLastError(), gpuSuccess);
   }
 
   {
     CUDADeviceGuard guard(DEVICE_ID);
     GpuMemoryUsage(&avail, &total);
     ASSERT_EQ(total, limit);
-#ifdef PADDLE_WITH_HIP
-    ASSERT_EQ(hipGetLastError(), gpuSuccess);
-#else
-    ASSERT_EQ(cudaGetLastError(), gpuSuccess);
-#endif
+    ASSERT_EQ(paddle::platform::GpuGetLastError(), gpuSuccess);
   }
 
   gpuError_t err = gpuSuccess;
@@ -63,54 +55,41 @@ TEST(test_record_malloc, test_limit_gpu_memory) {
   void *p1 = nullptr;
   size_t size1 = limit / 4 * 3;
   {
-    err = platform::RecordedCudaMalloc(&p1, size1, DEVICE_ID);
+    err = platform::RecordedGpuMalloc(&p1, size1, DEVICE_ID);
     ASSERT_EQ(err, gpuSuccess);
-#ifdef PADDLE_WITH_HIP
-    ASSERT_EQ(hipGetLastError(), gpuSuccess);
-#else
-    ASSERT_EQ(cudaGetLastError(), gpuSuccess);
-#endif
+    ASSERT_EQ(paddle::platform::GpuGetLastError(), gpuSuccess);
     ASSERT_NE(p1, nullptr);
 
-    ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), size1);
+    ASSERT_EQ(RecordedGpuMallocSize(DEVICE_ID), size1);
   }
 
   void *p2 = nullptr;
   size_t size2 = limit / 2;
   {
-    err = platform::RecordedCudaMalloc(&p2, size2, DEVICE_ID);
-#ifdef PADDLE_WITH_HIP
-    ASSERT_EQ(err, hipErrorOutOfMemory);
-    ASSERT_EQ(hipGetLastError(), gpuSuccess);
-#else
-    ASSERT_EQ(err, cudaErrorMemoryAllocation);
-    ASSERT_EQ(cudaGetLastError(), gpuSuccess);
-#endif
+    err = platform::RecordedGpuMalloc(&p2, size2, DEVICE_ID);
+    ASSERT_EQ(err, gpuErrorOutOfMemory);
+    ASSERT_EQ(paddle::platform::GpuGetLastError(), gpuSuccess);
     ASSERT_EQ(p2, nullptr);
 
-    ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), size1);
+    ASSERT_EQ(RecordedGpuMallocSize(DEVICE_ID), size1);
   }
 
   {
-    platform::RecordedCudaFree(p1, size1, DEVICE_ID);
-    ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), 0UL);
+    platform::RecordedGpuFree(p1, size1, DEVICE_ID);
+    ASSERT_EQ(RecordedGpuMallocSize(DEVICE_ID), 0UL);
   }
 
   {
-    err = platform::RecordedCudaMalloc(&p2, size2, DEVICE_ID);
+    err = platform::RecordedGpuMalloc(&p2, size2, DEVICE_ID);
     ASSERT_EQ(err, gpuSuccess);
-#ifdef PADDLE_WITH_HIP
-    ASSERT_EQ(hipGetLastError(), hipSuccess);
-#else
-    ASSERT_EQ(cudaGetLastError(), cudaSuccess);
-#endif
+    ASSERT_EQ(paddle::platform::GpuGetLastError(), gpuSuccess);
     ASSERT_NE(p2, nullptr);
-    ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), size2);
+    ASSERT_EQ(RecordedGpuMallocSize(DEVICE_ID), size2);
   }
 
   {
-    platform::RecordedCudaFree(p2, size2, DEVICE_ID);
-    ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), 0UL);
+    platform::RecordedGpuFree(p2, size2, DEVICE_ID);
+    ASSERT_EQ(RecordedGpuMallocSize(DEVICE_ID), 0UL);
   }
 }
 
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 588caed5a452e..e6d0a096b2d80 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -16,6 +16,9 @@ endif()
 if (WITH_GPU)
   set(PYBIND_DEPS ${PYBIND_DEPS} cuda_profiler)
 endif()
+if (WITH_IPU)
+  set(PYBIND_DEPS ${PYBIND_DEPS} ipu_info)
+endif()
 
 if (WITH_NCCL OR WITH_RCCL)
   set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper)
@@ -25,6 +28,13 @@ endif()
 if (WITH_XPU_BKCL)
   set(PYBIND_DEPS ${PYBIND_DEPS} reducer)
   set(PYBIND_DEPS ${PYBIND_DEPS} bkcl_context)
+  set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context)
+endif()
+
+if (WITH_ASCEND_CL)
+  set(PYBIND_DEPS ${PYBIND_DEPS} reducer)
+  set(PYBIND_DEPS ${PYBIND_DEPS} hccl_context)
+  set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context)
 endif()
 
 if(NOT WIN32)
@@ -32,9 +42,7 @@ if(NOT WIN32)
   set(PYBIND_DEPS ${PYBIND_DEPS} mmap_allocator)
   if (WITH_NCCL OR WITH_RCCL)
     set(PYBIND_DEPS ${PYBIND_DEPS} nccl_context)
-  endif()
-  if (WITH_ASCEND_CL)
-    set(PYBIND_DEPS ${PYBIND_DEPS} hccl_context)
+    set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context)
   endif()
 endif(NOT WIN32)
 
@@ -126,17 +134,25 @@ if(WITH_PYTHON)
 
   add_executable(op_function_generator op_function_generator.cc)
   target_link_libraries(op_function_generator ${OP_FUNCTION_GENERETOR_DEPS})
+  add_executable(eager_op_function_generator eager_op_function_generator.cc)
+  target_link_libraries(eager_op_function_generator ${OP_FUNCTION_GENERETOR_DEPS})
 
   get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
   target_link_libraries(op_function_generator ${os_dependency_modules})
+  target_link_libraries(eager_op_function_generator ${os_dependency_modules})
   if(WITH_ROCM)
     target_link_libraries(op_function_generator ${ROCM_HIPRTC_LIB})
+    target_link_libraries(eager_op_function_generator ${ROCM_HIPRTC_LIB})
   endif()
 
   set(impl_file ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function_impl.h)
   set(tmp_impl_file ${impl_file}.tmp)
+  set(eager_impl_file ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/eager_op_function_impl.h)
+  set(tmp_eager_impl_file ${eager_impl_file}.tmp)
 
   set(OP_IMPL_DEPS op_function_generator)
+  set(EAGER_OP_IMPL_DEPS eager_op_function_generator)
+
   if(WIN32)
     if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
       set(op_impl_path "${CMAKE_CURRENT_BINARY_DIR}")
@@ -160,22 +176,41 @@ if(WITH_PYTHON)
     ")\n"
     "exit /b 0")
 
+    file(WRITE ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/eager_op_function_generator_retry.bat ""
+    "set build_times=1\n"
+    ":retry\n"
+    "ECHO eager_op_function_generator run %build_times% time\n"
+    "taskkill /f /im eager_op_function_generator.exe 2>NUL\n"
+    "${op_impl_path}/eager_op_function_generator.exe ${tmp_eager_impl_file}\n"
+    "if %ERRORLEVEL% NEQ 0 (\n"
+    "    set /a build_times=%build_times%+1\n"
+    "    if %build_times% GEQ 10 (\n"
+    "        exit /b 1\n"
+    "    ) else (\n"
+    "        goto :retry\n"
+    "    )\n"
+    ")\n"
+    "exit /b 0")
+
     if(${CBLAS_PROVIDER} STREQUAL MKLML)
       ADD_CUSTOM_COMMAND(OUTPUT ${op_impl_path}/libiomp5md.dll
         COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} ${op_impl_path}
         DEPENDS mklml)
       list(APPEND OP_IMPL_DEPS ${op_impl_path}/libiomp5md.dll)
+      list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/libiomp5md.dll)
     else(${CBLAS_PROVIDER} STREQUAL EXTERN_OPENBLAS)
       ADD_CUSTOM_COMMAND(OUTPUT ${op_impl_path}/openblas.dll
         COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_SHARED_LIB} ${op_impl_path}
         DEPENDS extern_openblas)
       list(APPEND OP_IMPL_DEPS ${op_impl_path}/openblas.dll)
+      list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/openblas.dll)
     endif()
     if(WITH_MKLDNN)
       ADD_CUSTOM_COMMAND(OUTPUT ${op_impl_path}/mkldnn.dll
         COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB} ${op_impl_path}
         DEPENDS mkldnn)
         list(APPEND OP_IMPL_DEPS ${op_impl_path}/mkldnn.dll)
+        list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/mkldnn.dll)
     endif()
 
     add_custom_command(OUTPUT ${impl_file}
@@ -183,6 +218,13 @@ if(WITH_PYTHON)
       COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} ${impl_file}
       COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}"
       DEPENDS ${OP_IMPL_DEPS})
+    if(NOT ON_INFER)
+      add_custom_command(OUTPUT ${eager_impl_file}
+        COMMAND ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/eager_op_function_generator_retry.bat
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_eager_impl_file} ${eager_impl_file}
+        COMMENT "copy_if_different ${tmp_eager_impl_file} to ${eager_impl_file}"
+        DEPENDS ${EAGER_OP_IMPL_DEPS})
+    endif()
   else(WIN32)
     # If there are no *.so in /usr/lib or LD_LIBRARY_PATH,
     # copy these *.so to current directory and append current directory to
@@ -193,12 +235,14 @@ if(WITH_PYTHON)
         COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} ${CMAKE_CURRENT_BINARY_DIR}
         DEPENDS mklml)
       list(APPEND OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libiomp5.so)
+      list(APPEND EAGER_OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libiomp5.so)
     endif()
     if(WITH_MKLDNN)
       ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libdnnl.so.0
         COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB} ${CMAKE_CURRENT_BINARY_DIR}
         DEPENDS mkldnn)
       list(APPEND OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libdnnl.so.0)
+      list(APPEND EAGER_OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libdnnl.so.0)
     endif()
     add_custom_command(OUTPUT ${impl_file}
           COMMAND ${CMAKE_COMMAND} -E env "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:."
@@ -208,10 +252,35 @@ if(WITH_PYTHON)
           COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}"
           DEPENDS ${OP_IMPL_DEPS}
           VERBATIM)
+    if(NOT ON_INFER)
+      add_custom_command(OUTPUT ${eager_impl_file}
+            COMMAND ${CMAKE_COMMAND} -E env "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:."
+                "${CMAKE_CURRENT_BINARY_DIR}/eager_op_function_generator"
+                "${tmp_eager_impl_file}"
+            COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_eager_impl_file} ${eager_impl_file}
+            COMMENT "copy_if_different ${tmp_eager_impl_file} to ${eager_impl_file}"
+            DEPENDS ${EAGER_OP_IMPL_DEPS}
+            VERBATIM)
+      endif()
   endif(WIN32)
   add_custom_target(op_function_generator_cmd ALL DEPENDS ${impl_file})
+  if(NOT ON_INFER)
+    add_custom_target(eager_op_function_generator_cmd ALL DEPENDS ${eager_impl_file})
+  endif()
 
   list(APPEND PYBIND_DEPS interpretercore standalone_executor)
+  cc_library(op_function_common SRCS op_function_common.cc DEPS ${PYBIND_DEPS})
+  list(APPEND PYBIND_DEPS op_function_common)
+
+  if(NOT ON_INFER)
+    cc_library(paddle_eager
+    SRCS eager.cc eager_functions.cc eager_method.cc eager_properties.cc eager_utils.cc
+    DEPS eager_api autograd_meta backward grad_node_info pten op_function_common dygraph_function dygraph_node math_cpu linalg_cpu creation_cpu utils_cpu manipulation_cpu accumulation_node global_utils utils python)
+    add_dependencies(paddle_eager eager_codegen)
+    add_dependencies(paddle_eager eager_op_function_generator_cmd)
+    list(APPEND PYBIND_DEPS paddle_eager)
+  endif()
+
   cc_library(paddle_pybind SHARED
     SRCS ${PYBIND_SRCS}
     DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
diff --git a/paddle/fluid/pybind/bind_fleet_executor.cc b/paddle/fluid/pybind/bind_fleet_executor.cc
index 115be1b8ba8b4..6fc9b2a494f61 100644
--- a/paddle/fluid/pybind/bind_fleet_executor.cc
+++ b/paddle/fluid/pybind/bind_fleet_executor.cc
@@ -16,6 +16,7 @@
 #include <pybind11/stl.h>
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/place.h"
@@ -32,8 +33,8 @@ void BindFleetExecutor(py::module* m) {
   py::class_<FleetExecutor>(*m, "FleetExecutor")
       .def(py::init<const std::string&>())
       .def("init", &FleetExecutor::Init)
-      .def("run", &FleetExecutor::Run)
-      .def("release", &FleetExecutor::Release);
+      .def("run", &FleetExecutor::Run,
+           py::call_guard<py::gil_scoped_release>());
 
   py::class_<TaskNode>(*m, "TaskNode")
       .def(py::init<const framework::ProgramDesc&, int64_t, int64_t, int64_t>())
diff --git a/paddle/fluid/pybind/cuda_streams_py.cc b/paddle/fluid/pybind/cuda_streams_py.cc
index 311fb872ac103..21571e17a2b48 100644
--- a/paddle/fluid/pybind/cuda_streams_py.cc
+++ b/paddle/fluid/pybind/cuda_streams_py.cc
@@ -61,9 +61,9 @@ void BindCudaStream(py::module *m_ptr) {
     int curr_device_id = paddle::platform::GetCurrentDeviceId();
     paddle::platform::SetDeviceId(device_id);
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize());
+    PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize());
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
 #endif
     paddle::platform::SetDeviceId(curr_device_id);
 #else
@@ -264,7 +264,7 @@ void BindCudaStream(py::module *m_ptr) {
             auto stream_flag =
                 paddle::platform::stream::StreamFlag::kStreamNonBlocking;
 
-            int device_count = platform::GetCUDADeviceCount();
+            int device_count = platform::GetGPUDeviceCount();
             if (device < 0) {
               device = platform::GetCurrentDeviceId();
             }
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
new file mode 100644
index 0000000000000..0714080382205
--- /dev/null
+++ b/paddle/fluid/pybind/eager.cc
@@ -0,0 +1,135 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+// disable numpy compile error
+#include <Python.h>
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/eager/api/all.h"
+#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/utils.h"
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/pybind/eager.h"
+#include "paddle/fluid/pybind/eager_utils.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/include/core.h"
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#include "paddle/fluid/pybind/eager_op_function_impl.h"
+
+namespace paddle {
+namespace pybind {
+
+namespace py = ::pybind11;
+
+PyTypeObject* p_eager_tensor_type;
+
+PyObject* eagertensor_new(PyTypeObject* type, PyObject* args,
+                          PyObject* kwargs) {
+  PyObject* obj = type->tp_alloc(type, 0);
+  if (obj) {
+    auto v = reinterpret_cast<EagerTensorObject*>(obj);
+    new (&(v->eagertensor)) egr::EagerTensor();
+  }
+  return obj;
+}
+
+static void eagertensor_dealloc(EagerTensorObject* self) {
+  self->eagertensor.~EagerTensor();
+  Py_TYPE(self)->tp_free(reinterpret_cast<PyObject*>(self));
+}
+
+extern struct PyGetSetDef variable_properties[];
+
+extern PyMethodDef variable_methods[];
+
+PyTypeObject eager_tensor_type = {
+    PyVarObject_HEAD_INIT(NULL, 0) "core_avx.eager.EagerTensor", /* tp_name */
+    sizeof(EagerTensorObject),       /* tp_basicsize */
+    0,                               /* tp_itemsize */
+    (destructor)eagertensor_dealloc, /* tp_dealloc */
+    0,                               /* tp_vectorcall_offset */
+    0,                               /* tp_getattr */
+    0,                               /* tp_setattr */
+    0,                               /* tp_reserved */
+    0,                               /* tp_repr */
+    0,                               /* tp_as_number */
+    0,                               /* tp_as_sequence */
+    0,                               /* tp_as_mapping */
+    0,                               /* tp_hash  */
+    0,                               /* tp_call */
+    0,                               /* tp_str */
+    0,                               /* tp_getattro */
+    0,                               /* tp_setattro */
+    0,                               /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
+        Py_TPFLAGS_HEAPTYPE, /* tp_flags */
+    0,                       /* tp_doc */
+    0,                       /* tp_traverse */
+    0,                       /* tp_clear */
+    0,                       /* tp_richcompare */
+    0,                       /* tp_weaklistoffset */
+    0,                       /* tp_iter */
+    0,                       /* tp_iternext */
+    variable_methods,        /* tp_methods */
+    0,                       /* tp_members */
+    variable_properties,     /* tp_getset */
+    0,                       /* tp_base */
+    0,                       /* tp_dict */
+    0,                       /* tp_descr_get */
+    0,                       /* tp_descr_set */
+    0,                       /* tp_dictoffset */
+    0,                       /* tp_init */
+    0,                       /* tp_alloc */
+    eagertensor_new,         /* tp_new */
+    0,                       /* tp_free */
+    0,                       /* tp_is_gc */
+    0,                       /* tp_bases */
+    0,                       /* tp_mro */
+    0,                       /* tp_cache */
+    0,                       /* tp_subclasses */
+    0,                       /* tp_weaklist */
+    0,                       /* tp_del */
+    0,                       /* tp_version_tag */
+    0                        /* tp_finalize */
+};
+
+void BindEager(pybind11::module* module) {
+  auto m = module->def_submodule("eager");
+
+  p_eager_tensor_type = &eager_tensor_type;
+  if (PyType_Ready(&eager_tensor_type) < 0) {
+    PADDLE_THROW(platform::errors::Fatal(
+        "Init Paddle erroe in BindEager(PyType_Ready)."));
+    return;
+  }
+
+  Py_INCREF(&eager_tensor_type);
+  if (PyModule_AddObject(m.ptr(), "EagerTensor",
+                         reinterpret_cast<PyObject*>(&eager_tensor_type)) < 0) {
+    Py_DECREF(&eager_tensor_type);
+    Py_DECREF(m.ptr());
+    PADDLE_THROW(platform::errors::Fatal(
+        "Init Paddle erroe in BindEager(PyModule_AddObject)."));
+    return;
+  }
+
+  BindFunctions(m.ptr());
+  BindEagerOpFunctions(&m);
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/pten/api/include/manipulation.h b/paddle/fluid/pybind/eager.h
similarity index 66%
rename from paddle/pten/api/include/manipulation.h
rename to paddle/fluid/pybind/eager.h
index 579fa5cdf945a..c1a869d9b89fa 100644
--- a/paddle/pten/api/include/manipulation.h
+++ b/paddle/fluid/pybind/eager.h
@@ -1,28 +1,24 @@
 /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
 
-#include "paddle/pten/api/include/tensor.h"
+#include <Python.h>
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
 
 namespace paddle {
-namespace experimental {
-
-PD_DLL_DECL Tensor flatten(const Tensor& x, int start_axis, int stop_axis);
+namespace pybind {
 
-PD_DLL_DECL Tensor cast(const Tensor& x, DataType out_dtype);
+void BindEager(pybind11::module* m);
+void BindFunctions(PyObject* module);
 
-PD_DLL_DECL Tensor reshape(const Tensor& x, const std::vector<int64_t>& shape);
-}  // namespace experimental
+}  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
new file mode 100644
index 0000000000000..8c0f9ddf19f12
--- /dev/null
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -0,0 +1,223 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+// disable numpy compile error
+#include <Python.h>
+
+#include <string>
+#include <vector>
+
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"
+
+#include "paddle/fluid/eager/accumulation/accumulation_node.h"
+#include "paddle/fluid/eager/api/all.h"
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/backward.h"
+#include "paddle/fluid/eager/utils.h"
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/pybind/eager.h"
+#include "paddle/fluid/pybind/eager_utils.h"
+#include "paddle/fluid/pybind/exception.h"
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/api/lib/utils/storage.h"
+#include "paddle/pten/api/lib/utils/tensor_utils.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/include/core.h"
+
+namespace paddle {
+namespace pybind {
+
+namespace py = ::pybind11;
+
+extern PyTypeObject* p_eager_tensor_type;
+
+size_t PyArray_Size_(PyObject* numpy_data) {
+  size_t res = 1;
+  auto dims = pybind11::detail::array_proxy(numpy_data)->dimensions;
+  auto nd = pybind11::detail::array_proxy(numpy_data)->nd;
+  while (nd--) {
+    res *= (*dims++);
+  }
+  return res;
+}
+
+class EagerNumpyAllocation : public paddle::memory::allocation::Allocation {
+ public:
+  explicit EagerNumpyAllocation(PyObject* numpy_data, pten::DataType dtype)
+      : Allocation(
+            static_cast<void*>(pybind11::detail::array_proxy(numpy_data)->data),
+            pten::DataTypeSize(dtype) * PyArray_Size_(numpy_data),
+            paddle::platform::CPUPlace()),
+        arr_(numpy_data) {
+    PADDLE_ENFORCE_NOT_NULL(arr_, platform::errors::InvalidArgument(
+                                      "The underlying PyObject pointer of "
+                                      "numpy array cannot be nullptr"));
+    PADDLE_ENFORCE_NE(
+        arr_, Py_None,
+        platform::errors::PreconditionNotMet(
+            "The underlying PyObject pointer of numpy array cannot be None"));
+    Py_INCREF(arr_);
+  }
+  ~EagerNumpyAllocation() override {
+    py::gil_scoped_acquire gil;
+    Py_DECREF(arr_);
+  }
+
+ private:
+  PyObject* arr_;
+};
+
+static PyObject* eager_api_set_expected_place(PyObject* self, PyObject* args,
+                                              PyObject* kwargs) {
+  EAGER_TRY
+  auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 0), 0);
+  egr::Controller::Instance().SetExpectedPlace(place);
+
+  Py_INCREF(Py_None);
+  return Py_None;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* eager_api_scale(PyObject* self, PyObject* args,
+                                 PyObject* kwargs) {
+  EAGER_TRY
+  // TODO(jiabin): Sync Tensor and Variable here when we support
+  egr::EagerTensor ret =
+      egr::scale(reinterpret_cast<EagerTensorObject*>(PyTuple_GET_ITEM(args, 0))
+                     ->eagertensor,
+                 CastPyArg2AttrFloat(PyTuple_GET_ITEM(args, 1), 1),
+                 CastPyArg2AttrFloat(PyTuple_GET_ITEM(args, 2), 2),
+                 CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3),
+                 CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 4), 4));
+  return ToPyObject(ret);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* eager_api_numpy_to_tensor(PyObject* numpy_data,
+                                           pten::DataType dtype,
+                                           const paddle::platform::Place& place,
+                                           bool stop_gradient) {
+  std::vector<int64_t> vec_dims;
+  auto numpy_shape = pybind11::detail::array_proxy(numpy_data)->dimensions;
+  int rank = pybind11::detail::array_proxy(numpy_data)->nd;
+  for (int i = 0; i < rank; i++) {
+    vec_dims.push_back(static_cast<int64_t>(numpy_shape[i]));
+  }
+  paddle::framework::DDim dims = paddle::framework::make_ddim(vec_dims);
+
+  // TODO(jiabin): Support GPU later
+  auto meta = pten::DenseTensorMeta(dtype, dims);
+  auto holder = std::make_shared<EagerNumpyAllocation>(numpy_data, dtype);
+  auto shared_storage =
+      pten::make_intrusive<paddle::experimental::SharedStorage>(holder, 0);
+  std::shared_ptr<pten::DenseTensor> densetensor(
+      new pten::DenseTensor(std::move(shared_storage), std::move(meta)));
+
+  PyObject* obj = p_eager_tensor_type->tp_alloc(p_eager_tensor_type, 0);
+  if (obj) {
+    auto v = reinterpret_cast<EagerTensorObject*>(obj);
+    new (&(v->eagertensor)) egr::EagerTensor();
+    v->eagertensor.set_impl(densetensor);
+    v->eagertensor.set_name(egr::Controller::Instance().GenerateUniqueName());
+    auto meta = egr::EagerUtils::autograd_meta(&(v->eagertensor));
+    meta->SetStopGradient(stop_gradient);
+
+    // Created tensor will be leaf tensor
+    // So we append AccumulationNode to it.
+    auto accumulation_node = std::make_shared<egr::GradNodeAccumulation>();
+    meta->SetGradNode(accumulation_node);
+
+    // TODO(jiabin): Shall we increase ref cnt here to make python ref cnt num
+    // correctly?
+  } else {
+    PADDLE_THROW(platform::errors::Fatal(
+        "tp_alloc return null, can not new a PyObject."));
+  }
+
+  return obj;
+}
+
+static PyObject* eager_api_to_tensor(PyObject* self, PyObject* args,
+                                     PyObject* kwargs) {
+  EAGER_TRY
+  // TODO(jiabin): Support Kwargs here
+  PyObject* data = PyTuple_GET_ITEM(args, 0);
+  auto str_dtype = CastPyArg2AttrString(PyTuple_GET_ITEM(args, 1), 1);
+  pten::DataType dtype = pten::String2DataType(str_dtype);
+  auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 2), 2);
+  bool stop_gradient = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3);
+  // TODO(jiabin): Support this when python given name
+  // auto str_name = CastPyArg2AttrString(PyTuple_GET_ITEM(args, 4), 4);
+
+  if (pybind11::detail::npy_api::get().PyArray_Check_(data)) {
+    return eager_api_numpy_to_tensor(data, dtype, place, stop_gradient);
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Eater to_tensor only support numpy to tensor."));
+    Py_INCREF(Py_None);
+    return Py_None;
+  }
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* eager_api_retain_grad_for_tensor(PyObject* self,
+                                                  PyObject* args,
+                                                  PyObject* kwargs) {
+  EAGER_TRY
+  egr::egr_utils_api::RetainGradForTensor(
+      CastPyArg2EagerTensor(PyTuple_GET_ITEM(args, 0), 0));
+  Py_INCREF(Py_None);
+  return Py_None;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* eager_api_run_backward(PyObject* self, PyObject* args,
+                                        PyObject* kwargs) {
+  EAGER_TRY
+  auto tensors = CastPyArg2VectorOfEagerTensor(PyTuple_GET_ITEM(args, 0), 0);
+  auto grad_tensors =
+      CastPyArg2VectorOfEagerTensor(PyTuple_GET_ITEM(args, 1), 1);
+  RunBackward(tensors, grad_tensors,
+              CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2));
+  Py_INCREF(Py_None);
+  return Py_None;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+PyMethodDef variable_functions[] = {
+    {"to_tensor", (PyCFunction)(void (*)(void))eager_api_to_tensor,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"scale", (PyCFunction)(void (*)(void))eager_api_scale,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_set_expected_place",
+     (PyCFunction)(void (*)(void))eager_api_set_expected_place,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"retain_grad_for_tensor",
+     (PyCFunction)(void (*)(void))eager_api_retain_grad_for_tensor,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"run_backward", (PyCFunction)(void (*)(void))eager_api_run_backward,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {NULL, NULL, 0, NULL}};
+
+void BindFunctions(PyObject* module) {
+  if (PyModule_AddFunctions(module, variable_functions) < 0) {
+    PADDLE_THROW(platform::errors::Fatal(
+        "Init Paddle erroe in BindFunctions(PyModule_AddFunctions)."));
+    return;
+  }
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
new file mode 100644
index 0000000000000..75fd8c7fabe63
--- /dev/null
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -0,0 +1,113 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+// disable numpy compile error
+#include <Python.h>
+
+#include <string>
+#include <vector>
+
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"
+
+#include "paddle/fluid/eager/api/all.h"
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/pybind/eager.h"
+#include "paddle/fluid/pybind/eager_utils.h"
+#include "paddle/fluid/pybind/exception.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/include/core.h"
+namespace paddle {
+namespace pybind {
+
+extern PyTypeObject* pEagerTensorType;
+
+static PyObject* eager_tensor_method_numpy(EagerTensorObject* self,
+                                           PyObject* args, PyObject* kwargs) {
+  EAGER_TRY
+  self->eagertensor.SyncToTensor();
+  if (!self->eagertensor.initialized()) {
+    Py_INCREF(Py_None);
+    return Py_None;
+  }
+  auto tensor_dims = self->eagertensor.shape();
+  auto numpy_dtype = TensorDtype2NumpyDtype(self->eagertensor.type());
+  auto sizeof_dtype = pten::DataTypeSize(self->eagertensor.type());
+  Py_intptr_t py_dims[paddle::framework::DDim::kMaxRank];
+  Py_intptr_t py_strides[paddle::framework::DDim::kMaxRank];
+  size_t numel = 1;
+  for (int i = tensor_dims.size() - 1; i >= 0; --i) {
+    py_dims[i] = static_cast<size_t>(tensor_dims[i]);
+    py_strides[i] = sizeof_dtype * numel;
+    numel *= py_dims[i];
+  }
+  auto& api = pybind11::detail::npy_api::get();
+  PyObject* array = api.PyArray_NewFromDescr_(
+      api.PyArray_Type_, api.PyArray_DescrFromType_(numpy_dtype),
+      tensor_dims.size(), py_dims, py_strides, nullptr,
+      pybind11::detail::npy_api::NPY_ARRAY_ALIGNED_ |
+          pybind11::detail::npy_api::NPY_ARRAY_WRITEABLE_,
+      nullptr);
+
+  if (self->eagertensor.is_cpu()) {
+    auto dense_tensor =
+        std::dynamic_pointer_cast<pten::DenseTensor>(self->eagertensor.impl());
+    platform::CPUPlace place;
+    // deep copy
+    paddle::memory::Copy(place, reinterpret_cast<void*>(
+                                    pybind11::detail::array_proxy(array)->data),
+                         place, dense_tensor->data(), sizeof_dtype * numel);
+#if defined(PADDLE_WITH_CUDA)
+  } else if (self->eagertensor.is_cuda()) {
+    auto dense_tensor =
+        std::dynamic_pointer_cast<pten::DenseTensor>(self->eagertensor.impl());
+
+    paddle::platform::GpuMemcpySync(
+        pybind11::detail::array_proxy(array)->data, dense_tensor->data(),
+        pten::DataTypeSize(dense_tensor->dtype()) * dense_tensor->numel(),
+        cudaMemcpyDeviceToHost);
+#endif
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Tensor.numpy() only support cpu tensor."));
+    Py_INCREF(Py_None);
+    return Py_None;
+  }
+
+  return array;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* eager_tensor_method_is_initialized(EagerTensorObject* self,
+                                                    PyObject* args,
+                                                    PyObject* kwargs) {
+  EAGER_TRY
+  if (self->eagertensor.Var().IsInitialized()) {
+    self->eagertensor.SyncToTensor();
+  }
+  return ToPyObject(self->eagertensor.initialized());
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+PyMethodDef variable_methods[] = {
+    {"numpy", (PyCFunction)(void (*)(void))eager_tensor_method_numpy,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_is_initialized",
+     (PyCFunction)(void (*)(void))eager_tensor_method_is_initialized,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {NULL, NULL, 0, NULL}};
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/eager_op_function_generator.cc b/paddle/fluid/pybind/eager_op_function_generator.cc
new file mode 100644
index 0000000000000..46d0bdcb46de7
--- /dev/null
+++ b/paddle/fluid/pybind/eager_op_function_generator.cc
@@ -0,0 +1,397 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <set>
+#include <string>
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/pybind/pybind.h"
+#include "paddle/fluid/string/string_helper.h"
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/framework/fleet/ascend_wrapper.h"
+#endif
+#include "paddle/fluid/pybind/op_function_generator.h"
+
+std::set<std::string> gen_list = {"elementwise_add", "reduce_sum", "matmul_v2",
+                                  "sigmoid"};
+
+// clang-format off
+const char* OUT_INITIALIZER_TEMPLATE =
+    R"({"%s", {std::shared_ptr<imperative::VarBase>(new imperative::VarBase("auto_"+std::to_string(VarBaseUniqueNameID++)+"_"))}})";
+const char* OUT_DUPLICABLE_INITIALIZER_TEMPLATE = R"({"%s", ConstructDuplicableOutput(%s)})";
+
+const char* INPUT_INITIALIZER_TEMPLATE = R"({"%s", {%s}})";
+const char* INPUT_LIST_INITIALIZER_TEMPLATE = R"({"%s", %s})";
+
+const char* INPUT_INITIALIZER_TEMPLATE_WITH_NULL = R"(
+    if (%s != nullptr) {
+      ins["%s"] = {%s};
+    }
+)";
+
+const char* INPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST = R"(
+    if (%s.size() != 0) {
+      ins["%s"] = %s;
+    }
+)";
+
+const char* OUTPUT_INITIALIZER_TEMPLATE_WITH_NULL = R"(
+    outs["%s"] = {%s};
+)";
+
+const char* OUTPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST = R"(
+    outs["%s"] = %s;
+)";
+// if inputs is list, no need {}
+const char* ARG_OUT_NUM = R"(%sNum)";
+const char* ARG_OUT_NUM_TYPE = R"(size_t )";
+
+const char* IN_VAR_TYPE = R"(py::handle)";
+const char* IN_VAR_LIST_TYPE = R"(py::handle)";
+
+const char* OUT_VAR_TYPE = R"(std::shared_ptr<imperative::VarBase>)";
+const char* OUT_VAR_LIST_TYPE = R"(std::vector<std::shared_ptr<imperative::VarBase>>)";
+
+const char* CAST_VAR_TEMPLATE = R"(
+    auto %s = GetEagerTensorFromArgs("%s", "%s", args, %d, %s);)";
+
+const char* CAST_VAR_LIST_TEMPLATE = R"(
+    auto %s = GetEagerTensorListFromArgs("%s", "%s", args, %d, %s);)";
+
+const char* CAST_SIZE_T_TEMPLATE = R"(
+    auto %s = GetUnsignedLongFromArgs("%s", "%s", args, %d, %s);)";
+
+const char* ARG_TEMPLATE = R"(const %s& %s)";
+
+const char* RETURN_TUPLE_TYPE = R"(std::tuple<%s>)";
+const char* RETURN_TUPLE_TEMPLATE = R"(std::make_tuple(%s))";
+const char* RETURN_LIST_TEMPLATE = R"(outs["%s"])";
+const char* RETURN_TEMPLATE = R"(outs["%s"][0])";
+
+const char* FUNCTION_ARGS = R"(%s, const py::args& args)";
+const char* FUNCTION_ARGS_NO_INPUT = R"(const py::args& args)";
+
+const char* HANDLE_VIEW_BETWEEN_INPUT_AND_OUTPUT = R"(
+    if (ins.count("%s") && outs.count("%s")) {
+      HandleViewBetweenInputAndOutput(ins["%s"][0], outs["%s"][0]);
+    })";
+
+const char* OP_FUNCTION_TEMPLATE =
+R"(
+static PyObject * %s(PyObject *self, PyObject *args, PyObject *kwargs)
+{
+  PyThreadState *tstate = nullptr;
+  try
+  {
+    %s
+    framework::AttributeMap attrs;
+    ConstructAttrMapFromPyArgs("%s", args, %d, PyTuple_GET_SIZE(args) , attrs);
+    tstate = PyEval_SaveThread();
+    %s
+    PyEval_RestoreThread(tstate);
+    tstate = nullptr;
+    %s
+  }
+  catch(...) {
+    if (tstate) {
+      PyEval_RestoreThread(tstate);
+    }
+    ThrowExceptionToPython(std::current_exception());
+    return nullptr;
+  }
+})";
+
+const char* PYBIND_ITEM_TEMPLATE = R"(  {"%s", (PyCFunction)(void(*)(void))%s, METH_VARARGS | METH_KEYWORDS, "C++ interface function for %s in dygraph."},)";
+
+// clang-format on
+static inline bool FindInsMap(const std::string& op_type,
+                              const std::string& in_name) {
+  return op_ins_map[op_type].count(in_name);
+}
+
+static inline bool FindOutsMap(const std::string& op_type,
+                               const std::string& out_name) {
+  return op_outs_map[op_type].count(out_name);
+}
+
+static inline bool FindPassingOutsMap(const std::string& op_type,
+                                      const std::string& out_name) {
+  return op_passing_outs_map[op_type].count(out_name);
+}
+
+static inline bool FindViewOpMap(const std::string& op_type) {
+  return view_op_map.count(op_type);
+}
+
+static inline std::string TempName(const std::string& name) {
+  return name + '_';
+}
+
+std::string GenerateOpFunctionsBody(
+    const paddle::framework::proto::OpProto* op_proto, std::string func_name,
+    bool use_inplace_strategy = false,
+    std::map<std::string, std::string> inplace_map = {}) {
+  auto& op_type = op_proto->type();
+  std::string input_args = "";
+  std::string call_api_str = "auto out = " + op_type + "_dygraph_function(";
+  std::string ins_initializer_with_null = "";
+  std::string py_arg = "";
+  int arg_idx = 0;
+  int input_args_num = 0;
+  std::string ins_cast_str = "";
+  std::string view_strategy_str = "";
+  for (auto& input : op_proto->inputs()) {
+    auto& in_name = input.name();
+    // skip those dispensable inputs, like ResidualData in conv2d
+    if (input.dispensable() && !FindInsMap(op_type, in_name)) {
+      continue;
+    }
+    const auto in_type = input.duplicable() ? IN_VAR_LIST_TYPE : IN_VAR_TYPE;
+    auto input_arg =
+        paddle::string::Sprintf(ARG_TEMPLATE, in_type, TempName(in_name));
+    input_args += input_arg;
+    input_args += ",";
+    input_args_num++;
+    const auto in_cast_type =
+        input.duplicable() ? CAST_VAR_LIST_TEMPLATE : CAST_VAR_TEMPLATE;
+    auto dispensable = input.dispensable() ? "true" : "false";
+    ins_cast_str += paddle::string::Sprintf(in_cast_type, in_name, op_type,
+                                            in_name, arg_idx++, dispensable);
+
+    if (input.dispensable()) {
+      const auto in_template = input.duplicable()
+                                   ? INPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST
+                                   : INPUT_INITIALIZER_TEMPLATE_WITH_NULL;
+      ins_initializer_with_null +=
+          paddle::string::Sprintf(in_template, in_name, in_name, in_name);
+    } else {
+      call_api_str += in_name + ", ";
+    }
+  }
+
+  if (!input_args.empty() && input_args.back() == ',') {
+    input_args.pop_back();
+  }
+
+  // Generate outs initializer
+  std::string outs_initializer = "{";
+  std::string outs_initializer_with_null = "";
+  std::string return_str = "";
+
+  int outs_num = 0;
+  for (auto& output : op_proto->outputs()) {
+    auto& out_name = output.name();
+
+    // skip those dispensable oututs
+    if (output.dispensable() && !FindOutsMap(op_type, out_name)) {
+      continue;
+    }
+    const auto out_type =
+        output.duplicable() ? OUT_VAR_LIST_TYPE : OUT_VAR_TYPE;
+
+    if (FindPassingOutsMap(op_type, out_name)) {
+      if (input_args != "") {
+        input_args += ",";
+      }
+      input_args += out_type;
+      input_args += out_name;
+      input_args_num++;
+
+      if (output.dispensable()) {
+        const auto out_template =
+            output.duplicable() ? OUTPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST
+                                : OUTPUT_INITIALIZER_TEMPLATE_WITH_NULL;
+        outs_initializer_with_null +=
+            paddle::string::Sprintf(out_template, out_name, out_name);
+      } else {
+        const auto out_template = output.duplicable()
+                                      ? INPUT_LIST_INITIALIZER_TEMPLATE
+                                      : INPUT_INITIALIZER_TEMPLATE;
+        outs_initializer +=
+            paddle::string::Sprintf(out_template, out_name, out_name);
+        outs_initializer += ",";
+      }
+
+      const auto in_cast_type =
+          output.duplicable() ? CAST_VAR_LIST_TEMPLATE : CAST_VAR_TEMPLATE;
+      auto dispensable = output.dispensable() ? "true" : "false";
+      ins_cast_str += paddle::string::Sprintf(in_cast_type, out_name, op_type,
+                                              out_name, arg_idx++, dispensable);
+    } else {
+      // There are few Operators that have duplicable output, like `Out` in
+      // split op. We need to specify the number of variables for the
+      // duplicable output, as the argument OutNum;
+      if (output.duplicable()) {
+        if (input_args != "") {
+          input_args += ",";
+        }
+        auto out_num_str = paddle::string::Sprintf(ARG_OUT_NUM, out_name);
+        input_args += ARG_OUT_NUM_TYPE;
+        input_args += out_num_str;
+        input_args_num++;
+        outs_initializer += paddle::string::Sprintf(
+            OUT_DUPLICABLE_INITIALIZER_TEMPLATE, out_name, out_num_str);
+
+        auto dispensable = output.dispensable() ? "true" : "false";
+        ins_cast_str +=
+            paddle::string::Sprintf(CAST_SIZE_T_TEMPLATE, out_num_str, op_type,
+                                    out_num_str, arg_idx++, dispensable);
+        call_api_str += out_num_str + ", ";
+      } else {
+        outs_initializer +=
+            paddle::string::Sprintf(OUT_INITIALIZER_TEMPLATE, out_name);
+      }
+      outs_initializer += ",";
+    }
+
+    // return_str += paddle::string::Sprintf(return_template, out_name);
+    // return_str += ",";
+    outs_num += 1;
+  }
+  call_api_str += "attrs);";
+  if (outs_initializer.back() == ',') {
+    outs_initializer.pop_back();
+    // return_str.pop_back();
+  }
+  outs_initializer += "}";
+  if (FindViewOpMap(op_type)) {
+    std::string viwe_input_name = view_op_map[op_type].first;
+    std::string viwe_output_name = view_op_map[op_type].second;
+    view_strategy_str += paddle::string::Sprintf(
+        HANDLE_VIEW_BETWEEN_INPUT_AND_OUTPUT, viwe_input_name, viwe_output_name,
+        viwe_input_name, viwe_output_name);
+  }
+  if (outs_num == 0) {
+    return_str = "Py_INCREF(Py_None);\n    return Py_None;";
+  } else {
+    return_str = "return ToPyObject(out);";
+  }
+  std::string function_args = "";
+  if (input_args == "") {
+    function_args = FUNCTION_ARGS_NO_INPUT;
+  } else {
+    function_args = paddle::string::Sprintf(FUNCTION_ARGS, input_args);
+  }
+
+  // generate op funtcion body
+  auto op_function_str = paddle::string::Sprintf(
+      OP_FUNCTION_TEMPLATE, func_name, ins_cast_str, op_type, input_args_num,
+      call_api_str, return_str);
+
+  return op_function_str;
+}
+
+static std::tuple<std::vector<std::string>, std::vector<std::string>>
+GenerateOpFunctions() {
+  auto& op_info_map = paddle::framework::OpInfoMap::Instance().map();
+
+  std::vector<std::string> op_function_list, bind_function_list;
+  auto& all_kernels = paddle::framework::OperatorWithKernel::AllOpKernels();
+
+  for (auto& pair : op_info_map) {
+    auto& op_info = pair.second;
+    auto op_proto = op_info.proto_;
+    if (op_proto == nullptr) {
+      continue;
+    }
+    auto& op_type = op_proto->type();
+    // Skip ooerator which is not inherit form OperatorWithKernel, like while,
+    // since only OperatorWithKernel can run in dygraph mode.
+    // if the pten lib contains op kernel, we still generate ops method
+    if (!all_kernels.count(op_type) &&
+        !pten::KernelFactory::Instance().HasCompatiblePtenKernel(op_type)) {
+      continue;
+    }
+    if (!gen_list.count(op_type)) {
+      continue;
+    }
+    std::string func_name = "eager_api_" + op_type;
+    std::string op_function_str = GenerateOpFunctionsBody(op_proto, func_name);
+
+    // generate pybind item
+    auto bind_function_str = paddle::string::Sprintf(
+        PYBIND_ITEM_TEMPLATE, op_type, func_name, op_type);
+
+    op_function_list.emplace_back(std::move(op_function_str));
+    bind_function_list.emplace_back(std::move(bind_function_str));
+  }
+  return std::make_tuple(op_function_list, bind_function_list);
+}
+
+int main(int argc, char* argv[]) {
+  if (argc != 2) {
+    std::cerr << "argc must be 2" << std::endl;
+    return -1;
+  }
+
+#ifdef PADDLE_WITH_ASCEND_CL
+  auto ascend_ptr = paddle::framework::AscendInstance::GetInstance();
+  ascend_ptr->InitGEForUT();
+#endif
+
+  std::vector<std::string> headers{
+      "\"pybind11/detail/common.h\"",
+      "\"paddle/fluid/pybind/op_function_common.h\"",
+      "\"paddle/fluid/pybind/exception.h\"", "<Python.h>"};
+
+  std::ofstream out(argv[1], std::ios::out);
+
+  out << "#pragma once\n\n";
+
+  for (auto& header : headers) {
+    out << "#include  " + header + "\n";
+  }
+
+  out << "\n\n";
+
+  auto op_funcs = GenerateOpFunctions();
+
+  out << "namespace paddle {\n"
+      << "namespace pybind {\n\n";
+  out << paddle::string::join_strings(std::get<0>(op_funcs), '\n');
+  out << "\n\n";
+
+  out << "static PyMethodDef ExtestMethods[] = {\n"
+      << paddle::string::join_strings(std::get<1>(op_funcs), '\n')
+      << "\n  {nullptr,nullptr,0,nullptr}"
+      << "};\n\n";
+
+  out << "inline void BindEagerOpFunctions(pybind11::module *module) {\n"
+      << "  auto m = module->def_submodule(\"ops\");\n"
+      << "  if (PyModule_AddFunctions(m.ptr(), ExtestMethods) < 0) {\n"
+      << "    PADDLE_THROW(platform::errors::Fatal (\"Add functions to "
+         "core.eager.ops failed!\"));\n"
+      << "  }\n\n"
+      << "  InitOpsAttrTypeMap();"
+      << "}\n\n"
+      << "} // namespace pybind\n"
+      << "} // namespace paddle\n";
+
+  out.close();
+
+#ifdef PADDLE_WITH_ASCEND_CL
+  ge::GEFinalize();
+#endif
+
+  return 0;
+}
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
new file mode 100644
index 0000000000000..7f20f32e81a5e
--- /dev/null
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -0,0 +1,166 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+// disable numpy compile error
+#include <Python.h>
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/eager/api/all.h"
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/utils.h"
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/pybind/eager.h"
+#include "paddle/fluid/pybind/eager_utils.h"
+#include "paddle/fluid/pybind/exception.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/include/core.h"
+#pragma GCC diagnostic ignored "-Wwrite-strings"
+
+namespace paddle {
+namespace pybind {
+
+extern PyTypeObject* p_eager_tensor_type;
+
+PyObject* eager_tensor_properties_get_name(EagerTensorObject* self,
+                                           void* closure) {
+  EAGER_TRY
+  self->eagertensor.SyncToTensor();
+  return ToPyObject(self->eagertensor.name());
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+int eager_tensor_properties_set_name(EagerTensorObject* self, PyObject* value,
+                                     void* closure) {
+  EAGER_TRY
+  self->eagertensor.SyncToTensor();
+  self->eagertensor.set_name(CastPyArg2AttrString(value, 0));
+  return 0;
+  EAGER_CATCH_AND_THROW_RETURN_ZERO
+}
+
+PyObject* eager_tensor_properties_get_stop_gradient(EagerTensorObject* self,
+                                                    void* closure) {
+  EAGER_TRY
+  self->eagertensor.SyncToTensor();
+  auto meta = egr::EagerUtils::autograd_meta(&self->eagertensor);
+  return ToPyObject(meta->StopGradient());
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+PyObject* eager_tensor_properties_get_grad(EagerTensorObject* self,
+                                           void* closure) {
+  EAGER_TRY
+  self->eagertensor.SyncToTensor();
+  auto meta = egr::EagerUtils::unsafe_autograd_meta(self->eagertensor);
+  return ToPyObject(meta->Grad());
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+int eager_tensor_properties_set_stop_gradient(EagerTensorObject* self,
+                                              PyObject* value, void* closure) {
+  EAGER_TRY
+  self->eagertensor.SyncToTensor();
+  auto meta = egr::EagerUtils::autograd_meta(&self->eagertensor);
+  meta->SetStopGradient(CastPyArg2AttrBoolean(value, 0));
+  return 0;
+  EAGER_CATCH_AND_THROW_RETURN_ZERO
+}
+
+PyObject* eager_tensor_properties_get_persistable(EagerTensorObject* self,
+                                                  void* closure) {
+  EAGER_TRY
+  self->eagertensor.SyncToTensor();
+  auto meta = egr::EagerUtils::autograd_meta(&self->eagertensor);
+  return ToPyObject(meta->Persistable());
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+int eager_tensor_properties_set_persistable(EagerTensorObject* self,
+                                            PyObject* value, void* closure) {
+  EAGER_TRY
+  self->eagertensor.SyncToTensor();
+  auto meta = egr::EagerUtils::autograd_meta(&self->eagertensor);
+  meta->SetPersistable(CastPyArg2AttrBoolean(value, 0));
+  return 0;
+  EAGER_CATCH_AND_THROW_RETURN_ZERO
+}
+
+PyObject* eager_tensor_properties_get_shape(EagerTensorObject* self,
+                                            void* closure) {
+  EAGER_TRY
+  self->eagertensor.SyncToTensor();
+  auto ddim = self->eagertensor.shape();
+  std::vector<int64_t> value;
+  size_t rank = static_cast<size_t>(ddim.size());
+  value.resize(rank);
+  for (size_t i = 0; i < rank; i++) {
+    value[i] = ddim[i];
+  }
+
+  return ToPyObject(value);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+PyObject* eager_tensor_properties_get_place(EagerTensorObject* self,
+                                            void* closure) {
+  EAGER_TRY
+  self->eagertensor.SyncToTensor();
+  return ToPyObject(self->eagertensor.place());
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+PyObject* eager_tensor_properties_get_place_str(EagerTensorObject* self,
+                                                void* closure) {
+  EAGER_TRY
+  self->eagertensor.SyncToTensor();
+  std::stringstream ostr;
+  ostr << self->eagertensor.place();
+  return ToPyObject(ostr.str());
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+PyObject* eager_tensor_properties_get_dtype(EagerTensorObject* self,
+                                            void* closure) {
+  EAGER_TRY
+  self->eagertensor.SyncToTensor();
+  return ToPyObject(pten::DataType2String(self->eagertensor.type()));
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+struct PyGetSetDef variable_properties[] = {
+    {"grad", (getter)eager_tensor_properties_get_grad, nullptr, nullptr,
+     nullptr},
+    {"name", (getter)eager_tensor_properties_get_name,
+     (setter)eager_tensor_properties_set_name, nullptr, nullptr},
+    {"stop_gradient", (getter)eager_tensor_properties_get_stop_gradient,
+     (setter)eager_tensor_properties_set_stop_gradient, nullptr, nullptr},
+    {"persistable", (getter)eager_tensor_properties_get_persistable,
+     (setter)eager_tensor_properties_set_persistable, nullptr, nullptr},
+    {"shape", (getter)eager_tensor_properties_get_shape, nullptr, nullptr,
+     nullptr},
+    // {"is_leaf", (getter)eager_tensor_properties_get_is_leaf, nullptr,
+    // nullptr,
+    //  nullptr},
+    {"place", (getter)eager_tensor_properties_get_place, nullptr, nullptr,
+     nullptr},
+    {"_place_str", (getter)eager_tensor_properties_get_place_str, nullptr,
+     nullptr, nullptr},
+    {"dtype", (getter)eager_tensor_properties_get_dtype, nullptr, nullptr,
+     nullptr},
+    {nullptr, nullptr, nullptr, nullptr, nullptr}};
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
new file mode 100644
index 0000000000000..eb53884186ffc
--- /dev/null
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -0,0 +1,451 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <Python.h>
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/eager/api/all.h"
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/operators/py_func_op.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/pybind/eager.h"
+#include "paddle/fluid/pybind/eager_utils.h"
+#include "paddle/fluid/pybind/op_function_common.h"
+#include "paddle/fluid/pybind/tensor_py.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/include/core.h"
+
+namespace paddle {
+namespace pybind {
+
+extern PyTypeObject* p_eager_tensor_type;
+
+extern PyTypeObject* g_place_pytype;
+extern PyTypeObject* g_cudaplace_pytype;
+extern PyTypeObject* g_cpuplace_pytype;
+extern PyTypeObject* g_xpuplace_pytype;
+extern PyTypeObject* g_npuplace_pytype;
+extern PyTypeObject* g_cudapinnedplace_pytype;
+
+int TensorDtype2NumpyDtype(pten::DataType dtype) {
+  switch (dtype) {
+    case pten::DataType::BOOL:
+      return pybind11::detail::npy_api::NPY_BOOL_;
+    case pten::DataType::INT8:
+      return pybind11::detail::npy_api::NPY_INT8_;
+    case pten::DataType::UINT8:
+      return pybind11::detail::npy_api::NPY_UINT8_;
+    case pten::DataType::INT16:
+      return pybind11::detail::npy_api::NPY_INT16_;
+    case pten::DataType::INT32:
+      return pybind11::detail::npy_api::NPY_INT32_;
+    case pten::DataType::INT64:
+      return pybind11::detail::npy_api::NPY_INT64_;
+    case pten::DataType::FLOAT16:
+      return pybind11::detail::NPY_FLOAT16_;
+    case pten::DataType::FLOAT32:
+      return pybind11::detail::npy_api::NPY_FLOAT_;
+    case pten::DataType::FLOAT64:
+      return pybind11::detail::npy_api::NPY_DOUBLE_;
+    case pten::DataType::COMPLEX64:
+      return pybind11::detail::NPY_COMPLEX64;
+    case pten::DataType::COMPLEX128:
+      return pybind11::detail::NPY_COMPLEX128;
+    default:
+      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+          "Unknow pten::DataType, the int value = %d.",
+          static_cast<int>(dtype)));
+      return 0;
+  }
+}
+
+bool PyObject_CheckLongOrConvertToLong(PyObject** obj) {
+  if ((PyLong_Check(*obj) && !PyBool_Check(*obj))) {
+    return true;
+  }
+
+  if (std::string((reinterpret_cast<PyTypeObject*>((*obj)->ob_type))->tp_name)
+          .find("numpy") != std::string::npos) {
+    auto to = PyNumber_Long(*obj);
+    if (to) {
+      *obj = to;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool PyObject_CheckFloatOrConvertToFloat(PyObject** obj) {
+  // sometimes users provide PyLong or numpy.int64 but attr is float
+  if (PyFloat_Check(*obj) || PyLong_Check(*obj)) {
+    return true;
+  }
+  if (std::string((reinterpret_cast<PyTypeObject*>((*obj)->ob_type))->tp_name)
+          .find("numpy") != std::string::npos) {
+    auto to = PyNumber_Float(*obj);
+    if (to) {
+      *obj = to;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool PyObject_CheckStr(PyObject* obj) { return PyUnicode_Check(obj); }
+
+bool CastPyArg2AttrBoolean(PyObject* obj, ssize_t arg_pos) {
+  if (obj == Py_None) {
+    return false;  // To be compatible with QA integration testing. Some
+                   // test case pass in None.
+  } else if (obj == Py_True) {
+    return true;
+  } else if (obj == Py_False) {
+    return false;
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "argument (position %d) must be "
+        "bool, but got %s",
+        arg_pos + 1, (reinterpret_cast<PyTypeObject*>(obj->ob_type))->tp_name));
+  }
+}
+
+int CastPyArg2AttrInt(PyObject* obj, ssize_t arg_pos) {
+  if (PyObject_CheckLongOrConvertToLong(&obj)) {
+    return static_cast<int>(PyLong_AsLong(obj));
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "argument (position %d) must be "
+        "int, but got %s",
+        arg_pos + 1, (reinterpret_cast<PyTypeObject*>(obj->ob_type))->tp_name));
+  }
+}
+
+int64_t CastPyArg2AttrLong(PyObject* obj, ssize_t arg_pos) {
+  if (PyObject_CheckLongOrConvertToLong(&obj)) {
+    return (int64_t)PyLong_AsLong(obj);  // NOLINT
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "argument (position %d) must be "
+        "long, but got %s",
+        arg_pos + 1, (reinterpret_cast<PyTypeObject*>(obj->ob_type))->tp_name));
+  }
+}
+
+float CastPyArg2AttrFloat(PyObject* obj, ssize_t arg_pos) {
+  if (PyObject_CheckFloatOrConvertToFloat(&obj)) {
+    return static_cast<float>(PyFloat_AsDouble(obj));
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "argument (position %d) must be "
+        "float, but got %s",
+        arg_pos + 1, (reinterpret_cast<PyTypeObject*>(obj->ob_type))->tp_name));
+  }
+}
+
+std::string CastPyArg2AttrString(PyObject* obj, ssize_t arg_pos) {
+  if (PyObject_CheckStr(obj)) {
+    Py_ssize_t size;
+    const char* data;
+    data = PyUnicode_AsUTF8AndSize(obj, &size);
+    return std::string(data, static_cast<size_t>(size));
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "argument (position %d) must be "
+        "str, but got %s",
+        arg_pos + 1, (reinterpret_cast<PyTypeObject*>(obj->ob_type))->tp_name));
+    return "";
+  }
+}
+
+egr::EagerTensor CastPyArg2EagerTensor(PyObject* obj, ssize_t arg_pos) {
+  if (PyObject_IsInstance(obj,
+                          reinterpret_cast<PyObject*>(p_eager_tensor_type))) {
+    return reinterpret_cast<EagerTensorObject*>(obj)->eagertensor;
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "argument (position %d) must be "
+        "EagerTensor, but got %s",
+        arg_pos + 1, reinterpret_cast<PyTypeObject*>(obj->ob_type)->tp_name));
+  }
+}
+
+std::vector<egr::EagerTensor> CastPyArg2VectorOfEagerTensor(PyObject* obj,
+                                                            ssize_t arg_pos) {
+  std::vector<egr::EagerTensor> result;
+  if (PyList_Check(obj)) {
+    Py_ssize_t len = PyList_Size(obj);
+    PyObject* item = nullptr;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyList_GetItem(obj, i);
+      if (PyObject_IsInstance(
+              item, reinterpret_cast<PyObject*>(p_eager_tensor_type))) {
+        result.emplace_back(
+            reinterpret_cast<EagerTensorObject*>(item)->eagertensor);
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "argument (position %d) must be "
+            "list of bool, but got %s at pos %d",
+            arg_pos + 1,
+            reinterpret_cast<PyTypeObject*>(item->ob_type)->tp_name, i));
+      }
+    }
+  } else if (PyTuple_Check(obj)) {
+    Py_ssize_t len = PyTuple_Size(obj);
+    PyObject* item = nullptr;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyTuple_GetItem(obj, i);
+      if (PyObject_IsInstance(
+              item, reinterpret_cast<PyObject*>(p_eager_tensor_type))) {
+        result.emplace_back(
+            reinterpret_cast<EagerTensorObject*>(item)->eagertensor);
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "argument (position %d) must be "
+            "list of bool, but got %s at pos %d",
+            arg_pos + 1,
+            reinterpret_cast<PyTypeObject*>(item->ob_type)->tp_name, i));
+      }
+    }
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "argument (position %d) must be "
+        "list or tuple, but got %s",
+        arg_pos + 1, reinterpret_cast<PyTypeObject*>(obj->ob_type)->tp_name));
+  }
+  return result;
+}
+
+platform::Place CastPyArg2Place(PyObject* obj, ssize_t arg_pos) {
+  platform::Place place;
+  if (PyObject_IsInstance(obj, reinterpret_cast<PyObject*>(g_place_pytype))) {
+    place = ::pybind11::handle(obj).cast<platform::Place>();
+  } else if (PyObject_IsInstance(
+                 obj, reinterpret_cast<PyObject*>(g_cudaplace_pytype))) {
+    place = ::pybind11::handle(obj).cast<platform::CUDAPlace>();
+  } else if (PyObject_IsInstance(
+                 obj, reinterpret_cast<PyObject*>(g_cpuplace_pytype))) {
+    place = ::pybind11::handle(obj).cast<platform::CPUPlace>();
+  } else if (PyObject_IsInstance(
+                 obj, reinterpret_cast<PyObject*>(g_xpuplace_pytype))) {
+    place = ::pybind11::handle(obj).cast<platform::XPUPlace>();
+  } else if (PyObject_IsInstance(
+                 obj, reinterpret_cast<PyObject*>(g_npuplace_pytype))) {
+    place = ::pybind11::handle(obj).cast<platform::NPUPlace>();
+  } else if (PyObject_IsInstance(
+                 obj, reinterpret_cast<PyObject*>(g_cudapinnedplace_pytype))) {
+    place = ::pybind11::handle(obj).cast<platform::CUDAPinnedPlace>();
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "argument (position %d) must be "
+        "one of(Place,CUDAPlace,CPUPlace,XPUPlace,NPUPlace,CUDAPinnedPlace), "
+        "but got %s",
+        arg_pos + 1, reinterpret_cast<PyTypeObject*>(obj->ob_type)->tp_name));
+  }
+  return place;
+}
+
+PyObject* ToPyObject(bool value) {
+  if (value) {
+    Py_INCREF(Py_True);
+    return Py_True;
+  } else {
+    Py_INCREF(Py_False);
+    return Py_False;
+  }
+}
+
+PyObject* ToPyObject(int value) { return PyLong_FromLong(value); }
+
+PyObject* ToPyObject(int64_t value) { return PyLong_FromLongLong(value); }
+
+PyObject* ToPyObject(float value) { return PyLong_FromDouble(value); }
+
+PyObject* ToPyObject(double value) { return PyLong_FromDouble(value); }
+
+PyObject* ToPyObject(const char* value) { return PyUnicode_FromString(value); }
+
+PyObject* ToPyObject(const std::string& value) {
+  return PyUnicode_FromString(value.c_str());
+}
+
+PyObject* ToPyObject(const egr::EagerTensor& value) {
+  PyObject* obj = p_eager_tensor_type->tp_alloc(p_eager_tensor_type, 0);
+  if (obj) {
+    auto v = reinterpret_cast<EagerTensorObject*>(obj);
+    new (&(v->eagertensor)) egr::EagerTensor();
+    v->eagertensor = value;
+  } else {
+    PADDLE_THROW(platform::errors::Fatal(
+        "tp_alloc return null, can not new a PyObject."));
+  }
+  return obj;
+}
+
+PyObject* ToPyObject(const std::vector<bool>& value) {
+  PyObject* result = PyList_New((Py_ssize_t)value.size());
+
+  for (size_t i = 0; i < value.size(); i++) {
+    PyList_SET_ITEM(result, static_cast<Py_ssize_t>(i), ToPyObject(value[i]));
+  }
+
+  return result;
+}
+
+PyObject* ToPyObject(const std::vector<int>& value) {
+  PyObject* result = PyList_New((Py_ssize_t)value.size());
+
+  for (size_t i = 0; i < value.size(); i++) {
+    PyList_SET_ITEM(result, static_cast<Py_ssize_t>(i), ToPyObject(value[i]));
+  }
+
+  return result;
+}
+
+PyObject* ToPyObject(const std::vector<int64_t>& value) {
+  PyObject* result = PyList_New((Py_ssize_t)value.size());
+
+  for (size_t i = 0; i < value.size(); i++) {
+    PyList_SET_ITEM(result, (Py_ssize_t)i, ToPyObject(value[i]));
+  }
+
+  return result;
+}
+
+PyObject* ToPyObject(const std::vector<float>& value) {
+  PyObject* result = PyList_New((Py_ssize_t)value.size());
+
+  for (size_t i = 0; i < value.size(); i++) {
+    PyList_SET_ITEM(result, static_cast<Py_ssize_t>(i), ToPyObject(value[i]));
+  }
+
+  return result;
+}
+
+PyObject* ToPyObject(const std::vector<double>& value) {
+  PyObject* result = PyList_New((Py_ssize_t)value.size());
+
+  for (size_t i = 0; i < value.size(); i++) {
+    PyList_SET_ITEM(result, static_cast<Py_ssize_t>(i), ToPyObject(value[i]));
+  }
+
+  return result;
+}
+
+PyObject* ToPyObject(const std::vector<egr::EagerTensor>& value) {
+  PyObject* result = PyList_New((Py_ssize_t)value.size());
+
+  for (size_t i = 0; i < value.size(); i++) {
+    PyObject* obj = p_eager_tensor_type->tp_alloc(p_eager_tensor_type, 0);
+    if (obj) {
+      auto v = reinterpret_cast<EagerTensorObject*>(obj);
+      new (&(v->eagertensor)) egr::EagerTensor();
+      v->eagertensor = value[i];
+    } else {
+      PADDLE_THROW(platform::errors::Fatal(
+          "tp_alloc return null, can not new a PyObject."));
+    }
+    PyList_SET_ITEM(result, static_cast<Py_ssize_t>(i), obj);
+  }
+
+  return result;
+}
+
+PyObject* ToPyObject(const platform::Place& value) {
+  auto obj = ::pybind11::cast(value);
+  obj.inc_ref();
+  return obj.ptr();
+}
+
+egr::EagerTensor GetEagerTensorFromArgs(const std::string& op_type,
+                                        const std::string& arg_name,
+                                        PyObject* args, ssize_t arg_idx,
+                                        bool dispensable) {
+  PyObject* obj = PyTuple_GET_ITEM(args, arg_idx);
+
+  if (PyTuple_Check(obj)) {
+    obj = PyTuple_GET_ITEM(obj, 0);
+  }
+
+  if (obj == nullptr || obj == Py_None) {
+    if (!dispensable) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be Tensor, but got None",
+          op_type, arg_name, arg_idx));
+    }
+    egr::EagerTensor emptytensor;
+    return emptytensor;
+  }
+
+  return reinterpret_cast<EagerTensorObject*>(obj)->eagertensor;
+}
+
+std::vector<egr::EagerTensor> GetEagerTensorListFromArgs(
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable) {
+  PyObject* list = PyTuple_GET_ITEM(args, arg_idx);
+
+  if (list == nullptr) {
+    if (!dispensable) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be list of Tensor, but got "
+          "None",
+          op_type, arg_name, arg_idx));
+    }
+    return {};
+  }
+
+  std::vector<egr::EagerTensor> result;
+
+  if (PyList_Check(list)) {
+    Py_ssize_t len = PyList_Size(list);
+    if (len == 0) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be list of Tensors, but got "
+          "empty list",
+          op_type, arg_name, arg_idx));
+    }
+    for (Py_ssize_t i = 0; i < len; i++) {
+      result.emplace_back(
+          reinterpret_cast<EagerTensorObject*>(PyList_GetItem(list, i))
+              ->eagertensor);
+    }
+  } else if (PyTuple_Check(list)) {
+    Py_ssize_t len = PyTuple_Size(list);
+    if (len == 0) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be list of Tensors, but got "
+          "empty list",
+          op_type, arg_name, arg_idx));
+    }
+    for (Py_ssize_t i = 0; i < len; i++) {
+      result.emplace_back(
+          reinterpret_cast<EagerTensorObject*>(PyTuple_GetItem(list, i))
+              ->eagertensor);
+    }
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument '%s' (position %d) must be list of Tensors, but got "
+        "%s",
+        op_type, arg_name, arg_idx,
+        (reinterpret_cast<PyTypeObject*>(list->ob_type))->tp_name));
+  }
+
+  return result;
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
new file mode 100644
index 0000000000000..e72820c4dbe8c
--- /dev/null
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -0,0 +1,89 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <Python.h>
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace paddle {
+namespace pybind {
+
+typedef struct {
+  PyObject_HEAD egr::EagerTensor eagertensor;
+} EagerTensorObject;
+
+int TensorDtype2NumpyDtype(pten::DataType dtype);
+
+bool PyObject_CheckLongOrConvertToLong(PyObject** obj);
+bool PyObject_CheckFloatOrConvertToFloat(PyObject** obj);
+bool PyObject_CheckStr(PyObject* obj);
+bool CastPyArg2AttrBoolean(PyObject* obj, ssize_t arg_pos);
+int CastPyArg2AttrInt(PyObject* obj, ssize_t arg_pos);
+int64_t CastPyArg2AttrLong(PyObject* obj, ssize_t arg_pos);
+float CastPyArg2AttrFloat(PyObject* obj, ssize_t arg_pos);
+std::string CastPyArg2AttrString(PyObject* obj, ssize_t arg_pos);
+egr::EagerTensor CastPyArg2EagerTensor(PyObject* obj, ssize_t arg_pos);
+std::vector<egr::EagerTensor> CastPyArg2VectorOfEagerTensor(PyObject* obj,
+                                                            ssize_t arg_pos);
+platform::Place CastPyArg2Place(PyObject* obj, ssize_t arg_pos);
+
+PyObject* ToPyObject(int value);
+PyObject* ToPyObject(bool value);
+PyObject* ToPyObject(int64_t value);
+PyObject* ToPyObject(float value);
+PyObject* ToPyObject(double value);
+PyObject* ToPyObject(const char* value);
+PyObject* ToPyObject(const std::string& value);
+PyObject* ToPyObject(const egr::EagerTensor& value);
+PyObject* ToPyObject(const std::vector<bool>& value);
+PyObject* ToPyObject(const std::vector<int>& value);
+PyObject* ToPyObject(const std::vector<int64_t>& value);
+PyObject* ToPyObject(const std::vector<float>& value);
+PyObject* ToPyObject(const std::vector<double>& value);
+PyObject* ToPyObject(const std::vector<egr::EagerTensor>& value);
+PyObject* ToPyObject(const platform::Place& value);
+
+template <typename Tuple, size_t N>
+struct TupleEagerTensorResult {
+  static void Run(const Tuple& out, PyObject* result) {
+    TupleEagerTensorResult<Tuple, N - 1>::Run(out, result);
+    PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get<N - 1>(out)));
+  }
+};
+
+template <typename Tuple>
+struct TupleEagerTensorResult<Tuple, 1> {
+  static void Run(const Tuple& out, PyObject* result) {
+    PyTuple_SET_ITEM(result, 0, ToPyObject(std::get<0>(out)));
+  }
+};
+
+template <typename... Args>
+PyObject* ToPyObject(const std::tuple<Args...>& out) {
+  auto len = sizeof...(Args);
+  PyObject* result = PyTuple_New(len);
+
+  TupleEagerTensorResult<decltype(out), sizeof...(Args)>::Run(out, result);
+
+  return result;
+}
+
+egr::EagerTensor GetEagerTensorFromArgs(const std::string& op_type,
+                                        const std::string& arg_name,
+                                        PyObject* args, ssize_t arg_idx,
+                                        bool dispensable = false);
+std::vector<egr::EagerTensor> GetEagerTensorListFromArgs(
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable = false);
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/exception.cc b/paddle/fluid/pybind/exception.cc
index 3d07985ff654e..362a3e44fab62 100644
--- a/paddle/fluid/pybind/exception.cc
+++ b/paddle/fluid/pybind/exception.cc
@@ -81,5 +81,48 @@ void BindException(pybind11::module* m) {
   });
 }
 
+void ThrowExceptionToPython(std::exception_ptr p) {
+  static PyObject* EOFExceptionException =
+      PyErr_NewException("paddle.EOFException", PyExc_Exception, NULL);
+  static PyObject* EnforceNotMetException =
+      PyErr_NewException("paddle.EnforceNotMet", PyExc_Exception, NULL);
+  try {
+    if (p) std::rethrow_exception(p);
+  } catch (const platform::EOFException& e) {
+    PyErr_SetString(EOFExceptionException, e.what());
+  } catch (const platform::EnforceNotMet& e) {
+    switch (e.code()) {
+      case paddle::platform::error::INVALID_ARGUMENT:
+        PyErr_SetString(PyExc_ValueError, e.what());
+        break;
+      case paddle::platform::error::NOT_FOUND:
+      case paddle::platform::error::ALREADY_EXISTS:
+      case paddle::platform::error::PRECONDITION_NOT_MET:
+      case paddle::platform::error::PERMISSION_DENIED:
+      case paddle::platform::error::EXECUTION_TIMEOUT:
+      case paddle::platform::error::UNAVAILABLE:
+        PyErr_SetString(PyExc_RuntimeError, e.what());
+        break;
+      case paddle::platform::error::OUT_OF_RANGE:
+        PyErr_SetString(PyExc_IndexError, e.what());
+        break;
+      case paddle::platform::error::RESOURCE_EXHAUSTED:
+        PyErr_SetString(PyExc_MemoryError, e.what());
+        break;
+      case paddle::platform::error::UNIMPLEMENTED:
+        PyErr_SetString(PyExc_NotImplementedError, e.what());
+        break;
+      case paddle::platform::error::FATAL:
+        PyErr_SetString(PyExc_SystemError, e.what());
+        break;
+      case paddle::platform::error::EXTERNAL:
+        PyErr_SetString(PyExc_OSError, e.what());
+        break;
+      default:
+        PyErr_SetString(EnforceNotMetException, e.what());
+        break;
+    }
+  }
+}
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/exception.h b/paddle/fluid/pybind/exception.h
index 5e054267361f2..cf82f464a11f2 100644
--- a/paddle/fluid/pybind/exception.h
+++ b/paddle/fluid/pybind/exception.h
@@ -18,10 +18,26 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "pybind11/pybind11.h"
 
+#define EAGER_TRY try {
+#define EAGER_CATCH_AND_THROW_RETURN_NULL             \
+  }                                                   \
+  catch (...) {                                       \
+    ThrowExceptionToPython(std::current_exception()); \
+    return nullptr;                                   \
+  }
+
+#define EAGER_CATCH_AND_THROW_RETURN_ZERO             \
+  }                                                   \
+  catch (...) {                                       \
+    ThrowExceptionToPython(std::current_exception()); \
+    return 0;                                         \
+  }
+
 namespace paddle {
 namespace pybind {
 
 void BindException(pybind11::module* m);
+void ThrowExceptionToPython(std::exception_ptr p);
 
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 5ff0e58d85801..dc97d98e8c47f 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -37,6 +37,7 @@ limitations under the License. */
 #include "paddle/fluid/imperative/data_loader.h"
 #include "paddle/fluid/imperative/gloo_context.h"
 #include "paddle/fluid/imperative/hccl_context.h"
+#include "paddle/fluid/imperative/heter_ccl_context.h"
 #include "paddle/fluid/imperative/hooks.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/nccl_context.h"
@@ -59,18 +60,6 @@ PyTypeObject *g_varbase_pytype = nullptr;
 
 namespace py = ::pybind11;
 
-class Layer : public imperative::Layer {
- public:
-  using imperative::Layer::Layer;  // Inherit constructors
-
-  std::vector<std::shared_ptr<imperative::VarBase>> Forward(
-      const std::vector<std::shared_ptr<imperative::VarBase>> &inputs)
-      override {
-    PYBIND11_OVERLOAD(std::vector<std::shared_ptr<imperative::VarBase>>, Layer,
-                      Forward, inputs);  // NOLINT
-  }
-};
-
 template <typename T>
 static T PyObjectCast(PyObject *obj) {
   try {
@@ -1549,7 +1538,7 @@ void BindImperative(py::module *m_ptr) {
              self.MutableGradVarBase()->SetType(type);
            })
       .def("_reset_grad_inplace_version",
-           [](imperative::VarBase &self) {
+           [](imperative::VarBase &self, bool set_to_zero) {
              /*
              *** This interfaceis a complete hack ***
              reset_grad_inplace_version removes all inplace related records to
@@ -1561,15 +1550,20 @@ void BindImperative(py::module *m_ptr) {
              Make sure you fully understand what you're doing before make use of
              this interface, and prepare for the worst.
              */
+             py::gil_scoped_release release;
+
              if (self.HasGradVar()) {
                auto grad_var = self.GradVarBase();
                auto var_wrapper = grad_var->SharedVar();
-               if (var_wrapper) var_wrapper->ResetInplaceVersion();
+               if (var_wrapper) {
+                 var_wrapper->ResetInplaceVersion(set_to_zero);
+               }
              }
            })
       .def("_grad_ivar",
            [](const imperative::VarBase &self) {
              auto &grad_var = self.GradVarBase();
+
              if (grad_var && grad_var->Var().IsInitialized()) {
                auto *tensor =
                    grad_var->MutableVar()->IsType<framework::LoDTensor>()
@@ -1578,6 +1572,7 @@ void BindImperative(py::module *m_ptr) {
                        : grad_var->MutableVar()
                              ->GetMutable<framework::SelectedRows>()
                              ->mutable_value();
+
                if (tensor->IsInitialized()) {
                  return grad_var;
                }
@@ -1756,7 +1751,7 @@ void BindImperative(py::module *m_ptr) {
                  "Cannot copy this Tensor to GPU in CPU version Paddle, "
                  "Please recompile or reinstall Paddle with CUDA support."));
 #else
-             int device_count = platform::GetCUDADeviceCount();
+             int device_count = platform::GetGPUDeviceCount();
              int device_id = 0;
              if (handle == py::none()) {
                if (platform::is_gpu_place(self->Place())) {
@@ -1975,10 +1970,6 @@ void BindImperative(py::module *m_ptr) {
       .def("_numel",
            [](std::shared_ptr<imperative::VarBase> &self) {
              auto *t = self->MutableVar()->GetMutable<framework::LoDTensor>();
-             PADDLE_ENFORCE_EQ(
-                 t->IsInitialized(), true,
-                 platform::errors::InvalidArgument(
-                     "Tensor %s has not been initialized!", self->Name()));
              return t->numel();
            })
       .def_property("name", &imperative::VarBase::Name,
@@ -2051,18 +2042,6 @@ void BindImperative(py::module *m_ptr) {
       .def_property_readonly("type", &imperative::VarBase::Type)
       .def_property_readonly("dtype", &imperative::VarBase::DataType);
 
-  // NOTE(zhiqiu): set the metaclass of Layer.
-  // See details: https://github.com/pybind/pybind11/pull/679
-  // https://github.com/pybind/pybind11/blob/028812ae7eee307dca5f8f69d467af7b92cc41c8/tests/test_methods_and_attributes.cpp#L284
-  py::class_<imperative::Layer, Layer /* <--- trampoline*/> layer(
-      m, "Layer", py::metaclass((PyObject *)&PyType_Type));  // NOLINT
-  layer.def(py::init<>())
-      .def("forward",
-           [](imperative::Layer &self,
-              const std::vector<std::shared_ptr<imperative::VarBase>> &inputs) {
-             return self.Forward(inputs);
-           });
-
   py::class_<imperative::jit::ProgramDescTracer>(m, "ProgramDescTracer", "")
       .def("create_program_desc",
            &imperative::jit::ProgramDescTracer::CreateProgramDesc)
@@ -2360,6 +2339,15 @@ void BindImperative(py::module *m_ptr) {
            py::arg("ring_id"));
 #endif
 
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL)
+  py::class_<imperative::HeterParallelContext, imperative::ParallelContext,
+             std::shared_ptr<imperative::HeterParallelContext>>(
+      m, "HeterParallelContext")
+      .def(py::init<const imperative::ParallelStrategy &, const int &>())
+      .def("init", [](imperative::HeterParallelContext &self) { self.Init(); });
+#endif
+
   m.def("pylayer_apply",
         [](const platform::CPUPlace &place, const py::object &cls,
            const py::args args, const py::kwargs kwargs) {
diff --git a/paddle/fluid/pybind/op_function.h b/paddle/fluid/pybind/op_function.h
index 997cb610fafca..7b9379df6be2c 100644
--- a/paddle/fluid/pybind/op_function.h
+++ b/paddle/fluid/pybind/op_function.h
@@ -29,34 +29,14 @@
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/imperative/type_defs.h"
+#include "paddle/fluid/pybind/exception.h"
 #include "paddle/fluid/pybind/imperative.h"
+#include "paddle/fluid/pybind/op_function_common.h"
 
 namespace py = pybind11;
 namespace paddle {
 namespace pybind {
 
-class OpAttrTypeMap {
- public:
-  static OpAttrTypeMap& Instance() {
-    static OpAttrTypeMap g_op_attr_type_map;
-    return g_op_attr_type_map;
-  }
-
-  std::unordered_map<
-      std::string,
-      std::unordered_map<std::string, paddle::framework::proto::AttrType>>&
-  Map() {
-    return ops_attrtype_map_;
-  }
-
- private:
-  OpAttrTypeMap() = default;
-  std::unordered_map<
-      std::string,
-      std::unordered_map<std::string, paddle::framework::proto::AttrType>>
-      ops_attrtype_map_;
-};
-
 static inline std::shared_ptr<imperative::VarBase> CastPyHandleToVarBase(
     const std::string& op_type, const std::string& arg_name, int arg_idx,
     const py::handle& handle, bool dispensable = false) {
@@ -197,737 +177,7 @@ static inline void HandleViewBetweenInputAndOutput(
   }
 }
 
-extern PyTypeObject* g_varbase_pytype;
-extern PyTypeObject* g_vartype_pytype;
-extern PyTypeObject* g_blockdesc_pytype;
-
-inline bool PyObject_CheckBool(PyObject** obj) { return PyBool_Check(*obj); }
-
-inline bool PyObject_CheckLongOrToLong(PyObject** obj) {
-  if ((PyLong_Check(*obj) && !PyBool_Check(*obj)) ||
-      PyObject_IsInstance(*obj, (PyObject*)g_vartype_pytype) ||  // NOLINT
-      PyObject_IsInstance(*obj, (PyObject*)g_varbase_pytype)) {  // NOLINT
-    return true;
-  }
-
-  if (std::string(((PyTypeObject*)(*obj)->ob_type)->tp_name)  // NOLINT
-          .find("numpy") != std::string::npos) {
-    auto to = PyNumber_Long(*obj);
-    if (to) {
-      *obj = to;
-      return true;
-    }
-  }
-
-  return false;
-}
-
-inline bool PyObject_CheckFloatOrToFloat(PyObject** obj) {
-  // sometimes users provide PyLong or numpy.int64 but attr is float
-  if (PyFloat_Check(*obj) || PyLong_Check(*obj) ||
-      PyObject_IsInstance(*obj, (PyObject*)g_varbase_pytype)) {  // NOLINT
-    return true;
-  }
-  if (std::string(((PyTypeObject*)(*obj)->ob_type)->tp_name)  // NOLINT
-          .find("numpy") != std::string::npos) {
-    auto to = PyNumber_Float(*obj);
-    if (to) {
-      *obj = to;
-      return true;
-    }
-  }
-  return false;
-}
-
-inline bool PyObject_CheckString(PyObject* obj) { return PyUnicode_Check(obj); }
-
-static inline void CastPyArg2AttrBoolean(
-    PyObject* obj,
-    paddle::framework::AttributeMap& attrs,  // NOLINT
-    const std::string& key, const std::string& op_type, ssize_t arg_pos) {
-  if (obj == Py_None) {
-    attrs[key] = false;  // To be compatible with QA integration testing. Some
-                         // test case pass in None.
-  } else if (obj == Py_True) {
-    attrs[key] = true;
-  } else if (obj == Py_False) {
-    attrs[key] = false;
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument (position %d) must be "
-        "bool, but got %s",
-        op_type, arg_pos + 1,
-        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
-  }
-}
-
-static inline void CastPyArg2AttrInt(
-    PyObject* obj,
-    paddle::framework::AttributeMap& attrs,  // NOLINT
-    const std::string& key, const std::string& op_type, ssize_t arg_pos) {
-  if (PyObject_CheckLongOrToLong(&obj)) {
-    attrs[key] = (int)PyLong_AsLong(obj);  // NOLINT
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument (position %d) must be "
-        "int, but got %s",
-        op_type, arg_pos + 1,
-        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
-  }
-}
-
-static inline void CastPyArg2AttrLong(
-    PyObject* obj,
-    paddle::framework::AttributeMap& attrs,  // NOLINT
-    const std::string& key, const std::string& op_type, ssize_t arg_pos) {
-  if (PyObject_CheckLongOrToLong(&obj)) {
-    attrs[key] = (int64_t)PyLong_AsLong(obj);  // NOLINT
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument (position %d) must be "
-        "long, but got %s",
-        op_type, arg_pos + 1,
-        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
-  }
-}
-
-static inline void CastPyArg2AttrFloat(
-    PyObject* obj,
-    paddle::framework::AttributeMap& attrs,  // NOLINT
-    const std::string& key, const std::string& op_type, ssize_t arg_pos) {
-  if (PyObject_CheckFloatOrToFloat(&obj)) {
-    attrs[key] = (float)PyFloat_AsDouble(obj);  // NOLINT
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument (position %d) must be "
-        "float, but got %s",
-        op_type, arg_pos + 1,
-        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
-  }
-}
-
-static inline void CastPyArg2AttrString(
-    PyObject* obj,
-    paddle::framework::AttributeMap& attrs,  // NOLINT
-    const std::string& key, const std::string& op_type, ssize_t arg_pos) {
-  if (PyObject_CheckString(obj)) {
-    Py_ssize_t size;
-    const char* data;
-    data = PyUnicode_AsUTF8AndSize(obj, &size);
-    attrs[key] = std::string(data, static_cast<size_t>(size));
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument (position %d) must be "
-        "str, but got %s",
-        op_type, arg_pos + 1,
-        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
-  }
-}
-
-static inline void CastPyArg2AttrBooleans(
-    PyObject* obj, paddle::framework::AttributeMap& attrs,  // NOLINT
-    const std::string& key, const std::string& op_type, ssize_t arg_pos) {
-  if (PyList_Check(obj)) {
-    Py_ssize_t len = PyList_Size(obj);
-    PyObject* item = nullptr;
-    std::vector<bool> value;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PyList_GetItem(obj, i);
-      if (PyObject_CheckBool(&item)) {
-        value.emplace_back(PyLong_AsLong(item));
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument (position %d) must be "
-            "list of bool, but got %s at pos %d",
-            op_type, arg_pos + 1,
-            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
-            i));
-      }
-    }
-    attrs[key] = value;
-  } else if (PyTuple_Check(obj)) {
-    Py_ssize_t len = PyTuple_Size(obj);
-    PyObject* item = nullptr;
-    std::vector<bool> value;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PyTuple_GetItem(obj, i);
-      if (PyObject_CheckBool(&item)) {
-        value.emplace_back(PyLong_AsLong(item));
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument (position %d) must be "
-            "list of bool, but got %s at pos %d",
-            op_type, arg_pos + 1,
-            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
-            i));
-      }
-    }
-    attrs[key] = value;
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument (position %d) must be "
-        "list or tuple, but got %s",
-        op_type, arg_pos + 1,
-        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
-  }
-}
-
-static inline void CastPyArg2AttrInts(
-    PyObject* obj,
-    paddle::framework::AttributeMap& attrs,  // NOLINT
-    const std::string& key, const std::string& op_type, ssize_t arg_pos) {
-  if (PyList_Check(obj)) {
-    Py_ssize_t len = PyList_Size(obj);
-    PyObject* item = nullptr;
-    std::vector<int> value;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PyList_GetItem(obj, i);
-      if (PyObject_CheckLongOrToLong(&item)) {
-        value.emplace_back(PyLong_AsLong(item));
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument (position %d) must be "
-            "list of int, but got %s at pos %d",
-            op_type, arg_pos + 1,
-            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
-            i));
-      }
-    }
-    attrs[key] = value;
-  } else if (PyTuple_Check(obj)) {
-    Py_ssize_t len = PyTuple_Size(obj);
-    PyObject* item = nullptr;
-    std::vector<int> value;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PyTuple_GetItem(obj, i);
-      if (PyObject_CheckLongOrToLong(&item)) {
-        value.emplace_back(PyLong_AsLong(item));
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument (position %d) must be "
-            "list of int, but got %s at pos %d",
-            op_type, arg_pos + 1,
-            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
-            i));
-      }
-    }
-    attrs[key] = value;
-  } else if (PySequence_Check(obj)) {
-    Py_ssize_t len = PySequence_Size(obj);
-    PyObject* item = nullptr;
-    std::vector<int> value;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PySequence_GetItem(obj, i);
-      if (PyObject_CheckLongOrToLong(&item)) {
-        value.emplace_back(PyLong_AsLong(item));
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument (position %d) must be "
-            "list of int, but got %s at pos %d",
-            op_type, arg_pos + 1,
-            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
-            i));
-      }
-    }
-    attrs[key] = value;
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument (position %d) must be "
-        "list or tuple, but got %s",
-        op_type, arg_pos + 1,
-        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
-  }
-}
-
-static inline void CastPyArg2AttrLongs(
-    PyObject* obj,
-    paddle::framework::AttributeMap& attrs,  // NOLINT
-    const std::string& key, const std::string& op_type, ssize_t arg_pos) {
-  if (PyList_Check(obj)) {
-    Py_ssize_t len = PyList_Size(obj);
-    PyObject* item = nullptr;
-    std::vector<int64_t> value;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PyList_GetItem(obj, i);
-      if (PyObject_CheckLongOrToLong(&item)) {
-        value.emplace_back(PyLong_AsLong(item));
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument (position %d) must be "
-            "list of int, but got %s at pos %d",
-            op_type, arg_pos + 1,
-            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
-            i));
-      }
-    }
-    attrs[key] = value;
-  } else if (PyTuple_Check(obj)) {
-    Py_ssize_t len = PyTuple_Size(obj);
-    PyObject* item = nullptr;
-    std::vector<int64_t> value;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PyTuple_GetItem(obj, i);
-      if (PyObject_CheckLongOrToLong(&item)) {
-        value.emplace_back(PyLong_AsLong(item));
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument (position %d) must be "
-            "list of int, but got %s at pos %d",
-            op_type, arg_pos + 1,
-            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
-            i));
-      }
-    }
-    attrs[key] = value;
-  } else if (PySequence_Check(obj)) {
-    Py_ssize_t len = PySequence_Size(obj);
-    PyObject* item = nullptr;
-    std::vector<int64_t> value;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PySequence_GetItem(obj, i);
-      if (PyObject_CheckLongOrToLong(&item)) {
-        value.emplace_back(PyLong_AsLong(item));
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument (position %d) must be "
-            "list of int, but got %s at pos %d",
-            op_type, arg_pos + 1,
-            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
-            i));
-      }
-    }
-    attrs[key] = value;
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument (position %d) must be "
-        "list or tuple, but got %s",
-        op_type, arg_pos + 1,
-        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
-  }
-}
-
-static inline void CastPyArg2AttrFloats(
-    PyObject* obj,
-    paddle::framework::AttributeMap& attrs,  // NOLINT
-    const std::string& key, const std::string& op_type, ssize_t arg_pos) {
-  if (PyList_Check(obj)) {
-    Py_ssize_t len = PyList_Size(obj);
-    PyObject* item = nullptr;
-    std::vector<float> value;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PyList_GetItem(obj, i);
-      if (PyObject_CheckFloatOrToFloat(&item)) {
-        value.emplace_back(PyFloat_AsDouble(item));
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument (position %d) must be "
-            "list of float, but got %s at pos %d",
-            op_type, arg_pos + 1,
-            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
-            i));
-      }
-    }
-    attrs[key] = value;
-  } else if (PyTuple_Check(obj)) {
-    Py_ssize_t len = PyTuple_Size(obj);
-    PyObject* item = nullptr;
-    std::vector<float> value;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PyTuple_GetItem(obj, i);
-      if (PyObject_CheckFloatOrToFloat(&item)) {
-        value.emplace_back(PyFloat_AsDouble(item));
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument (position %d) must be "
-            "list of float, but got %s at pos %d",
-            op_type, arg_pos + 1,
-            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
-            i));
-      }
-    }
-    attrs[key] = value;
-  } else if (PySequence_Check(obj)) {
-    Py_ssize_t len = PySequence_Size(obj);
-    PyObject* item = nullptr;
-    std::vector<float> value;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PySequence_GetItem(obj, i);
-      if (PyObject_CheckFloatOrToFloat(&item)) {
-        value.emplace_back(PyFloat_AsDouble(item));
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument (position %d) must be "
-            "list of float, but got %s at pos %d",
-            op_type, arg_pos + 1,
-            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
-            i));
-      }
-    }
-    attrs[key] = value;
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument (position %d) must be "
-        "list or tuple, but got %s",
-        op_type, arg_pos + 1,
-        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
-  }
-}
-
-static inline void CastPyArg2AttrFloat64s(
-    PyObject* obj, paddle::framework::AttributeMap& attrs,  // NOLINT
-    const std::string& key, const std::string& op_type, ssize_t arg_pos) {
-  if (PyList_Check(obj)) {
-    Py_ssize_t len = PyList_Size(obj);
-    PyObject* item = nullptr;
-    std::vector<double> value;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PyList_GetItem(obj, i);
-      if (PyObject_CheckFloatOrToFloat(&item)) {
-        value.emplace_back(PyFloat_AsDouble(item));
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument (position %d) must be "
-            "list of float, but got %s at pos %d",
-            op_type, arg_pos + 1,
-            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
-            i));
-      }
-    }
-    attrs[key] = value;
-  } else if (PyTuple_Check(obj)) {
-    Py_ssize_t len = PyTuple_Size(obj);
-    PyObject* item = nullptr;
-    std::vector<double> value;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PyTuple_GetItem(obj, i);
-      if (PyObject_CheckFloatOrToFloat(&item)) {
-        value.emplace_back(PyFloat_AsDouble(item));
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument (position %d) must be "
-            "list of float, but got %s at pos %d",
-            op_type, arg_pos + 1,
-            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
-            i));
-      }
-    }
-    attrs[key] = value;
-  } else if (PySequence_Check(obj)) {
-    Py_ssize_t len = PySequence_Size(obj);
-    PyObject* item = nullptr;
-    std::vector<double> value;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PySequence_GetItem(obj, i);
-      if (PyObject_CheckFloatOrToFloat(&item)) {
-        value.emplace_back(PyFloat_AsDouble(item));
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument (position %d) must be "
-            "list of float, but got %s at pos %d",
-            op_type, arg_pos + 1,
-            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
-            i));
-      }
-    }
-    attrs[key] = value;
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument (position %d) must be "
-        "list or tuple, but got %s",
-        op_type, arg_pos + 1,
-        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
-  }
-}
-
-static inline void CastPyArg2AttrStrings(
-    PyObject* obj,
-    paddle::framework::AttributeMap& attrs,  // NOLINT
-    const std::string& key, const std::string& op_type, ssize_t arg_pos) {
-  if (PyList_Check(obj)) {
-    Py_ssize_t len = PyList_Size(obj);
-    PyObject* item = nullptr;
-    std::vector<std::string> value;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PyList_GetItem(obj, i);
-      if (PyObject_CheckString(item)) {
-        Py_ssize_t size;
-        const char* data;
-        data = PyUnicode_AsUTF8AndSize(item, &size);
-        value.emplace_back(std::string(data, (size_t)size));  // NOLINT
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument (position %d) must be "
-            "list of str, but got %s at pos %d",
-            op_type, arg_pos + 1,
-            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
-            i));
-      }
-    }
-    attrs[key] = value;
-  } else if (PyTuple_Check(obj)) {
-    Py_ssize_t len = PyTuple_Size(obj);
-    PyObject* item = nullptr;
-    std::vector<std::string> value;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PyTuple_GetItem(obj, i);
-      if (PyObject_CheckString(item)) {
-        Py_ssize_t size;
-        const char* data;
-        data = PyUnicode_AsUTF8AndSize(item, &size);
-        value.emplace_back(std::string(data, static_cast<size_t>(size)));
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument (position %d) must be "
-            "list of str, but got %s at pos %d",
-            op_type, arg_pos + 1,
-            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
-            i));
-      }
-    }
-    attrs[key] = value;
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument (position %d) must be "
-        "list or tuple, but got %s",
-        op_type, arg_pos + 1,
-        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
-  }
-}
-
-static inline void CastPyArg2AttrBlock(
-    PyObject* obj, paddle::framework::AttributeMap& attrs,  // NOLINT
-    const std::string& key, const std::string& op_type, ssize_t arg_pos) {
-  ::pybind11::detail::instance* inst =
-      (::pybind11::detail::instance*)obj;  // NOLINT
-
-  if (!PyObject_IsInstance((PyObject*)inst,                   // NOLINT
-                           (PyObject*)g_blockdesc_pytype)) {  // NOLINT
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument (position %d) must be "
-        "BlockDesc, but got %s",
-        op_type, arg_pos + 1,
-        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
-  }
-  void** vh = inst->simple_layout ? inst->simple_value_holder
-                                  : &inst->nonsimple.values_and_holders[0];
-  attrs[key] = reinterpret_cast<paddle::framework::BlockDesc*&>(vh[0]);
-}
-
-static inline void ConstructAttrMapFromPyArgs(
-    const std::string& op_type, PyObject* args, ssize_t attr_start,
-    ssize_t attr_end, paddle::framework::AttributeMap& attrs) {  // NOLINT
-  PADDLE_ENFORCE_EQ(
-      (attr_end - attr_start) % 2, 0,
-      platform::errors::InvalidArgument(
-          "The number of arguments for attributes should be even."));
-
-  auto attr_type_map = &(OpAttrTypeMap::Instance().Map()[op_type]);
-
-  PyObject* obj = nullptr;
-  for (ssize_t arg_pos = attr_start; arg_pos < attr_end; arg_pos += 2) {
-    Py_ssize_t key_len;
-    const char* key_ptr;
-    obj = PyTuple_GET_ITEM(args, arg_pos);
-    if (PyObject_CheckString(obj)) {
-      key_ptr = PyUnicode_AsUTF8AndSize(obj, &key_len);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "%s(): argument (position %d) must be str, but got "
-          "%s",
-          op_type, arg_pos, ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
-    }
-
-    std::string key(key_ptr, static_cast<size_t>(key_len));
-    auto iter = attr_type_map->find(key);
-    if (iter == attr_type_map->end()) {
-      continue;
-    }
-
-    obj = PyTuple_GET_ITEM(args, arg_pos + 1);
-
-    switch (iter->second) {
-      case paddle::framework::proto::AttrType::INT:
-        CastPyArg2AttrInt(obj, attrs, key, op_type, arg_pos);
-        break;
-      case paddle::framework::proto::AttrType::FLOAT:
-        CastPyArg2AttrFloat(obj, attrs, key, op_type, arg_pos);
-        break;
-      case paddle::framework::proto::AttrType::STRING:
-        CastPyArg2AttrString(obj, attrs, key, op_type, arg_pos);
-        break;
-      case paddle::framework::proto::AttrType::INTS:
-        CastPyArg2AttrInts(obj, attrs, key, op_type, arg_pos);
-        break;
-      case paddle::framework::proto::AttrType::FLOATS:
-        CastPyArg2AttrFloats(obj, attrs, key, op_type, arg_pos);
-        break;
-      case paddle::framework::proto::AttrType::STRINGS:
-        CastPyArg2AttrStrings(obj, attrs, key, op_type, arg_pos);
-        break;
-      case paddle::framework::proto::AttrType::BOOLEAN:
-        CastPyArg2AttrBoolean(obj, attrs, key, op_type, arg_pos);
-        break;
-      case paddle::framework::proto::AttrType::BOOLEANS:
-        CastPyArg2AttrBooleans(obj, attrs, key, op_type, arg_pos);
-        break;
-      case paddle::framework::proto::AttrType::LONG:
-        CastPyArg2AttrLong(obj, attrs, key, op_type, arg_pos);
-        break;
-      case paddle::framework::proto::AttrType::LONGS:
-        CastPyArg2AttrLongs(obj, attrs, key, op_type, arg_pos);
-        break;
-      case paddle::framework::proto::AttrType::FLOAT64S:
-        CastPyArg2AttrFloat64s(obj, attrs, key, op_type, arg_pos);
-        break;
-      case paddle::framework::proto::AttrType::BLOCK:
-        CastPyArg2AttrBlock(obj, attrs, key, op_type, arg_pos);
-        break;
-      default:
-        break;
-    }
-  }
-}
-
-static inline std::shared_ptr<imperative::VarBase> GetVarBaseFromArgs(
-    const std::string& op_type, const std::string& arg_name, PyObject* args,
-    ssize_t arg_idx, bool dispensable = false) {
-  ::pybind11::detail::instance* inst =
-      (::pybind11::detail::instance*)PyTuple_GET_ITEM(args, arg_idx);
-
-  if (PyTuple_Check((PyObject*)inst)) {  // NOLINT
-    inst = (::pybind11::detail::instance*)PyTuple_GET_ITEM(inst, 0);
-  }
-
-  if (inst == nullptr || (PyObject*)inst == Py_None) {  // NOLINT
-    if (!dispensable) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "%s(): argument '%s' (position %d) must be Tensor, but got None",
-          op_type, arg_name, arg_idx));
-    }
-    return nullptr;
-  }
-
-  if (!PyObject_IsInstance((PyObject*)inst,                 // NOLINT
-                           (PyObject*)g_varbase_pytype)) {  // NOLINT
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument '%s' (position %d) must be Tensor, but got "
-        "%s",
-        op_type, arg_name, arg_idx,
-        ((PyTypeObject*)((PyObject*)inst)->ob_type)->tp_name));  // NOLINT
-  }
-
-  void** vh = inst->simple_layout ? inst->simple_value_holder
-                                  : &inst->nonsimple.values_and_holders[0];
-  return reinterpret_cast<std::shared_ptr<paddle::imperative::VarBase>&>(vh[1]);
-}
-
-static inline std::vector<std::shared_ptr<imperative::VarBase>>
-GetVarBaseListFromArgs(const std::string& op_type, const std::string& arg_name,
-                       PyObject* args, ssize_t arg_idx,
-                       bool dispensable = false) {
-  PyObject* list = PyTuple_GET_ITEM(args, arg_idx);
-
-  if (list == nullptr) {
-    if (!dispensable) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "%s(): argument '%s' (position %d) must be list of Tensor, but got "
-          "None",
-          op_type, arg_name, arg_idx));  // NOLINT
-    }
-    return {};
-  }
-
-  std::vector<std::shared_ptr<imperative::VarBase>> result;
-
-  if (PyList_Check(list)) {
-    Py_ssize_t len = PyList_Size(list);
-    if (len == 0) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "%s(): argument '%s' (position %d) must be list of Tensors, but got "
-          "empty list",
-          op_type, arg_name, arg_idx));
-    }
-    ::pybind11::detail::instance* item = nullptr;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = (::pybind11::detail::instance*)PyList_GetItem(list, i);
-      if (!PyObject_IsInstance((PyObject*)item,                 // NOLINT
-                               (PyObject*)g_varbase_pytype)) {  // NOLINT
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument '%s' (position %d) must be list of Tensors, but "
-            "got list of "
-            "%s",
-            op_type, arg_name, arg_idx,
-            ((PyTypeObject*)((PyObject*)item)->ob_type)->tp_name));  // NOLINT
-      }
-      void** vh = item->simple_layout ? item->simple_value_holder
-                                      : &item->nonsimple.values_and_holders[0];
-      result.emplace_back(
-          reinterpret_cast<std::shared_ptr<paddle::imperative::VarBase>&>(
-              vh[1]));
-    }
-  } else if (PyTuple_Check(list)) {
-    Py_ssize_t len = PyTuple_Size(list);
-    if (len == 0) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "%s(): argument '%s' (position %d) must be list of Tensors, but got "
-          "empty list",
-          op_type, arg_name, arg_idx));
-    }
-    ::pybind11::detail::instance* item = nullptr;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = (::pybind11::detail::instance*)PyTuple_GetItem(list, i);  // NOLINT
-      if (!PyObject_IsInstance((PyObject*)item,                        // NOLINT
-                               (PyObject*)g_varbase_pytype)) {         // NOLINT
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument '%s' (position %d) must be list of Tensors, but "
-            "got list of "
-            "%s",
-            op_type, arg_name, arg_idx,
-            ((PyTypeObject*)((PyObject*)item)->ob_type)->tp_name));  // NOLINT
-      }
-      void** vh = item->simple_layout ? item->simple_value_holder
-                                      : &item->nonsimple.values_and_holders[0];
-      result.emplace_back(
-          reinterpret_cast<std::shared_ptr<paddle::imperative::VarBase>&>(
-              vh[1]));
-    }
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument '%s' (position %d) must be list of Tensors, but got "
-        "%s",
-        op_type, arg_name, arg_idx,
-        ((PyTypeObject*)list->ob_type)->tp_name));  // NOLINT
-  }
-
-  return result;
-}
-
-static inline unsigned long GetUnsignedLongFromArgs(  // NOLINT
-    const std::string& op_type, const std::string& arg_name, PyObject* args,
-    ssize_t arg_idx, bool dispensable = false) {
-  PyObject* item = PyTuple_GET_ITEM(args, arg_idx);
-
-  if (item == nullptr) {
-    if (!dispensable) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "%s(): argument '%s' (position %d) must be long, but got None",
-          op_type, arg_name, arg_idx));
-    }
-    return 0;
-  }
-
-  if (PyObject_CheckLongOrToLong(&item)) {
-    return PyLong_AsUnsignedLong(item);
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument '%s' (position %d) must be "
-        "long, but got %s",
-        op_type, arg_name, arg_idx,
-        ((PyTypeObject*)item->ob_type)->tp_name));  // NOLINT
-  }
-}
-
-static inline PyObject* MakeReturnPyObject(
+PyObject* MakeReturnPyObject(
     const std::shared_ptr<paddle::imperative::VarBase>& out) {
   return ::pybind11::detail::type_caster_base<imperative::VarBase>::cast_holder(
              ::pybind11::detail::holder_helper<
@@ -936,7 +186,7 @@ static inline PyObject* MakeReturnPyObject(
       .ptr();
 }
 
-static inline PyObject* MakeReturnPyObject(
+PyObject* MakeReturnPyObject(
     const std::vector<std::shared_ptr<imperative::VarBase>>& out) {
   PyObject* result = PyList_New((Py_ssize_t)out.size());
 
@@ -969,7 +219,7 @@ struct TupleVarBasesResult<Tuple, 1> {
 };
 
 template <typename... Args>
-static inline PyObject* MakeReturnPyObject(const std::tuple<Args...>& out) {
+PyObject* MakeReturnPyObject(const std::tuple<Args...>& out) {
   auto len = sizeof...(Args);
   PyObject* result = PyTuple_New(len);
 
@@ -978,64 +228,6 @@ static inline PyObject* MakeReturnPyObject(const std::tuple<Args...>& out) {
   return result;
 }
 
-void InitOpsAttrTypeMap() {
-  auto op_info_map = paddle::framework::OpInfoMap::Instance().map();
-  for (auto iter = op_info_map.begin(); iter != op_info_map.end(); ++iter) {
-    auto op_proto = iter->second.proto_;
-    if (op_proto == nullptr) {
-      continue;
-    }
-    auto attrs_proto = op_proto->attrs();
-    for (auto& attr : attrs_proto) {
-      OpAttrTypeMap::Instance().Map()[iter->first][attr.name()] = attr.type();
-    }
-  }
-}
-
-void ThrowExceptionToPython(std::exception_ptr p) {
-  static PyObject* EOFExceptionException =
-      PyErr_NewException("paddle.EOFException", PyExc_Exception, NULL);
-  static PyObject* EnforceNotMetException =
-      PyErr_NewException("paddle.EnforceNotMet", PyExc_Exception, NULL);
-  try {
-    if (p) std::rethrow_exception(p);
-  } catch (const platform::EOFException& e) {
-    PyErr_SetString(EOFExceptionException, e.what());
-  } catch (const platform::EnforceNotMet& e) {
-    switch (e.code()) {
-      case paddle::platform::error::INVALID_ARGUMENT:
-        PyErr_SetString(PyExc_ValueError, e.what());
-        break;
-      case paddle::platform::error::NOT_FOUND:
-      case paddle::platform::error::ALREADY_EXISTS:
-      case paddle::platform::error::PRECONDITION_NOT_MET:
-      case paddle::platform::error::PERMISSION_DENIED:
-      case paddle::platform::error::EXECUTION_TIMEOUT:
-      case paddle::platform::error::UNAVAILABLE:
-        PyErr_SetString(PyExc_RuntimeError, e.what());
-        break;
-      case paddle::platform::error::OUT_OF_RANGE:
-        PyErr_SetString(PyExc_IndexError, e.what());
-        break;
-      case paddle::platform::error::RESOURCE_EXHAUSTED:
-        PyErr_SetString(PyExc_MemoryError, e.what());
-        break;
-      case paddle::platform::error::UNIMPLEMENTED:
-        PyErr_SetString(PyExc_NotImplementedError, e.what());
-        break;
-      case paddle::platform::error::FATAL:
-        PyErr_SetString(PyExc_SystemError, e.what());
-        break;
-      case paddle::platform::error::EXTERNAL:
-        PyErr_SetString(PyExc_OSError, e.what());
-        break;
-      default:
-        PyErr_SetString(EnforceNotMetException, e.what());
-        break;
-    }
-  }
-}
-
 }  // namespace pybind
 }  // namespace paddle
 
diff --git a/paddle/fluid/pybind/op_function_common.cc b/paddle/fluid/pybind/op_function_common.cc
new file mode 100644
index 0000000000000..1f761ae29c2af
--- /dev/null
+++ b/paddle/fluid/pybind/op_function_common.cc
@@ -0,0 +1,806 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <pybind11/chrono.h>
+#include <pybind11/complex.h>
+#include <pybind11/functional.h>
+#include <pybind11/stl.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/attribute.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/imperative/tracer.h"
+#include "paddle/fluid/imperative/type_defs.h"
+#include "paddle/fluid/pybind/imperative.h"
+#include "paddle/fluid/pybind/op_function_common.h"
+
+namespace py = pybind11;
+namespace paddle {
+namespace pybind {
+
+class OpAttrTypeMap {
+ public:
+  static OpAttrTypeMap& Instance() {
+    static OpAttrTypeMap g_op_attr_type_map;
+    return g_op_attr_type_map;
+  }
+
+  std::unordered_map<
+      std::string,
+      std::unordered_map<std::string, paddle::framework::proto::AttrType>>&
+  Map() {
+    return ops_attrtype_map_;
+  }
+
+ private:
+  OpAttrTypeMap() = default;
+  std::unordered_map<
+      std::string,
+      std::unordered_map<std::string, paddle::framework::proto::AttrType>>
+      ops_attrtype_map_;
+};
+
+extern PyTypeObject* g_varbase_pytype;
+extern PyTypeObject* g_vartype_pytype;
+extern PyTypeObject* g_blockdesc_pytype;
+
+bool PyObject_CheckBool(PyObject** obj) { return PyBool_Check(*obj); }
+
+bool PyObject_CheckLongOrToLong(PyObject** obj) {
+  if ((PyLong_Check(*obj) && !PyBool_Check(*obj)) ||
+      PyObject_IsInstance(*obj, (PyObject*)g_vartype_pytype) ||  // NOLINT
+      PyObject_IsInstance(*obj, (PyObject*)g_varbase_pytype)) {  // NOLINT
+    return true;
+  }
+
+  if (std::string(((PyTypeObject*)(*obj)->ob_type)->tp_name)  // NOLINT
+          .find("numpy") != std::string::npos) {
+    auto to = PyNumber_Long(*obj);
+    if (to) {
+      *obj = to;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool PyObject_CheckFloatOrToFloat(PyObject** obj) {
+  // sometimes users provide PyLong or numpy.int64 but attr is float
+  if (PyFloat_Check(*obj) || PyLong_Check(*obj) ||
+      PyObject_IsInstance(*obj, (PyObject*)g_varbase_pytype)) {  // NOLINT
+    return true;
+  }
+  if (std::string(((PyTypeObject*)(*obj)->ob_type)->tp_name)  // NOLINT
+          .find("numpy") != std::string::npos) {
+    auto to = PyNumber_Float(*obj);
+    if (to) {
+      *obj = to;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool PyObject_CheckString(PyObject* obj) { return PyUnicode_Check(obj); }
+
+void CastPyArg2AttrBoolean(PyObject* obj,
+                           paddle::framework::AttributeMap& attrs,  // NOLINT
+                           const std::string& key, const std::string& op_type,
+                           ssize_t arg_pos) {
+  if (obj == Py_None) {
+    attrs[key] = false;  // To be compatible with QA integration testing. Some
+                         // test case pass in None.
+  } else if (obj == Py_True) {
+    attrs[key] = true;
+  } else if (obj == Py_False) {
+    attrs[key] = false;
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "bool, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+}
+
+void CastPyArg2AttrInt(PyObject* obj,
+                       paddle::framework::AttributeMap& attrs,  // NOLINT
+                       const std::string& key, const std::string& op_type,
+                       ssize_t arg_pos) {
+  if (PyObject_CheckLongOrToLong(&obj)) {
+    attrs[key] = (int)PyLong_AsLong(obj);  // NOLINT
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "int, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+}
+
+void CastPyArg2AttrLong(PyObject* obj,
+                        paddle::framework::AttributeMap& attrs,  // NOLINT
+                        const std::string& key, const std::string& op_type,
+                        ssize_t arg_pos) {
+  if (PyObject_CheckLongOrToLong(&obj)) {
+    attrs[key] = (int64_t)PyLong_AsLong(obj);  // NOLINT
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "long, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+}
+
+void CastPyArg2AttrFloat(PyObject* obj,
+                         paddle::framework::AttributeMap& attrs,  // NOLINT
+                         const std::string& key, const std::string& op_type,
+                         ssize_t arg_pos) {
+  if (PyObject_CheckFloatOrToFloat(&obj)) {
+    attrs[key] = (float)PyFloat_AsDouble(obj);  // NOLINT
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "float, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+}
+
+void CastPyArg2AttrString(PyObject* obj,
+                          paddle::framework::AttributeMap& attrs,  // NOLINT
+                          const std::string& key, const std::string& op_type,
+                          ssize_t arg_pos) {
+  if (PyObject_CheckString(obj)) {
+    Py_ssize_t size;
+    const char* data;
+    data = PyUnicode_AsUTF8AndSize(obj, &size);
+    attrs[key] = std::string(data, (size_t)size);  // NOLINT
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "str, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+}
+
+void CastPyArg2AttrBooleans(PyObject* obj,
+                            paddle::framework::AttributeMap& attrs,  // NOLINT
+                            const std::string& key, const std::string& op_type,
+                            ssize_t arg_pos) {
+  if (PyList_Check(obj)) {
+    Py_ssize_t len = PyList_Size(obj);
+    PyObject* item = nullptr;
+    std::vector<bool> value;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyList_GetItem(obj, i);
+      if (PyObject_CheckBool(&item)) {
+        value.emplace_back(PyLong_AsLong(item));
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument (position %d) must be "
+            "list of bool, but got %s at pos %d",
+            op_type, arg_pos + 1,
+            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
+            i));
+      }
+    }
+    attrs[key] = value;
+  } else if (PyTuple_Check(obj)) {
+    Py_ssize_t len = PyTuple_Size(obj);
+    PyObject* item = nullptr;
+    std::vector<bool> value;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyTuple_GetItem(obj, i);
+      if (PyObject_CheckBool(&item)) {
+        value.emplace_back(PyLong_AsLong(item));
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument (position %d) must be "
+            "list of bool, but got %s at pos %d",
+            op_type, arg_pos + 1,
+            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
+            i));
+      }
+    }
+    attrs[key] = value;
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "list or tuple, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+}
+
+void CastPyArg2AttrInts(PyObject* obj,
+                        paddle::framework::AttributeMap& attrs,  // NOLINT
+                        const std::string& key, const std::string& op_type,
+                        ssize_t arg_pos) {
+  if (PyList_Check(obj)) {
+    Py_ssize_t len = PyList_Size(obj);
+    PyObject* item = nullptr;
+    std::vector<int> value;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyList_GetItem(obj, i);
+      if (PyObject_CheckLongOrToLong(&item)) {
+        value.emplace_back(PyLong_AsLong(item));
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument (position %d) must be "
+            "list of int, but got %s at pos %d",
+            op_type, arg_pos + 1,
+            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
+            i));
+      }
+    }
+    attrs[key] = value;
+  } else if (PyTuple_Check(obj)) {
+    Py_ssize_t len = PyTuple_Size(obj);
+    PyObject* item = nullptr;
+    std::vector<int> value;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyTuple_GetItem(obj, i);
+      if (PyObject_CheckLongOrToLong(&item)) {
+        value.emplace_back(PyLong_AsLong(item));
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument (position %d) must be "
+            "list of int, but got %s at pos %d",
+            op_type, arg_pos + 1,
+            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
+            i));
+      }
+    }
+    attrs[key] = value;
+  } else if (PySequence_Check(obj)) {
+    Py_ssize_t len = PySequence_Size(obj);
+    PyObject* item = nullptr;
+    std::vector<int> value;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PySequence_GetItem(obj, i);
+      if (PyObject_CheckLongOrToLong(&item)) {
+        value.emplace_back(PyLong_AsLong(item));
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument (position %d) must be "
+            "list of int, but got %s at pos %d",
+            op_type, arg_pos + 1,
+            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
+            i));
+      }
+    }
+    attrs[key] = value;
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "list or tuple, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+}
+
+void CastPyArg2AttrLongs(PyObject* obj,
+                         paddle::framework::AttributeMap& attrs,  // NOLINT
+                         const std::string& key, const std::string& op_type,
+                         ssize_t arg_pos) {
+  if (PyList_Check(obj)) {
+    Py_ssize_t len = PyList_Size(obj);
+    PyObject* item = nullptr;
+    std::vector<int64_t> value;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyList_GetItem(obj, i);
+      if (PyObject_CheckLongOrToLong(&item)) {
+        value.emplace_back(PyLong_AsLong(item));
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument (position %d) must be "
+            "list of int, but got %s at pos %d",
+            op_type, arg_pos + 1,
+            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
+            i));
+      }
+    }
+    attrs[key] = value;
+  } else if (PyTuple_Check(obj)) {
+    Py_ssize_t len = PyTuple_Size(obj);
+    PyObject* item = nullptr;
+    std::vector<int64_t> value;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyTuple_GetItem(obj, i);
+      if (PyObject_CheckLongOrToLong(&item)) {
+        value.emplace_back(PyLong_AsLong(item));
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument (position %d) must be "
+            "list of int, but got %s at pos %d",
+            op_type, arg_pos + 1,
+            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
+            i));
+      }
+    }
+    attrs[key] = value;
+  } else if (PySequence_Check(obj)) {
+    Py_ssize_t len = PySequence_Size(obj);
+    PyObject* item = nullptr;
+    std::vector<int64_t> value;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PySequence_GetItem(obj, i);
+      if (PyObject_CheckLongOrToLong(&item)) {
+        value.emplace_back(PyLong_AsLong(item));
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument (position %d) must be "
+            "list of int, but got %s at pos %d",
+            op_type, arg_pos + 1,
+            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
+            i));
+      }
+    }
+    attrs[key] = value;
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "list or tuple, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+}
+
+void CastPyArg2AttrFloats(PyObject* obj,
+                          paddle::framework::AttributeMap& attrs,  // NOLINT
+                          const std::string& key, const std::string& op_type,
+                          ssize_t arg_pos) {
+  if (PyList_Check(obj)) {
+    Py_ssize_t len = PyList_Size(obj);
+    PyObject* item = nullptr;
+    std::vector<float> value;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyList_GetItem(obj, i);
+      if (PyObject_CheckFloatOrToFloat(&item)) {
+        value.emplace_back(PyFloat_AsDouble(item));
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument (position %d) must be "
+            "list of float, but got %s at pos %d",
+            op_type, arg_pos + 1,
+            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
+            i));
+      }
+    }
+    attrs[key] = value;
+  } else if (PyTuple_Check(obj)) {
+    Py_ssize_t len = PyTuple_Size(obj);
+    PyObject* item = nullptr;
+    std::vector<float> value;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyTuple_GetItem(obj, i);
+      if (PyObject_CheckFloatOrToFloat(&item)) {
+        value.emplace_back(PyFloat_AsDouble(item));
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument (position %d) must be "
+            "list of float, but got %s at pos %d",
+            op_type, arg_pos + 1,
+            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
+            i));
+      }
+    }
+    attrs[key] = value;
+  } else if (PySequence_Check(obj)) {
+    Py_ssize_t len = PySequence_Size(obj);
+    PyObject* item = nullptr;
+    std::vector<float> value;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PySequence_GetItem(obj, i);
+      if (PyObject_CheckFloatOrToFloat(&item)) {
+        value.emplace_back(PyFloat_AsDouble(item));
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument (position %d) must be "
+            "list of float, but got %s at pos %d",
+            op_type, arg_pos + 1,
+            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
+            i));
+      }
+    }
+    attrs[key] = value;
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "list or tuple, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+}
+
+void CastPyArg2AttrFloat64s(PyObject* obj,
+                            paddle::framework::AttributeMap& attrs,  // NOLINT
+                            const std::string& key, const std::string& op_type,
+                            ssize_t arg_pos) {
+  if (PyList_Check(obj)) {
+    Py_ssize_t len = PyList_Size(obj);
+    PyObject* item = nullptr;
+    std::vector<double> value;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyList_GetItem(obj, i);
+      if (PyObject_CheckFloatOrToFloat(&item)) {
+        value.emplace_back(PyFloat_AsDouble(item));
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument (position %d) must be "
+            "list of float, but got %s at pos %d",
+            op_type, arg_pos + 1,
+            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
+            i));
+      }
+    }
+    attrs[key] = value;
+  } else if (PyTuple_Check(obj)) {
+    Py_ssize_t len = PyTuple_Size(obj);
+    PyObject* item = nullptr;
+    std::vector<double> value;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyTuple_GetItem(obj, i);
+      if (PyObject_CheckFloatOrToFloat(&item)) {
+        value.emplace_back(PyFloat_AsDouble(item));
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument (position %d) must be "
+            "list of float, but got %s at pos %d",
+            op_type, arg_pos + 1,
+            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
+            i));
+      }
+    }
+    attrs[key] = value;
+  } else if (PySequence_Check(obj)) {
+    Py_ssize_t len = PySequence_Size(obj);
+    PyObject* item = nullptr;
+    std::vector<double> value;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PySequence_GetItem(obj, i);
+      if (PyObject_CheckFloatOrToFloat(&item)) {
+        value.emplace_back(PyFloat_AsDouble(item));
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument (position %d) must be "
+            "list of float, but got %s at pos %d",
+            op_type, arg_pos + 1,
+            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
+            i));
+      }
+    }
+    attrs[key] = value;
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "list or tuple, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+}
+
+void CastPyArg2AttrStrings(PyObject* obj,
+                           paddle::framework::AttributeMap& attrs,  // NOLINT
+                           const std::string& key, const std::string& op_type,
+                           ssize_t arg_pos) {
+  if (PyList_Check(obj)) {
+    Py_ssize_t len = PyList_Size(obj);
+    PyObject* item = nullptr;
+    std::vector<std::string> value;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyList_GetItem(obj, i);
+      if (PyObject_CheckString(item)) {
+        Py_ssize_t size;
+        const char* data;
+        data = PyUnicode_AsUTF8AndSize(item, &size);
+        value.emplace_back(std::string(data, (size_t)size));  // NOLINT
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument (position %d) must be "
+            "list of str, but got %s at pos %d",
+            op_type, arg_pos + 1,
+            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
+            i));
+      }
+    }
+    attrs[key] = value;
+  } else if (PyTuple_Check(obj)) {
+    Py_ssize_t len = PyTuple_Size(obj);
+    PyObject* item = nullptr;
+    std::vector<std::string> value;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyTuple_GetItem(obj, i);
+      if (PyObject_CheckString(item)) {
+        Py_ssize_t size;
+        const char* data;
+        data = PyUnicode_AsUTF8AndSize(item, &size);
+        value.emplace_back(std::string(data, (size_t)size));  // NOLINT
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument (position %d) must be "
+            "list of str, but got %s at pos %d",
+            op_type, arg_pos + 1,
+            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
+            i));
+      }
+    }
+    attrs[key] = value;
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "list or tuple, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+}
+
+void CastPyArg2AttrBlock(PyObject* obj,
+                         paddle::framework::AttributeMap& attrs,  // NOLINT
+                         const std::string& key, const std::string& op_type,
+                         ssize_t arg_pos) {
+  ::pybind11::detail::instance* inst =
+      (::pybind11::detail::instance*)obj;  // NOLINT
+
+  if (!PyObject_IsInstance((PyObject*)inst,                   // NOLINT
+                           (PyObject*)g_blockdesc_pytype)) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "BlockDesc, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+  void** vh = inst->simple_layout ? inst->simple_value_holder
+                                  : &inst->nonsimple.values_and_holders[0];
+  attrs[key] = reinterpret_cast<paddle::framework::BlockDesc*&>(vh[0]);
+}
+
+void ConstructAttrMapFromPyArgs(
+    const std::string& op_type, PyObject* args, ssize_t attr_start,
+    ssize_t attr_end, paddle::framework::AttributeMap& attrs) {  // NOLINT
+  PADDLE_ENFORCE_EQ(
+      (attr_end - attr_start) % 2, 0,
+      platform::errors::InvalidArgument(
+          "The number of arguments for attributes should be even."));
+
+  auto attr_type_map = &(OpAttrTypeMap::Instance().Map()[op_type]);
+
+  PyObject* obj = nullptr;
+  for (ssize_t arg_pos = attr_start; arg_pos < attr_end; arg_pos += 2) {
+    Py_ssize_t key_len;
+    const char* key_ptr;
+    obj = PyTuple_GET_ITEM(args, arg_pos);
+    if (PyObject_CheckString(obj)) {
+      key_ptr = PyUnicode_AsUTF8AndSize(obj, &key_len);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument (position %d) must be str, but got "
+          "%s",
+          op_type, arg_pos, ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+    }
+
+    std::string key(key_ptr, (size_t)key_len);  // NOLINT
+    auto iter = attr_type_map->find(key);
+    if (iter == attr_type_map->end()) {
+      continue;
+    }
+
+    obj = PyTuple_GET_ITEM(args, arg_pos + 1);
+
+    switch (iter->second) {
+      case paddle::framework::proto::AttrType::INT:
+        CastPyArg2AttrInt(obj, attrs, key, op_type, arg_pos);
+        break;
+      case paddle::framework::proto::AttrType::FLOAT:
+        CastPyArg2AttrFloat(obj, attrs, key, op_type, arg_pos);
+        break;
+      case paddle::framework::proto::AttrType::STRING:
+        CastPyArg2AttrString(obj, attrs, key, op_type, arg_pos);
+        break;
+      case paddle::framework::proto::AttrType::INTS:
+        CastPyArg2AttrInts(obj, attrs, key, op_type, arg_pos);
+        break;
+      case paddle::framework::proto::AttrType::FLOATS:
+        CastPyArg2AttrFloats(obj, attrs, key, op_type, arg_pos);
+        break;
+      case paddle::framework::proto::AttrType::STRINGS:
+        CastPyArg2AttrStrings(obj, attrs, key, op_type, arg_pos);
+        break;
+      case paddle::framework::proto::AttrType::BOOLEAN:
+        CastPyArg2AttrBoolean(obj, attrs, key, op_type, arg_pos);
+        break;
+      case paddle::framework::proto::AttrType::BOOLEANS:
+        CastPyArg2AttrBooleans(obj, attrs, key, op_type, arg_pos);
+        break;
+      case paddle::framework::proto::AttrType::LONG:
+        CastPyArg2AttrLong(obj, attrs, key, op_type, arg_pos);
+        break;
+      case paddle::framework::proto::AttrType::LONGS:
+        CastPyArg2AttrLongs(obj, attrs, key, op_type, arg_pos);
+        break;
+      case paddle::framework::proto::AttrType::FLOAT64S:
+        CastPyArg2AttrFloat64s(obj, attrs, key, op_type, arg_pos);
+        break;
+      case paddle::framework::proto::AttrType::BLOCK:
+        CastPyArg2AttrBlock(obj, attrs, key, op_type, arg_pos);
+        break;
+      default:
+        break;
+    }
+  }
+}
+
+std::shared_ptr<imperative::VarBase> GetVarBaseFromArgs(
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable) {
+  ::pybind11::detail::instance* inst =
+      (::pybind11::detail::instance*)PyTuple_GET_ITEM(args, arg_idx);
+
+  if (PyTuple_Check((PyObject*)inst)) {  // NOLINT
+    inst = (::pybind11::detail::instance*)PyTuple_GET_ITEM(inst, 0);
+  }
+
+  if (inst == nullptr || (PyObject*)inst == Py_None) {  // NOLINT
+    if (!dispensable) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be Tensor, but got None",
+          op_type, arg_name, arg_idx));
+    }
+    return nullptr;
+  }
+
+  if (!PyObject_IsInstance((PyObject*)inst,                 // NOLINT
+                           (PyObject*)g_varbase_pytype)) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument '%s' (position %d) must be Tensor, but got "
+        "%s",
+        op_type, arg_name, arg_idx,
+        ((PyTypeObject*)((PyObject*)inst)->ob_type)->tp_name));  // NOLINT
+  }
+
+  void** vh = inst->simple_layout ? inst->simple_value_holder
+                                  : &inst->nonsimple.values_and_holders[0];
+  return reinterpret_cast<std::shared_ptr<paddle::imperative::VarBase>&>(vh[1]);
+}
+
+std::vector<std::shared_ptr<imperative::VarBase>> GetVarBaseListFromArgs(
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable) {
+  PyObject* list = PyTuple_GET_ITEM(args, arg_idx);
+
+  if (list == nullptr) {
+    if (!dispensable) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be list of Tensor, but got "
+          "None",
+          op_type, arg_name, arg_idx));  // NOLINT
+    }
+    return {};
+  }
+
+  std::vector<std::shared_ptr<imperative::VarBase>> result;
+
+  if (PyList_Check(list)) {
+    Py_ssize_t len = PyList_Size(list);
+    if (len == 0) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be list of Tensors, but got "
+          "empty list",
+          op_type, arg_name, arg_idx));
+    }
+    ::pybind11::detail::instance* item = nullptr;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = (::pybind11::detail::instance*)PyList_GetItem(list, i);
+      if (!PyObject_IsInstance((PyObject*)item,                 // NOLINT
+                               (PyObject*)g_varbase_pytype)) {  // NOLINT
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument '%s' (position %d) must be list of Tensors, but "
+            "got list of "
+            "%s",
+            op_type, arg_name, arg_idx,
+            ((PyTypeObject*)((PyObject*)item)->ob_type)->tp_name));  // NOLINT
+      }
+      void** vh = item->simple_layout ? item->simple_value_holder
+                                      : &item->nonsimple.values_and_holders[0];
+      result.emplace_back(
+          reinterpret_cast<std::shared_ptr<paddle::imperative::VarBase>&>(
+              vh[1]));
+    }
+  } else if (PyTuple_Check(list)) {
+    Py_ssize_t len = PyTuple_Size(list);
+    if (len == 0) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be list of Tensors, but got "
+          "empty list",
+          op_type, arg_name, arg_idx));
+    }
+    ::pybind11::detail::instance* item = nullptr;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = (::pybind11::detail::instance*)PyTuple_GetItem(list, i);  // NOLINT
+      if (!PyObject_IsInstance((PyObject*)item,                        // NOLINT
+                               (PyObject*)g_varbase_pytype)) {         // NOLINT
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument '%s' (position %d) must be list of Tensors, but "
+            "got list of "
+            "%s",
+            op_type, arg_name, arg_idx,
+            ((PyTypeObject*)((PyObject*)item)->ob_type)->tp_name));  // NOLINT
+      }
+      void** vh = item->simple_layout ? item->simple_value_holder
+                                      : &item->nonsimple.values_and_holders[0];
+      result.emplace_back(
+          reinterpret_cast<std::shared_ptr<paddle::imperative::VarBase>&>(
+              vh[1]));
+    }
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument '%s' (position %d) must be list of Tensors, but got "
+        "%s",
+        op_type, arg_name, arg_idx,
+        ((PyTypeObject*)list->ob_type)->tp_name));  // NOLINT
+  }
+
+  return result;
+}
+
+unsigned long GetUnsignedLongFromArgs(  // NOLINT
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable) {
+  PyObject* item = PyTuple_GET_ITEM(args, arg_idx);
+
+  if (item == nullptr) {
+    if (!dispensable) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be long, but got None",
+          op_type, arg_name, arg_idx));
+    }
+    return 0;
+  }
+
+  if (PyObject_CheckLongOrToLong(&item)) {
+    return PyLong_AsUnsignedLong(item);
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument '%s' (position %d) must be "
+        "long, but got %s",
+        op_type, arg_name, arg_idx,
+        ((PyTypeObject*)item->ob_type)->tp_name));  // NOLINT
+  }
+}
+
+void InitOpsAttrTypeMap() {
+  auto op_info_map = paddle::framework::OpInfoMap::Instance().map();
+  for (auto iter = op_info_map.begin(); iter != op_info_map.end(); ++iter) {
+    auto op_proto = iter->second.proto_;
+    if (op_proto == nullptr) {
+      continue;
+    }
+    auto attrs_proto = op_proto->attrs();
+    for (auto& attr : attrs_proto) {
+      OpAttrTypeMap::Instance().Map()[iter->first][attr.name()] = attr.type();
+    }
+  }
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/op_function_common.h b/paddle/fluid/pybind/op_function_common.h
new file mode 100644
index 0000000000000..9dc3a71a6ccf9
--- /dev/null
+++ b/paddle/fluid/pybind/op_function_common.h
@@ -0,0 +1,126 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <pybind11/chrono.h>
+#include <pybind11/complex.h>
+#include <pybind11/functional.h>
+#include <pybind11/stl.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/attribute.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/imperative/tracer.h"
+#include "paddle/fluid/imperative/type_defs.h"
+#include "paddle/fluid/pybind/imperative.h"
+
+namespace py = pybind11;
+namespace paddle {
+namespace pybind {
+
+bool PyObject_CheckBool(PyObject** obj);
+
+bool PyObject_CheckLongOrToLong(PyObject** obj);
+
+bool PyObject_CheckFloatOrToFloat(PyObject** obj);
+
+bool PyObject_CheckString(PyObject* obj);
+
+void CastPyArg2AttrBoolean(PyObject* obj,
+                           paddle::framework::AttributeMap& attrs,  // NOLINT
+                           const std::string& key, const std::string& op_type,
+                           ssize_t arg_pos);
+
+void CastPyArg2AttrInt(PyObject* obj,
+                       paddle::framework::AttributeMap& attrs,  // NOLINT
+                       const std::string& key, const std::string& op_type,
+                       ssize_t arg_pos);
+
+void CastPyArg2AttrLong(PyObject* obj,
+                        paddle::framework::AttributeMap& attrs,  // NOLINT
+                        const std::string& key, const std::string& op_type,
+                        ssize_t arg_pos);
+
+void CastPyArg2AttrFloat(PyObject* obj,
+                         paddle::framework::AttributeMap& attrs,  // NOLINT
+                         const std::string& key, const std::string& op_type,
+                         ssize_t arg_pos);
+
+void CastPyArg2AttrString(PyObject* obj,
+                          paddle::framework::AttributeMap& attrs,  // NOLINT
+                          const std::string& key, const std::string& op_type,
+                          ssize_t arg_pos);
+
+void CastPyArg2AttrBooleans(PyObject* obj,
+                            paddle::framework::AttributeMap& attrs,  // NOLINT
+                            const std::string& key, const std::string& op_type,
+                            ssize_t arg_pos);
+
+void CastPyArg2AttrInts(PyObject* obj,
+                        paddle::framework::AttributeMap& attrs,  // NOLINT
+                        const std::string& key, const std::string& op_type,
+                        ssize_t arg_pos);
+
+void CastPyArg2AttrLongs(PyObject* obj,
+                         paddle::framework::AttributeMap& attrs,  // NOLINT
+                         const std::string& key, const std::string& op_type,
+                         ssize_t arg_pos);
+
+void CastPyArg2AttrFloats(PyObject* obj,
+                          paddle::framework::AttributeMap& attrs,  // NOLINT
+                          const std::string& key, const std::string& op_type,
+                          ssize_t arg_pos);
+
+void CastPyArg2AttrFloat64s(PyObject* obj,
+                            paddle::framework::AttributeMap& attrs,  // NOLINT
+                            const std::string& key, const std::string& op_type,
+                            ssize_t arg_pos);
+
+void CastPyArg2AttrStrings(PyObject* obj,
+                           paddle::framework::AttributeMap& attrs,  // NOLINT
+                           const std::string& key, const std::string& op_type,
+                           ssize_t arg_pos);
+
+void CastPyArg2AttrBlock(PyObject* obj,
+                         paddle::framework::AttributeMap& attrs,  // NOLINT
+                         const std::string& key, const std::string& op_type,
+                         ssize_t arg_pos);
+
+void ConstructAttrMapFromPyArgs(
+    const std::string& op_type, PyObject* args, ssize_t attr_start,
+    ssize_t attr_end,
+    paddle::framework::AttributeMap& attrs);  // NOLINT
+
+std::shared_ptr<imperative::VarBase> GetVarBaseFromArgs(
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable = false);
+
+std::vector<std::shared_ptr<imperative::VarBase>> GetVarBaseListFromArgs(
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable = false);
+
+unsigned long GetUnsignedLongFromArgs(  // NOLINT
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable = false);
+
+void InitOpsAttrTypeMap();
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 850f208359e05..5587952facc53 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/pybind/op_function_generator.h"
+
 #include <algorithm>
 #include <fstream>
 #include <iostream>
@@ -30,179 +32,6 @@
 #include "paddle/fluid/framework/fleet/ascend_wrapper.h"
 #endif
 
-// NOTE(zhiqiu): Commonly, the inputs in auto-generated OP function are
-// determined by the OP`s proto automatically, i.e., all the inputs registered
-// in OpMaker.
-// However, some OPs have dispensable inputs, which means the input can
-// be none for some conditions. It is discovered that most dispensable inputs
-// is not used in imperative mode, so we drop those inputs when generating OP
-// functions. While, for very few OPs, the dispensable inputs are used, we
-// need to manually specify them in this map.
-std::map<std::string, std::set<std::string>> op_ins_map = {
-    {"layer_norm", {"X", "Scale", "Bias"}},
-    {"bincount", {"X", "Weights"}},
-    {"fused_attention",
-     {"X", "LnScale", "LnBias", "QKVW", "QKVBias", "SrcMask", "OutLinearW",
-      "OutLinearBias", "Ln2Scale", "Ln2Bias"}},
-    {"instance_norm", {"X", "Scale", "Bias"}},
-    {"gru_unit", {"Input", "HiddenPrev", "Weight", "Bias"}},
-    {"label_smooth", {"X", "PriorDist"}},
-    {"assign", {"X"}},
-    {"reshape2", {"X", "Shape"}},
-    {"expand", {"X", "ExpandTimes"}},
-    {"slice", {"Input", "StartsTensor", "EndsTensor"}},
-    {"fake_quantize_dequantize_moving_average_abs_max",
-     {"X", "InScale", "InAccum", "InState"}},
-    {"nll_loss", {"X", "Label", "Weight"}},
-    {"bilinear_tensor_product", {"X", "Y", "Weight", "Bias"}},
-    {"gather", {"X", "Index", "Axis"}},
-    {"roi_pool", {"X", "ROIs", "RoisNum"}},
-    {"roi_align", {"X", "ROIs", "RoisNum"}},
-    {"psroi_pool", {"X", "ROIs", "RoisNum"}},
-    {"collect_fpn_proposals",
-     {"MultiLevelRois", "MultiLevelScores", "MultiLevelRoIsNum"}},
-    {"distribute_fpn_proposals", {"FpnRois", "RoisNum"}},
-    {"warpctc", {"Logits", "Label", "LogitsLength", "LabelLength"}},
-    {"hierarchical_sigmoid",
-     {"X", "W", "Label", "PathTable", "PathCode", "Bias"}},
-    {"moving_average_abs_max_scale", {"X", "InAccum", "InState"}},
-    {"multiclass_nms3", {"BBoxes", "Scores", "RoisNum"}},
-    {"box_coder", {"PriorBox", "PriorBoxVar", "TargetBox"}},
-    {"momentum", {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}},
-    {"sparse_momentum", {"Param", "Grad", "Velocity", "Index", "LearningRate"}},
-    {"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}},
-    {"run_program", {"X", "Params"}},
-    {"fused_feedforward",
-     {"Dropout1Seed", "Dropout2Seed", "Linear1Bias", "Linear2Bias", "Ln1Scale",
-      "Ln1Bias", "Ln2Scale", "Ln2Bias"}},
-    {"faster_tokenizer", {"Text", "Vocab", "TextPair"}},
-    {"matrix_rank", {"X", "TolTensor"}},
-    {"adam",
-     {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow",
-      "Beta2Pow", "MasterParam"}},
-    {"adamw",
-     {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow",
-      "Beta2Pow", "MasterParam"}},
-};
-
-// NOTE(zhiqiu): Like op_ins_map.
-// Commonly, the outputs in auto-generated OP function are determined by the
-// OP`s proto automatically, i.e., all the outputs registered in OpMaker.
-// However, some OPs have dispensable outputs, which means the output can
-// be none for some conditions. It is discovered that most dispensable outputs
-// is not used in imperative mode, so we drop those outputs when generating OP
-// functions. While, for very few OPs, the dispensable outputs are used, we
-// need to manually specify them in this map.
-std::map<std::string, std::set<std::string>> op_outs_map = {
-    {"fake_quantize_dequantize_moving_average_abs_max",
-     {"Out", "OutScale", "OutAccum", "OutState"}},
-    {"batch_norm",
-     {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
-      "ReserveSpace"}},
-    {"fused_attention",
-     {"LnMean", "LnVariance", "LnOut", "QKVOut", "QKVBiasOut", "TransposeOut2",
-      "QKOut", "QKTVOut", "SoftmaxOut", "AttnDropoutMaskOut", "AttnDropoutOut",
-      "SrcMaskOut", "FMHAOut", "OutLinearOut", "DropoutMaskOut", "Ln2Mean",
-      "Ln2Variance", "BiasDropoutResidualOut", "Y"}},
-    {"sync_batch_norm",
-     {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
-      "ReserveSpace"}},
-    {"unique", {"Out", "Index", "Indices", "Counts"}},
-    {"unique_consecutive", {"Out", "Index", "Counts"}},
-    {"generate_proposals", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
-    {"collect_fpn_proposals", {"FpnRois", "RoisNum"}},
-    {"matrix_nms", {"Out", "Index", "RoisNum"}},
-    {"distribute_fpn_proposals",
-     {"MultiFpnRois", "RestoreIndex", "MultiLevelRoIsNum"}},
-    {"moving_average_abs_max_scale",
-     {"Out", "OutScale", "OutAccum", "OutState"}},
-    {"multiclass_nms3", {"Out", "NmsRoisNum"}},
-    {"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
-    {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
-    {"sparse_momentum", {"ParamOut", "VelocityOut"}},
-    {"rnn", {"DropoutState", "Reserve", "Out", "State"}},
-    {"lamb",
-     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
-    {"run_program", {"DOut"}},
-    {"adam",
-     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
-      "MasterParamOut"}},
-    {"adamw",
-     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
-      "MasterParamOut"}},
-};
-
-// NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
-// generated in C++ automatically.
-// However, some OPs need to pass the outputs from Python instead of generating
-// them in C++. There are mainly 2 reasons for that,
-// (1) Optimizer OPs need to update the input param in-place, like sgd.
-//     So they need to pass the output which is same as input param.
-// (2) Very few python APIs has out in their arguments, like fill_constant.
-//     So they need to pass the python output to C++.
-//     Actually, this is not a good design, since it may break the SSA graph,
-//     especially in declarative mode.
-// For those OPs, we need to manually specify the outs need to pass in this map.
-std::map<std::string, std::set<std::string>> op_passing_outs_map = {
-    {"sgd", {"ParamOut"}},
-    {"adam",
-     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
-      "MasterParamOut"}},
-    {"adamw",
-     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
-      "MasterParamOut"}},
-    {"average_accumulates",
-     {"out_sum_1", "out_sum_2", "out_sum_3", "out_num_accumulates",
-      "out_old_num_accumulates", "out_num_updates"}},
-    {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
-    {"sparse_momentum", {"ParamOut", "VelocityOut"}},
-    {"batch_norm", {"MeanOut", "VarianceOut"}},
-    {"sync_batch_norm", {"MeanOut", "VarianceOut"}},
-    {"accuracy", {"Correct", "Total"}},
-    {"fill_constant", {"Out"}},
-    {"recv_v2", {"Out"}},
-    {"partial_recv", {"Out"}},
-    {"matmul", {"Out"}},
-    {"c_broadcast", {"Out"}},
-    {"c_sync_calc_stream", {"Out"}},
-    {"c_sync_comm_stream", {"Out"}},
-    {"c_reduce_sum", {"Out"}},
-    {"c_reduce_max", {"Out"}},
-    {"c_reduce_min", {"Out"}},
-    {"c_reduce_prod", {"Out"}},
-    {"c_reduce", {"Out"}},
-    {"c_scatter", {"Out"}},
-    {"barrier", {"Out"}},
-    {"fake_quantize_dequantize_moving_average_abs_max",
-     {"Out", "OutScale", "OutAccum", "OutState"}},
-    {"fake_quantize_dequantize_abs_max", {"Out", "OutScale"}},
-    {"fake_channel_wise_quantize_dequantize_abs_max", {"Out", "OutScale"}},
-    {"check_finite_and_unscale", {"Out", "FoundInfinite"}},
-    {"update_loss_scaling",
-     {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}},
-    {"moving_average_abs_max_scale",
-     {"Out", "OutScale", "OutAccum", "OutState"}},
-    {"lamb",
-     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
-    {"rnn", {"DropoutState"}},
-    {"run_program", {"Out", "DOut", "OutScope"}},
-    {"clear_float_status", {"FloatStatusOut"}},
-    {"get_float_status", {"FloatStatusOut"}},
-};
-
-// NOTE(pangyoki): Tensor View Strategy.
-// In this case, a new output varbase will be created, and this varbase will
-// reuse the input varbase's allocation.
-// It's a map. The key of outer map is the view op name, the value is
-// a pair which implies the mapping relationship between the input and
-// output varbase.
-std::map<std::string, std::pair<std::string, std::string>> view_op_map = {
-    {"squeeze2", {"X", "Out"}},  // "X" -> "Out"
-    {"unsqueeze2", {"X", "Out"}},
-    {"reshape2", {"X", "Out"}},
-    {"flatten_contiguous_range", {"X", "Out"}},
-};
-
 // NOTE(pangyoki): Inplace OP with duplicable input.
 // The set includes inplace ops that have duplicable input.
 // The first Varbase in input needs to be specified for the inplace strategy
diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h
new file mode 100644
index 0000000000000..7000097e0abcb
--- /dev/null
+++ b/paddle/fluid/pybind/op_function_generator.h
@@ -0,0 +1,192 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <set>
+#include <string>
+
+// NOTE(zhiqiu): Commonly, the inputs in auto-generated OP function are
+// determined by the OP`s proto automatically, i.e., all the inputs registered
+// in OpMaker.
+// However, some OPs have dispensable inputs, which means the input can
+// be none for some conditions. It is discovered that most dispensable inputs
+// is not used in imperative mode, so we drop those inputs when generating OP
+// functions. While, for very few OPs, the dispensable inputs are used, we
+// need to manually specify them in this map.
+std::map<std::string, std::set<std::string>> op_ins_map = {
+    {"layer_norm", {"X", "Scale", "Bias"}},
+    {"bincount", {"X", "Weights"}},
+    {"fused_attention",
+     {"X", "LnScale", "LnBias", "QKVW", "QKVBias", "SrcMask", "OutLinearW",
+      "OutLinearBias", "Ln2Scale", "Ln2Bias"}},
+    {"instance_norm", {"X", "Scale", "Bias"}},
+    {"gru_unit", {"Input", "HiddenPrev", "Weight", "Bias"}},
+    {"label_smooth", {"X", "PriorDist"}},
+    {"assign", {"X"}},
+    {"reshape2", {"X", "Shape"}},
+    {"expand", {"X", "ExpandTimes"}},
+    {"slice", {"Input", "StartsTensor", "EndsTensor"}},
+    {"fake_quantize_dequantize_moving_average_abs_max",
+     {"X", "InScale", "InAccum", "InState"}},
+    {"nll_loss", {"X", "Label", "Weight"}},
+    {"bilinear_tensor_product", {"X", "Y", "Weight", "Bias"}},
+    {"gather", {"X", "Index", "Axis"}},
+    {"roi_pool", {"X", "ROIs", "RoisNum"}},
+    {"roi_align", {"X", "ROIs", "RoisNum"}},
+    {"psroi_pool", {"X", "ROIs", "RoisNum"}},
+    {"collect_fpn_proposals",
+     {"MultiLevelRois", "MultiLevelScores", "MultiLevelRoIsNum"}},
+    {"distribute_fpn_proposals", {"FpnRois", "RoisNum"}},
+    {"warpctc", {"Logits", "Label", "LogitsLength", "LabelLength"}},
+    {"hierarchical_sigmoid",
+     {"X", "W", "Label", "PathTable", "PathCode", "Bias"}},
+    {"moving_average_abs_max_scale", {"X", "InAccum", "InState"}},
+    {"multiclass_nms3", {"BBoxes", "Scores", "RoisNum"}},
+    {"box_coder", {"PriorBox", "PriorBoxVar", "TargetBox"}},
+    {"momentum", {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}},
+    {"sparse_momentum", {"Param", "Grad", "Velocity", "Index", "LearningRate"}},
+    {"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}},
+    {"run_program", {"X", "Params"}},
+    {"fused_feedforward",
+     {"Dropout1Seed", "Dropout2Seed", "Linear1Bias", "Linear2Bias", "Ln1Scale",
+      "Ln1Bias", "Ln2Scale", "Ln2Bias"}},
+    {"faster_tokenizer", {"Text", "Vocab", "TextPair"}},
+    {"matrix_rank", {"X", "TolTensor"}},
+    {"adam",
+     {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow",
+      "Beta2Pow", "MasterParam"}},
+    {"adamw",
+     {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow",
+      "Beta2Pow", "MasterParam"}},
+};
+
+// NOTE(zhiqiu): Like op_ins_map.
+// Commonly, the outputs in auto-generated OP function are determined by the
+// OP`s proto automatically, i.e., all the outputs registered in OpMaker.
+// However, some OPs have dispensable outputs, which means the output can
+// be none for some conditions. It is discovered that most dispensable outputs
+// is not used in imperative mode, so we drop those outputs when generating OP
+// functions. While, for very few OPs, the dispensable outputs are used, we
+// need to manually specify them in this map.
+std::map<std::string, std::set<std::string>> op_outs_map = {
+    {"fake_quantize_dequantize_moving_average_abs_max",
+     {"Out", "OutScale", "OutAccum", "OutState"}},
+    {"batch_norm",
+     {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
+      "ReserveSpace"}},
+    {"fused_attention",
+     {"LnMean", "LnVariance", "LnOut", "QKVOut", "QKVBiasOut", "TransposeOut2",
+      "QKOut", "QKTVOut", "SoftmaxOut", "AttnDropoutMaskOut", "AttnDropoutOut",
+      "SrcMaskOut", "FMHAOut", "OutLinearOut", "DropoutMaskOut", "Ln2Mean",
+      "Ln2Variance", "BiasDropoutResidualOut", "Y"}},
+    {"sync_batch_norm",
+     {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
+      "ReserveSpace"}},
+    {"unique", {"Out", "Index", "Indices", "Counts"}},
+    {"unique_consecutive", {"Out", "Index", "Counts"}},
+    {"generate_proposals", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
+    {"collect_fpn_proposals", {"FpnRois", "RoisNum"}},
+    {"matrix_nms", {"Out", "Index", "RoisNum"}},
+    {"distribute_fpn_proposals",
+     {"MultiFpnRois", "RestoreIndex", "MultiLevelRoIsNum"}},
+    {"moving_average_abs_max_scale",
+     {"Out", "OutScale", "OutAccum", "OutState"}},
+    {"multiclass_nms3", {"Out", "NmsRoisNum"}},
+    {"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
+    {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
+    {"sparse_momentum", {"ParamOut", "VelocityOut"}},
+    {"rnn", {"DropoutState", "Reserve", "Out", "State"}},
+    {"lamb",
+     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
+    {"run_program", {"DOut"}},
+    {"adam",
+     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
+      "MasterParamOut"}},
+    {"adamw",
+     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
+      "MasterParamOut"}},
+};
+
+// NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
+// generated in C++ automatically.
+// However, some OPs need to pass the outputs from Python instead of generating
+// them in C++. There are mainly 2 reasons for that,
+// (1) Optimizer OPs need to update the input param in-place, like sgd.
+//     So they need to pass the output which is same as input param.
+// (2) Very few python APIs has out in their arguments, like fill_constant.
+//     So they need to pass the python output to C++.
+//     Actually, this is not a good design, since it may break the SSA graph,
+//     especially in declarative mode.
+// For those OPs, we need to manually specify the outs need to pass in this map.
+std::map<std::string, std::set<std::string>> op_passing_outs_map = {
+    {"sgd", {"ParamOut"}},
+    {"adam",
+     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
+      "MasterParamOut"}},
+    {"adamw",
+     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
+      "MasterParamOut"}},
+    {"average_accumulates",
+     {"out_sum_1", "out_sum_2", "out_sum_3", "out_num_accumulates",
+      "out_old_num_accumulates", "out_num_updates"}},
+    {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
+    {"sparse_momentum", {"ParamOut", "VelocityOut"}},
+    {"batch_norm", {"MeanOut", "VarianceOut"}},
+    {"sync_batch_norm", {"MeanOut", "VarianceOut"}},
+    {"accuracy", {"Correct", "Total"}},
+    {"fill_constant", {"Out"}},
+    {"recv_v2", {"Out"}},
+    {"partial_recv", {"Out"}},
+    {"matmul", {"Out"}},
+    {"c_broadcast", {"Out"}},
+    {"c_sync_calc_stream", {"Out"}},
+    {"c_sync_comm_stream", {"Out"}},
+    {"c_reduce_sum", {"Out"}},
+    {"c_reduce_max", {"Out"}},
+    {"c_reduce_min", {"Out"}},
+    {"c_reduce_prod", {"Out"}},
+    {"c_reduce", {"Out"}},
+    {"c_scatter", {"Out"}},
+    {"barrier", {"Out"}},
+    {"fake_quantize_dequantize_moving_average_abs_max",
+     {"Out", "OutScale", "OutAccum", "OutState"}},
+    {"fake_quantize_dequantize_abs_max", {"Out", "OutScale"}},
+    {"fake_channel_wise_quantize_dequantize_abs_max", {"Out", "OutScale"}},
+    {"check_finite_and_unscale", {"Out", "FoundInfinite"}},
+    {"update_loss_scaling",
+     {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}},
+    {"moving_average_abs_max_scale",
+     {"Out", "OutScale", "OutAccum", "OutState"}},
+    {"lamb",
+     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
+    {"rnn", {"DropoutState"}},
+    {"run_program", {"Out", "DOut", "OutScope"}},
+    {"clear_float_status", {"FloatStatusOut"}},
+    {"get_float_status", {"FloatStatusOut"}},
+};
+
+// NOTE(pangyoki): Tensor View Strategy.
+// In this case, a new output varbase will be created, and this varbase will
+// reuse the input varbase's allocation.
+// It's a map. The key of outer map is the view op name, the value is
+// a pair which implies the mapping relationship between the input and
+// output varbase.
+std::map<std::string, std::pair<std::string, std::string>> view_op_map = {
+    {"squeeze2", {"X", "Out"}},  // "X" -> "Out"
+    {"unsqueeze2", {"X", "Out"}},
+    {"reshape2", {"X", "Out"}},
+    {"flatten_contiguous_range", {"X", "Out"}},
+};
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 617c724f63f38..1e29820e08d5c 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -75,6 +75,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/pybind/cuda_streams_py.h"
+#include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/io.h"
 #include "paddle/utils/none.h"
 #ifdef PADDLE_WITH_ASCEND
@@ -113,9 +114,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #endif
 #ifndef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/cuda_profiler.h"
+#include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
 #endif
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 
 #ifdef PADDLE_WITH_ASCEND_CL
@@ -150,6 +151,14 @@ PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType);
 
 namespace paddle {
 namespace pybind {
+
+PyTypeObject *g_place_pytype = nullptr;
+PyTypeObject *g_cudaplace_pytype = nullptr;
+PyTypeObject *g_cpuplace_pytype = nullptr;
+PyTypeObject *g_xpuplace_pytype = nullptr;
+PyTypeObject *g_npuplace_pytype = nullptr;
+PyTypeObject *g_cudapinnedplace_pytype = nullptr;
+
 bool IsCompiledWithCUDA() {
 #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
   return false;
@@ -198,6 +207,14 @@ bool IsCompiledWithMKLDNN() {
 #endif
 }
 
+bool IsCompiledWithCINN() {
+#ifndef PADDLE_WITH_CINN
+  return false;
+#else
+  return true;
+#endif
+}
+
 bool IsCompiledWithHETERPS() {
 #ifndef PADDLE_WITH_HETERPS
   return false;
@@ -498,7 +515,7 @@ static void AssertStaticGraphAndDygraphGradMakerNoDiff() {
 static int GetNCCLVersion() {
 #if NCCL_VERSION_CODE >= 2304
   int ver;
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGetVersion(&ver));
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGetVersion(&ver));
   return ver;
 #else
   PADDLE_THROW(platform::errors::External(
@@ -524,6 +541,7 @@ PYBIND11_MODULE(core_avx, m) {
 PYBIND11_MODULE(core_noavx, m) {
 #endif
 
+  BindEager(&m);
   BindCudaStream(&m);
 
   // Not used, just make sure cpu_info.cc is linked.
@@ -546,7 +564,7 @@ PYBIND11_MODULE(core_noavx, m) {
   m.def("disable_signal_handler", &DisableSignalHandler);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  m.def("cudnn_version", &platform::CudnnVersion);
+  m.def("cudnn_version", &platform::DnnVersion);
   m.def("gpu_memory_available", []() {
     size_t available = 0;
     size_t total = 0;
@@ -554,6 +572,7 @@ PYBIND11_MODULE(core_noavx, m) {
     return available;
   });
 #endif
+
 #ifdef PADDLE_WITH_NCCL
   m.def("nccl_version", &GetNCCLVersion);
 #endif
@@ -1611,7 +1630,7 @@ All parameter, weight, gradient are variables in Paddle.
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
 #endif
-  py::class_<platform::CUDAPlace>(m, "CUDAPlace", R"DOC(
+  py::class_<platform::CUDAPlace> cudaplace(m, "CUDAPlace", R"DOC(
 
     CUDAPlace is a descriptor of a device.
     It represents a GPU device allocated or to be allocated with Tensor or LoDTensor.
@@ -1634,7 +1653,9 @@ All parameter, weight, gradient are variables in Paddle.
 
           place = paddle.CUDAPlace(0)
 
-        )DOC")
+        )DOC");
+  g_cudaplace_pytype = reinterpret_cast<PyTypeObject *>(cudaplace.ptr());
+  cudaplace
       .def("__init__",
            [](platform::CUDAPlace &self, int dev_id) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -1646,8 +1667,8 @@ All parameter, weight, gradient are variables in Paddle.
                std::exit(-1);
              }
 
-             if (UNLIKELY(dev_id >= platform::GetCUDADeviceCount())) {
-               if (platform::GetCUDADeviceCount() == 0) {
+             if (UNLIKELY(dev_id >= platform::GetGPUDeviceCount())) {
+               if (platform::GetGPUDeviceCount() == 0) {
                  LOG(ERROR) << "Cannot use GPU because there is no GPU "
                                "detected on your "
                                "machine.";
@@ -1656,8 +1677,8 @@ All parameter, weight, gradient are variables in Paddle.
                  LOG(ERROR) << string::Sprintf(
                      "Invalid CUDAPlace(%d), must inside [0, %d), because GPU "
                      "number on your machine is %d",
-                     dev_id, platform::GetCUDADeviceCount(),
-                     platform::GetCUDADeviceCount());
+                     dev_id, platform::GetGPUDeviceCount(),
+                     platform::GetGPUDeviceCount());
                  std::exit(-1);
                }
              }
@@ -1692,13 +1713,15 @@ All parameter, weight, gradient are variables in Paddle.
       .def("__repr__", string::to_string<const platform::CUDAPlace &>)
       .def("__str__", string::to_string<const platform::CUDAPlace &>);
 
-  py::class_<platform::XPUPlace>(m, "XPUPlace", R"DOC(
+  py::class_<platform::XPUPlace> xpuplace(m, "XPUPlace", R"DOC(
     **Note**:
     Examples:
         .. code-block:: python
           import paddle.fluid as fluid
           xpu_place = fluid.XPUPlace(0)
-        )DOC")
+        )DOC");
+  g_xpuplace_pytype = reinterpret_cast<PyTypeObject *>(xpuplace.ptr());
+  xpuplace
       .def("__init__",
            [](platform::XPUPlace &self, int dev_id) {
 #ifdef PADDLE_WITH_XPU
@@ -1768,7 +1791,7 @@ All parameter, weight, gradient are variables in Paddle.
   });
 #endif
 
-  py::class_<paddle::platform::CPUPlace>(m, "CPUPlace", R"DOC(
+  py::class_<paddle::platform::CPUPlace> cpuplace(m, "CPUPlace", R"DOC(
     CPUPlace is a descriptor of a device.
     It represents a CPU device on which a tensor will be allocated and a model will run.
 
@@ -1778,8 +1801,9 @@ All parameter, weight, gradient are variables in Paddle.
           import paddle
           cpu_place = paddle.CPUPlace()
 
-        )DOC")
-      .def(py::init<>())
+        )DOC");
+  g_cpuplace_pytype = reinterpret_cast<PyTypeObject *>(cpuplace.ptr());
+  cpuplace.def(py::init<>())
       .def("_type", &PlaceIndex<platform::CPUPlace>)
       .def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>)
       .def("_equals", &IsSamePlace<platform::CPUPlace, platform::XPUPlace>)
@@ -1791,7 +1815,8 @@ All parameter, weight, gradient are variables in Paddle.
       .def("__repr__", string::to_string<const platform::CPUPlace &>)
       .def("__str__", string::to_string<const platform::CPUPlace &>);
 
-  py::class_<paddle::platform::CUDAPinnedPlace>(m, "CUDAPinnedPlace", R"DOC(
+  py::class_<paddle::platform::CUDAPinnedPlace> cudapinnedplace(
+      m, "CUDAPinnedPlace", R"DOC(
     CUDAPinnedPlace is a descriptor of a device.
     It refers to the page locked memory allocated by the CUDA function `cudaHostAlloc()` in the host memory.
     The host operating system will not paging and exchanging the memory.
@@ -1805,7 +1830,10 @@ All parameter, weight, gradient are variables in Paddle.
           import paddle
           place = paddle.CUDAPinnedPlace()
 
-        )DOC")
+        )DOC");
+  g_cudapinnedplace_pytype =
+      reinterpret_cast<PyTypeObject *>(cudapinnedplace.ptr());
+  cudapinnedplace
       .def("__init__",
            [](platform::CUDAPinnedPlace &self) {
 #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
@@ -1831,7 +1859,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
 
   // NPUPlace
-  py::class_<platform::NPUPlace>(m, "NPUPlace", R"DOC(
+  py::class_<platform::NPUPlace> npuplace(m, "NPUPlace", R"DOC(
     NPUPlace is a descriptor of a device.
     It represents a NPU device on which a tensor will be allocated and a model will run.
 
@@ -1840,7 +1868,9 @@ All parameter, weight, gradient are variables in Paddle.
           import paddle
           npu_place = paddle.NPUPlace(0)
 
-        )DOC")
+        )DOC");
+  g_npuplace_pytype = reinterpret_cast<PyTypeObject *>(npuplace.ptr());
+  npuplace
       .def("__init__",
            [](platform::NPUPlace &self, int dev_id) {
 #ifdef PADDLE_WITH_ASCEND_CL
@@ -1891,8 +1921,9 @@ All parameter, weight, gradient are variables in Paddle.
            [](const platform::NPUPlace &self) { return self.GetDeviceId(); })
       .def("__str__", string::to_string<const platform::NPUPlace &>);
 
-  py::class_<platform::Place>(m, "Place")
-      .def(py::init<>())
+  py::class_<platform::Place> platformplace(m, "Place");
+  g_place_pytype = reinterpret_cast<PyTypeObject *>(platformplace.ptr());
+  platformplace.def(py::init<>())
       .def("_type", &PlaceIndex<platform::Place>)
       .def("_equals", &IsSamePlace<platform::Place, platform::Place>)
       .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPlace>)
@@ -2180,6 +2211,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("is_compiled_with_npu", IsCompiledWithNPU);
   m.def("is_compiled_with_xpu", IsCompiledWithXPU);
   m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
+  m.def("is_compiled_with_cinn", IsCompiledWithCINN);
   m.def("_is_compiled_with_heterps", IsCompiledWithHETERPS);
   m.def("supports_bfloat16", SupportsBfloat16);
   m.def("supports_bfloat16_fast_performance", SupportsBfloat16FastPerformance);
@@ -2229,7 +2261,7 @@ All parameter, weight, gradient are variables in Paddle.
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
     // Only GPUs with Compute Capability >= 53 support float16
-    return platform::GetCUDAComputeCapability(place.device) >= 53;
+    return platform::GetGPUComputeCapability(place.device) >= 53;
   });
 #endif
 
@@ -2409,7 +2441,7 @@ All parameter, weight, gradient are variables in Paddle.
 
   m.def("op_support_gpu", OpSupportGPU);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
+  m.def("get_cuda_device_count", platform::GetGPUDeviceCount);
   m.def("cuda_empty_cache", [] {
     for (int dev_id : platform::GetSelectedDevices()) {
       auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(
diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt
new file mode 100644
index 0000000000000..b8f6f4738d3e7
--- /dev/null
+++ b/paddle/infrt/CMakeLists.txt
@@ -0,0 +1,79 @@
+if (NOT WITH_INFRT)
+    return()
+endif()
+
+set(infrt_src CACHE INTERNAL "" FORCE)
+
+# Gather headers for library publish.
+function(core_gather_headers)
+    file(GLOB includes LIST_DIRECTORIES false RELATIVE ${CMAKE_SOURCE_DIR} *.h)
+
+    foreach(header ${includes})
+        set(core_includes "${core_includes};${header}" CACHE INTERNAL "")
+    endforeach()
+endfunction()
+
+function(gather_srcs SRC_GROUP)
+    set(options)
+    set(oneValueArgs)
+    set(multiValueArgs "SRCS")
+    cmake_parse_arguments(prefix "" "" "${multiValueArgs}" ${ARGN})
+    foreach(cpp ${prefix_SRCS})
+        set(${SRC_GROUP} "${${SRC_GROUP}};${CMAKE_CURRENT_SOURCE_DIR}/${cpp}" CACHE INTERNAL "")
+    endforeach()
+endfunction()
+
+# This method is similar to the global cc_test, but discard the huge amount default dependencies those are
+# not needed by INFRT.
+function(cc_test_tiny TARGET_NAME)
+  if(WITH_TESTING)
+    set(options SERIAL)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS ARGS)
+    cmake_parse_arguments(cc_test_tiny "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    add_executable(${TARGET_NAME} ${cc_test_tiny_SRCS})
+    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+    target_link_libraries(${TARGET_NAME} ${cc_test_tiny_DEPS} ${os_dependency_modules} infrt_gtest_main gtest )
+    add_dependencies(${TARGET_NAME} ${cc_test_tiny_DEPS} infrt_gtest_main gtest extern_gtest)
+
+    add_test(NAME ${TARGET_NAME}
+      COMMAND ${TARGET_NAME} "${cc_test_tiny_ARGS}"
+            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    if (${cc_test_tiny_SERIAL})
+      set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+    endif()
+  endif()
+
+endfunction()
+
+if (WITH_TESTING)
+    cc_library(infrt_gtest_main SRCS gtest_main.cc DEPS gtest glog gflags)
+endif()
+
+
+add_subdirectory(api)
+add_subdirectory(common)
+add_subdirectory(dialect)
+add_subdirectory(host_context)
+add_subdirectory(kernel)
+add_subdirectory(tensor)
+add_subdirectory(support)
+add_subdirectory(external_kernels)
+add_subdirectory(paddle)
+
+
+# MLIR td file generations
+set(infrt_mlir_incs
+        ops_inc
+        basic_kernels_inc
+        test_kernels_inc
+        infrt_base_inc
+        tensor_shape_inc
+        dense_tensor_inc
+        pd_ops_inc
+        rewrite_inc
+        )
+message(STATUS "infrt srcs:\n${infrt_src}")
+
+cc_library(infrt SRCS ${infrt_src} DEPS glog ${mlir_libs} paddle_framework_proto)
+add_dependencies(infrt ${infrt_mlir_incs})
diff --git a/paddle/infrt/api/CMakeLists.txt b/paddle/infrt/api/CMakeLists.txt
new file mode 100644
index 0000000000000..93a7ae8369521
--- /dev/null
+++ b/paddle/infrt/api/CMakeLists.txt
@@ -0,0 +1,8 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    infrt_api.cc
+    )
+
+# Disable temporarily for the external-kernel's mkldnn is outdate
+# cc_test(test_infrt_api SRCS infrt_api_test.cc DEPS infrt ${MLIR_IR_LIBS})
diff --git a/paddle/infrt/api/infrt_api.cc b/paddle/infrt/api/infrt_api.cc
new file mode 100644
index 0000000000000..c2a4e0aff7a08
--- /dev/null
+++ b/paddle/infrt/api/infrt_api.cc
@@ -0,0 +1,246 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/api/infrt_api.h"
+
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/Support/DynamicLibrary.h>
+#include <mlir/Dialect/StandardOps/IR/Ops.h>
+#include <mlir/Parser.h>
+
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/infrt/common/global.h"
+#include "paddle/infrt/dialect/dense_tensor.h"
+#include "paddle/infrt/dialect/mlir_loader.h"
+#include "paddle/infrt/host_context/core_runtime.h"
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/mlir_function_executable.h"
+#include "paddle/infrt/host_context/mlir_to_runtime_translate.h"
+#include "paddle/infrt/host_context/op_executable.h"
+#include "paddle/infrt/host_context/value.h"
+#include "paddle/infrt/kernel/basic_kernels.h"
+#include "paddle/infrt/kernel/control_flow_kernels.h"
+#include "paddle/infrt/kernel/tensor_kernels.h"
+#include "paddle/infrt/kernel/tensor_shape_kernels.h"
+#include "paddle/infrt/kernel/test_kernels.h"
+#include "paddle/infrt/tensor/tensor_map.h"
+
+using namespace infrt::host_context;  // NOLINT
+using namespace infrt::tensor;        // NOLINT
+using namespace infrt::tensor;        // NOLINT
+using infrt::dt::TensorMapType;       // NOLINT
+using infrt::dt::TensorType;          // NOLINT
+
+namespace infrt {
+
+template <typename T>
+std::string DumpToString(T& op) {  // NOLINT
+  std::string buffer;
+  llvm::raw_string_ostream os(buffer);
+  op.print(os);
+  os.flush();
+  return buffer;
+}
+
+struct MlirToRuntimeTranslator::Impl {
+  mlir::ModuleOp module;
+  // The runtime for a function call.
+  CoreRuntimeBuilder* runtime{};
+
+  // The current working op, the translator process the ops one by one, each
+  // time it updates `cur_op` here to current op
+  // working on.
+  OpExecutableBuilder* cur_op{};
+
+  // record the current function name.
+  std::string cur_func_name;
+
+  // Name to function definitions.
+  std::unordered_map<std::string, mlir::FuncOp> func_defs;
+
+  // Map from an operation to its results.
+  std::unordered_map<const mlir::Operation*, std::vector<ValueRef>> op_results;
+  llvm::DenseMap<mlir::Value, ValueRef> value_map;
+};
+
+/**
+ * Execute the mlir program in predict mode.
+ */
+class PredictExecutor : public MlirToRuntimeTranslator {
+ public:
+  CoreRuntimeBuilder core_runtime;
+
+  PredictExecutor(mlir::ModuleOp module,
+                  KernelRegistry* registry,
+                  TensorMap* map)
+      : MlirToRuntimeTranslator(module, &core_runtime),
+        core_runtime(registry),
+        registry_(registry) {
+    CHECK(registry_);
+    Init(map);
+  }
+
+  void Run() {
+    auto arguments = llvm::makeArrayRef(arguments_);
+    auto results = llvm::makeMutableArrayRef(results_.begin(), results_.size());
+    function_executable_->Execute(arguments, results);
+  }
+
+  int GetInputNum() { return inputs_.size(); }
+
+  DenseHostTensor* GetInput(int i) { return inputs_[i]; }
+
+  int GetOutputNum() { return outputs_.size(); }
+
+  DenseHostTensor* GetOutput(int i) { return outputs_[i]; }
+
+ private:
+  void Init(TensorMap* map) {
+    EmitFunctions();
+    llvm::Optional<mlir::FuncOp> predict_func_ = llvm::None;
+    for (auto func_op : impl_->module.getOps<mlir::FuncOp>()) {
+      if (func_op.getName().str() != "predict") continue;
+      predict_func_ = func_op;
+      break;
+    }
+    if (!predict_func_) {
+      std::cout << "ERROR: init failed, no predict function found in mlir."
+                << std::endl;
+      return;
+    }
+    auto& predict_func = predict_func_.getValue();
+    function_executable_ =
+        new MlirFunctionExecutable(predict_func, registry_, impl_->func_defs);
+
+    // process parammeters
+    for (size_t i = 0; i < predict_func.getNumArguments(); ++i) {
+      auto arg = predict_func.getArgument(i);
+      auto type = arg.getType();
+      // this param is TensorMap
+      if (type.isa<TensorMapType>()) {
+        auto* value = new host_context::Value(std::move(*map));
+        arguments_.push_back(value);
+        AddValue(predict_func.getArgument(i), value);
+      } else {
+        // this param is an input Tensor
+        auto dht = DenseHostTensor();
+        auto* value = new host_context::Value(std::move(dht));
+        arguments_.push_back(value);
+        inputs_.push_back(&(value->get<DenseHostTensor>()));
+      }
+    }
+
+    // process results
+    auto& last_op = predict_func.front().back();
+    if (last_op.getName().getStringRef() == "infrt.return") {
+      for (size_t i = 0; i < last_op.getNumOperands(); ++i) {
+        auto* value = AddValue(mlir::Value(last_op.getOperand(i)));
+        results_.push_back(ValueRef(value));
+        outputs_.push_back(&(value->get<DenseHostTensor>()));
+      }
+    }
+  }
+
+ protected:
+  std::unordered_map<std::string, mlir::FuncOp> func_def_table;
+
+  void EmitFunction(mlir::FuncOp op) override {
+    CHECK(!impl_->func_defs.count(op.getName().str()))
+        << "Duplicate function defition found for function ["
+        << op.getName().str();
+    impl_->func_defs.emplace(op.getName().str(), op);
+  }
+
+ private:
+  KernelRegistry* registry_{};
+  MlirFunctionExecutable* function_executable_;
+  llvm::SmallVector<DenseHostTensor*, 1> inputs_;
+  llvm::SmallVector<host_context::Value*, 2> arguments_;
+  llvm::SmallVector<DenseHostTensor*, 1> outputs_;
+  llvm::SmallVector<ValueRef, 1> results_;
+};
+
+std::shared_ptr<InfRtPredictor> CreateInfRtPredictor(
+    const InfRtConfig& config) {
+  auto x = std::make_shared<InfRtPredictor>();
+  x->Init(config);
+  return x;
+}
+
+struct InfRtPredictor::Impl {
+  mlir::OwningModuleRef module_ref;
+  std::unique_ptr<PredictExecutor> executor;
+};
+
+InfRtPredictor::InfRtPredictor() : impl_(new Impl) {}
+InfRtPredictor::~InfRtPredictor() {}
+
+void InfRtPredictor::Run() { impl_->executor->Run(); }
+
+int InfRtPredictor::Init(const InfRtConfig& config) {
+  mlir::MLIRContext* context = infrt::Global::getMLIRContext();
+  auto module_ref = dialect::LoadMlirFile(config.mlir_path(), context);
+
+  KernelRegistry* registry = new KernelRegistry();
+
+  kernel::RegisterBasicKernels(registry);
+  kernel::RegisterTestKernels(registry);
+  kernel::RegisterTensorShapeKernels(registry);
+  kernel::RegisterTensorKernels(registry);
+  kernel::RegisterControlFlowKernels(registry);
+
+  impl_->module_ref = std::move(module_ref);
+
+  // load extra shared library
+  for (const std::string& lib_path : config.shared_libs()) {
+    std::string err;
+    llvm::sys::DynamicLibrary dynLib =
+        llvm::sys::DynamicLibrary::getPermanentLibrary(lib_path.c_str(), &err);
+    if (!dynLib.isValid()) {
+      llvm::errs() << "Load shared library failed. Error: " << err << "\n";
+      return 1;
+    }
+    if (auto reg_sym = dynLib.SearchForAddressOfSymbol("RegisterKernels")) {
+      auto reg_func = reinterpret_cast<void (*)(KernelRegistry*)>(reg_sym);
+      reg_func(registry);
+    } else {
+      llvm::outs() << "Symbol \"RegisterKernels\" not found in \"" << lib_path
+                   << "\". Skip.\n";
+    }
+  }
+
+  // Load params
+  TensorMap* tensor_map = LoadParams(config.model_dir());
+
+  // Create PredictExecutor
+  impl_->executor.reset(
+      new PredictExecutor(impl_->module_ref.get(), registry, tensor_map));
+  return 0;
+}
+
+int InfRtPredictor::GetInputNum() { return impl_->executor->GetInputNum(); }
+
+DenseHostTensor* InfRtPredictor::GetInput(int i) {
+  return impl_->executor->GetInput(i);
+}
+
+int InfRtPredictor::GetOutputNum() { return impl_->executor->GetOutputNum(); }
+
+DenseHostTensor* InfRtPredictor::GetOutput(int i) {
+  return impl_->executor->GetOutput(i);
+}
+
+}  // namespace infrt
diff --git a/paddle/infrt/api/infrt_api.h b/paddle/infrt/api/infrt_api.h
new file mode 100644
index 0000000000000..82b6cb8df91ff
--- /dev/null
+++ b/paddle/infrt/api/infrt_api.h
@@ -0,0 +1,63 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/infrt/tensor/dense_host_tensor.h"
+
+namespace infrt {
+
+class InfRtConfig {
+  std::string model_dir_;
+  std::string mlir_path_;
+  std::vector<std::string> shared_libs_;
+
+ public:
+  InfRtConfig() = default;
+  void set_model_dir(const std::string& model_dir) { model_dir_ = model_dir; }
+  const std::string& model_dir() const { return model_dir_; }
+
+  void set_mlir_path(const std::string& mlir_path) { mlir_path_ = mlir_path; }
+  const std::string& mlir_path() const { return mlir_path_; }
+
+  void set_shared_libs(const std::vector<std::string>& shared_libs) {
+    shared_libs_ = shared_libs;
+  }
+  const std::vector<std::string>& shared_libs() const { return shared_libs_; }
+
+  virtual ~InfRtConfig() = default;
+};
+
+class InfRtPredictor {
+ public:
+  InfRtPredictor();
+  ~InfRtPredictor();
+  void Run();
+  int Init(const InfRtConfig& config);
+  int GetInputNum();
+  tensor::DenseHostTensor* GetInput(int i);
+  int GetOutputNum();
+  tensor::DenseHostTensor* GetOutput(int i);
+
+ protected:
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+std::shared_ptr<InfRtPredictor> CreateInfRtPredictor(const InfRtConfig& config);
+
+}  // namespace infrt
diff --git a/paddle/infrt/api/infrt_api_test.cc b/paddle/infrt/api/infrt_api_test.cc
new file mode 100644
index 0000000000000..92e069f47521b
--- /dev/null
+++ b/paddle/infrt/api/infrt_api_test.cc
@@ -0,0 +1,79 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/api/infrt_api.h"
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <vector>
+
+#include "llvm/Support/raw_ostream.h"
+#include "paddle/infrt/common/buffer.h"
+#include "paddle/infrt/common/dtype.h"
+
+using infrt::InfRtConfig;
+using infrt::InfRtPredictor;
+using infrt::CreateInfRtPredictor;
+
+namespace infrt {
+
+TEST(InfRtPredictor, predictor) {
+  std::vector<std::string> shared_libs;
+  shared_libs.push_back("../../paddle/libexternal_kernels.so");
+
+  InfRtConfig config;
+
+  // set external shared libraries that contain kernels.
+  config.set_shared_libs(shared_libs);
+  // set model dir
+  config.set_model_dir("../../paddle/paddle_1.8_fc_model");
+  // set mlir path
+  config.set_mlir_path("../../../infrt/dialect/mlir_tests/tensor_map.mlir");
+
+  std::shared_ptr<InfRtPredictor> predictor = CreateInfRtPredictor(config);
+
+  auto* input = predictor->GetInput(0);
+  std::vector<int64_t> shape = {3, 3};
+  input->Init(shape, infrt::GetDType<float>());
+  llvm::outs() << input->shape() << "\n";
+
+  // init input tensor
+  auto* input_data = reinterpret_cast<float*>(input->buffer()->data()->memory);
+  for (int i = 0; i < input->shape().GetNumElements(); i++) input_data[i] = 1.0;
+
+  predictor->Run();
+
+  // get and print output tensor
+  auto* output = predictor->GetOutput(0);
+  auto* output_data =
+      reinterpret_cast<float*>(output->buffer()->data()->memory);
+
+  std::vector<float> ans = {0.428458,
+                            0.244493,
+                            0.572342,
+                            0.572008,
+                            0.509771,
+                            0.495599,
+                            0.651287,
+                            0.326426,
+                            0.404649};
+
+  ASSERT_EQ(output->shape().GetNumElements(), ans.size());
+  for (int i = 0; i < output->shape().GetNumElements(); ++i) {
+    ASSERT_NEAR(output_data[i], ans[i], 0.000001);
+  }
+}
+
+}  // namespace infrt
diff --git a/paddle/infrt/common/CMakeLists.txt b/paddle/infrt/common/CMakeLists.txt
new file mode 100644
index 0000000000000..931e3e42307eb
--- /dev/null
+++ b/paddle/infrt/common/CMakeLists.txt
@@ -0,0 +1,14 @@
+core_gather_headers()
+set(core_includes "${core_includes};infrt/common/dtype.def" CACHE INTERNAL "")
+
+gather_srcs(infrt_src SRCS
+    dtype.cc
+    global.cc
+    target.cc
+    type.cc
+    shared.cc
+    object.cc
+    string.cc
+    buffer.cc
+    memory.cc
+    )
diff --git a/paddle/infrt/common/buffer.cc b/paddle/infrt/common/buffer.cc
new file mode 100644
index 0000000000000..bc4ec7feada87
--- /dev/null
+++ b/paddle/infrt/common/buffer.cc
@@ -0,0 +1,98 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/common/buffer.h"
+
+#include <stdarg.h>
+#include <stdio.h>
+
+#include <cmath>
+
+namespace infrt {
+void Buffer::Resize(uint32_t size) {
+  if (size_ > 0) {
+    Free();
+    size_ = 0;
+  }
+
+  if (size_ != size) {
+    data_.memory = reinterpret_cast<uint8_t*>(Malloc(size));
+    size_ = size;
+  }
+}
+
+void Buffer::Resize(uint32_t alignment, uint32_t size) {
+  if (size_ > 0) {
+    Free();
+    size_ = 0;
+  }
+
+  if (size_ != size) {
+    data_.memory = reinterpret_cast<uint8_t*>(AlignedAlloc(alignment, size));
+    size_ = size;
+  }
+}
+
+void Buffer::SetTarget(const infrt::common::Target& target) {
+  target_ = target;
+  memory_mng_cache_ = MemoryManager::Global().RetrieveSafely(target_.arch);
+}
+
+void Buffer::ResizeLazy(uint32_t size) {
+  if (size <= size_) return;
+  Resize(size);
+}
+
+void Buffer::ResizeLazy(uint32_t alignment, uint32_t size) {
+  if (size <= size_) return;
+  Resize(alignment, size);
+}
+
+void Buffer::Resize(uint32_t size, const infrt::common::Target& target) {
+  if (target.arch != target_.arch) {
+    Free();
+    SetTarget(target);
+  }
+  Resize(size);
+}
+
+void Buffer::Resize(uint32_t alignment,
+                    uint32_t size,
+                    const infrt::common::Target& target) {
+  if (target.arch != target_.arch) {
+    Free();
+    SetTarget(target);
+  }
+  Resize(alignment, size);
+}
+
+void Buffer::ResizeLazy(uint32_t size, const infrt::common::Target& target) {
+  if (target.arch != target_.arch) {
+    Free();
+    SetTarget(target);
+  }
+  ResizeLazy(size);
+}
+
+void Buffer::ResizeLazy(uint32_t alignment,
+                        uint32_t size,
+                        const infrt::common::Target& target) {
+  if (target.arch != target_.arch) {
+    Free();
+    SetTarget(target);
+  }
+  ResizeLazy(alignment, size);
+}
+
+}  // namespace infrt
diff --git a/paddle/infrt/common/buffer.h b/paddle/infrt/common/buffer.h
new file mode 100644
index 0000000000000..cae2a7ead96ab
--- /dev/null
+++ b/paddle/infrt/common/buffer.h
@@ -0,0 +1,296 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <glog/logging.h>
+
+#include <memory>
+
+#include "paddle/infrt/common/macros.h"
+#include "paddle/infrt/common/memory.h"
+#include "paddle/infrt/common/target.h"
+
+namespace infrt {
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define INFRT_ALWAYS_INLINE __attribute__((always_inline)) inline
+
+//! Code for the primitive types supported in INFRT.
+typedef enum infrt_type_code_t {
+  infrt_type_unk = -1,   //! Unknown type
+  infrt_type_int = 0,    //! signed int
+  infrt_type_uint = 1,   //! unsigned int
+  infrt_type_float = 2,  //! floating point
+  infrt_type_handle = 3  //! void*
+} infrt_type_code_t;
+
+#ifndef INFRT_ATTRIBUTE_ALIGN
+#define INFRT_ATTRIBUTE_ALIGN(n) __attribute__((aligned(n)))
+#endif
+
+/**
+ * A tuntime tag for type in INFRT system.
+ */
+typedef struct infrt_type_t {
+#if __cplusplus >= 201103L
+  INFRT_ATTRIBUTE_ALIGN(1) infrt_type_code_t code;
+#else
+  uint8_t code;
+#endif
+
+  //! Number of bits.
+  uint8_t bits;
+
+  //! Number of elements in a vector, 1 for scalar.
+  uint16_t lanes;
+
+  //! Number of '*', e.g. for `float*`, the num_asterisks is 1, `float**` it is
+  //! 2.
+  uint8_t num_asterisks{0};
+
+#ifdef __cplusplus
+  INFRT_ALWAYS_INLINE infrt_type_t()
+      : code(infrt_type_int), bits(0), lanes(0) {}
+  INFRT_ALWAYS_INLINE infrt_type_t(infrt_type_code_t code,
+                                   uint8_t bits,
+                                   uint16_t lanes = 1,
+                                   uint8_t num_asterisks = 0)
+      : code(code), bits(bits), lanes(lanes), num_asterisks(num_asterisks) {}
+  INFRT_ALWAYS_INLINE bool operator==(const infrt_type_t& other) const {
+    return code == other.code && bits == other.bits && lanes == other.lanes;
+  }
+  INFRT_ALWAYS_INLINE bool operator!=(const infrt_type_t& other) const {
+    return !(*this == other);
+  }
+  INFRT_ALWAYS_INLINE uint16_t bytes() const { return (bits + 7) / 8; }
+#endif  // __cplusplus
+} infrt_type_t;
+
+//! Help to define the size of a dimension, due to polyhedral representation, we
+//! no need to record the extend or
+//! min(default to 0).
+typedef int infrt_dimension_t;
+
+//! Help to tell the kind of the device.
+typedef enum infrt_device_kind_t {
+  infrt_unk_device = -1,    // Undefined device.
+  infrt_x86_device = 0,     // X86 device
+  infrt_opencl_device = 1,  // OpenCL device
+  infrt_arm_device = 2      // ARM device
+} infrt_device_kind_t;
+
+struct infrt_buffer_t;
+
+/**
+ * All INFRT backends implementation should provide an interface to be used.
+ */
+struct infrt_device_interface_impl_t;
+
+struct infrt_device_interface_t {
+  int (*malloc)(void* context, struct infrt_buffer_t* buf);
+  int (*free)(void* context, struct infrt_buffer_t* buf);
+  int (*sync)(void* context, struct infrt_buffer_t* buf);
+  int (*release)(void* context,
+                 const struct infrt_device_interface_t* device_interface);
+  int (*copy_to_host)(void* context, struct infrt_buffer_t* buf);
+  int (*copy_to_device)(void* context, struct infrt_buffer_t* buf);
+  int (*buffer_copy)(void* context,
+                     struct infrt_buffer_t* src,
+                     struct infrt_buffer_t* dst);
+  struct infrt_device_interface_impl_t* impl;
+};
+
+//! The raw representation of a buffer,used in the generated code/lib.
+#define INFRT_BUFFER_MAX_DIMS 8
+typedef struct infrt_buffer_t {
+  //! Tell which kind of device this buffer locates.
+  infrt_device_kind_t device;
+
+  //! The interface used to operate on device.
+  const struct infrt_device_interface_t* device_interface;
+
+  //! A pointer to the memory in host.
+  uint8_t* memory;
+
+  //! Extra flags.
+  uint64_t flag;
+
+  //! Data type.
+  infrt_type_t type;
+
+  //! Number of dimensions.
+  int32_t dimensions;
+  infrt_dimension_t dims[INFRT_BUFFER_MAX_DIMS];
+
+  //! Allocate and deallocate lazily, default true.
+  char lazy;
+
+  //! The actual memory size(in bytes).
+  uint64_t memory_size;
+
+  uint16_t align;
+
+#ifdef __cplusplus
+  infrt_buffer_t()
+      : device(infrt_unk_device),
+        device_interface(NULL),
+        memory(NULL),
+        flag(0UL),
+        type(infrt_type_t()),
+        dimensions(0),
+        lazy(true),
+        memory_size(0),
+        align(0) {}
+
+  static void delete_(struct infrt_buffer_t* x) { delete x; }
+
+  ~infrt_buffer_t() {}
+
+  // NOTE the buffer should be resized first.
+  static void alloc(struct infrt_buffer_t*);
+
+  //! Set the shape of the buffer. NOTE this just record the shape, not allocate
+  //! the memory.
+  INFRT_ALWAYS_INLINE void resize(const infrt_dimension_t* dims,
+                                  int dimensions) {
+    this->dimensions = dimensions;
+    memcpy(this->dims, dims, dimensions * sizeof(infrt_dimension_t));
+  }
+
+  INFRT_ALWAYS_INLINE uint64_t num_elements() const {
+    uint64_t res = 1;
+    for (int i = 0; i < dimensions; i++) {
+      res *= dims[i];
+    }
+    return res;
+  }
+
+  INFRT_ALWAYS_INLINE int device_sync(void* ctx = NULL) {
+    if (device_interface && device_interface->sync) {
+      return device_interface->sync(ctx, this);
+    }
+    return 0;
+  }
+
+  INFRT_ALWAYS_INLINE uint8_t* begin() const { return 0; }
+  INFRT_ALWAYS_INLINE uint8_t* end() const {
+    return memory + num_elements() * type.bytes();
+  }
+
+#endif  // __cplusplus
+} infrt_buffer_t;
+
+#ifdef __cplusplus
+struct infrt_device_interface_impl_t {
+  int (*malloc)(void* context, struct infrt_buffer_t* buf);
+  int (*free)(void* context, struct infrt_buffer_t* buf);
+  int (*sync)(void* context, struct infrt_buffer_t* buf);
+  int (*release)(void* context);
+  int (*copy_to_host)(void* context, struct infrt_buffer_t* buf);
+  int (*copy_to_device)(void* context, struct infrt_buffer_t* buf);
+  int (*buffer_copy)(void* context,
+                     struct infrt_buffer_t* src,
+                     struct infrt_buffer_t* dst);
+};
+
+// The device implementations
+extern struct infrt_device_interface_t* infrt_x86_device_interface();
+#endif  // __cplusplus
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#define INFRT_LOG(fmt, ...)     \
+  do {                          \
+    fprintf(stderr,             \
+            "%s:%d:%s(): " fmt, \
+            __FILE__,           \
+            __LINE__,           \
+            __func__,           \
+            __VA_ARGS__);       \
+  } while (0)
+
+#define INFRT_CHECK(cond)                \
+  if (!(cond)) {                         \
+    INFRT_LOG("check %s failed", #cond); \
+    abort();                             \
+  }
+/**
+ * Buffer helps to hold the memory, and offers a set of methods to help manage
+ * the memory.
+ */
+struct Buffer final {
+  Buffer() = default;
+  explicit Buffer(const common::Target& target) { SetTarget(target); }
+
+  //! Resize the memory hold by this buffer *exactlly* to \p size.
+  void Resize(uint32_t size);
+  void Resize(uint32_t alignment, uint32_t size);
+
+  //! Lazily resize the memory.
+  void ResizeLazy(uint32_t size);
+  void ResizeLazy(uint32_t alignment, uint32_t size);
+
+  //! Resize the memory to \p size in target \p target.
+  void Resize(uint32_t size, const common::Target& target);
+  void Resize(uint32_t alignment, uint32_t size, const common::Target& target);
+
+  //! Lazily resize the memory to \p size in target \p target.
+  void ResizeLazy(uint32_t size, const common::Target& target);
+  void ResizeLazy(uint32_t alignment,
+                  uint32_t size,
+                  const common::Target& target);
+
+  void SetTarget(const common::Target& target);
+
+  const infrt_buffer_t* data() const { return &data_; }
+  infrt_buffer_t* data() { return &data_; }
+
+  //! Free all the memory owned by this buffer.
+  void Free() {
+    if (!data_.memory) return;
+    memory_mng_cache_->free(data_.memory);
+  }
+
+ private:
+  inline void* Malloc(uint32_t size) INFRT_RESULT_SHOULD_USE {
+    CHECK(memory_mng_cache_) << "Should set target first";
+    return memory_mng_cache_->malloc(size);
+  }
+
+  inline void* AlignedAlloc(uint32_t alignment,
+                            uint32_t size) INFRT_RESULT_SHOULD_USE {
+    CHECK(memory_mng_cache_) << "Should set target first";
+    return memory_mng_cache_->aligned_alloc(alignment, size);
+  }
+
+ private:
+  infrt_buffer_t data_;
+
+  //! The place where this buffer locates.
+  common::Target target_;
+
+  //! Number of bytes of this buffer.
+  uint32_t size_{};
+
+  //! Hold the corresponding memory manager for speed.
+  MemoryInterface* memory_mng_cache_{};
+};
+
+}  // namespace infrt
diff --git a/paddle/infrt/common/common.h b/paddle/infrt/common/common.h
new file mode 100644
index 0000000000000..a15bc69b6030a
--- /dev/null
+++ b/paddle/infrt/common/common.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/infrt/common/macros.h"
+#include "paddle/infrt/common/shared.h"
+#include "paddle/infrt/common/target.h"
+#include "paddle/infrt/common/type.h"
+
+namespace infrt {
+
+// export some general concepts.
+using common::make_shared;
+using common::Object;
+using common::ref_count;
+using common::Shared;
+
+// Type related.
+using common::Bool;
+using common::Float;
+using common::Int;
+using common::UInt;
+using common::Void;
+
+using common::type_of;
+
+using common::Target;
+using common::Type;
+using common::UnkTarget;
+
+template <typename T>
+T& Reference(const T* x) {
+  return *const_cast<T*>(x);
+}
+
+static void CheckVarNameValid(const std::string& name) {
+  CHECK(!name.empty());
+  CHECK(name.find(' ') == std::string::npos &&   //
+        name.find('.') == std::string::npos &&   //
+        name.find('/') == std::string::npos &&   //
+        name.find('\t') == std::string::npos &&  //
+        name.find('\n') == std::string::npos &&  //
+        name.find('\r') == std::string::npos)
+      << "Some invalid character found";
+}
+
+}  // namespace infrt
diff --git a/paddle/infrt/common/dtype.cc b/paddle/infrt/common/dtype.cc
new file mode 100644
index 0000000000000..d5cf67d8a3c40
--- /dev/null
+++ b/paddle/infrt/common/dtype.cc
@@ -0,0 +1,50 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/common/dtype.h"
+
+namespace infrt {
+
+const char* DType::name() const {
+  switch (kind_) {
+    case Kind::Unk:
+      return "Unk";
+      break;
+#define INFRT_DTYPE(enum__, value__) \
+  case Kind::enum__:                 \
+    return #enum__;                  \
+    break;
+#include "paddle/infrt/common/dtype.def"
+#undef INFRT_DTYPE
+  }
+
+  return "";
+}
+
+size_t DType::GetHostSize() const {
+  switch (kind_) {
+#define INFRT_DTYPE(enum__, value__) \
+  case DType::Kind::enum__:          \
+    return sizeof(DTypeInternal<DType::Kind::enum__>::type);
+#include "paddle/infrt/common/dtype.def"  // NOLINT
+#undef INFRT_DTYPE
+
+    case Kind::Unk:
+      return 0;
+      break;
+  }
+  return 0;
+}
+
+}  // namespace infrt
diff --git a/paddle/infrt/common/dtype.def b/paddle/infrt/common/dtype.def
new file mode 100644
index 0000000000000..32df72aa764a3
--- /dev/null
+++ b/paddle/infrt/common/dtype.def
@@ -0,0 +1,18 @@
+// Define all INFRT dtypes
+// DTYPE(ENUM, VALUE)
+#ifdef INFRT_DTYPE
+
+INFRT_DTYPE(UI8,      1)
+INFRT_DTYPE(UI16,     2)
+INFRT_DTYPE(UI32,     3)
+INFRT_DTYPE(UI64,     4)
+INFRT_DTYPE(I1,       5)
+INFRT_DTYPE(I8,       6)
+INFRT_DTYPE(I16,      7)
+INFRT_DTYPE(I32,      8)
+INFRT_DTYPE(I64,      9)
+INFRT_DTYPE(F32,      10)
+INFRT_DTYPE(F64,      11)
+INFRT_DTYPE(STRING,   12)
+
+#endif
\ No newline at end of file
diff --git a/paddle/infrt/common/dtype.h b/paddle/infrt/common/dtype.h
new file mode 100644
index 0000000000000..8b57299fa94fd
--- /dev/null
+++ b/paddle/infrt/common/dtype.h
@@ -0,0 +1,85 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+namespace infrt {
+class DType {
+ public:
+  enum class Kind : uint8_t {
+    Unk = 0,
+
+// Automatically generate the enum definition
+#define INFRT_DTYPE(enum__, value__) enum__ = value__,
+#include "paddle/infrt/common/dtype.def"
+#undef INFRT_DTYPE
+
+    BOOL = I1,
+  };
+
+  DType() = default;
+  explicit constexpr DType(Kind kind) : kind_(kind) { assert(IsValid()); }
+
+  DType(const DType&) = default;
+  DType& operator=(const DType&) = default;
+  bool operator==(DType other) const { return kind_ == other.kind_; }
+  bool operator!=(DType other) const { return !(*this == other); }
+
+  constexpr Kind kind() const { return kind_; }
+
+  bool IsValid() const { return kind_ != Kind::Unk; }
+  bool IsInvalid() const { return !IsValid(); }
+
+  const char* name() const;
+
+  size_t GetHostSize() const;
+
+ private:
+  Kind kind_{Kind::Unk};
+};
+
+template <typename T>
+constexpr DType GetDType();
+
+template <DType::Kind kind>
+struct DTypeInternal;
+
+#define INFRT_IMPL_GET_DTYPE(cpp_type__, enum__)  \
+  template <>                                     \
+  inline constexpr DType GetDType<cpp_type__>() { \
+    return DType{DType::Kind::enum__};            \
+  }                                               \
+  template <>                                     \
+  struct DTypeInternal<DType::Kind::enum__> {     \
+    using type = cpp_type__;                      \
+  };
+
+INFRT_IMPL_GET_DTYPE(bool, I1);
+INFRT_IMPL_GET_DTYPE(int8_t, I8);
+INFRT_IMPL_GET_DTYPE(int16_t, I16);
+INFRT_IMPL_GET_DTYPE(int32_t, I32);
+INFRT_IMPL_GET_DTYPE(int64_t, I64);
+INFRT_IMPL_GET_DTYPE(uint8_t, UI8);
+INFRT_IMPL_GET_DTYPE(uint16_t, UI16);
+INFRT_IMPL_GET_DTYPE(uint32_t, UI32);
+INFRT_IMPL_GET_DTYPE(uint64_t, UI64);
+INFRT_IMPL_GET_DTYPE(float, F32);
+INFRT_IMPL_GET_DTYPE(double, F64);
+INFRT_IMPL_GET_DTYPE(std::string, STRING);
+
+}  // namespace infrt
diff --git a/paddle/infrt/common/global.cc b/paddle/infrt/common/global.cc
new file mode 100644
index 0000000000000..54ecf1589aa14
--- /dev/null
+++ b/paddle/infrt/common/global.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/common/global.h"
+
+namespace infrt {
+
+Global::Global() {}
+
+mlir::MLIRContext* Global::context = nullptr;
+
+mlir::MLIRContext* Global::getMLIRContext() {
+  if (nullptr == context) {
+    context = new mlir::MLIRContext();
+  }
+  return context;
+}
+
+}  // namespace infrt
diff --git a/paddle/pten/api/include/linalg.h b/paddle/infrt/common/global.h
similarity index 62%
rename from paddle/pten/api/include/linalg.h
rename to paddle/infrt/common/global.h
index 259cf66493203..f89164d03f31d 100644
--- a/paddle/pten/api/include/linalg.h
+++ b/paddle/infrt/common/global.h
@@ -14,17 +14,19 @@
 
 #pragma once
 
-#include "paddle/pten/api/include/tensor.h"
+#include "mlir/IR/MLIRContext.h"
+#include "paddle/infrt/tensor/dense_host_tensor.h"
 
-namespace paddle {
-namespace experimental {
+namespace infrt {
 
-PD_DLL_DECL Tensor dot(const Tensor& x, const Tensor& y);
+// global variables
+class Global {
+ private:
+  static mlir::MLIRContext *context;
+  Global();
 
-PD_DLL_DECL Tensor matmul(const Tensor& x,
-                          const Tensor& y,
-                          bool transpose_x = false,
-                          bool transpose_y = false);
+ public:
+  static mlir::MLIRContext *getMLIRContext();
+};  // class Global
 
-}  // namespace experimental
-}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/common/macros.h b/paddle/infrt/common/macros.h
new file mode 100644
index 0000000000000..4481f6b38aed3
--- /dev/null
+++ b/paddle/infrt/common/macros.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if !defined(NDEBUG)
+#define INFRT_DEBUG
+#endif
+
+#define INFRT_DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  TypeName(const TypeName&) = delete;            \
+  void operator=(const TypeName&) = delete
+
+#ifndef INFRT_NOT_IMPLEMENTED
+#define INFRT_NOT_IMPLEMENTED LOG(FATAL) << "Not Implemented";
+#endif
+
+#define INFRT_RESULT_SHOULD_USE __attribute__((warn_unused_result))
+
+/**
+ * A trick to enforce the registry.
+ *
+ * usage:
+ *
+ * INFRT_REGISTER_HELPER(some_key) {
+ *   // register methods
+ * }
+ *
+ * INFRT_USE_REGISTER(some_key);
+ */
+#define INFRT_REGISTER_HELPER(symbol__) bool __infrt__##symbol__##__registrar()
+#define INFRT_USE_REGISTER(symbol__)                                 \
+  extern bool __infrt__##symbol__##__registrar();                    \
+  [[maybe_unused]] static bool __infrt_extern_registrar_##symbol__ = \
+      __infrt__##symbol__##__registrar();
+
+#if __cplusplus >= 201703L
+#define INFRT_NODISCARD [[nodiscard]]
+#else
+#define INFRT_NODISCARD
+#endif
diff --git a/paddle/infrt/common/memory.cc b/paddle/infrt/common/memory.cc
new file mode 100644
index 0000000000000..aa5983a56c434
--- /dev/null
+++ b/paddle/infrt/common/memory.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/common/memory.h"
+
+namespace infrt {
+
+using infrt::common::Target;
+
+namespace {
+
+class X86MemoryMng : public MemoryInterface {
+ public:
+  void* malloc(size_t nbytes) override { return ::malloc(nbytes); }
+  void free(void* data) override {
+    if (!data) return;
+    ::free(data);
+  }
+  void* aligned_alloc(size_t alignment, size_t nbytes) override {
+    return ::aligned_alloc(alignment, nbytes);
+  }
+};
+
+}  // namespace
+
+MemoryManager::MemoryManager() {
+  Register(Target::Arch::Unk, new X86MemoryMng);
+  Register(Target::Arch::X86, new X86MemoryMng);
+}
+
+}  // namespace infrt
diff --git a/paddle/infrt/common/memory.h b/paddle/infrt/common/memory.h
new file mode 100644
index 0000000000000..678529b8b785c
--- /dev/null
+++ b/paddle/infrt/common/memory.h
@@ -0,0 +1,76 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <glog/logging.h>
+#include <unordered_map>
+
+#include <memory>
+
+#include "paddle/infrt/common/macros.h"
+#include "paddle/infrt/common/target.h"
+
+namespace infrt {
+
+class MemoryInterface {
+ public:
+  virtual void* malloc(size_t nbytes) = 0;
+  virtual void free(void* data) = 0;
+  virtual void* aligned_alloc(size_t alignment, size_t nbytes) {
+    return nullptr;
+  }
+  virtual ~MemoryInterface() {}
+};
+
+/**
+ * MemoryManager holds a map of MemoryInterface for each articture.
+ */
+class MemoryManager final {
+ public:
+  using key_t = common::Target::Arch;
+
+  static MemoryManager& Global() {
+    static auto* x = new MemoryManager;
+    return *x;
+  }
+
+  MemoryInterface* Retrieve(key_t key) INFRT_RESULT_SHOULD_USE {
+    auto it = memory_mngs_.find(key);
+    if (it != memory_mngs_.end()) return it->second.get();
+    return nullptr;
+  }
+
+  MemoryInterface* RetrieveSafely(key_t key) {
+    auto* res = Retrieve(key);
+    CHECK(res) << "no MemoryInterface for architecture [" << key << "]";
+    return res;
+  }
+
+  MemoryInterface* Register(key_t key, MemoryInterface* item) {
+    CHECK(!memory_mngs_.count(key)) << "Duplicate register [" << key << "]";
+    memory_mngs_[key].reset(item);
+    return item;
+  }
+
+ private:
+  MemoryManager();
+
+  std::unordered_map<common::Target::Arch, std::unique_ptr<MemoryInterface>>
+      memory_mngs_;
+
+  INFRT_DISALLOW_COPY_AND_ASSIGN(MemoryManager);
+};
+
+}  // namespace infrt
diff --git a/paddle/infrt/common/object.cc b/paddle/infrt/common/object.cc
new file mode 100644
index 0000000000000..6842ff7ba007d
--- /dev/null
+++ b/paddle/infrt/common/object.cc
@@ -0,0 +1,19 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/common/object.h"
+
+namespace infrt {
+namespace common {}  // namespace common
+}  // namespace infrt
diff --git a/paddle/infrt/common/object.h b/paddle/infrt/common/object.h
new file mode 100644
index 0000000000000..ab2d00cce985c
--- /dev/null
+++ b/paddle/infrt/common/object.h
@@ -0,0 +1,81 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cstring>
+#include <iostream>
+
+#include "paddle/infrt/common/shared.h"
+
+namespace infrt {
+namespace common {
+
+template <typename T>
+class Shared;
+/**
+ * Object is the basic element in the INFRT, with `Shared` wrapper, the object
+ * can be shared accross the system.
+ */
+struct Object {
+  //! Get the type representation of this object.
+  virtual const char* type_info() const = 0;
+  virtual ~Object() {}
+
+  //! Cast to a derived type.
+  template <typename T>
+  T* as() {
+    return static_cast<T*>(this);
+  }
+
+  //! Cast to a derived type.
+  template <typename T>
+  const T* as() const {
+    return static_cast<const T*>(this);
+  }
+
+  //! Type safe cast.
+  template <typename T>
+  T* safe_as() {
+    if (std::strcmp(type_info(), T::__type_info__) == 0) {
+      return static_cast<T*>(this);
+    }
+    return nullptr;
+  }
+  //! Type safe cast.
+  template <typename T>
+  const T* safe_as() const {
+    if (std::strcmp(type_info(), T::__type_info__) == 0) {
+      return static_cast<const T*>(this);
+    }
+    return nullptr;
+  }
+
+  //! Check if the type is right.
+  template <typename T>
+  bool is_type() const {
+    if (std::strcmp(type_info(), T::__type_info__) == 0) {
+      return true;
+    }
+    return false;
+  }
+
+  //! The reference count, which make all the derived type able to share.
+  mutable RefCount __ref_count__;
+};
+
+using object_ptr = Object*;
+using shared_object = Shared<Object>;
+
+}  // namespace common
+}  // namespace infrt
diff --git a/paddle/infrt/common/shared.cc b/paddle/infrt/common/shared.cc
new file mode 100644
index 0000000000000..78457b7ed352b
--- /dev/null
+++ b/paddle/infrt/common/shared.cc
@@ -0,0 +1,15 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/common/shared.h"
diff --git a/paddle/infrt/common/shared.h b/paddle/infrt/common/shared.h
new file mode 100644
index 0000000000000..dbcf2b0597888
--- /dev/null
+++ b/paddle/infrt/common/shared.h
@@ -0,0 +1,153 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <atomic>
+#include <string>
+#include <type_traits>
+
+namespace infrt {
+namespace common {
+
+class RefCount {
+ public:
+  using value_type = int32_t;
+  RefCount() = default;
+
+  value_type Inc() { return ++count_; }
+  value_type Dec() { return --count_; }
+  bool is_zero() const { return 0 == count_; }
+  std::string to_string() { return std::to_string(count_.load()); }
+  int32_t val() const { return count_; }
+
+ private:
+  std::atomic<value_type> count_{0};
+};
+
+class Object;
+/**
+ * The templated methods are used to unify the way to get the RefCount instance
+ * in client classes.
+ */
+template <typename T>
+RefCount& ref_count(const T* t) {
+  static_assert(std::is_base_of<Object, T>::value, "T is not a Object");
+  return t->__ref_count__;
+}
+template <typename T>
+void Destroy(const T* t) {
+  delete t;
+}
+
+template <typename T>
+struct Shared {
+  using object_ptr = T*;
+
+  Shared() = default;
+  explicit Shared(T* p) : p_(p) {
+    if (p) IncRef(p);
+  }
+  Shared(const Shared& other) : p_(other.p_) { IncRef(p_); }
+  Shared(Shared&& other) : p_(other.p_) { other.p_ = nullptr; }
+  Shared<T>& operator=(const Shared<T>& other);
+
+  //! Reset to another pointer \p x.
+  void Reset(T* x = nullptr);
+
+  //! Access the pointer in various ways.
+  // @{
+  inline T* get() const { return p_; }
+  inline T& operator*() const { return *p_; }
+  inline T* operator->() const { return p_; }
+  inline T* self() { return p_; }
+  inline const T* self() const { return p_; }
+  // @}
+
+  inline bool same_as(const Shared& other) { return p_ == other.p_; }
+  inline bool defined() const { return p_; }
+  inline bool operator<(const Shared& other) const { return p_ < other.p_; }
+  inline Shared<T>& operator=(T* x);
+  inline bool operator==(const Shared& other) const { return p_ == other.p_; }
+
+  ~Shared();
+
+ private:
+  //! Increase the share count.
+  void IncRef(T* p);
+
+  //! Decrease the share count.
+  void DecRef(T* p);
+
+ protected:
+  T* p_{};
+};
+
+template <typename T>
+void Shared<T>::IncRef(T* p) {
+  if (p) {
+    ref_count(p).Inc();
+  }
+}
+template <typename T>
+void Shared<T>::DecRef(T* p) {
+  if (p) {
+    if (ref_count(p).Dec() == 0) {
+      Destroy(p);
+    }
+  }
+}
+template <typename T>
+Shared<T>& Shared<T>::operator=(const Shared<T>& other) {
+  if (other.p_ == p_) return *this;
+  // Other can be inside of something owned by this, so we should be careful to
+  // incref other before we decref
+  // ourselves.
+  T* tmp = other.p_;
+  IncRef(tmp);
+  DecRef(p_);
+  p_ = tmp;
+  return *this;
+}
+
+template <typename T, typename... Args>
+T* make_shared(Args&&... args) {
+  return new T(args...);
+}
+
+template <typename T>
+Shared<T>& Shared<T>::operator=(T* x) {
+  if (p_ == x) return *this;
+
+  T* tmp = x;
+  IncRef(tmp);
+  DecRef(p_);
+  p_ = tmp;
+  return *this;
+}
+
+template <typename T>
+Shared<T>::~Shared() {
+  DecRef(p_);
+  p_ = nullptr;
+}
+
+template <typename T>
+void Shared<T>::Reset(T* x) {
+  if (x) IncRef(x);
+  DecRef(p_);
+  p_ = x;
+}
+
+}  // namespace common
+}  // namespace infrt
diff --git a/paddle/infrt/common/string.cc b/paddle/infrt/common/string.cc
new file mode 100644
index 0000000000000..d02643825a7c8
--- /dev/null
+++ b/paddle/infrt/common/string.cc
@@ -0,0 +1,128 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/common/string.h"
+
+#include <stdarg.h>
+
+#include <cstring>
+
+namespace infrt {
+namespace infrt {
+
+std::string StringFormat(const std::string &fmt_str, ...) {
+  /* Reserve two times as much as the length of the fmt_str */
+  int final_n, n = (static_cast<int>(fmt_str.size())) * 2;
+  std::unique_ptr<char[]> formatted;
+  va_list ap;
+  while (1) {
+    formatted.reset(
+        new char[n]); /* Wrap the plain char array into the unique_ptr */
+    std::strcpy(&formatted[0], fmt_str.c_str());  // NOLINT
+    va_start(ap, fmt_str);
+    final_n = vsnprintf(&formatted[0], n, fmt_str.c_str(), ap);
+    va_end(ap);
+    if (final_n < 0 || final_n >= n)
+      n += abs(final_n - n + 1);
+    else
+      break;
+  }
+  return std::string(formatted.get());
+}
+
+std::string Trim(const std::string &s, const char *empty) {
+  if (s.empty()) return s;
+  auto start = s.find_first_not_of(empty);
+  if (start == std::string::npos) return "";
+  auto end = s.find_last_not_of(empty);
+  return s.substr(start, end - start + 1);
+}
+
+std::string Uppercase(const std::string &x) {
+  auto res = x;
+  for (auto &c : res) {
+    c = toupper(c);
+  }
+  return res;
+}
+
+bool Startswith(const std::string &x, const std::string &str) {
+  return x.find(str) == 0;
+}
+bool Endswith(const std::string &x, const std::string &str) {
+  if (x.length() >= str.length()) {
+    return std::equal(str.rbegin(), str.rend(), x.rbegin());
+  }
+  return false;
+}
+
+std::vector<std::string> Split(const std::string &str,
+                               const std::string &splitter) {
+  std::vector<std::string> results;
+  std::string::size_type pos1, pos2;
+  pos2 = str.find(splitter);
+  pos1 = 0;
+  while (std::string::npos != pos2) {
+    results.push_back(str.substr(pos1, pos2 - pos1));
+    pos1 = pos2 + splitter.size();
+    pos2 = str.find(splitter, pos1);
+  }
+  if (pos1 != str.length()) {
+    results.push_back(str.substr(pos1));
+  }
+  return results;
+}
+
+void Replace(std::string *s, const std::string &from, const std::string &to) {
+  size_t pos = 0;
+  while ((pos = s->find(from, pos)) != std::string::npos) {
+    s->replace(pos, from.size(), to);
+    pos += to.length();
+  }
+}
+
+size_t Count(std::string *s, const std::string &sub) {
+  size_t pos = 0;
+  size_t times = 0;
+  while ((pos = s->find(sub, pos)) != std::string::npos) {
+    if ((pos == 0 || !IsPrefix(s->at(pos - 1))) &&
+        (pos + sub.length() == s->size() ||
+         !IsSuffix(s->at(pos + sub.length())))) {
+      pos += sub.length();
+      times++;
+    } else {
+      pos++;
+    }
+  }
+  return times;
+}
+
+bool IsPrefix(const char &c) {
+  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c == '_');
+}
+
+bool IsSuffix(const char &c) {
+  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c == '_') ||
+         (c >= '0' && c <= '9') || (c == '\'');
+}
+
+std::string TransValidVarName(std::string name) {
+  Replace(&name, ".", "__");
+  Replace(&name, "/", "___");
+  name.erase(0, name.find_first_not_of("_"));
+  return name;
+}
+
+}  // namespace infrt
+}  // namespace infrt
diff --git a/paddle/infrt/common/string.h b/paddle/infrt/common/string.h
new file mode 100644
index 0000000000000..f744470603f80
--- /dev/null
+++ b/paddle/infrt/common/string.h
@@ -0,0 +1,84 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace infrt {
+namespace infrt {
+
+//! Get the content of a stream.
+template <typename T>
+std::string GetStreamCnt(const T& x);
+
+/**
+ * Construct a formatted string with arguments.
+ * @param fmt_str The format.
+ * @param ... The parameters of the format.
+ * @return The formated string.
+ */
+std::string StringFormat(const std::string& fmt_str, ...);
+
+/**
+ * Join multiple fields to a single string. Similar to Python's str.join method.
+ */
+template <typename T = std::string>
+std::string Join(const std::vector<T>& fields, const std::string& splitter) {
+  if (fields.empty()) return "";
+  std::stringstream ss;
+  for (int i = 0; i < fields.size() - 1; i++) ss << fields[i] << splitter;
+  ss << fields.back();
+  return ss.str();
+}
+
+std::vector<std::string> Split(const std::string& str,
+                               const std::string& splitter);
+
+std::string Trim(const std::string& s, const char* empty = " \n\r\t");
+
+//! Convert a string to its uppercase.
+std::string Uppercase(const std::string& x);
+
+//! Replace a substr 'from' to 'to' in string s.
+void Replace(std::string* s, const std::string& from, const std::string& to);
+
+//! Count how many times substr 'sub' appears in string s.
+size_t Count(std::string* s, const std::string& sub);
+
+//! Tell if a char is prefix of a tensor's name.
+bool IsPrefix(const char& c);
+
+//! Tell if a char is suffix of a tensor's name.
+bool IsSuffix(const char& c);
+
+//! Tell if a string \p x start with \p str.
+bool Startswith(const std::string& x, const std::string& str);
+
+//! Tell if a string \p x ends with \p str.
+bool Endswith(const std::string& x, const std::string& str);
+
+template <typename T>
+std::string GetStreamCnt(const T& x) {
+  std::stringstream os;
+  os << x;
+  return os.str();
+}
+
+std::string TransValidVarName(std::string name);
+
+}  // namespace infrt
+}  // namespace infrt
diff --git a/paddle/infrt/common/target.cc b/paddle/infrt/common/target.cc
new file mode 100644
index 0000000000000..d376ad7db0241
--- /dev/null
+++ b/paddle/infrt/common/target.cc
@@ -0,0 +1,118 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/common/target.h"
+
+#include <glog/logging.h>
+
+namespace infrt {
+namespace common {
+
+bool Target::operator==(const Target &other) const {
+  return os == other.os &&      //
+         arch == other.arch &&  //
+         bits == other.bits &&  //
+         features == other.features;
+}
+
+int Target::max_num_threads() const {
+  CHECK(arch == Arch::NVGPU)
+      << "The target is not NVGPU! Cannot get max number of threads.";
+  return 1024;
+}
+
+std::vector<Target::Lib> Target::get_target_libs() const { return libs; }
+
+int Target::get_target_bits() const {
+  switch (bits) {
+    case Bit::k32:
+      return 32;
+    case Bit::k64:
+      return 64;
+    case Bit::Unk:
+      return 0;
+    default:
+      LOG(FATAL) << "Not supported Bit";
+  }
+  return -1;
+}
+
+std::ostream &operator<<(std::ostream &os, const Target &target) {
+  os << "Target<";
+  switch (target.os) {
+    case Target::OS::Linux:
+      os << "linux";
+      break;
+    case Target::OS::Windows:
+      os << "windows";
+      break;
+    case Target::OS::Unk:
+      os << "unk";
+      break;
+  }
+
+  os << ",";
+
+  switch (target.arch) {
+    case Target::Arch::X86:
+      os << "x86";
+      break;
+    case Target::Arch::ARM:
+      os << "arm";
+      break;
+    case Target::Arch::NVGPU:
+      os << "nvgpu";
+      break;
+    case Target::Arch::Unk:
+      os << "unk";
+      break;
+  }
+  os << ",";
+
+  switch (target.bits) {
+    case Target::Bit::k32:
+      os << "32";
+      break;
+    case Target::Bit::k64:
+      os << "64";
+      break;
+    case Target::Bit::Unk:
+      os << "unk";
+      break;
+  }
+  os << ">";
+
+  return os;
+}
+
+std::ostream &operator<<(std::ostream &os, Target::Arch arch) {
+  switch (arch) {
+    case Target::Arch::Unk:
+      os << "Unk";
+      break;
+    case Target::Arch::X86:
+      os << "X86";
+      break;
+    case Target::Arch::ARM:
+      os << "ARM";
+      break;
+    case Target::Arch::NVGPU:
+      os << "NVGPU";
+      break;
+  }
+  return os;
+}
+
+}  // namespace common
+}  // namespace infrt
diff --git a/paddle/infrt/common/target.h b/paddle/infrt/common/target.h
new file mode 100644
index 0000000000000..eaf19efbfe7a8
--- /dev/null
+++ b/paddle/infrt/common/target.h
@@ -0,0 +1,112 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ostream>
+#include <vector>
+
+namespace infrt {
+namespace common {
+
+struct Target {
+  /**
+   * The operating system used by the target. Determines which system calls to
+   * generate.
+   */
+  enum class OS : int {
+    Unk = -1,
+    Linux,
+    Windows,
+  };
+
+  /**
+   * The architecture used by the target. Determines the instruction set to use.
+   */
+  enum class Arch : int {
+    Unk = -1,
+    X86,
+    ARM,
+    NVGPU,
+  };
+
+  enum class Bit : int {
+    Unk = -1,
+    k32,
+    k64,
+  };
+
+  OS os{OS::Unk};
+  Arch arch{Arch::Unk};
+  Bit bits{Bit::Unk};
+
+  enum class Feature : int {
+    JIT = 0,
+    Debug,
+  };
+
+  /**
+   * The library used by the target.
+   */
+  enum class Lib : int {
+    Unk = -1,
+    MKL,
+  };
+  std::vector<Feature> features;
+  std::vector<Lib> libs;
+
+  explicit Target(OS o = OS::Linux,
+                  Arch a = Arch::Unk,
+                  Bit b = Bit::Unk,
+                  const std::vector<Feature>& features = {},
+                  const std::vector<Lib>& libs = {})
+      : os(o), arch(a), bits(b), features(features), libs(libs) {}
+
+  bool defined() const {
+    return os != OS::Unk && arch != Arch::Unk && bits != Bit::Unk;
+  }
+
+  int max_num_threads() const;
+
+  int get_target_bits() const;
+
+  std::vector<Lib> get_target_libs() const;
+
+  bool operator==(const Target& other) const;
+  bool operator!=(const Target& other) const { return !(*this == other); }
+  friend std::ostream& operator<<(std::ostream& os, const Target& target);
+};
+
+static const Target& UnkTarget() {
+  static Target target(
+      Target::OS::Unk, Target::Arch::Unk, Target::Bit::Unk, {}, {});
+  return target;
+}
+
+static const Target& DefaultHostTarget() {
+  static Target target(
+      Target::OS::Linux, Target::Arch::X86, Target::Bit::k64, {}, {});
+  return target;
+}
+
+static const Target& DefaultNVGPUTarget() {
+  static Target target(
+      Target::OS::Linux, Target::Arch::NVGPU, Target::Bit::k64, {}, {});
+  return target;
+}
+
+std::ostream& operator<<(std::ostream& os, Target::Arch arch);
+
+}  // namespace common
+}  // namespace infrt
diff --git a/paddle/infrt/common/type.cc b/paddle/infrt/common/type.cc
new file mode 100644
index 0000000000000..f262bd4697b36
--- /dev/null
+++ b/paddle/infrt/common/type.cc
@@ -0,0 +1,358 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/common/type.h"
+
+#include <utility>
+
+namespace infrt {
+namespace common {
+
+struct Type::Storage {
+  Storage() = default;
+  Storage(type_t t, int b, int w) : type_(t), bits_(b), lanes_(w) {}
+
+  type_t type_{type_t::Unk};
+  cpp_type_t cpp_type_{cpp_type_t::None};
+
+  //! How many bits per element.
+  int bits_{};
+
+  //! How many elements(if a vector type), for scalar types, it should be 1.
+  int lanes_{1};
+
+  //! Name of the customized type.
+  std::string customized_type_;
+};
+
+Type::~Type() {}
+
+std::ostream &operator<<(std::ostream &os, const Type &t) {
+  if (t.is_cpp_const()) os << "const ";
+  switch (t.type()) {
+    case Type::type_t::Int:
+      if (t.bits() == 1) {
+        os << "bool";
+      } else {
+        os << "int" << t.bits();
+      }
+
+      break;
+    case Type::type_t::UInt:
+      os << "uint" << t.bits();
+      break;
+
+    case Type::type_t::Float:
+      os << "float" << t.bits();
+      break;
+    case Type::type_t::Void:
+      os << "void";
+      break;
+    case Type::type_t::Customized:
+      os << t.customized_type();
+      break;
+    case Type::type_t::String:
+      os << "string";
+      break;
+    case Type::type_t::Unk:
+      os << "unk";
+      break;
+  }
+
+  if (t.lanes() > 1) os << "<" << t.lanes() << ">";
+  if (t.is_cpp_handle()) os << "*";
+  if (t.is_cpp_handle2()) os << "**";
+
+  return os;
+}
+
+std::ostream &operator<<(std::ostream &os, Type::type_t t) {
+  switch (t) {
+    case Type::type_t::String:
+      os << "String";
+      break;
+    case Type::type_t::Void:
+      os << "Void";
+      break;
+    case Type::type_t::UInt:
+      os << "UInt";
+      break;
+    case Type::type_t::Int:
+      os << "Int";
+      break;
+    case Type::type_t::Float:
+      os << "Float";
+      break;
+    case Type::type_t::Unk:
+      os << "Unk";
+      break;
+    case Type::type_t::Customized:
+      os << "Customized";
+  }
+  return os;
+}
+
+Type &Type::set_cpp_handle(bool x) {
+  // unset the other handle-related bits.
+  set_cpp_handle2(false);
+
+  auto &v = (*reinterpret_cast<uint8_t *>(&GetStorage().cpp_type_));
+  // unset the other handle-related bits.
+  v &= ~static_cast<uint8_t>(cpp_type_t::Handle);
+  v &= ~static_cast<uint8_t>(cpp_type_t::HandleHandle);
+
+  if (x)
+    v |= static_cast<uint8_t>(cpp_type_t::Handle);
+  else
+    v &= ~static_cast<uint8_t>(cpp_type_t::Handle);
+
+  return *this;
+}
+
+Type &Type::set_cpp_handle2(bool x) {
+  auto &v = (*reinterpret_cast<uint8_t *>(&GetStorage().cpp_type_));
+
+  // unset the other handle-related bits.
+  v &= ~static_cast<uint8_t>(cpp_type_t::Handle);
+  v &= ~static_cast<uint8_t>(cpp_type_t::HandleHandle);
+
+  if (x)
+    v |= static_cast<uint8_t>(cpp_type_t::HandleHandle);
+  else
+    v &= ~static_cast<uint8_t>(cpp_type_t::HandleHandle);
+
+  return *this;
+}
+
+Type Type::VectorOf(int w) const {
+  CheckTypeValid();
+  return Type(type(), w, bits());
+}
+
+Type::Type(const Type &other) {
+  if (other.storage_) storage_.reset(new Storage(*other.storage_));
+}
+
+Type Type::ElementOf() const {
+  CheckTypeValid();
+  auto type = *this;
+  type.storage_->lanes_ = 1;
+  return type;
+}
+
+void Type::CheckTypeValid() const { CHECK_NE(GetStorage().type_, type_t::Unk); }
+
+Type Type::PointerOf() const {
+  CheckTypeValid();
+  auto x = *this;
+  CHECK(!x.is_cpp_handle2()) << "Not support three level of PointerOf";
+  if (x.is_cpp_handle())
+    x.set_cpp_handle2();
+  else
+    x.set_cpp_handle();
+  return x;
+}
+
+Type Type::ConstOf() const {
+  CheckTypeValid();
+  auto x = *this;
+  x.set_cpp_const();
+  return x;
+}
+
+Type Type::IgnoreConst() const {
+  CheckTypeValid();
+  auto x = *this;
+  x.set_cpp_const(false);
+  return x;
+}
+
+Type Type::with_bits(int x) const {
+  CHECK(is_primitive());
+  Type type = *this;
+  type.GetStorage().bits_ = x;
+  return type;
+}
+
+Type Type::with_type(Type::type_t x) const {
+  Type type = *this;
+  type.GetStorage().type_ = x;
+  return type;
+}
+
+Type Type::with_lanes(int x) const {
+  CHECK(valid());
+  Type type = *this;
+  type.GetStorage().lanes_ = x;
+  return type;
+}
+
+Type Type::with_cpp_const(bool x) const {
+  Type type = *this;
+  type.set_cpp_const(x);
+  return type;
+}
+
+Type &Type::set_cpp_const(bool is_const) {
+  uint8_t &data = *reinterpret_cast<uint8_t *>(&GetStorage().cpp_type_);
+  if (is_const) {
+    data |= static_cast<uint8_t>(cpp_type_t::Const);
+  } else {
+    data &= ~(static_cast<uint8_t>(cpp_type_t::Const));
+  }
+
+  return *this;
+}
+Type &Type::set_customized_type(const std::string &t) {
+  GetStorage().type_ = type_t::Customized;
+  GetStorage().customized_type_ = t;
+
+  return *this;
+}
+
+bool Type::valid() const {
+  if (is_unk()) return false;
+  if (is_customized()) {
+    return !GetStorage().customized_type_.empty();
+  }
+  if (is_primitive()) {
+    return bits() != 0;
+  }
+  return true;
+}
+
+Type::Type(Type::type_t t, int b, int w) : storage_(new Storage(t, b, w)) {}
+bool Type::is_primitive() const {
+  return !is_unk() && type() != type_t::Customized;
+}
+bool Type::is_customized() const {
+  return !is_unk() && type() == type_t::Customized;
+}
+bool Type::is_unk() const { return type() == type_t::Unk; }
+bool Type::is_bool() const { return type() == type_t::UInt && bits() == 1; }
+bool Type::is_void() const { return type() == type_t::Void; }
+bool Type::is_vector() const { return lanes() > 1; }
+bool Type::is_scalar() const { return lanes() == 1; }
+bool Type::is_float(int bits) const {
+  return type() == type_t::Float && (bits < 0 || bits == this->bits());
+}
+bool Type::is_uint(int bits) const {
+  return type() == type_t::UInt && (bits < 0 || bits == this->bits());
+}
+bool Type::is_int(int bits) const {
+  return type() == type_t::Int && (bits < 0 || bits == this->bits());
+}
+bool Type::is_integer(int bits) const {
+  return (type() == type_t::Int || type() == type_t::UInt) &&
+         (bits < 0 || bits == this->bits());
+}
+bool Type::is_index_type() {
+  return is_int() && lanes() == 1 && (bits() == 32 || bits() == 64);
+}
+bool Type::is_cpp_handle() const {
+  return static_cast<uint8_t>(GetStorage().cpp_type_) &
+         static_cast<uint8_t>(cpp_type_t::Handle);
+}
+bool Type::is_cpp_handle2() const {
+  return static_cast<uint8_t>(GetStorage().cpp_type_) &
+         static_cast<uint8_t>(cpp_type_t::HandleHandle);
+}
+bool Type::is_cpp_const() const {
+  return static_cast<uint8_t>(cpp_type_t::Const) &
+         static_cast<uint8_t>(GetStorage().cpp_type_);
+}
+const std::string &Type::customized_type() const {
+  return GetStorage().customized_type_;
+}
+bool Type::is_customized_type() const {
+  return !GetStorage().customized_type_.empty();
+}
+Type::type_t Type::type() const { return GetStorage().type_; }
+int Type::bits() const { return GetStorage().bits_; }
+int Type::lanes() const { return GetStorage().lanes_; }
+Type::cpp_type_t Type::cpp_type() const { return GetStorage().cpp_type_; }
+bool Type::operator==(const Type &other) const {
+  return type() == other.type() && bits() == other.bits() &&
+         lanes() == other.lanes() &&
+         GetStorage().cpp_type_ == other.GetStorage().cpp_type_ &&
+         customized_type() == other.customized_type();
+}
+bool Type::is_string() const { return type() == type_t::String; }
+
+Type &Type::operator=(const Type &other) {
+  if (other.storage_) storage_.reset(new Storage(*other.storage_));
+  return *this;
+}
+
+Type::Storage &Type::GetStorage() { return *storage_; }
+const Type::Storage &Type::GetStorage() const { return *storage_; }
+
+Type::Type() : storage_(new Storage) {}
+Type::Type(Type &&other) : storage_(std::move(other.storage_)) {}
+
+const Type &F16() {
+  static auto t = Float(16);
+  return t;
+}
+const Type &F32() {
+  static auto t = Float(32);
+  return t;
+}
+const Type &F64() {
+  static auto t = Float(64);
+  return t;
+}
+const Type &I8() {
+  static auto t = Int(8);
+  return t;
+}
+const Type &I16() {
+  static auto t = Int(16);
+  return t;
+}
+const Type &I32() {
+  static auto t = Int(32);
+  return t;
+}
+const Type &I64() {
+  static auto t = Int(64);
+  return t;
+}
+const Type &UI8() {
+  static auto t = UInt(8);
+  return t;
+}
+const Type &UI16() {
+  static auto t = UInt(16);
+  return t;
+}
+const Type &UI32() {
+  static auto t = UInt(32);
+  return t;
+}
+const Type &UI64() {
+  static auto t = UInt(64);
+  return t;
+}
+const Type &I1() {
+  static auto t = Int(1);
+  return t;
+}
+const Type &UI1() {
+  static auto t = UInt(1);
+  return t;
+}
+
+}  // namespace common
+}  // namespace infrt
diff --git a/paddle/infrt/common/type.h b/paddle/infrt/common/type.h
new file mode 100644
index 0000000000000..b532fc154ff02
--- /dev/null
+++ b/paddle/infrt/common/type.h
@@ -0,0 +1,223 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <glog/logging.h>
+
+#include <memory>
+#include <string>
+
+#include "paddle/infrt/common/macros.h"
+
+//! Much of the concepts are borrowed from Halide project.
+
+namespace infrt {
+namespace common {
+
+/**
+ * Types in the INFRT type system. They can be ints, unsigned ints, or floats of
+ * various bit-widths.
+ * They can also be vectors of the same (by setting the `lanes` field to
+ * something larger than one).
+ * NOTE: Front-end code other than vectorize shouldn't use vector types.
+ */
+struct Type {
+  enum class type_t {
+    Unk = -1,
+    Int,
+    UInt,
+    Float,
+    String,
+    Void,
+    // stupid idea to mix the Customized with other primitive types, large
+    // refactor needs here.
+    Customized,  // Customized type
+  };
+
+  //! type decorators in C++, the different code can used together.
+  enum class cpp_type_t : uint8_t {
+    None = 0,               // None information.
+    Const = 1,              // const.
+    Handle = 1 << 1,        // pointer type, such as `infrt_buffer_t*`.
+    HandleHandle = 1 << 2,  // pointer of pointer, such as `infrt_buffer_t**`.
+  };
+
+  Type();
+  Type(type_t t, int b, int w);
+  Type(const Type& other);
+  explicit Type(Type&& other);
+  Type& operator=(const Type& other);
+
+  INFRT_NODISCARD bool is_primitive() const;
+  INFRT_NODISCARD bool is_customized() const;
+  INFRT_NODISCARD bool valid() const;
+
+  //! Some helper functions to check a type.
+  // @{
+  INFRT_NODISCARD bool is_unk() const;
+  INFRT_NODISCARD bool is_void() const;
+  INFRT_NODISCARD bool is_bool() const;
+  INFRT_NODISCARD bool is_vector() const;
+  INFRT_NODISCARD bool is_scalar() const;
+  INFRT_NODISCARD bool is_float(int bits = -1) const;
+  INFRT_NODISCARD bool is_int(int bits = -1) const;
+  INFRT_NODISCARD bool is_integer(int bits = -1) const;
+  INFRT_NODISCARD bool is_uint(int bits = -1) const;
+  INFRT_NODISCARD bool is_string() const;
+  INFRT_NODISCARD bool is_index_type();
+  // @}
+
+  Type& set_cpp_handle(bool x = true);
+  INFRT_NODISCARD bool is_cpp_handle() const;
+
+  Type& set_cpp_handle2(bool x = true);
+  INFRT_NODISCARD bool is_cpp_handle2() const;
+
+  Type& set_cpp_const(bool is_const = true);
+  INFRT_NODISCARD bool is_cpp_const() const;
+
+  Type& set_customized_type(const std::string& t);
+  const std::string& customized_type() const;
+  INFRT_NODISCARD bool is_customized_type() const;
+
+  // Get a new type with bits set to \p x.
+  Type with_bits(int x) const;
+  // Get a new type with type set to \p x.
+  Type with_type(type_t x) const;
+  // Get a new type with lanes set to \p x.
+  Type with_lanes(int x) const;
+  // Get a new type with cpp_const set to \p x.
+  Type with_cpp_const(bool x = true) const;
+
+  //! Getters
+  // @{
+  type_t type() const;
+  int bits() const;
+  int lanes() const;
+  cpp_type_t cpp_type() const;
+  // @}
+
+  //! Compare two types for equality.
+  bool operator==(const Type& other) const;
+
+  //! Compare two types for inequality.
+  bool operator!=(const Type& other) const { return !(*this == other); }
+
+  //! Generate a vector of this type, with `w` elements.
+  Type VectorOf(int w) const;
+  //! Generate a element type of this type.
+  Type ElementOf() const;
+  //! Generate the address type.
+  Type PointerOf() const;
+  //! Ignore const.
+  Type IgnoreConst() const;
+  //! Add const.
+  Type ConstOf() const;
+
+  friend std::ostream& operator<<(std::ostream& os, const Type& t);
+
+  ~Type();
+
+ private:
+  void CheckTypeValid() const;
+
+  struct Storage;
+  Storage& GetStorage();
+  const Storage& GetStorage() const;
+
+  std::unique_ptr<Storage> storage_;
+};  // namespace common
+
+inline Type Void() { return Type(Type::type_t::Void, 1, 0); }
+inline Type Int(int bits, int lanes = 1) {
+  return Type(Type::type_t::Int, bits, lanes);
+}
+inline Type UInt(int bits, int lanes = 1) {
+  return Type(Type::type_t::UInt, bits, lanes);
+}
+inline Type Float(int bits, int lanes = 1) {
+  return Type(Type::type_t::Float, bits, lanes);
+}
+inline Type Bool(int lanes = 1) { return Type(Type::type_t::UInt, 1, lanes); }
+inline Type String() { return Type(Type::type_t::String, 1, 1); }
+
+//! Builtin native types as global singletons.
+// @{
+const Type& F16();
+const Type& F32();
+const Type& F64();
+const Type& I8();
+const Type& I16();
+const Type& I32();
+const Type& I64();
+const Type& UI8();
+const Type& UI16();
+const Type& UI32();
+const Type& UI64();
+const Type& I1();
+const Type& UI1();
+// @}
+
+template <typename T>
+Type type_of();
+
+// clang-format off
+template <> inline Type type_of<float>() { return F32(); }
+template <> inline Type type_of<double>() { return F64(); }
+template <> inline Type type_of<unsigned char>() { return UI8(); }
+template <> inline Type type_of<int16_t>() { return UI16(); }
+template <> inline Type type_of<int32_t>() { return I32(); }
+template <> inline Type type_of<uint32_t>() { return UI32(); }
+template <> inline Type type_of<bool>() { return UI1(); }
+template <> inline Type type_of<char>() { return I8(); }
+template <> inline Type type_of<int64_t>() { return I64(); }
+template <> inline Type type_of<uint64_t>() { return UI64(); }
+template <> inline Type type_of<signed char>() { return I8(); }
+template <> inline Type type_of<void>() { return Void(); }
+// clang-format on
+template <>
+inline Type type_of<int8_t*>() {
+  Type x = Int(8);
+  x.set_cpp_handle();
+  return x;
+}
+template <>
+inline Type type_of<void*>() {
+  Type x = type_of<void>();
+  x.set_cpp_handle();
+  return x;
+}
+template <>
+inline Type type_of<void**>() {
+  Type x = type_of<void>();
+  x.set_cpp_handle2();
+  return x;
+}
+template <>
+inline Type type_of<float*>() {
+  Type x = type_of<float>();
+  x.set_cpp_handle();
+  return x;
+}
+template <>
+inline Type type_of<double*>() {
+  Type x = type_of<double>();
+  x.set_cpp_handle();
+  return x;
+}
+
+std::ostream& operator<<(std::ostream& os, Type::type_t t);
+
+}  // namespace common
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/CMakeLists.txt b/paddle/infrt/dialect/CMakeLists.txt
new file mode 100644
index 0000000000000..c1517beab0662
--- /dev/null
+++ b/paddle/infrt/dialect/CMakeLists.txt
@@ -0,0 +1,61 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    dialect.cc
+    types.cc
+    basic_kernels.cc
+    test_kernels.cc
+    infrt_base.cc
+    init_infrt_dialects.cc
+    tensor_shape.cc
+    dense_tensor.cc
+    mlir_loader.cc
+    diagnostic_utils.cc
+    pd_types.cc
+    pd_ops.cc
+    )
+
+mlir_tablegen_on(ops)
+mlir_tablegen_on(basic_kernels)
+mlir_tablegen_on(test_kernels)
+mlir_tablegen_on(infrt_base DIALECT infrt)
+mlir_tablegen_on(tensor_shape DIALECT ts)
+mlir_tablegen_on(dense_tensor DIALECT dt)
+mlir_tablegen_on(pd_op_base DIALECT pd)
+mlir_tablegen_on(pd_ops)
+mlir_add_rewriter(rewrite)
+
+# TODO(Superjomn) add a cmake function cc_executable to ecapsulate the following code
+add_executable(infrtopt opt.cc)
+target_link_libraries(infrtopt infrt ${mlir_libs})
+add_dependencies(infrtopt infrt)
+
+add_executable(print-ir print_ir.cc)
+target_link_libraries(print-ir infrt ${mlir_libs})
+add_dependencies(print-ir pd_ops_inc)
+
+
+# MLIR opt tests
+# %{
+set(infrt_opt_path ${CMAKE_BINARY_DIR}/infrt/dialect/infrtopt)
+
+add_test(test_infrt_mlir_opt_on_basic ${infrt_opt_path}
+        ${CMAKE_SOURCE_DIR}/infrt/dialect/mlir_tests/basic.mlir)
+add_test(test_infrt_mlir_opt_on_tensor_shape ${infrt_opt_path}
+        ${CMAKE_SOURCE_DIR}/infrt/dialect/mlir_tests/tensor_shape.mlir)
+add_test(test_infrt_mlir_opt_on_paddle_ops
+        ${infrt_opt_path}
+        ${CMAKE_SOURCE_DIR}/infrt/dialect/mlir_tests/paddle_ops.mlir)
+# %}
+
+cc_test_tiny(test_infrt_mlir_loader SRCS mlir_loader_test.cc DEPS infrt ${MLIR_IR_LIBS})
+
+# execute mlir and run FileCheck
+infrt_exec_check(run_and_check_tensor_type mlir_tests/tensor_type.mlir)
+infrt_exec_check(run_and_check_basic mlir_tests/basic.mlir)
+infrt_exec_check(run_and_check_benchmark mlir_tests/benchmark.mlir)
+#infrt_exec_check(run_and_check_dense_tensor mlir_tests/dense_tensor.mlir)
+add_test(test_infrt_mlir_dense_tensor
+        ${CMAKE_BINARY_DIR}/infrt/host_context/infrt-exec
+        -i
+        ${CMAKE_CURRENT_SOURCE_DIR}/mlir_tests/dense_tensor.mlir)
diff --git a/paddle/infrt/dialect/basic_kernels.cc b/paddle/infrt/dialect/basic_kernels.cc
new file mode 100644
index 0000000000000..b4d2b9182b0c5
--- /dev/null
+++ b/paddle/infrt/dialect/basic_kernels.cc
@@ -0,0 +1,164 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/basic_kernels.h"
+
+#include <llvm/ADT/STLExtras.h>
+#include <mlir/IR/Attributes.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/Function.h>
+#include <mlir/IR/Module.h>
+#include <mlir/IR/OpDefinition.h>
+#include <mlir/IR/OpImplementation.h>
+#include <mlir/IR/StandardTypes.h>
+#include <mlir/IR/TypeUtilities.h>
+#include <mlir/Support/LogicalResult.h>
+
+#include "paddle/infrt/dialect/dense_tensor.h"
+
+namespace infrt::dialect {
+using namespace mlir;  // NOLINT
+
+static ParseResult parseCallOp(OpAsmParser &parser,       // NOLINT
+                               OperationState &result) {  // NOLINT
+  SymbolRefAttr callee_attr;
+  FunctionType callee_type;
+  SmallVector<OpAsmParser::OperandType, 4> operands;
+  auto callee_loc = parser.getNameLoc();
+  if (parser.parseAttribute(callee_attr, "callee", result.attributes) ||
+      parser.parseOperandList(operands, OpAsmParser::Delimiter::Paren) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(callee_type) ||
+      parser.addTypesToList(callee_type.getResults(), result.types) ||
+      parser.resolveOperands(
+          operands, callee_type.getInputs(), callee_loc, result.operands))
+    return failure();
+  return success();
+}
+
+static ParseResult parseConstantOp(Type attrType,
+                                   OpAsmParser &parser,       // NOLINT
+                                   OperationState &result) {  // NOLINT
+  Attribute valueAttr;
+  if (parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseAttribute(valueAttr, attrType, "value", result.attributes) ||
+      parser.addTypeToList(attrType, result.types))
+    return failure();
+  return success();
+}
+
+static ParseResult parseConstantF32Op(OpAsmParser &parser,       // NOLINT
+                                      OperationState &result) {  // NOLINT
+  return parseConstantOp(
+      FloatType::getF32(result.getContext()), parser, result);
+}
+static ParseResult parseConstantF64Op(OpAsmParser &parser,       // NOLINT
+                                      OperationState &result) {  // NOLINT
+  return parseConstantOp(
+      FloatType::getF64(result.getContext()), parser, result);
+}
+static ParseResult parseConstantI32Op(OpAsmParser &parser,       // NOLINT
+                                      OperationState &result) {  // NOLINT
+  return parseConstantOp(
+      IntegerType::get(32, result.getContext()), parser, result);
+}
+static ParseResult parseConstantI64Op(OpAsmParser &parser,       // NOLINT
+                                      OperationState &result) {  // NOLINT
+  return parseConstantOp(
+      IntegerType::get(64, result.getContext()), parser, result);
+}
+
+static ParseResult parseReturnOp(OpAsmParser &parser,       // NOLINT
+                                 OperationState &result) {  // NOLINT
+  SmallVector<OpAsmParser::OperandType, 2> opInfo;
+  SmallVector<Type, 2> types;
+  llvm::SMLoc loc = parser.getCurrentLocation();
+  return failure(parser.parseOperandList(opInfo) ||
+                 (!opInfo.empty() && parser.parseColonTypeList(types)) ||
+                 parser.resolveOperands(opInfo, types, loc, result.operands));
+}
+
+static void print(OpAsmPrinter &p, CallOp op) {  // NOLINT
+  p << "infrt.call " << op.getAttr("callee") << "(";
+  p.printOperands(op.getOperands());
+  p << ")";
+  p.printOptionalAttrDict(op.getAttrs(), {"callee"});
+  p << " : ";
+}
+
+static void printConstant(OpAsmPrinter &p, mlir::Operation *op) {  // NOLINT
+  p << op->getName() << " ";
+  p.printOptionalAttrDict(op->getAttrs(), /*elidedAttrs=*/{"value"});
+
+  if (op->getAttrs().size() > 1) p << ' ';
+  Attribute attr = op->getAttr("value");
+  if (auto int_attr = attr.dyn_cast<IntegerAttr>()) {
+    bool is_signed = int_attr.getType().isIndex() ||
+                     int_attr.getType().getIntOrFloatBitWidth() != 1;
+    int_attr.getValue().print(p.getStream(), is_signed);
+  } else if (auto float_attr = attr.dyn_cast<FloatAttr>()) {
+    p << float_attr.getValue().convertToFloat();
+  } else {
+    op->emitOpError("unknown attribute type");
+  }
+}
+
+static void print(OpAsmPrinter &p, ConstantF32Op op) {  // NOLINT
+  printConstant(p, op);
+}
+static void print(OpAsmPrinter &p, ConstantF64Op op) {  // NOLINT
+  printConstant(p, op);
+}
+static void print(OpAsmPrinter &p, ConstantI32Op op) {  // NOLINT
+  printConstant(p, op);
+}
+static void print(OpAsmPrinter &p, ConstantI64Op op) {  // NOLINT
+  printConstant(p, op);
+}
+
+static void print(OpAsmPrinter &p, ReturnOp op) {  // NOLINT
+  p << "infrt.return";
+  if (op.getNumOperands() > 0) {
+    p << ' ';
+    p.printOperands(op.getOperands());
+    p << " : ";
+    llvm::interleaveComma(op.getOperands(), p);
+  }
+}
+
+static LogicalResult verify(CallOp op) { return success(); }
+
+static LogicalResult verify(ConstantF32Op op) { return success(); }
+static LogicalResult verify(ConstantI32Op op) { return success(); }
+static LogicalResult verify(ConstantF64Op op) { return success(); }
+static LogicalResult verify(ConstantI64Op op) { return success(); }
+
+static LogicalResult verify(ReturnOp op) {
+  auto function = dyn_cast<FuncOp>(op.getParentOp());
+
+  if (!function) return success();
+
+  auto results = function.getType().getResults();
+  if (op.getNumOperands() != results.size())
+    return op.emitOpError("has ")
+           << op.getNumOperands()
+           << " operands, but enclosing function returns " << results.size();
+
+  return success();
+}
+
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/basic_kernels.cpp.inc"
+
+}  // namespace infrt::dialect
diff --git a/paddle/infrt/dialect/basic_kernels.h b/paddle/infrt/dialect/basic_kernels.h
new file mode 100644
index 0000000000000..65316bc1437c0
--- /dev/null
+++ b/paddle/infrt/dialect/basic_kernels.h
@@ -0,0 +1,24 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <mlir/IR/OpDefinition.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
+
+using namespace mlir;  // NOLINT
+
+namespace infrt::dialect {
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/basic_kernels.hpp.inc"
+}  // namespace infrt::dialect
diff --git a/paddle/infrt/dialect/basic_kernels.td b/paddle/infrt/dialect/basic_kernels.td
new file mode 100644
index 0000000000000..df5e4d8a2c6a1
--- /dev/null
+++ b/paddle/infrt/dialect/basic_kernels.td
@@ -0,0 +1,139 @@
+// Operation definitions for basic kernels.
+
+#ifdef BASIC_OPS
+#else
+#define BASIC_OPS
+
+include "paddle/infrt/dialect/infrt_base.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+
+class INFRT_Op<string mnemonic, list<OpTrait> traits = []> : Op<INFRT_Dialect, mnemonic, !listconcat(traits, [IsolatedFromAbove])> {
+
+  // Each registered op needs to provide all of a printer, parser and verifier.
+  let printer = [{ return infrt::dialect::print(p, *this); }];
+  let verifier = [{ return infrt::dialect::verify(*this); }];
+  let parser = [{ return infrt::dialect::parse$cppClass(parser, result); }];
+}
+
+def CallOp : INFRT_Op<"call"> {
+  let summary = "call a host operation";
+  let description = [{
+      The "infrt.call" operation represents a direct call to a function. The operands and result types of the call must match the specified function type.
+
+          %2 = infrt.call @add(%0, %1) : (f32, f32) -> f32
+    }];
+
+  let arguments = (ins FlatSymbolRefAttr:$callee, Variadic<AnyType>:$operands);
+  let results = (outs Variadic<AnyType>);
+
+  let extraClassDeclaration = [{
+      StringRef getCallee() { return callee(); }
+      mlir::FunctionType getCalleeType();
+    }];
+}
+
+class ConstantOp<string suffix, Type baseType, Attr attr>
+    : INFRT_Op<"constant." # suffix, [NoSideEffect]> {
+  let summary = "constant value constructor in host";
+
+  let arguments = (ins attr:$value);
+  let results = (outs baseType);
+}
+
+def ConstantI32Op : ConstantOp<"i32", I32, I32Attr>;
+def ConstantI64Op : ConstantOp<"i64", I64, I64Attr>;
+def ConstantF32Op : ConstantOp<"f32", F32, F32Attr>;
+def ConstantF64Op : ConstantOp<"f64", F64, F64Attr>;
+
+def ReturnOp : INFRT_Op<"return", [Terminator]> {
+  let summary = "host executor return operation";
+  let description = [{
+      The "infrt.return" operation represents a return operation within a function.
+
+        func @foo() : (i32, f8) {
+        infrt.return %0, %1 : i32, f8
+        }
+    }];
+
+  let arguments = (ins Variadic<AnyType>:$operands);
+
+  let builders = [OpBuilder<
+                  "OpBuilder &b, OperationState &result",
+                  [{ build(b, result, llvm::None); }]>];
+}
+
+class AddOp<string suffix, Type type> : INFRT_Op<"add." # suffix, [NoSideEffect]> {
+  let summary = "infrt.add operation";
+  let description = [{
+      An operation that takes two inputs and returns their sum as result.
+    }];
+
+  let arguments = (ins type, type);
+  let results = (outs type);
+  let assemblyFormat = "operands attr-dict";
+  let verifier = ?;
+}
+
+def AddI32Op : AddOp<"i32", I32>;
+def AddI64Op : AddOp<"i64", I64>;
+def AddF32Op : AddOp<"f32", F32>;
+def AddF64Op : AddOp<"f64", F64>;
+
+class MulOp<string suffix, Type type> : INFRT_Op<"mul." # suffix, [NoSideEffect]> {
+    let summary = "infrt.mul operation";
+    let description = [{
+        An operation that takes two inputs and returns their mul as result.
+    }];
+
+    let arguments = (ins type, type);
+let results = (outs type);
+let assemblyFormat = "operands attr-dict";
+let verifier = ?;
+}
+
+def MulI32Op : MulOp<"i32", I32>;
+def MulI64Op : MulOp<"i64", I64>;
+def MulF32Op : MulOp<"f32", F32>;
+def MulF64Op : MulOp<"f64", F64>;
+
+class PrintOp<string suffix, Type type> : INFRT_Op<"print." # suffix> {
+  let summary = "infrt.print operation";
+  let description = [{
+      An operation takes a number as input and prints to stdout.
+    }];
+
+  let arguments = (ins type);
+  let assemblyFormat = "operands attr-dict";
+  let verifier = ?;
+}
+
+//def PrintI32Op : PrintOp<"i32", I32>;
+//def PrintI64Op : PrintOp<"i64", I64>;
+def PrintF32Op : PrintOp<"f32", F32>;
+//def PrintF64Op : PrintOp<"f64", F64>;
+
+def GetStringOp : INFRT_Op<"get_string"> {
+  let summary = "infrt.get_string";
+  let description = [{
+    Get a !infrt.string value from the given string attribute.
+  }];
+
+  let arguments = (ins StrAttr:$value);
+  let results = (outs StringType);
+  let assemblyFormat = "`(` $value `)` attr-dict";
+  let verifier = ?;
+}
+
+def PrintStringOp : INFRT_Op<"print_string"> {
+  let summary = "infrt.print_string";
+  let description = [{
+      An operation that prints a string.
+  }];
+
+  let arguments = (ins StringType:$input);
+  let results = (outs);
+  let assemblyFormat = "`(` $input `)` attr-dict";
+  let verifier = ?;
+}
+
+#endif  // basic kernels
diff --git a/paddle/infrt/dialect/dense_tensor.cc b/paddle/infrt/dialect/dense_tensor.cc
new file mode 100644
index 0000000000000..629a7b16523fc
--- /dev/null
+++ b/paddle/infrt/dialect/dense_tensor.cc
@@ -0,0 +1,277 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/dense_tensor.h"
+
+#include <llvm/ADT/STLExtras.h>
+#include <mlir/IR/Attributes.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/DialectImplementation.h>
+#include <mlir/IR/Function.h>
+#include <mlir/IR/Module.h>
+#include <mlir/IR/OpDefinition.h>
+#include <mlir/IR/OpImplementation.h>
+#include <mlir/IR/StandardTypes.h>
+#include <mlir/IR/TypeUtilities.h>
+#include <mlir/Support/LogicalResult.h>
+
+#include <tuple>
+
+#include "paddle/infrt/common/global.h"
+#include "paddle/infrt/dialect/tensor_shape.h"
+
+namespace infrt::dt {
+
+void DTDialect::initialize() {
+  allowUnknownTypes();
+  addOperations<
+#define GET_OP_LIST
+#include "paddle/infrt/dialect/dense_tensor.cpp.inc"
+      >();
+}
+
+namespace detail {
+struct TensorTypeStorage : public mlir::TypeStorage {
+  TensorTypeStorage(TargetType target,
+                    LayoutType layout,
+                    PrecisionType precision)
+      : target_(target), layout_(layout), precision_(precision) {}
+
+  using KeyTy = std::tuple<TargetType, LayoutType, PrecisionType>;
+
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(target_, layout_, precision_);
+  }
+
+  static llvm::hash_code hashKey(const KeyTy &key) {
+    return llvm::hash_value(key);
+  }
+
+  static TensorTypeStorage *construct(
+      mlir::TypeStorageAllocator &allocator,  // NOLINT
+      const KeyTy &key) {
+    return new (allocator.allocate<TensorTypeStorage>())
+        TensorTypeStorage(std::get<0>(key), std::get<1>(key), std::get<2>(key));
+  }
+
+  TargetType target_;
+  LayoutType layout_;
+  PrecisionType precision_;
+};
+}  // namespace detail
+
+llvm::Optional<TargetType> GetTargetType(mlir::StringRef key) {
+  if (key.equals_lower("x86"))
+    return TargetType::X86;
+  else if (key.equals_lower("cuda"))
+    return TargetType::CUDA;
+  else
+    return llvm::None;
+}
+
+llvm::Optional<LayoutType> GetLayoutType(mlir::StringRef key) {
+  if (key.equals_lower("nchw"))
+    return LayoutType::NCHW;
+  else if (key.equals_lower("nhwc"))
+    return LayoutType::NHWC;
+  else
+    return llvm::None;
+}
+
+llvm::Optional<PrecisionType> GetPrecisionType(mlir::StringRef key) {
+  if (key.equals_lower("i32"))
+    return PrecisionType::I32;
+  else if (key.equals_lower("f32"))
+    return PrecisionType::F32;
+  else
+    return llvm::None;
+}
+
+TensorType TensorType::get(TargetType target,
+                           LayoutType layout,
+                           PrecisionType precision) {
+  return Base::get(
+      ::infrt::Global::getMLIRContext(), target, layout, precision);
+}
+
+TargetType TensorType::target() { return getImpl()->target_; }
+
+LayoutType TensorType::layout() { return getImpl()->layout_; }
+
+PrecisionType TensorType::precision() { return getImpl()->precision_; }
+
+raw_ostream &operator<<(raw_ostream &os, TensorType tensorType) {
+  os << "TensorType<" << tensorType.target() << ", " << tensorType.layout()
+     << ", " << tensorType.precision() << ">";
+  return os;
+}
+
+TensorMapType TensorMapType::get() {
+  return Base::get(::infrt::Global::getMLIRContext());
+}
+
+TensorMapType TensorMapType::get(mlir::MLIRContext *context) {
+  return Base::get(context);
+}
+
+StringType StringType::get() {
+  return Base::get(::infrt::Global::getMLIRContext());
+}
+
+StringType StringType::get(mlir::MLIRContext *context) {
+  return Base::get(context);
+}
+
+raw_ostream &operator<<(raw_ostream &os, TargetType type) {
+  switch (type) {
+    case (TargetType::X86):
+      os << "X86";
+      break;
+    case (TargetType::CUDA):
+      os << "CUDA";
+      break;
+    default:
+      os << "Unsupported";
+  }
+  return os;
+}
+
+raw_ostream &operator<<(raw_ostream &os, LayoutType type) {
+  switch (type) {
+    case (LayoutType::NCHW):
+      os << "NCHW";
+      break;
+    case (LayoutType::NHWC):
+      os << "NHWC";
+      break;
+    default:
+      os << "Unsupported";
+  }
+  return os;
+}
+
+raw_ostream &operator<<(raw_ostream &os, PrecisionType type) {
+  switch (type) {
+    case (PrecisionType::I32):
+      os << "I32";
+      break;
+    case (PrecisionType::F32):
+      os << "F32";
+      break;
+    default:
+      os << "Unsupported";
+  }
+  return os;
+}
+
+static Type getTensorType(mlir::MLIRContext *context) {
+  auto t_dialect = Identifier::get("t", context);
+  return OpaqueType::get(t_dialect, "tensor", context);
+}
+
+static ParseResult parseCreateUninitTensorOp(
+    OpAsmParser &parser,       // NOLINT
+    OperationState &result) {  // NOLINT
+  auto loc = parser.getCurrentLocation();
+  ::mlir::Type outputRawTypes[1];
+  ::llvm::ArrayRef<::mlir::Type> outputTypes(outputRawTypes);
+
+  mlir::ArrayAttr shapeAttr;
+  if (parser.parseAttribute(shapeAttr,
+                            parser.getBuilder().getI64Type(),
+                            "shape",
+                            result.attributes))
+    return failure();
+  if (parser.parseOptionalAttrDict(result.attributes)) return failure();
+
+  if (parser.parseArrow()) return failure();
+  if (parser.parseType(outputRawTypes[0])) return failure();
+  if (!outputRawTypes[0].isa<TensorType>())
+    return parser.emitError(loc, "invalid kind of type specified");
+  result.addTypes(outputTypes);
+  return success();
+}
+
+template <typename CreateUninitTensorOp>
+static void printCreateUninitTensorOp(OpAsmPrinter &p,  // NOLINT
+                                      CreateUninitTensorOp op) {
+  p << CreateUninitTensorOp::getOperationName();
+  p << " ";
+  p.printAttributeWithoutType(op.shapeAttr());
+  p.printOptionalAttrDict(op.getAttrs(), /*elidedAttrs=*/{"shape"});
+  p << " -> ";
+  p << op.getOperation()->getResultTypes();
+}
+
+// TODO(shibo): can be removed?
+// static ParseResult parseFillTensorWithConstantOp(OpAsmParser& parser,
+// OperationState& result) {
+//  auto loc = parser.getCurrentLocation();
+//  ::mlir::OpAsmParser::OperandType inputRawOperands[1];
+//  ::llvm::ArrayRef<::mlir::OpAsmParser::OperandType>
+//  inputOperands(inputRawOperands);
+//  ::mlir::Type inputRawTypes[1];
+//  ::llvm::ArrayRef<::mlir::Type> inputTypes(inputRawTypes);
+//
+//  if (parser.parseOperand(inputRawOperands[0])) return failure();
+//
+//  if (parser.parseColon()) return failure();
+//  if (parser.parseType(inputRawTypes[0])) return failure();
+//  if (!inputRawTypes[0].isa<TensorType>())
+//    return parser.emitError(loc, "invalid kind of type specified");
+//
+//  Attribute value_attr;
+//  if (parser.resolveOperands(inputOperands, inputTypes, loc, result.operands))
+//  return failure();
+//  if (parser.parseAttribute(value_attr, "value", result.attributes)) return
+//  failure();
+//  return success();
+//}
+
+// TODO(shibo): can be removed?
+// template <typename FillTensorOp>
+// static void printFillTensorWithConstantOp(OpAsmPrinter& p, FillTensorOp op) {
+//  p << FillTensorOp::getOperationName();
+//  p << " ";
+//  p.printOperand(op.getOperand());
+//  p << " : ";
+//  p << op.getOperation()->getOperandTypes();
+//  p << " ";
+//  p << op.getAttr("value");
+//}
+
+static ParseResult parseSetTensorOp(OpAsmParser &parser,       // NOLINT
+                                    OperationState &result) {  // NOLINT
+  SmallVector<OpAsmParser::OperandType, 1> operands;
+  if (parser.parseOperandList(operands, 1)) return failure();
+
+  auto tensor_type = getTensorType(result.getContext());
+
+  Attribute value_attr;
+  return failure(
+      parser.resolveOperand(operands[0], tensor_type, result.operands) ||
+      parser.parseAttribute(value_attr, "values", result.attributes));
+}
+
+template <typename SetTensorOp>
+static void printSetTensorOp(OpAsmPrinter &p, SetTensorOp op) {  // NOLINT
+  p << SetTensorOp::getOperationName() << " ";
+  p.printOperand(op.getOperand());
+  p << " " << op.getAttr("values");
+}
+
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/dense_tensor.cpp.inc"  // NOLINT
+
+}  // namespace infrt::dt
diff --git a/paddle/infrt/dialect/dense_tensor.h b/paddle/infrt/dialect/dense_tensor.h
new file mode 100644
index 0000000000000..866c62213ab05
--- /dev/null
+++ b/paddle/infrt/dialect/dense_tensor.h
@@ -0,0 +1,79 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/OpDefinition.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
+
+#include <string>
+
+using namespace mlir;  // NOLINT
+namespace infrt::dt {
+
+namespace detail {
+struct TensorTypeStorage;
+}  // namespace detail
+
+enum class TargetType : uint8_t { X86, CUDA };
+enum class LayoutType : uint8_t { NCHW, NHWC };
+enum class PrecisionType : uint8_t { I32, F32 };
+
+llvm::Optional<TargetType> GetTargetType(mlir::StringRef key);
+llvm::Optional<LayoutType> GetLayoutType(mlir::StringRef key);
+llvm::Optional<PrecisionType> GetPrecisionType(mlir::StringRef key);
+
+raw_ostream &operator<<(raw_ostream &os, TargetType type);
+raw_ostream &operator<<(raw_ostream &os, LayoutType type);
+raw_ostream &operator<<(raw_ostream &os, PrecisionType type);
+
+class TensorType : public mlir::Type::TypeBase<TensorType,
+                                               mlir::Type,
+                                               detail::TensorTypeStorage> {
+ public:
+  using Base::Base;
+  static TensorType get(TargetType target,
+                        LayoutType layout,
+                        PrecisionType precision);
+
+  TargetType target();
+  LayoutType layout();
+  PrecisionType precision();
+};
+
+raw_ostream &operator<<(raw_ostream &os, TensorType tensorType);
+
+class TensorMapType : public mlir::Type::TypeBase<TensorMapType,
+                                                  mlir::Type,
+                                                  mlir::TypeStorage> {
+ public:
+  using Base::Base;
+  static TensorMapType get();
+  static TensorMapType get(mlir::MLIRContext *context);
+};
+
+class StringType
+    : public mlir::Type::TypeBase<StringType, mlir::Type, mlir::TypeStorage> {
+ public:
+  using Base::Base;
+  static StringType get();
+  static StringType get(mlir::MLIRContext *context);
+};
+
+#include "paddle/infrt/dialect/dense_tensor_dialect.hpp.inc"
+
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/dense_tensor.hpp.inc"
+
+}  // namespace infrt::dt
diff --git a/paddle/infrt/dialect/dense_tensor.td b/paddle/infrt/dialect/dense_tensor.td
new file mode 100644
index 0000000000000..07e70cb2ca1ee
--- /dev/null
+++ b/paddle/infrt/dialect/dense_tensor.td
@@ -0,0 +1,150 @@
+#ifdef DT_OPS
+#else
+#define DT_OPS
+
+include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/tensor_shape_base.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+
+def DT_Dialect : Dialect {
+  let name = "dt";
+
+  let description = [{
+      The DenseTensor dialect.
+  }];
+
+  let cppNamespace = "::infrt::dt";
+}
+
+class DT_Op<string mnemonic, list<OpTrait> traits = []> :
+      Op<DT_Dialect, mnemonic, traits>;
+
+class CreateUninitTensorOp<string dtype>
+      : DT_Op<"create_uninit_tensor." # dtype, [NoSideEffect]> {
+  let summary = "dt.create_uninit_tensor operation";
+
+  let description = [{
+      An operation that creates an uninitialized tensor.
+  }];
+
+  let arguments = (ins I64ArrayAttr:$shape);
+  let results = (outs TensorType:$output);
+
+  let parser  = [{ return infrt::dt::parseCreateUninitTensorOp(parser, result); }];
+  let printer = [{ return infrt::dt::printCreateUninitTensorOp(p, *this); }];
+}
+
+
+def ShallowCopyTensorOp
+      : DT_Op<"shallow_copy_tensor", [NoSideEffect]> {
+  let summary = "dt.shallow_copy_tensor operation";
+
+  let description = [{
+      An operation that copy a tensor shallowly.
+  }];
+
+  let arguments = (ins TensorType:$input);
+  let results = (outs TensorType:$output);
+
+  let assemblyFormat = "$input attr-dict `:` type($input) `->` type($output)";
+}
+
+
+class FillTensorWithConstantOp<string dtype> :
+      DT_Op<"fill_tensor_with_constant." # dtype> {
+  let summary = "dt.fill_tensor_with_constant operation";
+
+  let description = [{
+      An operation that fills an input tensor with a value.
+  }];
+
+  let arguments = (ins
+      TensorType:$input,
+      AnyAttr:$value
+  );
+  let results = (outs);
+
+  // TODO: can be removed?
+  //let parser  = [{ return infrt::dt::parseFillTensorWithConstantOp(parser, result); }];
+  //let printer = [{ return infrt::dt::printFillTensorWithConstantOp(p, *this); }];
+  let assemblyFormat = "`(` $input `:` type($input) `)`  attr-dict";
+}
+
+def PrintTensorOp : DT_Op<"print_tensor"> {
+  let summary = "dt.print_tensor operation";
+
+  let description = [{
+    An operation that prints a tensor.
+  }];
+
+  let arguments = (ins TensorType:$input);
+  let results = (outs);
+  let assemblyFormat = "`(` $input `:` type($input) `)` attr-dict";
+}
+
+class SetTensorOp<string dtype> :
+      DT_Op<"set_tensor_with_constant_values." # dtype> {
+  let summary = "dt.set_tensor_with_constant_values operation";
+
+  let description = [{
+    An operation that sets an input tensor with given values.
+  }];
+
+  let arguments = (ins TensorType);
+  let results = (outs);
+
+  let parser  = [{ return infrt::dt::parseSetTensorOp(parser, result); }];
+  let printer = [{ return infrt::dt::printSetTensorOp(p, *this); }];
+}
+
+def LoadParamsOp : DT_Op<"load_params", [NoSideEffect]> {
+  let summary = "dt.load_params operation";
+
+  let description = [{
+    An operation that can load tensors to TensorMap.
+  }];
+
+  // input path of model params.
+  let arguments = (ins StringType:$path);
+  let results = (outs TensorMapType);
+
+  let assemblyFormat = "`(` operands `)` attr-dict";
+  let verifier = ?;
+}
+
+def GetParamOp : DT_Op<"get_param", [NoSideEffect]> {
+  let summary = "dt.get_param operation";
+
+  let description = [{
+    An operation that can get a tensor from TensorMap.
+  }];
+
+  // input path of model params.
+  let arguments = (ins
+          TensorMapType:$map,
+          StrAttr:$name
+          );
+  let results = (outs TensorType:$output);
+  let assemblyFormat = "`(` $map `,` $name `)` attr-dict `->` type($output)";
+  let verifier = ?;
+}
+
+def GetTensorShapeOp : DT_Op<"get_tensor_shape", [NoSideEffect]> {
+  let summary = "dt.get_tensor_shape operation";
+
+  let description = [{
+      An operation that returns the shape of the input tensor.
+  }];
+
+  let arguments = (ins TensorType:$input);
+  let results = (outs TS_Shape:$output);
+  let assemblyFormat = "$input attr-dict `:` type($input) `->` type($output)";
+}
+
+foreach dtype = ["ui8", "ui16", "ui32", "ui64", "i32", "f32", "f64", "i64"] in {
+  def DT_CreateUninitTensorOp_#dtype : CreateUninitTensorOp<dtype>;
+  def DT_FillTensorOp_#dtype : FillTensorWithConstantOp<dtype>;
+  def DT_SetTensorOp_#dtype : SetTensorOp<dtype>;
+}
+
+#endif  // DT_OPS
diff --git a/paddle/infrt/dialect/diagnostic_utils.cc b/paddle/infrt/dialect/diagnostic_utils.cc
new file mode 100644
index 0000000000000..a28176e38fdc7
--- /dev/null
+++ b/paddle/infrt/dialect/diagnostic_utils.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/diagnostic_utils.h"
+
+#include <string>
+
+namespace infrt::dialect {
+
+struct MyScopedDiagnosicHandler::Impl {
+  Impl() : diag_stream_(diag_str_) {}
+
+  // String stream to assemble the final error message.
+  std::string diag_str_;
+  llvm::raw_string_ostream diag_stream_;
+
+  // A SourceMgr to use for the base handler class.
+  llvm::SourceMgr source_mgr_;
+
+  // Log detail information.
+  bool log_info_{};
+};
+
+MyScopedDiagnosicHandler::MyScopedDiagnosicHandler(mlir::MLIRContext *ctx,
+                                                   bool propagate)
+    : mlir::SourceMgrDiagnosticHandler(
+          impl_->source_mgr_, ctx, impl_->diag_stream_),
+      impl_(new Impl) {
+  setHandler([this](mlir::Diagnostic &diag) { return this->handler(&diag); });
+}
+
+mlir::LogicalResult MyScopedDiagnosicHandler::handler(mlir::Diagnostic *diag) {
+  if (diag->getSeverity() != mlir::DiagnosticSeverity::Error &&
+      !impl_->log_info_)
+    return mlir::success();
+  emitDiagnostic(*diag);
+  impl_->diag_stream_.flush();
+  return mlir::failure(true);
+}
+
+}  // namespace infrt::dialect
diff --git a/paddle/infrt/dialect/diagnostic_utils.h b/paddle/infrt/dialect/diagnostic_utils.h
new file mode 100644
index 0000000000000..3a8098cf75181
--- /dev/null
+++ b/paddle/infrt/dialect/diagnostic_utils.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <llvm/Support/SourceMgr.h>
+#include <mlir/IR/Diagnostics.h>
+
+#include <memory>
+
+namespace infrt::dialect {
+
+/**
+ * A scoped diagnostic handler to help debug MLIR process.
+ */
+class MyScopedDiagnosicHandler : public mlir::SourceMgrDiagnosticHandler {
+ public:
+  MyScopedDiagnosicHandler(mlir::MLIRContext* ctx, bool propagate);
+
+  mlir::LogicalResult handler(mlir::Diagnostic* diag);
+
+  ~MyScopedDiagnosicHandler();
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace infrt::dialect
diff --git a/paddle/infrt/dialect/dialect.cc b/paddle/infrt/dialect/dialect.cc
new file mode 100644
index 0000000000000..cbcd5d0f0fa78
--- /dev/null
+++ b/paddle/infrt/dialect/dialect.cc
@@ -0,0 +1,36 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/Function.h>
+#include <mlir/IR/OpDefinition.h>
+#include <mlir/IR/OpImplementation.h>
+#include <mlir/IR/StandardTypes.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
+#include <mlir/Support/LogicalResult.h>
+
+namespace infrt::hlir::dialect {
+
+class CinnDialect : public ::mlir::Dialect {
+ public:
+  explicit CinnDialect(::mlir::MLIRContext* ctx);
+
+  //! We should register this function in dialect
+  static llvm::StringRef getDialectNamespace() {
+    return "infrt::hlir::dialect";
+  }
+};
+
+}  // namespace infrt::hlir::dialect
diff --git a/paddle/infrt/dialect/infrt_base.cc b/paddle/infrt/dialect/infrt_base.cc
new file mode 100644
index 0000000000000..b28ad5ad4b5a5
--- /dev/null
+++ b/paddle/infrt/dialect/infrt_base.cc
@@ -0,0 +1,127 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/infrt_base.h"
+
+#include "paddle/infrt/dialect/basic_kernels.h"
+#include "paddle/infrt/dialect/dense_tensor.h"
+#include "paddle/infrt/dialect/test_kernels.h"
+
+namespace infrt::dialect {
+
+// ----INFRTDialect definition begin----
+void INFRTDialect::initialize() {
+  allowUnknownTypes();
+  allowUnknownOperations();
+
+  addTypes<infrt::dt::StringType>();
+  addTypes<infrt::dt::TensorType>();
+  addTypes<infrt::dt::TensorMapType>();
+
+  addOperations<
+#define GET_OP_LIST
+#include "paddle/infrt/dialect/basic_kernels.cpp.inc"
+      >();
+  addOperations<
+#define GET_OP_LIST
+#include "paddle/infrt/dialect/test_kernels.cpp.inc"
+      >();
+}
+
+mlir::Type INFRTDialect::parseType(mlir::DialectAsmParser &parser) const {
+  llvm::StringRef keyword;
+  if (parser.parseKeyword(&keyword)) return mlir::Type();
+  // parse TensorType, for example: !infrt.tensor<X86, CUDA, F32>
+  if (keyword == "tensor") {
+    llvm::StringRef target;
+    llvm::StringRef layout;
+    llvm::StringRef precision;
+
+    // parse "<"
+    if (parser.parseLess()) return mlir::Type();
+    // parse target
+    if (parser.parseKeyword(&target)) return mlir::Type();
+    auto targetType = infrt::dt::GetTargetType(target);
+    if (!targetType) {
+      parser.emitError(parser.getCurrentLocation(), "unknown target type: ")
+          << target;
+      return mlir::Type();
+    }
+
+    // parse ","
+    if (parser.parseComma()) return mlir::Type();
+    // parse layout
+    if (parser.parseKeyword(&layout)) return mlir::Type();
+    auto layoutType = infrt::dt::GetLayoutType(layout);
+    if (!layoutType) {
+      parser.emitError(parser.getCurrentLocation(), "unknown layout type: ")
+          << layout;
+      return mlir::Type();
+    }
+
+    // parse ","
+    if (parser.parseComma()) return mlir::Type();
+    // parse precision
+    if (parser.parseKeyword(&precision)) return mlir::Type();
+    auto precisionType = infrt::dt::GetPrecisionType(precision);
+    if (!precisionType) {
+      parser.emitError(parser.getCurrentLocation(), "unknown precision type: ")
+          << precision;
+      return mlir::Type();
+    }
+
+    // parse ">"
+    if (parser.parseGreater()) return mlir::Type();
+
+    return infrt::dt::TensorType::get(*targetType, *layoutType, *precisionType);
+  }
+  // parse TensorMapType, for example: !infrt.tensor_map
+  if (keyword == "tensor_map") {
+    return infrt::dt::TensorMapType::get();
+  }
+  // parse StringType, for example: !infrt.string
+  if (keyword == "string") {
+    return infrt::dt::StringType::get();
+  }
+
+  parser.emitError(parser.getCurrentLocation(), "unknown infrt type: ")
+      << keyword;
+  return mlir::Type();
+}
+
+void INFRTDialect::printType(mlir::Type type,
+                             mlir::DialectAsmPrinter &printer) const {
+  // print TensorType, for example: !infrt.tensor<X86, CUDA, F32>
+  if (type.isa<infrt::dt::TensorType>()) {
+    auto tensorType = type.cast<infrt::dt::TensorType>();
+    printer << "tensor<" << tensorType.target() << ", " << tensorType.layout()
+            << ", " << tensorType.precision() << ">";
+    return;
+  }
+  // print TensorMapType, for example: !infrt.tensor_map
+  if (type.isa<infrt::dt::TensorMapType>()) {
+    printer << "tensor_map";
+    return;
+  }
+  // print StringType, for example: !infrt.string
+  if (type.isa<infrt::dt::StringType>()) {
+    printer << "string";
+    return;
+  }
+  llvm_unreachable("unknown infrt type.");
+}
+
+// ----INFRTDialect definition end----
+
+}  // namespace infrt::dialect
diff --git a/paddle/infrt/dialect/infrt_base.h b/paddle/infrt/dialect/infrt_base.h
new file mode 100644
index 0000000000000..1398378957069
--- /dev/null
+++ b/paddle/infrt/dialect/infrt_base.h
@@ -0,0 +1,73 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/DialectImplementation.h>
+#include <mlir/IR/MLIRContext.h>
+#include <mlir/IR/StandardTypes.h>
+#include <mlir/IR/TypeUtilities.h>
+#include <mlir/IR/Types.h>
+
+#include "paddle/infrt/dialect/infrt_base.hpp.inc"
+
+namespace infrt::dialect {
+
+class INFRTDialect : public ::mlir::Dialect {
+  explicit INFRTDialect(::mlir::MLIRContext *context)
+      : ::mlir::Dialect(getDialectNamespace(),
+                        context,
+                        ::mlir::TypeID::get<INFRTDialect>()) {
+    initialize();
+  }
+
+  // parse types registered to the dialect.
+  mlir::Type parseType(mlir::DialectAsmParser &parser) const override;
+  // print types registered to the dialect.
+  void printType(mlir::Type type,
+                 mlir::DialectAsmPrinter &printer) const override;
+
+  void initialize();
+  friend class ::mlir::MLIRContext;
+
+ public:
+  static ::llvm::StringRef getDialectNamespace() { return "infrt"; }
+};
+
+}  // namespace infrt::dialect
+
+namespace mlir {
+
+template <typename T>
+static mlir::IntegerAttr createI32Attr(mlir::OpBuilder &b,  // NOLINT
+                                       mlir::Location loc,
+                                       T constant) {
+  return b.getIntegerAttr(b.getI32Type(), constant);
+}
+
+static mlir::ValueRange cvtValueToValueRange(const mlir::Value &operand) {
+  return mlir::ValueRange(operand);
+}
+
+static mlir::ValueRange concatTwoValueRange(mlir::ValueRange operand_0,
+                                            mlir::ValueRange operand_1) {
+  mlir::SmallVector<::mlir::Value, 4> operands;
+  operands.append(operand_0.begin(), operand_0.end());
+  operands.append(operand_1.begin(), operand_1.end());
+  return operands;
+}
+
+}  // namespace mlir
diff --git a/paddle/infrt/dialect/infrt_base.td b/paddle/infrt/dialect/infrt_base.td
new file mode 100644
index 0000000000000..61dcfe5bfb1c3
--- /dev/null
+++ b/paddle/infrt/dialect/infrt_base.td
@@ -0,0 +1,42 @@
+#ifndef INFRT_BASE
+#define INFRT_BASE
+
+include "mlir/IR/OpBase.td"
+
+def INFRT_Dialect : Dialect {
+  let name = "infrt";
+
+  let description = [{
+    The INFRT host dialect.
+  }];
+
+  let cppNamespace = "::infrt::dialect";
+}
+
+// Type definitions
+def StringType :
+    Type<CPred<"$_self.isa<::infrt::dt::StringType>()">, "!infrt.string type">,
+    BuildableType<"$_builder.getType<::infrt::dt::StringType>()">;
+
+def TensorType :
+    Type<CPred<"$_self.isa<::infrt::dt::TensorType>()">, "!infrt.tensor type">;
+
+def TensorMapType :
+    Type<CPred<"$_self.isa<::infrt::dt::TensorMapType>()">, "!infrt.tensor_map type">,
+    BuildableType<"$_builder.getType<::infrt::dt::TensorMapType>()">;
+
+def BufferType : OpaqueType<"b", "buffer", "buffer">;
+
+class INFRT_createI32Attr<string value> : NativeCodeCall<
+    "mlir::createI32Attr($_builder, $_loc, " # value # ")">;
+
+def INFRT_cvtValueToValueRange : NativeCodeCall<
+    "mlir::cvtValueToValueRange($0)">;
+
+def INFRT_concatTwoValueRange : NativeCodeCall<
+    "mlir::concatTwoValueRange($0, $1)">;
+
+class IsBoolAttrEq<string value> : Constraint<
+    CPred<"($0.getValue() ==" # value # ")">,
+    "Bool attrbute value constraint">;
+#endif  // INFRT_BASE
diff --git a/paddle/infrt/dialect/init_infrt_dialects.cc b/paddle/infrt/dialect/init_infrt_dialects.cc
new file mode 100644
index 0000000000000..4bc2bf70942d2
--- /dev/null
+++ b/paddle/infrt/dialect/init_infrt_dialects.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/init_infrt_dialects.h"
+
+#include <glog/logging.h>
+
+#include "paddle/infrt/dialect/basic_kernels.h"
+#include "paddle/infrt/dialect/dense_tensor.h"
+#include "paddle/infrt/dialect/infrt_base.h"
+#include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/tensor_shape.h"
+
+namespace infrt {
+
+void RegisterCinnDialects(mlir::DialectRegistry& registry) {  // NOLINT
+  registry.insert<ts::TensorShapeDialect>();
+  registry.insert<dialect::INFRTDialect>();
+  registry.insert<dt::DTDialect>();
+  registry.insert<mlir::pd::PaddleDialect>();
+}
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/init_infrt_dialects.h b/paddle/infrt/dialect/init_infrt_dialects.h
new file mode 100644
index 0000000000000..50caca018980d
--- /dev/null
+++ b/paddle/infrt/dialect/init_infrt_dialects.h
@@ -0,0 +1,23 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "mlir/IR/Dialect.h"
+
+namespace infrt {
+
+void RegisterCinnDialects(mlir::DialectRegistry& registry);  // NOLINT
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/mlir_loader.cc b/paddle/infrt/dialect/mlir_loader.cc
new file mode 100644
index 0000000000000..8df8727dbe2b0
--- /dev/null
+++ b/paddle/infrt/dialect/mlir_loader.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/mlir_loader.h"
+
+#include <llvm/Support/SourceMgr.h>
+#include <mlir/Dialect/StandardOps/IR/Ops.h>
+#include <mlir/IR/Diagnostics.h>
+#include <mlir/IR/Function.h>
+#include <mlir/IR/OperationSupport.h>
+#include <mlir/Parser.h>
+#include <unordered_map>
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/infrt/dialect/diagnostic_utils.h"
+#include "paddle/infrt/dialect/init_infrt_dialects.h"
+
+namespace infrt::dialect {
+
+mlir::OwningModuleRef LoadMlirSource(mlir::MLIRContext* context,
+                                     const std::string& mlir_source) {
+  context->allowUnregisteredDialects();
+  RegisterCinnDialects(context->getDialectRegistry());
+  context->getDialectRegistry().insert<mlir::StandardOpsDialect>();
+
+  mlir::ScopedDiagnosticHandler scope_handler(
+      context, [](mlir::Diagnostic& diag) {
+        if (diag.getSeverity() != mlir::DiagnosticSeverity::Error)
+          return mlir::success();
+        LOG(INFO) << "diag: " << diag.str();
+        return mlir::failure(true);
+      });
+
+  auto res = mlir::parseSourceString(
+      llvm::StringRef(mlir_source.data(), mlir_source.length()), context);
+  CHECK(*res) << "failed to parse MLIR string";
+  return res;
+}
+
+mlir::OwningModuleRef LoadMlirFile(const std::string& file_name,
+                                   mlir::MLIRContext* context) {
+  context->allowUnregisteredDialects();
+  RegisterCinnDialects(context->getDialectRegistry());
+  context->getDialectRegistry().insert<mlir::StandardOpsDialect>();
+
+  mlir::ScopedDiagnosticHandler scope_handler(
+      context, [](mlir::Diagnostic& diag) {
+        if (diag.getSeverity() != mlir::DiagnosticSeverity::Error)
+          return mlir::success();
+        LOG(INFO) << "diag: " << diag.str();
+        return mlir::failure(true);
+      });
+
+  return mlir::parseSourceFile(std::string(file_name), context);
+}
+
+}  // namespace infrt::dialect
diff --git a/paddle/infrt/dialect/mlir_loader.h b/paddle/infrt/dialect/mlir_loader.h
new file mode 100644
index 0000000000000..092da7d9ce03f
--- /dev/null
+++ b/paddle/infrt/dialect/mlir_loader.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <glog/logging.h>
+#include <mlir/IR/Module.h>
+#include <string>
+
+#include <memory>
+
+namespace infrt::dialect {
+
+mlir::OwningModuleRef LoadMlirSource(mlir::MLIRContext* context,
+                                     const std::string& mlir_source);
+mlir::OwningModuleRef LoadMlirFile(const std::string& file_name,
+                                   mlir::MLIRContext* context);
+
+}  // namespace infrt::dialect
diff --git a/paddle/infrt/dialect/mlir_loader_test.cc b/paddle/infrt/dialect/mlir_loader_test.cc
new file mode 100644
index 0000000000000..1b622d585ad8e
--- /dev/null
+++ b/paddle/infrt/dialect/mlir_loader_test.cc
@@ -0,0 +1,57 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/mlir_loader.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <llvm/Support/SourceMgr.h>
+#include <mlir/IR/Function.h>
+#include <mlir/Parser.h>
+
+#include <string>
+
+#include "paddle/infrt/dialect/init_infrt_dialects.h"
+
+namespace infrt::dialect {
+
+TEST(MlirLoader, basic) {
+  mlir::MLIRContext context;
+
+  auto source = R"ROC(
+func @main() -> f32 {
+  %v0 = infrt.constant.f32 1.0
+  %v1 = infrt.constant.f32 2.0
+  %value = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+
+  "infrt.print.f32"(%v0) : (f32) -> ()
+
+  infrt.return %value : f32
+}
+)ROC";
+
+  auto module = LoadMlirSource(&context, source);
+  module->verify();
+
+  LOG(INFO) << "module name: " << module->getOperationName().data();
+  for (auto func : module->getOps<mlir::FuncOp>()) {
+    LOG(INFO) << "get func " << func.getName().str();
+    int num_args = func.getNumArguments();
+    for (int i = 0; i < num_args; i++) {
+      LOG(INFO) << "arg: " << func.getArgument(i).getArgNumber();
+    }
+  }
+}
+
+}  // namespace infrt::dialect
diff --git a/paddle/infrt/dialect/mlir_tests/basic.mlir b/paddle/infrt/dialect/mlir_tests/basic.mlir
new file mode 100644
index 0000000000000..84b9b0fbd71cb
--- /dev/null
+++ b/paddle/infrt/dialect/mlir_tests/basic.mlir
@@ -0,0 +1,40 @@
+// CHECK-LABEL: @basic_f32
+func @basic_f32() -> f32 {
+  %v0 = infrt.constant.f32 1.0
+  %v1 = infrt.constant.f32 2.0
+  %value = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+
+  // CHECK-NEXT: 3
+  "infrt.print.f32"(%value) : (f32) -> ()
+
+  infrt.return %value : f32
+}
+
+/// ================================================================
+/// @caller call the other function @callee
+func @callee.add.f32(%x : f32, %y : f32, %y1 : f32) -> f32 {
+  %z = "infrt.add.f32"(%x, %y) : (f32, f32) -> f32
+  %z1 = "infrt.add.f32"(%z, %y1) : (f32, f32) -> f32
+  infrt.return %z1 : f32
+}
+
+// CHECK-LABEL: @caller.add.f32
+func @caller.add.f32() -> f32 {
+  %x = infrt.constant.f32 1.0
+  %y = infrt.constant.f32 2.0
+  %y1 = infrt.constant.f32 3.0
+  %z = infrt.call @callee.add.f32(%x, %y, %y1) : (f32, f32, f32) -> f32
+
+  // CHECK-NEXT: 6
+  "infrt.print.f32"(%z) : (f32) -> ()
+  infrt.return %z : f32
+}
+/// <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+// CHECK-LABEL: @string_test
+func @string_test() {
+  %path = infrt.get_string("this is get_string op.")
+  // CHECK-LABEL: string = this is get_string op.
+  infrt.print_string(%path)
+  infrt.return
+}
diff --git a/paddle/infrt/dialect/mlir_tests/benchmark.mlir b/paddle/infrt/dialect/mlir_tests/benchmark.mlir
new file mode 100644
index 0000000000000..8b4530689df7e
--- /dev/null
+++ b/paddle/infrt/dialect/mlir_tests/benchmark.mlir
@@ -0,0 +1,23 @@
+// CHECK-LABEL: @benchmark
+func @benchmark() {
+  // CHECK-LABEL: BM:add.f32:Count: 3
+  // CHECK-LABEL: BM:add.f32:Duration(ns)
+  // CHECK-LABEL: BM:add.f32:Time Min(ns)
+  // CHECK-LABEL: BM:add.f32:Time 50%(ns)
+  // CHECK-LABEL: BM:add.f32:Time 95%(ns)
+  // CHECK-LABEL: BM:add.f32:Time 99%(ns)
+  // CHECK-LABEL: BM:add.f32:CPU Min(ns)
+  // CHECK-LABEL: BM:add.f32:CPU 50%(ns)
+  // CHECK-LABEL: BM:add.f32:CPU 95%(ns)
+  // CHECK-LABEL: BM:add.f32:CPU 99%(ns)
+  // CHECK-LABEL: BM:add.f32:CPU utilization(percent)
+  infrt.benchmark "add.f32"() duration_secs = 1, max_count = 3, num_warmup_runs = 3
+  {
+    %0 = infrt.constant.f32 1.0
+    %1 = infrt.constant.f32 2.0
+    %res = "infrt.add.f32"(%0, %1) : (f32, f32) -> f32
+    "infrt.print.f32"(%res) : (f32) -> ()
+    infrt.return %res : f32
+  }
+  infrt.return
+}
diff --git a/paddle/infrt/dialect/mlir_tests/dense_tensor.mlir b/paddle/infrt/dialect/mlir_tests/dense_tensor.mlir
new file mode 100644
index 0000000000000..cca7445cd58d8
--- /dev/null
+++ b/paddle/infrt/dialect/mlir_tests/dense_tensor.mlir
@@ -0,0 +1,22 @@
+func @dense_shape0() {
+  %shape = ts.build_shape [1:i64, 57:i64]
+  %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.tensor<X86, NCHW, F32>
+
+  infrt.return
+}
+
+func @predict(%a: !infrt.tensor<X86, NCHW, F32>, %b: !infrt.tensor<X86, NCHW, F32>) -> (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) {
+  %a0 = dt.shallow_copy_tensor %a : !infrt.tensor<X86, NCHW, F32> -> !infrt.tensor<X86, NCHW, F32>
+  %b0 = dt.shallow_copy_tensor %b : !infrt.tensor<X86, NCHW, F32> -> !infrt.tensor<X86, NCHW, F32>
+
+  infrt.return %a0, %b0: !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>
+}
+
+
+func @main() {
+  %shape = ts.build_shape [1:i64, 57:i64]
+  %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.tensor<X86, NCHW, F32>
+
+  %b, %c = infrt.call @predict(%a, %a) : (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>)
+  infrt.return
+}
diff --git a/paddle/infrt/dialect/mlir_tests/paddle_ops.mlir b/paddle/infrt/dialect/mlir_tests/paddle_ops.mlir
new file mode 100644
index 0000000000000..1855a68dd91c3
--- /dev/null
+++ b/paddle/infrt/dialect/mlir_tests/paddle_ops.mlir
@@ -0,0 +1,8 @@
+func @ops() {
+  %a = pd.Feed() : tensor<?xf32>
+  %b = pd.Feed() : tensor<?xf32>
+
+  %c = "pd.Matmul"(%a, %b) {transpose_x=true, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+
+  infrt.return
+}
diff --git a/paddle/infrt/dialect/mlir_tests/rewrite.mlir b/paddle/infrt/dialect/mlir_tests/rewrite.mlir
new file mode 100644
index 0000000000000..c984fda3e6211
--- /dev/null
+++ b/paddle/infrt/dialect/mlir_tests/rewrite.mlir
@@ -0,0 +1,24 @@
+// CHECK-LABEL: @main
+func @main() -> tensor<?xf32> {
+  %a = "pd.Feed"() : () -> tensor<?xf32>
+  %b = "pd.Feed"() : () -> tensor<?xf32>
+  %bias = "pd.Feed"() : () -> tensor<?xf32>
+
+  %b1 = "pd.Feed"() : () -> tensor<?xf32>
+  %b2 = "pd.Feed"() : () -> tensor<?xf32>
+  %bias1 = "pd.Feed"() : () -> tensor<?xf32>
+  %bias2 = "pd.Feed"() : () -> tensor<?xf32>
+
+  %c = "pd.Matmul"(%a, %b) {transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %d = "pd.ElementwiseAdd"(%c, %bias) {axis=1:i32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %e = "pd.Relu6"(%d) {} : (tensor<?xf32>) -> tensor<?xf32>
+
+  %c1 = "pd.Matmul"(%e, %b1) {transpose_x=false, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %d1 = "pd.ElementwiseAdd"(%c1, %bias1) {axis=1:i32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %e1 = "pd.Relu"(%d1) {} : (tensor<?xf32>) -> tensor<?xf32>
+
+  %c2 = "pd.Matmul"(%e1, %b2) {transpose_x=true, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %d2 = "pd.ElementwiseAdd"(%c2, %bias2) {axis=1:i32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %e2 = "pd.Relu"(%d2) {} : (tensor<?xf32>) -> tensor<?xf32>
+  infrt.return %e2 : tensor<?xf32>
+}
\ No newline at end of file
diff --git a/paddle/infrt/dialect/mlir_tests/rewrite_conv_bn.mlir b/paddle/infrt/dialect/mlir_tests/rewrite_conv_bn.mlir
new file mode 100644
index 0000000000000..d41d4b2f9f6bc
--- /dev/null
+++ b/paddle/infrt/dialect/mlir_tests/rewrite_conv_bn.mlir
@@ -0,0 +1,15 @@
+// CHECK-LABEL: @main
+func @main() -> tensor<?xf32> {
+  %a = "pd.Feed"() : () -> tensor<?x3x256x256xf32>
+  %filter = "pd.Constant"(){value = dense<1.000000e+00> : tensor<3x64x3x3xf32>} : () -> tensor<3x64x3x3xf32> 
+  %bias = "pd.Constant"(){value = dense<1.000000e+00> : tensor<64xf32>} : () -> tensor<64xf32>
+
+  %scale = "pd.Constant"(){value = dense<1.000000e+00> : tensor<64xf32>} : () -> tensor<64xf32>
+  %bias2 = "pd.Constant"(){value = dense<1.000000e+00> : tensor<64xf32>} : () -> tensor<64xf32>
+  %mean = "pd.Constant"(){value = dense<1.000000e+00> : tensor<64xf32>} : () -> tensor<64xf32>
+  %var = "pd.Constant"(){value = dense<1.000000e+00> : tensor<64xf32>} : () -> tensor<64xf32>
+
+  %c = "pd.conv2d"(%a, %filter, %bias) {} : (tensor<?x3x256x256xf32>, tensor<3x64x3x3xf32>, tensor<64xf32>) -> tensor<?x3x256x256xf32>
+  %d = "pd.batch_norm"(%c, %scale, %bias2, %mean, %var) {} : (tensor<?x3x256x256xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) -> tensor<?x3x256x256xf32>
+  infrt.return %d : tensor<?x3x256x256xf32>
+}
\ No newline at end of file
diff --git a/paddle/infrt/dialect/mlir_tests/tensor_map.mlir b/paddle/infrt/dialect/mlir_tests/tensor_map.mlir
new file mode 100644
index 0000000000000..111c01c9a108b
--- /dev/null
+++ b/paddle/infrt/dialect/mlir_tests/tensor_map.mlir
@@ -0,0 +1,31 @@
+// CHECK-LABEL: @predict
+func @predict(%input:!infrt.tensor<X86, NCHW, F32>, %map: !infrt.tensor_map) -> (!infrt.tensor<X86, NCHW, F32>) {
+  %w = dt.get_param(%map, "create_parameter_0.w_0") -> !infrt.tensor<X86, NCHW, F32>
+  %bias = dt.get_param(%map, "create_parameter_1.w_0") -> !infrt.tensor<X86, NCHW, F32>
+
+  %out = dt.create_uninit_tensor.f32 [3, 3] -> !infrt.tensor<X86, NCHW, F32>
+
+  // fc
+  "external.matmul"(%input, %w, %out) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.sigmoid"(%out, %out) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+  //dt.print_tensor (%out : !infrt.tensor<X86, NCHW, F32>)
+
+  infrt.return %out : !infrt.tensor<X86, NCHW, F32>
+}
+
+// CHECK-LABEL: @main
+func @main() {
+  %input = dt.create_uninit_tensor.f32 [3, 3] -> !infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%input : !infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+
+  %path = infrt.get_string("/infrt/build/paddle/paddle_1.8_fc_model")
+  // CHECK-LABEL: loading params
+  %map = dt.load_params(%path)
+
+  %out = infrt.call @predict(%input, %map): (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor_map) -> (!infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%out : !infrt.tensor<X86, NCHW, F32>)
+
+  infrt.return
+}
+
diff --git a/paddle/infrt/dialect/mlir_tests/tensor_shape.mlir b/paddle/infrt/dialect/mlir_tests/tensor_shape.mlir
new file mode 100644
index 0000000000000..504b5b36be038
--- /dev/null
+++ b/paddle/infrt/dialect/mlir_tests/tensor_shape.mlir
@@ -0,0 +1,5 @@
+func @build_tensor1() {
+  %a = ts.build_shape [1:i64, 57:i64, 92:i64]
+  ts.print_shape %a
+  infrt.return
+}
diff --git a/paddle/infrt/dialect/mlir_tests/tensor_type.mlir b/paddle/infrt/dialect/mlir_tests/tensor_type.mlir
new file mode 100644
index 0000000000000..c331097ab1072
--- /dev/null
+++ b/paddle/infrt/dialect/mlir_tests/tensor_type.mlir
@@ -0,0 +1,9 @@
+// CHECK-LABEL: test_tensor_type
+func @test_tensor_type() {
+  %a = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%a : !infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+  // CHECK: tensor: shape=shape[3,4], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+  dt.print_tensor (%a : !infrt.tensor<X86, NCHW, F32>)
+
+  infrt.return
+}
diff --git a/paddle/infrt/dialect/ops.td b/paddle/infrt/dialect/ops.td
new file mode 100644
index 0000000000000..264134a447c63
--- /dev/null
+++ b/paddle/infrt/dialect/ops.td
@@ -0,0 +1,6 @@
+include "mlir/IR/OpBase.td"
+include "paddle/infrt/dialect/infrt_base.td"
+
+
+class INFRT_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<INFRT_Dialect, mnemonic, traits>;
diff --git a/paddle/infrt/dialect/opt.cc b/paddle/infrt/dialect/opt.cc
new file mode 100644
index 0000000000000..d90d25230d0c2
--- /dev/null
+++ b/paddle/infrt/dialect/opt.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+#include <llvm/Support/CommandLine.h>
+#include <mlir/Dialect/Affine/IR/AffineOps.h>
+#include <mlir/Dialect/LLVMIR/LLVMDialect.h>
+#include <mlir/IR/AsmState.h>
+#include <mlir/IR/Dialect.h>
+#include <mlir/InitAllDialects.h>
+#include <mlir/InitAllPasses.h>
+#include <mlir/Pass/Pass.h>
+#include <mlir/Pass/PassManager.h>
+#include <mlir/Support/FileUtilities.h>
+#include <mlir/Support/MlirOptMain.h>
+#include <mlir/Transforms/Passes.h>
+
+#include <iostream>
+
+#include "paddle/infrt/common/global.h"
+#include "paddle/infrt/dialect/init_infrt_dialects.h"
+#include "paddle/infrt/dialect/mlir_loader.h"
+
+int main(int argc, char **argv) {
+  mlir::MLIRContext *context = infrt::Global::getMLIRContext();
+
+  auto &registry = context->getDialectRegistry();
+  infrt::RegisterCinnDialects(registry);
+
+  mlir::registerCanonicalizerPass();
+
+  return mlir::failed(
+      mlir::MlirOptMain(argc, argv, "INFRT mlir pass driver", registry));
+}
diff --git a/paddle/infrt/dialect/pd_op_base.td b/paddle/infrt/dialect/pd_op_base.td
new file mode 100644
index 0000000000000..af53df113dfb3
--- /dev/null
+++ b/paddle/infrt/dialect/pd_op_base.td
@@ -0,0 +1,77 @@
+// This file defines some basic elements of Paddle(alias pd) dialect.
+// We learned much from TensorFlow mlir dialect https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
+
+#ifndef PD_OP_BASE
+#define PD_OP_BASE
+
+include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+
+def PD_Dialect : Dialect {
+  let name = "pd";
+
+  let description = [{
+    The PaddlePaddle dialect.
+
+    This dialect contains the PaddlePaddle operators.
+  }];
+
+  let cppNamespace = "::mlir::pd";
+}
+
+class PD_Op<string mnemonic, list<OpTrait> traits = []> :
+      Op<PD_Dialect, mnemonic, traits>;
+
+
+class PD_PaddleAttr <string name, string description> :
+      Attr<CPred<"$_self.isa<mlir::pd::" # name # "Attr>()">,
+          "PaddlePaddle " # description # " attribute">;
+
+
+//===----------------------------------------------------------------------===//
+// PaddlePaddle type definitions
+//===----------------------------------------------------------------------===//
+
+def PD_PDDialectType : Type<CPred<"$_self.isa<mlir::pd::PDType>()">, "PaddlePaddle type">;
+
+class PD_PaddleType <string name, string description> :
+      Type<CPred<"$_self.isa<mlir::pd::" # name #"Type>()">,
+         "Paddle " # description # " type">,
+      BuildableType<"getType<mlir::pd::" # name # "Type>()">;
+
+//===----------------------------------------------------------------------===//
+// Integer types
+def PD_Bool : AnyTypeOf<[I<1>], "bool">;
+def PD_Int8 : AnyTypeOf<[I8], "8-bit integer">;
+def PD_Int16 : AnyTypeOf<[I16], "16-bit integer">;
+def PD_Int32 : AnyTypeOf<[I32], "32-bit integer">;
+def PD_Int64 : AnyTypeOf<[I64], "64-bit integer">;
+
+def PD_UInt8 : AnyTypeOf<[UI<8>], "8-bit unsigned integer">;
+def PD_UInt16 : AnyTypeOf<[UI<16>], "16-bit unsigned integer">;
+def PD_UInt32 : AnyTypeOf<[UI<32>], "32-bit unsigned integer">;
+def PD_UInt64 : AnyTypeOf<[UI<64>], "64-bit unsigned integer">;
+
+def PD_SInt : AnyTypeOf<[PD_Int8, PD_Int16, PD_Int32, PD_Int64], "signed integer">;
+def PD_UInt : AnyTypeOf<[PD_UInt8, PD_UInt16, PD_UInt32, PD_UInt64], "unsigned integer">;
+def PD_Int : AnyTypeOf<[PD_SInt, PD_UInt], "integer">;
+
+// Float types
+def PD_Float16 : AnyTypeOf<[F16], "16-bit float">;
+def PD_Float32 : AnyTypeOf<[F32], "32-bit float">;
+def PD_Float64 : AnyTypeOf<[F64], "64-bit float">;
+
+def PD_Float : AnyTypeOf<[PD_Float16, PD_Float32, PD_Float64], "floating-point">;
+
+
+// Tensor types
+
+def PD_ElementType : Type<Or<[PD_Float.predicate,
+                              PD_Bool.predicate,
+                              PD_Int.predicate]>,
+                              "pd.dtype">;
+
+def PD_Tensor : TensorOf<[PD_ElementType]>;
+
+
+#endif // PD_OP_BASE
diff --git a/paddle/infrt/dialect/pd_ops.cc b/paddle/infrt/dialect/pd_ops.cc
new file mode 100644
index 0000000000000..7ca07dd5fcbba
--- /dev/null
+++ b/paddle/infrt/dialect/pd_ops.cc
@@ -0,0 +1,177 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/pd_ops.h"
+
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/PatternMatch.h"
+#include "paddle/infrt/dialect/infrt_base.h"
+
+namespace mlir {
+namespace pd {
+
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/pd_ops.hpp.inc"
+#undef GET_OP_CLASSES
+
+PaddleDialect::PaddleDialect(MLIRContext *context)
+    : Dialect("pd", context, TypeID::get<PaddleDialect>()) {
+  addOperations<
+#define GET_OP_LIST
+#include "paddle/infrt/dialect/pd_ops.cpp.inc"  // NOLINT
+      >();
+#undef GET_OP_LIST
+}
+
+mlir::Operation *PaddleDialect::materializeConstant(mlir::OpBuilder &builder,
+                                                    mlir::Attribute value,
+                                                    mlir::Type type,
+                                                    mlir::Location loc) {
+  return builder.create<ConstantOp>(loc, value);
+}
+
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/pd_ops.cpp.inc"  // NOLINT
+#undef GET_OP_CLASSES
+
+#include "paddle/infrt/dialect/rewrite.hpp.inc"  // NOLINT
+
+void ConstantOp::build(OpBuilder &builder,
+                       OperationState &state,
+                       Attribute value) {
+  if (auto elem_attr = value.dyn_cast<ElementsAttr>()) {
+    return ConstantOp::build(builder, state, elem_attr);
+  } else if (value.isa<BoolAttr, FloatAttr, IntegerAttr>()) {
+    ShapedType type = RankedTensorType::get(/*shape=*/{}, value.getType());
+    state.addAttribute("value", DenseElementsAttr::get(type, value));
+    state.addTypes(type);
+    return;
+  }
+  llvm_unreachable("unsupported attribute type for building pd.constant");
+}
+
+LogicalResult ConstantOp::inferReturnTypes(
+    MLIRContext *context,
+    Optional<Location> location,
+    ValueRange operands,
+    DictionaryAttr attributes,
+    RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  inferredReturnTypes.push_back(attributes.get("value").getType());
+  return success();
+}
+::mlir::OpFoldResult ConstantOp::fold(
+    ::llvm::ArrayRef<::mlir::Attribute> operands) {
+  return value();
+}
+
+LogicalResult ElementwiseAdd::inferReturnTypes(
+    MLIRContext *context,
+    Optional<Location> location,
+    ValueRange operands,
+    DictionaryAttr attributes,
+    RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  inferredReturnTypes.push_back(operands[0].getType());
+  return success();
+}
+void ElementwiseAdd::getCanonicalizationPatterns(
+    ::mlir::OwningRewritePatternList &results, ::mlir::MLIRContext *context) {
+  results.insert<FuseMulAdd>(context);
+}
+
+::mlir::OpFoldResult ElementwiseAdd::fold(
+    llvm::ArrayRef<mlir::Attribute> operands) {
+  if (getElementTypeOrSelf(getType()).isa<FloatType>()) {
+    if (!operands[0] || !operands[1]) return {};
+    DenseElementsAttr lhs = operands[0].dyn_cast<DenseElementsAttr>();
+    DenseElementsAttr rhs = operands[1].dyn_cast<DenseElementsAttr>();
+    if (!lhs || !rhs) return {};
+    ShapedType type = getType().template cast<ShapedType>();
+    if (!type.hasStaticShape()) return {};
+    Type etype = type.getElementType();
+    if (!etype.isa<FloatType>()) return {};
+    SmallVector<APFloat, 6> values;
+    values.reserve(lhs.getNumElements());
+    for (const auto zip :
+         llvm::zip(lhs.getValues<APFloat>(), rhs.getValues<APFloat>())) {
+      values.push_back(
+          std::plus<APFloat>()(std::get<0>(zip), std::get<1>(zip)));
+    }
+    return DenseElementsAttr::get(type, values);
+  }
+  return {};
+}
+
+LogicalResult ElementwiseDiv::inferReturnTypes(
+    MLIRContext *context,
+    Optional<Location> location,
+    ValueRange operands,
+    DictionaryAttr attributes,
+    RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  inferredReturnTypes.push_back(operands[0].getType());
+  return success();
+}
+
+LogicalResult ElementwiseMul::inferReturnTypes(
+    MLIRContext *context,
+    Optional<Location> location,
+    ValueRange operands,
+    DictionaryAttr attributes,
+    RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  inferredReturnTypes.push_back(operands[0].getType());
+  return success();
+}
+
+LogicalResult ElementwiseSub::inferReturnTypes(
+    MLIRContext *context,
+    Optional<Location> location,
+    ValueRange operands,
+    DictionaryAttr attributes,
+    RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  inferredReturnTypes.push_back(operands[0].getType());
+  return success();
+}
+
+LogicalResult MulOp::inferReturnTypes(
+    MLIRContext *context,
+    Optional<Location> location,
+    ValueRange operands,
+    DictionaryAttr attributes,
+    RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  inferredReturnTypes.push_back(operands[0].getType());
+  return success();
+}
+
+void ReluOp::getCanonicalizationPatterns(
+    ::mlir::OwningRewritePatternList &results, ::mlir::MLIRContext *context) {
+  results.insert<FuseFCRelu>(context);
+}
+
+void FusedRepeatedFCRelu::getCanonicalizationPatterns(
+    ::mlir::OwningRewritePatternList &results, ::mlir::MLIRContext *context) {
+  results.insert<FuseRepeatedFCRelu2>(context);
+}
+
+void BatchNormOp::getCanonicalizationPatterns(
+    ::mlir::OwningRewritePatternList &results, ::mlir::MLIRContext *context) {
+  results.insert<FuseBatchNormWithConvPattern>(context);
+}
+
+}  // namespace pd
+}  // namespace mlir
diff --git a/paddle/infrt/dialect/pd_ops.h b/paddle/infrt/dialect/pd_ops.h
new file mode 100644
index 0000000000000..d09b6032257a2
--- /dev/null
+++ b/paddle/infrt/dialect/pd_ops.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "mlir/Dialect/Traits.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Interfaces/CallInterfaces.h"
+#include "mlir/Interfaces/DerivedAttributeOpInterface.h"
+#include "mlir/Interfaces/InferTypeOpInterface.h"
+#include "mlir/Interfaces/LoopLikeInterface.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+
+namespace mlir {
+namespace pd {
+
+class PaddleDialect : public Dialect {
+ public:
+  explicit PaddleDialect(MLIRContext* context);
+
+  static StringRef getDialectNamespace() { return "pd"; }
+
+  /// A hook used to materialize constant values with the given type.
+  Operation* materializeConstant(OpBuilder& builder,
+                                 Attribute value,
+                                 Type type,
+                                 Location loc) override;
+
+  Type parseType(DialectAsmParser& parser) const override {
+    return Dialect::parseType(parser);
+  }
+  void printType(Type type, DialectAsmPrinter& printer) const override {
+    Dialect::printType(type, printer);
+  }
+};
+
+}  // namespace pd
+}  // namespace mlir
diff --git a/paddle/infrt/dialect/pd_ops.td b/paddle/infrt/dialect/pd_ops.td
new file mode 100644
index 0000000000000..9e906ad0c02cc
--- /dev/null
+++ b/paddle/infrt/dialect/pd_ops.td
@@ -0,0 +1,182 @@
+#ifndef PD_OPS
+#define PD_OPS
+
+include "mlir/Interfaces/InferTypeOpInterface.td"
+include "mlir/Interfaces/LoopLikeInterface.td"
+include "mlir/IR/OpBase.td"
+include "paddle/infrt/dialect/pd_op_base.td"
+
+def PD_FeedOp : PD_Op<"Feed", [NoSideEffect]> {
+  let summary = "Feed Op";
+
+  let description = [{
+    Feed a tensor into the model.
+  }];
+
+  let arguments = (ins);
+  let results = (outs PD_Tensor:$out);
+
+  let assemblyFormat = [{
+      `(` `)` attr-dict `:` type($out)
+  }];
+}
+
+def PD_ConstantOp : PD_Op<"Constant", [NoSideEffect, ConstantLike, DeclareOpInterfaceMethods<InferTypeOpInterface>, AllTypesMatch<["value", "output"]>]> {
+  let summary = "constant Op";
+  let description = [{}];
+
+  let arguments = (ins ElementsAttr:$value);
+  let results = (outs PD_Tensor:$output);
+  let hasFolder = 1;
+
+  let builders = [
+    OpBuilder<"OpBuilder &builder, OperationState &state, Attribute value">,
+  ];
+}
+
+def PD_AbsOp : PD_Op<"Abs", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = "Computes the absolute value of a tensor";
+
+  let description = [{
+  }];
+
+  let arguments = (ins PD_Tensor:$x);
+  let results = (outs PD_Tensor:$y);
+}
+
+def PD_SqrtOp : PD_Op<"sqrt", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = "Computes the sqrt value of a tensor";
+
+  let description = [{
+  }];
+
+  let arguments = (ins PD_Tensor:$x);
+  let results = (outs PD_Tensor:$y);
+}
+
+def PD_ReluOp : PD_Op<"Relu", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = "Computes the Relu of a tensor";
+
+  let description = [{
+  }];
+
+  let arguments = (ins PD_Tensor:$x);
+  let results = (outs PD_Tensor:$y);
+  let hasCanonicalizer = 1;
+}
+
+def PD_Relu6Op : PD_Op<"Relu6", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = "Computes the Relu6 of a tensor";
+
+  let description = [{
+  }];
+
+  let arguments = (ins PD_Tensor:$x);
+  let results = (outs PD_Tensor:$y);
+}
+
+def PD_ElementwiseAdd : PD_Op<"ElementwiseAdd", [NoSideEffect, Commutative, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+  let summary = "ElementwiseAdd Op";
+  let description = [{
+  }];
+
+  let arguments = (ins PD_Tensor:$x, PD_Tensor:$y, DefaultValuedAttr<I32Attr, "-1">:$axis);
+  let results = (outs PD_Tensor:$out);
+  let hasCanonicalizer = 1;
+  let hasFolder = 1;
+}
+
+def PD_ElementwiseSub : PD_Op<"ElementwiseSub", [NoSideEffect, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+  let summary = "ElementwiseSub Op";
+  let description = [{
+  }];
+
+  let arguments = (ins PD_Tensor:$x, PD_Tensor:$y, DefaultValuedAttr<I32Attr, "-1">:$axis);
+  let results = (outs PD_Tensor:$out);
+}
+
+def PD_ElementwiseMul : PD_Op<"ElementwiseMul", [NoSideEffect, Commutative, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+  let summary = "ElementwiseMul Op";
+  let description = [{
+  }];
+
+  let arguments = (ins PD_Tensor:$x, PD_Tensor:$y, DefaultValuedAttr<I32Attr, "-1">:$axis);
+  let results = (outs PD_Tensor:$out);
+}
+
+def PD_ElementwiseDiv : PD_Op<"ElementwiseDiv", [NoSideEffect, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+  let summary = "ElementwiseDiv Op";
+  let description = [{
+  }];
+
+  let arguments = (ins PD_Tensor:$x, PD_Tensor:$y, DefaultValuedAttr<I32Attr, "-1">:$axis);
+  let results = (outs PD_Tensor:$out);
+}
+
+def PD_MatmulOp : PD_Op<"Matmul", [NoSideEffect]> {
+  let summary = "Computes the matrix mulplication result of two tensors";
+  let description = [{
+  }];
+
+  let arguments = (ins PD_Tensor:$x, PD_Tensor:$y,
+                  DefaultValuedAttr<BoolAttr, "false">:$transpose_x,
+                  DefaultValuedAttr<BoolAttr, "false">:$transpose_y,
+                  DefaultValuedAttr<F32Attr, "1.0">:$alpha);
+  let results = (outs PD_Tensor:$out);
+
+  //let hasCanonicalizer = 1;
+}
+
+def PD_MulOp : PD_Op<"mul", [NoSideEffect, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+  let summary = "paddle mul op";
+  let description = [{}];
+
+  let arguments = (ins PD_Tensor:$x, PD_Tensor:$y);
+  let results = (outs PD_Tensor:$out);
+
+  //let hasCanonicalizer = 1;
+}
+
+def PD_Conv2dOp : PD_Op<"conv2d", [NoSideEffect]> {
+  let summary = "paddle conv2d operation";
+  let description = [{
+  }];
+
+  let arguments = (ins PD_Tensor:$Input, PD_Tensor:$Filter, PD_Tensor:$Bias);
+  let results = (outs PD_Tensor:$Output);
+
+  //let hasCanonicalizer = 1;
+}
+
+def PD_BatchNormOp : PD_Op<"batch_norm", [NoSideEffect]> {
+  let summary = "paddle batch_norm operation";
+  let description = [{
+  }];
+
+  let arguments = (ins PD_Tensor:$X, PD_Tensor:$Scale, PD_Tensor:$Bias,
+                   PD_Tensor:$Mean, PD_Tensor:$Variance,
+                   DefaultValuedAttr<F32Attr, "1e-05">:$epsilon);
+  let results = (outs PD_Tensor:$Y);
+
+  let hasCanonicalizer = 1;
+}
+
+def PD_FusedFC : PD_Op<"FC", [NoSideEffect]> {
+    let summary = "Computes the Fully Connected result of two tensors";
+    let description = [{
+    }];
+
+    let arguments = (ins PD_Tensor:$input, PD_Tensor:$w, PD_Tensor:$bias, DefaultValuedAttr<I32Attr, "1">:$in_num_col_dims);
+    let results = (outs PD_Tensor:$out);
+}
+
+def PD_FusedRepeatedFCRelu : PD_Op<"RepeatedFCRelu", [SameVariadicOperandSize, NoSideEffect]> {
+    let summary = "";
+    let description = [{ }];
+
+    let arguments = (ins PD_Tensor:$input, Variadic<PD_Tensor>:$w, Variadic<PD_Tensor>:$bias);
+    let results = (outs PD_Tensor:$out);
+    let hasCanonicalizer = 1;
+}
+
+#endif  // PD_OPS
diff --git a/paddle/infrt/dialect/pd_types.cc b/paddle/infrt/dialect/pd_types.cc
new file mode 100644
index 0000000000000..94856e362d301
--- /dev/null
+++ b/paddle/infrt/dialect/pd_types.cc
@@ -0,0 +1,15 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/pd_types.h"
diff --git a/paddle/infrt/dialect/pd_types.h b/paddle/infrt/dialect/pd_types.h
new file mode 100644
index 0000000000000..6f9fe56338a9f
--- /dev/null
+++ b/paddle/infrt/dialect/pd_types.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file defines the types used in PaddlePaddle MLIR dialect.
+// We borrowed much ideas from tensorflow mlir dialect (tf_types.h in
+// tensorflow).
+
+#pragma once
+
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Types.h"
+
+namespace mlir {
+namespace PD {
+
+class PaddleType : public Type {
+ public:
+  using Type::Type;
+
+  static bool classof(Type type);
+};
+
+namespace detail {
+
+template <typename Derived>
+class PaddleTypeImpl : public Type::TypeBase<Derived, PaddleType, TypeStorage> {
+ public:
+  using Base = typename Type::TypeBase<Derived, PaddleType, TypeStorage>;
+  using PDBase = PaddleTypeImpl<Derived>;
+  using Base::Base;
+};
+
+}  // namespace detail
+
+#define HANDLE_PD_TYPE(pdtype, enumerant, name)                      \
+  class pdtype##Type : public detail::PaddleTypeImpl<pdtype##Type> { \
+   public:                                                           \
+    using PDBase::PDBase;                                            \
+  };
+
+}  // namespace PD
+}  // namespace mlir
diff --git a/paddle/infrt/dialect/print_ir.cc b/paddle/infrt/dialect/print_ir.cc
new file mode 100644
index 0000000000000..3c5a2b6a7bf90
--- /dev/null
+++ b/paddle/infrt/dialect/print_ir.cc
@@ -0,0 +1,134 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Support/raw_os_ostream.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/AsmState.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Region.h"
+#include "mlir/IR/Verifier.h"
+#include "mlir/Parser.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/Passes.h"
+#include "paddle/infrt/common/global.h"
+#include "paddle/infrt/dialect/init_infrt_dialects.h"
+
+namespace cl = llvm::cl;
+
+static cl::opt<std::string> inputFilename(cl::Positional,
+                                          cl::desc("<input toy file>"),
+                                          cl::init("-"),
+                                          cl::value_desc("filename"));
+
+llvm::raw_ostream &printIndent(int indent = 0) {
+  for (int i = 0; i < indent; ++i) llvm::outs() << "    ";
+  return llvm::outs();
+}
+
+void printOperation(mlir::Operation *op, int indent);
+void printRegion(mlir::Region &region, int indent);  // NOLINT
+void printBlock(mlir::Block &block, int indent);     // NOLINT
+
+void printOperation(mlir::Operation *op, int indent) {
+  llvm::Optional<mlir::ModuleOp> module_op = llvm::None;
+  if (llvm::isa<mlir::ModuleOp>(op))
+    module_op = llvm::dyn_cast<mlir::ModuleOp>(op);
+  llvm::Optional<mlir::FuncOp> func_op = llvm::None;
+  if (llvm::isa<mlir::FuncOp>(op)) func_op = llvm::dyn_cast<mlir::FuncOp>(op);
+
+  printIndent(indent) << "op: '" << op->getName();
+  // This getName is inherited from Operation::getName
+  if (module_op) {
+    printIndent() << "@" << module_op->getName();
+  }
+  // This getName is inherited from SymbolOpInterfaceTrait::getName,
+  // which return value of "sym_name" in ModuleOp or FuncOp attributes.
+  if (func_op) {
+    printIndent() << "@" << func_op->getName();
+  }
+  printIndent() << "' with " << op->getNumOperands() << " operands"
+                << ", " << op->getNumResults() << " results"
+                << ", " << op->getAttrs().size() << " attributes"
+                << ", " << op->getNumRegions() << " regions"
+                << ", " << op->getNumSuccessors() << " successors\n";
+  if (!op->getAttrs().empty()) {
+    printIndent(indent) << op->getAttrs().size() << " attributes:\n";
+    for (mlir::NamedAttribute attr : op->getAttrs()) {
+      printIndent(indent + 1) << "- {" << attr.first << " : " << attr.second
+                              << "}\n";
+    }
+  }
+
+  if (op->getNumRegions() > 0) {
+    printIndent(indent) << op->getNumRegions() << " nested regions:\n";
+    for (mlir::Region &region : op->getRegions()) {
+      printRegion(region, indent + 1);
+    }
+  }
+}
+
+void printRegion(mlir::Region &region, int indent) {  // NOLINT
+  printIndent(indent) << "Region with " << region.getBlocks().size()
+                      << " blocks:\n";
+  for (mlir::Block &block : region.getBlocks()) {
+    printBlock(block, indent + 1);
+  }
+}
+
+void printBlock(mlir::Block &block, int indent) {  // NOLINT
+  printIndent(indent) << "Block with " << block.getNumArguments()
+                      << " arguments"
+                      << ", " << block.getNumSuccessors() << " successors"
+                      << ", " << block.getOperations().size()
+                      << " operations\n";
+
+  for (mlir::Operation &operation : block.getOperations()) {
+    printOperation(&operation, indent + 1);
+  }
+}
+
+int main(int argc, char **argv) {
+  mlir::registerAsmPrinterCLOptions();
+  mlir::registerMLIRContextCLOptions();
+  mlir::registerPassManagerCLOptions();
+  cl::ParseCommandLineOptions(argc, argv, "mlir demo");
+
+  mlir::MLIRContext *context = infrt::Global::getMLIRContext();
+  context->allowUnregisteredDialects();
+  auto &registry = context->getDialectRegistry();
+  infrt::RegisterCinnDialects(registry);
+
+  // mlir will verify module automatically after parsing.
+  // https://github.com/llvm/llvm-project/blob/38d18d93534d290d045bbbfa86337e70f1139dc2/mlir/lib/Parser/Parser.cpp#L2051
+  // mlir::OwningModuleRef module_ref = mlir::parseSourceString(mlir_source,
+  // context);
+  mlir::OwningModuleRef module_ref =
+      mlir::parseSourceFile(inputFilename, context);
+  std::cout << "----------print IR Structure begin----------" << std::endl;
+  printOperation(module_ref->getOperation(), 0);
+  std::cout << "----------print IR Structure end----------" << std::endl;
+
+  module_ref->dump();
+  return 0;
+}
diff --git a/paddle/infrt/dialect/rewrite.td b/paddle/infrt/dialect/rewrite.td
new file mode 100644
index 0000000000000..aa81dd72d059b
--- /dev/null
+++ b/paddle/infrt/dialect/rewrite.td
@@ -0,0 +1,90 @@
+#ifndef INFRT_REWRITE
+#define INFRT_REWRITE
+
+include "paddle/infrt/dialect/infrt_base.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "paddle/infrt/dialect/pd_ops.td"
+
+//===----------------------------------------------------------------------===//
+// This is to fuse the composition: 'Matmul o ElementwiseAdd' into 'PD_FusedFC'.
+//
+// We have:
+//   (Matmul)      z = x * y
+//   (Add)         out = z + bias 
+//
+// which corresponds to the following computation:
+//   (FusedFC)  out = x * y + bias
+// 
+// Todo:
+//  1. Make the constrait more completely.
+//  2. Consider the case of : out = bias + z
+//===----------------------------------------------------------------------===//
+def FuseMulAdd : Pat<(PD_ElementwiseAdd (PD_MatmulOp $x, $y, $transpose_x, $transpose_y, $alpha), $bias, $axis),
+                     (PD_FusedFC $x, $y, $bias, (INFRT_createI32Attr<"1">)),
+                     [(IsBoolAttrEq<"false"> $transpose_x),(IsBoolAttrEq<"false"> $transpose_y)]>;
+
+
+//===----------------------------------------------------------------------===//
+// This is to fuse the composition: 'FusedFC o Relu' into 'FusedRepeatedFCRelu'.
+//
+// We have:
+//   (FusedFC)      z = fc(x, y, bias)
+//   (Relu)         out = relu(z)
+//
+// which corresponds to the following computation:
+//   (FusedRepeatedFCRelu)  out = RepeatedFCRelu(x, [y], [bias])
+// 
+//===----------------------------------------------------------------------===//
+def FuseFCRelu : Pat<(PD_ReluOp (PD_FusedFC $x, $y, $bias, $_)),
+                     (PD_FusedRepeatedFCRelu $x, (INFRT_cvtValueToValueRange $y), (INFRT_cvtValueToValueRange $bias))>;
+
+//===----------------------------------------------------------------------===//
+// This is to fold 'FusedRepeatedFCRelu' op.
+//
+// We have:
+//   (FusedRepeatedFCRelu)      z = RepeatedFCRelu(x, [y, ...], [bias, ...])
+//   (FusedRepeatedFCRelu)      out = RepeatedFCRelu(z, [y1, ...], [bias1, ...])
+//
+// which corresponds to the following computation:
+//   (FusedRepeatedFCRelu)  out = RepeatedFCRelu(x, [y, ..., y1, ...], [bias, ..., bias1, ....])
+// 
+//===----------------------------------------------------------------------===//
+def FuseRepeatedFCRelu2 : Pat<(PD_FusedRepeatedFCRelu (PD_FusedRepeatedFCRelu $x, $y, $bias), $y_2, $bias_2),
+                     (PD_FusedRepeatedFCRelu $x, (INFRT_concatTwoValueRange $y, $y_2), (INFRT_concatTwoValueRange $bias, $bias_2))>;
+
+
+//===----------------------------------------------------------------------===//
+// This is to fuse the composition: 'BatchNorm o Conv' into 'Conv'
+// by deriving new 'w' and 'b' for 'Conv':
+//
+// We have:
+//   (Conv)      z = w * x + b 
+//   (BatchNorm) y = scale * (z - mean) / sqrt(var + eps) + bias
+//
+// which corresponds to the following computation:
+//   y = w_ * x + b_
+// where
+//   w_ = scale * w / sqrt(var + eps)
+//   b_ = B + scale * (b - mean) / sqrt(var + eps)
+//
+//===----------------------------------------------------------------------===//
+def FuseBatchNormWithConvPattern: Pat<
+    (PD_BatchNormOp
+        (PD_Conv2dOp $input, $filter, $bias),
+        $scale, $bias_2, $mean, $var, $epsilon),
+    (PD_Conv2dOp
+        $input,
+        (PD_MulOp $filter,
+            (PD_ElementwiseDiv:$coefficientW
+                $scale,
+                (PD_SqrtOp (PD_ElementwiseAdd $var, (PD_ConstantOp $epsilon), (INFRT_createI32Attr<"1">))),
+                (INFRT_createI32Attr<"1">))),
+        (PD_ElementwiseAdd
+            $bias,
+            (PD_MulOp 
+                (PD_ElementwiseSub $bias, $mean, (INFRT_createI32Attr<"1">)),
+                $coefficientW),
+            (INFRT_createI32Attr<"1">)))
+>;
+
+#endif // INFRT_REWRITE
diff --git a/paddle/infrt/dialect/tensor_shape.cc b/paddle/infrt/dialect/tensor_shape.cc
new file mode 100644
index 0000000000000..ef5a5525cb22f
--- /dev/null
+++ b/paddle/infrt/dialect/tensor_shape.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/tensor_shape.h"
+
+#include <llvm/ADT/STLExtras.h>
+#include <mlir/IR/Attributes.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/DialectImplementation.h>
+#include <mlir/IR/Function.h>
+#include <mlir/IR/Module.h>
+#include <mlir/IR/OpDefinition.h>
+#include <mlir/IR/OpImplementation.h>
+#include <mlir/IR/StandardTypes.h>
+#include <mlir/IR/TypeUtilities.h>
+#include <mlir/Support/LogicalResult.h>
+
+namespace infrt::ts {
+using namespace mlir;  // NOLINT
+
+void TensorShapeDialect::initialize() {
+  allowUnknownTypes();
+  addTypes<ShapeType, PartialShapeType>();
+  addOperations<
+#define GET_OP_LIST
+#include "paddle/infrt/dialect/tensor_shape.cpp.inc"
+      >();
+}
+
+Type TensorShapeDialect::parseType(DialectAsmParser &parser) const {
+  StringRef keyword;
+  if (parser.parseKeyword(&keyword)) return Type();
+  if (keyword == "shape") return ShapeType::get(getContext());
+  if (keyword == "partial_shape") return PartialShapeType::get(getContext());
+
+  parser.emitError(parser.getNameLoc(), "unknown shape type: ") << keyword;
+  return Type();
+}
+
+void TensorShapeDialect::printType(::mlir::Type type,
+                                   ::mlir::DialectAsmPrinter &os) const {
+  if (type.isa<ShapeType>()) {
+    os << "shape";
+    return;
+  }
+
+  if (type.isa<PartialShapeType>()) {
+    os << "partial_shape";
+    return;
+  }
+  llvm_unreachable("unexpected 'shape' type kind");
+}
+
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/tensor_shape.cpp.inc"  // NOLINT
+
+}  // namespace infrt::ts
diff --git a/paddle/infrt/dialect/tensor_shape.h b/paddle/infrt/dialect/tensor_shape.h
new file mode 100644
index 0000000000000..bd3fa8853675a
--- /dev/null
+++ b/paddle/infrt/dialect/tensor_shape.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/OpDefinition.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
+
+namespace infrt::ts {
+
+class ShapeType
+    : public mlir::Type::TypeBase<ShapeType, mlir::Type, mlir::TypeStorage> {
+ public:
+  using Base::Base;
+};
+
+class PartialShapeType : public mlir::Type::TypeBase<PartialShapeType,
+                                                     mlir::Type,
+                                                     mlir::TypeStorage> {
+ public:
+  using Base::Base;
+};
+
+using namespace mlir;  // NOLINT
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/tensor_shape.hpp.inc"
+#include "paddle/infrt/dialect/tensor_shape_dialect.hpp.inc"
+
+}  // namespace infrt::ts
diff --git a/paddle/infrt/dialect/tensor_shape.td b/paddle/infrt/dialect/tensor_shape.td
new file mode 100644
index 0000000000000..d3714c8ed14d3
--- /dev/null
+++ b/paddle/infrt/dialect/tensor_shape.td
@@ -0,0 +1,49 @@
+#ifdef INFRT_OPS
+#else
+#define INFRT_OPS
+
+include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/tensor_shape_base.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+
+// Base class for the operation in the TensorShape dialect
+class TS_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<TensorShapeDialect, mnemonic, traits> {
+  let parser = [{ return infrt::dialect::parse$cppClass(parser, result); }];
+  let printer = " return infrt::dialect::printOpWithOperands(p, *this)" ";";
+}
+
+def TS_BuildShapeOp : TS_Op<"build_shape", [NoSideEffect]> {
+  let summary = "Build tensor shape operation";
+  let description = [{
+    An operation that builds a tensor shape of given ranks and extents.
+  }];
+
+  let arguments = (ins I64ArrayAttr:$value);
+  let results = (outs TS_Shape:$output);
+  let assemblyFormat = "$value attr-dict";
+}
+
+def TS_GetNumElementsOp : TS_Op<"get_num_elements"> {
+  let summary = "Returns the number of elements in the shape";
+
+  let description = [{
+    An operation that returns the number of elements in the given shape.
+  }];
+
+  let arguments = (ins TS_Shape);
+  let results = (outs I64);
+  let assemblyFormat = "operands attr-dict";
+}
+
+def TS_PrintShapeOp : TS_Op<"print_shape"> {
+  let summary = "Print tensor shape operation";
+  let description = [{
+    An operation that prints a tensor shape.
+  }];
+
+  let arguments = (ins TS_Shape:$shape);
+  let assemblyFormat = "operands attr-dict";
+}
+
+#endif
diff --git a/paddle/infrt/dialect/tensor_shape_base.td b/paddle/infrt/dialect/tensor_shape_base.td
new file mode 100644
index 0000000000000..ea1c1854d77ca
--- /dev/null
+++ b/paddle/infrt/dialect/tensor_shape_base.td
@@ -0,0 +1,36 @@
+#ifdef TS_OPS_BASE
+#else
+#define TS_OPS_BASE
+
+// Tensor shape dialect.
+def TensorShapeDialect : Dialect {
+    let name = "ts";
+
+    let description = [{
+        The Tensor Shape dialect.
+
+        This dialect contains operations for working with tensor shapes.
+    }];
+
+    let cppNamespace = "::infrt::ts";
+}
+
+// Type definition.
+def TS_Shape : DialectType<TensorShapeDialect,
+CPred<"$_self.isa<::infrt::ts::ShapeType>()">, "!ts.shape type">,
+BuildableType<"$_builder.getType<::infrt::ts::ShapeType>()"> {
+    let typeDescription = [{
+        `!ts.shape type` represents a static tensor shape.
+}];
+}
+
+def TS_PartialShape : DialectType<TensorShapeDialect,
+CPred<"$_self.isa<::infrt::ts::PartialShapeType>()">, "!ts.partial_shape type">,
+BuildableType<"$_builder.getType<::infrt::ts::PartialShapeType>()"> {
+    let typeDescription = [{
+        `!ts.partial_shape type` represents either a static tensor shape, unranked
+        tensor shape or a ranked tensor shape with unknown dimension sizes.
+}];
+}
+
+#endif  // TS_OPS_BASE
diff --git a/paddle/infrt/dialect/test_kernels.cc b/paddle/infrt/dialect/test_kernels.cc
new file mode 100644
index 0000000000000..894d96f95ad5c
--- /dev/null
+++ b/paddle/infrt/dialect/test_kernels.cc
@@ -0,0 +1,163 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/test_kernels.h"
+
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/TypeUtilities.h"
+
+namespace infrt::dialect {
+
+//===----------------------------------------------------------------------===//
+// BenchmarkOp
+//===----------------------------------------------------------------------===//
+
+// Parse the BenchmarkOp in the following format
+// infrt.benchmark "add.i32"(%c : i32, %d : f32)
+//       max_count = 100, duration_secs = 1 {
+// ...
+// }
+
+static ParseResult parseBenchmarkOp(OpAsmParser &parser,       // NOLINT
+                                    OperationState &result) {  // NOLINT
+  StringAttr nameAttr;
+  if (parser.parseAttribute(nameAttr, "name", result.attributes))
+    return failure();
+
+  // Parse the operands, e.g. (%c : i32, %d : f32)
+  if (parser.parseLParen()) return failure();
+
+  SmallVector<OpAsmParser::OperandType, 4> operands;
+  SmallVector<Type, 4> types;
+  llvm::SMLoc type_loc = parser.getCurrentLocation();
+
+  if (parser.parseOptionalRParen()) {
+    // Parse non-empty operands
+    do {
+      // Parse %c : i32,
+      OpAsmParser::OperandType operand;
+      Type type;
+
+      if (parser.parseOperand(operand) || parser.parseColonType(type))
+        return failure();
+
+      operands.push_back(operand);
+      types.push_back(type);
+    } while (succeeded(parser.parseOptionalComma()));
+
+    if (parser.parseRParen()) return failure();
+  }
+
+  if (parser.resolveOperands(operands, types, type_loc, result.operands))
+    return failure();
+
+  // Parse the keyword attribute, e.g. max_count = 100, duration_secs = 1
+  do {
+    StringRef attr;
+    Attribute resultAttr;
+    if (parser.parseKeyword(&attr) || parser.parseEqual() ||
+        parser.parseAttribute(resultAttr,
+                              parser.getBuilder().getIntegerType(32),
+                              attr,
+                              result.attributes))
+      return failure();
+  } while (succeeded(parser.parseOptionalComma()));
+
+  // Set the default attribute num_warmup_runs to 1 if unset
+  auto setDefaultAttrIfUnset = [&](const char *attr_name, int value) {
+    bool found = llvm::any_of(result.attributes,
+                              [attr_name](const NamedAttribute &attr) {
+                                return attr.first == attr_name;
+                              });
+    if (!found) {
+      IntegerAttr default_val = parser.getBuilder().getI32IntegerAttr(value);
+      result.addAttribute(attr_name, default_val);
+    }
+  };
+  setDefaultAttrIfUnset("num_warmup_runs", 1);
+
+  Region *target = result.addRegion();
+  return parser.parseRegion(*target,
+                            operands,
+                            types,
+                            /*enableNameShadowing=*/true);
+}
+
+// Print the BenchmarkOp in the following format
+// infrt.benchmark "add.i32"(%c : i32, %d : f32)
+//       max_count = 100, duration_secs = 1 {
+// ...
+// }
+static void print(OpAsmPrinter &p, BenchmarkOp op) {  // NOLINT
+  p << "infrt.benchmark ";
+
+  // Print the name attribute, e.g "add.i32"
+  auto name_attr = op.getAttr("name");
+  p << name_attr;
+
+  // Print the operands and types, e.g. (%c : i32, %d : f32)
+  p << '(';
+  llvm::interleaveComma(llvm::zip(op.getOperands(), op.getOperandTypes()),
+                        p,
+                        [&](const auto &it) {
+                          p << std::get<0>(it) << " : " << std::get<1>(it);
+                        });
+  p << ") ";
+
+  bool need_comma = false;
+  // Print the attributes, e.g. max_count = 100, duration_secs = 1
+  for (auto &name_attr : op.getAttrs()) {
+    auto id = name_attr.first;
+    if (id == "name") continue;
+    if (need_comma) p << ", ";
+    auto attr = name_attr.second;
+    p << id << " = ";
+    if (auto int_attr = attr.dyn_cast<IntegerAttr>()) {
+      int_attr.getValue().print(p.getStream(), /*isSigned=*/false);
+    } else {
+      op.emitOpError("Unexpected attribute");
+    }
+    need_comma = true;
+  }
+  p << ' ';
+
+  // Print the region
+  // Reuse the argument names provided to the op for the bbarg names within
+  // the region.
+  p.shadowRegionArgs(op.region(), op.getOperands());
+  p.printRegion(op.region(), /*printEntryBlockArgs=*/false);
+}
+
+static LogicalResult verify(BenchmarkOp op) {
+  // Verify that the target benchmark region has exactly one return value.
+  auto &region = op.region();
+  auto &last_op = region.front().back();
+  if (last_op.getName().getStringRef() != "infrt.return") {
+    return op.emitOpError("missing return statement");
+  }
+  if (last_op.getNumOperands() != 1) {
+    return op.emitOpError(
+        "incorrect number of return values. One return value is expected");
+  }
+
+  return success();
+}
+
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/test_kernels.cpp.inc"
+
+}  // namespace infrt::dialect
diff --git a/paddle/infrt/dialect/test_kernels.h b/paddle/infrt/dialect/test_kernels.h
new file mode 100644
index 0000000000000..29d4209cb7280
--- /dev/null
+++ b/paddle/infrt/dialect/test_kernels.h
@@ -0,0 +1,23 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+
+namespace infrt::dialect {
+using namespace mlir;  // NOLINT
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/test_kernels.hpp.inc"
+}  // namespace infrt::dialect
diff --git a/paddle/infrt/dialect/test_kernels.td b/paddle/infrt/dialect/test_kernels.td
new file mode 100644
index 0000000000000..6aa12f252d014
--- /dev/null
+++ b/paddle/infrt/dialect/test_kernels.td
@@ -0,0 +1,65 @@
+// Operation definitions for testing.
+
+#ifdef TEST_OPS
+#else
+#define TEST_OPS
+
+include "paddle/infrt/dialect/infrt_base.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+
+// Base class for Test dialect ops.
+class Test_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<INFRT_Dialect, mnemonic, !listconcat(traits, [IsolatedFromAbove])> {
+
+  // Each registered op in the Test namespace needs to provide all of a printer,
+  // parser and verifier.
+  let printer = [{ return infrt::dialect::print(p, *this); }];
+  let verifier = [{ return infrt::dialect::verify(*this); }];
+  let parser = [{ return infrt::dialect::parse$cppClass(parser, result); }];
+}
+
+def BenchmarkOp : Test_Op<"benchmark"> {
+  let summary = "benchmark operation";
+  let description = [{
+     The "infrt.benchmark" operation benchmarks the performance of an MLIR
+     region by executing the given MLIR region repeatedly up to the
+     `duratino_secs` seconds or `max_count` times. `num_warmup_runs` specifies
+     the number of warm up runs to run the given MLIR region before the
+     benchmark starts.
+
+     The target MLIR region can take an arbitrary number of arguments and
+     should return exactly one value. The arguments for the MLIR region are
+     provided as the operands of the infrt.benchmark op.
+
+     Example:
+       infrt.benchmark "add.i32"(%c : i32, %d : f32) max_count = 100, duration_secs = 1 {
+         // code for benchmarking
+         ...
+       }
+
+       infrt.benchmark "add.i32"(%c : i32)
+         duration_secs = 1,
+         max_count = 100,
+         num_warmup_runs = 10 {
+         // The MLIR code to be benchmarked goes here.
+         // The following code benchmarks the infrt.add.i32 kernel.
+         %x = infrt.add.i32 %c, %c
+         // The benchmarked function needs to return exactly one value.
+         infrt.return %x : i32
+       }
+  }];
+
+  let regions = (region SizedRegion<1>:$region);
+
+  let arguments = (ins
+    Variadic<AnyType>,
+    I32Attr:$duration_secs,
+    I32Attr:$max_count,
+    StrAttr:$name,
+    DefaultValuedAttr<I32Attr, "1">:$num_warmup_runs
+  );
+
+  let results = (outs);
+}
+
+#endif  // TEST_OPS
diff --git a/paddle/infrt/dialect/types.cc b/paddle/infrt/dialect/types.cc
new file mode 100644
index 0000000000000..6d6f6a20b46c9
--- /dev/null
+++ b/paddle/infrt/dialect/types.cc
@@ -0,0 +1,17 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/types.h"
+
+namespace infrt::hlir::mlir {}  // namespace infrt::hlir::mlir
diff --git a/paddle/infrt/dialect/types.h b/paddle/infrt/dialect/types.h
new file mode 100644
index 0000000000000..a9a2b61871cc0
--- /dev/null
+++ b/paddle/infrt/dialect/types.h
@@ -0,0 +1,16 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <mlir/IR/StandardTypes.h>
diff --git a/paddle/infrt/external_kernels/CMakeLists.txt b/paddle/infrt/external_kernels/CMakeLists.txt
new file mode 100644
index 0000000000000..faffc3909bc1e
--- /dev/null
+++ b/paddle/infrt/external_kernels/CMakeLists.txt
@@ -0,0 +1,13 @@
+set(external_kernels_src "basic_kernels.cc")
+
+cc_library(external_kernels SHARED SRCS ${external_kernels_src})
+set_target_properties(external_kernels PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+
+set(basic_mlir "${CMAKE_CURRENT_SOURCE_DIR}/basic.mlir")
+set(external_kernels_lib "${CMAKE_CURRENT_BINARY_DIR}/libexternal_kernels.so")
+message(STATUS "basic_mlir: ${basic_mlir}")
+message(STATUS "external_kernels_lib: ${external_kernels_lib}")
+add_test(
+    NAME run_and_check_external_kernels
+    COMMAND sh -c "${CMAKE_BINARY_DIR}/infrt/host_context/infrt-exec -i ${basic_mlir} --shared_libs=${external_kernels_lib} | ${LLVM_PATH}/bin/FileCheck ${basic_mlir}"
+)
diff --git a/paddle/infrt/external_kernels/basic.mlir b/paddle/infrt/external_kernels/basic.mlir
new file mode 100644
index 0000000000000..843b12ced21a9
--- /dev/null
+++ b/paddle/infrt/external_kernels/basic.mlir
@@ -0,0 +1,21 @@
+// CHECK: basic
+func @basic() -> f32 {
+  %v0 = infrt.constant.f32 1.0
+  %v1 = infrt.constant.f32 2.0
+  %v2 = "external.add.f32"(%v0, %v1) : (f32, f32) -> f32
+
+  // CHECK: 1
+  "external.print.f32"(%v0) : (f32) -> ()
+  // CHECK: 2
+  "external.print.f32"(%v1) : (f32) -> ()
+
+  // CHECK: 3
+  "external.print.f32"(%v2) : (f32) -> ()
+
+  %v3 = "external.mul.f32"(%v2, %v1) : (f32, f32) -> f32
+
+  // CHECK: 6
+  "external.print.f32"(%v3) : (f32) -> ()
+
+  infrt.return %v3 : f32
+}
diff --git a/paddle/infrt/external_kernels/basic_kernels.cc b/paddle/infrt/external_kernels/basic_kernels.cc
new file mode 100644
index 0000000000000..b59a8881fb092
--- /dev/null
+++ b/paddle/infrt/external_kernels/basic_kernels.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/kernel_utils.h"
+
+template <typename T>
+T add(T a, T b) {
+  return a + b;
+}
+
+template <typename T>
+T sub(T a, T b) {
+  return a - b;
+}
+
+template <typename T>
+T mul(T a, T b) {
+  return a * b;
+}
+
+template <typename T>
+T div(T a, T b) {
+  return a / b;
+}
+
+template <typename T>
+void print(T a) {
+  std::cout << a << std::endl;
+}
+
+void RegisterKernels(infrt::host_context::KernelRegistry *registry) {
+  // int32
+  registry->AddKernel("external.add.i32", INFRT_KERNEL(add<int32_t>));
+  registry->AddKernel("external.sub.i32", INFRT_KERNEL(sub<int32_t>));
+  registry->AddKernel("external.mul.i32", INFRT_KERNEL(mul<int32_t>));
+  registry->AddKernel("external.div.i32", INFRT_KERNEL(div<int32_t>));
+  registry->AddKernel("external.print.i32", INFRT_KERNEL(print<int32_t>));
+
+  // float
+  registry->AddKernel("external.add.f32", INFRT_KERNEL(add<float>));
+  registry->AddKernel("external.sub.f32", INFRT_KERNEL(sub<float>));
+  registry->AddKernel("external.mul.f32", INFRT_KERNEL(mul<float>));
+  registry->AddKernel("external.div.f32", INFRT_KERNEL(div<float>));
+  registry->AddKernel("external.print.f32", INFRT_KERNEL(print<float>));
+}
diff --git a/paddle/infrt/external_kernels/fc.mlir b/paddle/infrt/external_kernels/fc.mlir
new file mode 100644
index 0000000000000..bdac9ded2ef65
--- /dev/null
+++ b/paddle/infrt/external_kernels/fc.mlir
@@ -0,0 +1,43 @@
+// CHECK-LABEL: @fc
+func @fc(%input : !infrt.tensor<X86, NCHW, F32>,
+         %w : !infrt.tensor<X86, NCHW, F32>,
+         %bias : !infrt.tensor<X86, NCHW, F32>) -> !infrt.tensor<X86, NCHW, F32>
+{
+  %out = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.tensor<X86, NCHW, F32>
+  // dt.fill_tensor_with_constant.f32 (%out : !infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
+
+  // fc1
+  "external.matmul"(%input, %w, %out) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.sigmoid"(%out, %out) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+
+  // fc2
+  "external.matmul"(%out, %w, %out) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.sigmoid"(%out, %out) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+
+  infrt.return %out : !infrt.tensor<X86, NCHW, F32>
+}
+
+// CHECK-LABEL: @benchmark
+func @benchmark() {
+  %input = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%input : !infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+
+  %w = dt.create_uninit_tensor.f32 [50, 50] -> !infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%w : !infrt.tensor<X86, NCHW, F32>) {value=2.0:f32}
+
+  %bias = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%bias : !infrt.tensor<X86, NCHW, F32>) {value=3.0:f32}
+
+  infrt.benchmark "add.f32"(
+          %input:!infrt.tensor<X86, NCHW, F32>,
+          %w:!infrt.tensor<X86, NCHW, F32>,
+          %bias:!infrt.tensor<X86, NCHW, F32>)
+          duration_secs = 100, max_count = 300000, num_warmup_runs = 3
+  {
+    %res = infrt.call @fc(%input, %w, %bias) : (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> (!infrt.tensor<X86, NCHW, F32>)
+    infrt.return %res : !infrt.tensor<X86, NCHW, F32>
+  }
+  infrt.return
+}
diff --git a/paddle/infrt/external_kernels/paddle.mlir b/paddle/infrt/external_kernels/paddle.mlir
new file mode 100644
index 0000000000000..e7b8e9efba838
--- /dev/null
+++ b/paddle/infrt/external_kernels/paddle.mlir
@@ -0,0 +1,50 @@
+// CHECK: paddle_func
+func @paddle_func() -> () {
+  %input = dt.create_uninit_tensor.f32 [3, 5] -> !infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%input : !infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+
+  %w = dt.create_uninit_tensor.f32 [5, 4] -> !infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%w : !infrt.tensor<X86, NCHW, F32>) {value=2.0:f32}
+
+  %bias = dt.create_uninit_tensor.f32 [4] -> !infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%bias : !infrt.tensor<X86, NCHW, F32>) {value=3.0:f32}
+
+  %out = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%out : !infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
+
+  "external.fc2"(%input, %w, %bias, %out) {in_num_col_dims=3:i32, test_attr=5:i32}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+  // CHECK-LABEL: tensor: shape=shape[3,5], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+  dt.print_tensor (%input : !infrt.tensor<X86, NCHW, F32>)
+  // CHECK-LABEL: tensor: shape=shape[5,4], values=[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
+  dt.print_tensor (%w : !infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%bias : !infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%out : !infrt.tensor<X86, NCHW, F32>)
+
+  // test external.matmul
+  %out1 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%out1 : !infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
+  "external.matmul"(%input, %w, %out1) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+  dt.print_tensor (%out1 : !infrt.tensor<X86, NCHW, F32>)
+
+  // test external.elementwise_add
+  %out2 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%out2 : !infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
+  %bias1 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%bias1 : !infrt.tensor<X86, NCHW, F32>) {value=3.0:f32}
+  "external.elementwise_add"(%out1, %bias1, %out2) {axis=-1}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+  dt.print_tensor (%out2 : !infrt.tensor<X86, NCHW, F32>)
+
+  // test external.relu
+  %out3 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%out3 : !infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
+  "external.relu"(%out1, %out3) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+  dt.print_tensor (%out3 : !infrt.tensor<X86, NCHW, F32>)
+
+  // test external.sigmoid
+  %out4 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%out4 : !infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
+  "external.sigmoid"(%out1, %out4) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+  dt.print_tensor (%out4 : !infrt.tensor<X86, NCHW, F32>)
+
+  infrt.return
+}
diff --git a/paddle/infrt/gtest_main.cc b/paddle/infrt/gtest_main.cc
new file mode 100644
index 0000000000000..26e2b5dcfc61a
--- /dev/null
+++ b/paddle/infrt/gtest_main.cc
@@ -0,0 +1,23 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+int main(int argc, char **argv) {
+  testing::InitGoogleTest(&argc, argv);
+  gflags::ParseCommandLineFlags(&argc, &argv, false);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/infrt/host_context/CMakeLists.txt b/paddle/infrt/host_context/CMakeLists.txt
new file mode 100644
index 0000000000000..fdba9af4a5912
--- /dev/null
+++ b/paddle/infrt/host_context/CMakeLists.txt
@@ -0,0 +1,29 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    kernel_frame.cc
+    kernel_registry.cc
+    value.cc
+    kernel_utils.cc
+    symbol_table.cc
+    op_executable.cc
+    core_runtime.cc
+    mlir_to_runtime_translate.cc
+    function.cc
+    mlir_function_executable.cc
+    mlir_program_executor.cc
+    )
+
+cc_test_tiny(test_infrt_host_context_value SRCS value_test.cc DEPS infrt ${MLIR_IR_LIBS})
+cc_test_tiny(test_infrt_kernel_utils SRCS kernel_utils_test.cc DEPS infrt ${MLIR_IR_LIBS})
+cc_test_tiny(test_infrt_kernel_registry SRCS kernel_registry_test.cc DEPS infrt ${MLIR_IR_LIBS})
+cc_test_tiny(test_infrt_op_executable SRCS op_executable_test.cc DEPS infrt ${MLIR_IR_LIBS})
+cc_test_tiny(test_infrt_core_runtime SRCS core_runtime_test.cc DEPS infrt ${MLIR_IR_LIBS})
+cc_test_tiny(test_infrt_mlir_to_runtime_translate SRCS mlir_to_runtime_translate_test.cc DEPS infrt ${MLIR_IR_LIBS})
+
+infrt_exec_check(test_infrt_mlir_exec_on_basic mlir_tests/basic.mlir)
+infrt_exec_check(test_infrt_mlir_exec_on_shape mlir_tests/shape.mlir)
+infrt_exec_check(test_infrt_mlir_exec_on_dense_tensor mlir_tests/dense_tensor.mlir)
+
+add_executable(infrt-exec mlir_exec.cc)
+target_link_libraries(infrt-exec infrt ${MLIR_IR_LIBS})
diff --git a/paddle/infrt/host_context/core_runtime.cc b/paddle/infrt/host_context/core_runtime.cc
new file mode 100644
index 0000000000000..cdb8cc99ecb26
--- /dev/null
+++ b/paddle/infrt/host_context/core_runtime.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/core_runtime.h"
+
+#include <unordered_map>
+
+#include <string>
+#include <vector>
+
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/op_executable.h"
+#include "paddle/infrt/host_context/symbol_table.h"
+
+namespace infrt::host_context {
+
+struct CoreRuntime::Impl {
+  KernelRegistry* kernel_registry{};
+  SymbolTable symbol_table;
+  std::vector<OpExecutableBuilder> op_executables;
+
+  mutable std::vector<ValueRef> results;
+};
+
+SymbolTable* CoreRuntime::symbol_table() { return &impl_->symbol_table; }
+
+CoreRuntime::CoreRuntime(CoreRuntime::Impl* impl) : impl_(impl) { CHECK(impl); }
+
+void CoreRuntime::Execute() {
+  // std::cout << "CoreRuntime::Execute" << std::endl;
+  int op_offset = 0;
+  for (auto& op : impl_->op_executables) {
+    VLOG(3) << "running op " << op_offset++ << " " << op.name();
+    op.Execute();
+  }
+}
+
+KernelRegistry* CoreRuntime::kernel_registry() const {
+  return impl_->kernel_registry;
+}
+
+size_t CoreRuntime::num_ops() const { return impl_->op_executables.size(); }
+
+CoreRuntimeBuilder::CoreRuntimeBuilder(KernelRegistry* kernel_registry)
+    : CoreRuntime(new Impl) {
+  impl_->kernel_registry =
+      kernel_registry ? kernel_registry : GetCpuKernelRegistry();
+}
+
+OpExecutableBuilder* CoreRuntimeBuilder::NewOpExecutable(
+    const std::string& op_name) {
+  CHECK(impl_.get());
+  impl_->op_executables.emplace_back(
+      op_name, symbol_table(), impl_->kernel_registry);
+  return &impl_->op_executables.back();
+}
+
+void CoreRuntimeBuilder::FeedInArgs(
+    llvm::ArrayRef<std::pair<std::string, ValueRef>> args) {
+  for (auto& item : args) {
+    symbol_table()->Register(item.first, item.second);
+  }
+}
+
+void CoreRuntimeBuilder::SetKernelRegistry(KernelRegistry* x) {
+  CHECK(x);
+  impl_->kernel_registry = x;
+}
+
+llvm::SmallVector<ValueRef, 4> CoreRuntime::GetResults(
+    llvm::ArrayRef<std::string> arg_names) {
+  llvm::SmallVector<ValueRef, 4> results;
+  for (auto& name : arg_names) {
+    results.push_back(ValueRef(symbol_table()->GetValue(name)));
+  }
+
+  return results;
+}
+
+CoreRuntime::~CoreRuntime() {}
+
+}  // namespace infrt::host_context
diff --git a/paddle/infrt/host_context/core_runtime.h b/paddle/infrt/host_context/core_runtime.h
new file mode 100644
index 0000000000000..802f8b17bb010
--- /dev/null
+++ b/paddle/infrt/host_context/core_runtime.h
@@ -0,0 +1,86 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/ADT/SmallVector.h>
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "paddle/infrt/host_context/value.h"
+
+namespace infrt::host_context {
+
+class KernelRegistry;
+class OpExecutable;
+class OpExecutableBuilder;
+class SymbolTable;
+
+/**
+ * CoreRuntime encapsulate the execution for a sequence of ops.
+ * Each function call will bind to a CoreRuntime instance, push the argument
+ * Values in to the argument-list, and get the
+ * result Values from the return-list.
+ */
+class CoreRuntime : public std::enable_shared_from_this<CoreRuntime> {
+ public:
+  //! Execute a program.
+  void Execute();
+
+  //! Return the number of ops.
+  size_t num_ops() const;
+
+  //! Get the results of the execution.
+  llvm::SmallVector<ValueRef, 4>  //
+      GetResults(llvm::ArrayRef<std::string> arg_names);
+
+  std::shared_ptr<CoreRuntime> getptr() {
+    return std::shared_ptr<CoreRuntime>(this);
+  }
+
+  KernelRegistry* kernel_registry() const;
+
+  ~CoreRuntime();
+
+ protected:
+  //! Get the symbol table.
+  SymbolTable* symbol_table();
+
+  class Impl;
+  explicit CoreRuntime(Impl* impl);
+  std::unique_ptr<Impl> impl_;
+};
+
+/**
+ * The builder for CoreRuntime, help to construct a function.
+ */
+class CoreRuntimeBuilder : public CoreRuntime {
+ public:
+  explicit CoreRuntimeBuilder(KernelRegistry* kernel_registry);
+
+  using CoreRuntime::symbol_table;
+
+  void SetKernelRegistry(KernelRegistry* x);
+
+  //! Feed the input arguments, each item is a pair of arg-name and arg-value.
+  void FeedInArgs(llvm::ArrayRef<std::pair<std::string, ValueRef>> args);
+
+  llvm::ArrayRef<const std::string&> attr_names() const;
+
+  OpExecutableBuilder* NewOpExecutable(const std::string& op_name);
+};
+
+}  // namespace infrt::host_context
diff --git a/paddle/infrt/host_context/core_runtime_test.cc b/paddle/infrt/host_context/core_runtime_test.cc
new file mode 100644
index 0000000000000..3c0dadaad42e7
--- /dev/null
+++ b/paddle/infrt/host_context/core_runtime_test.cc
@@ -0,0 +1,96 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/core_runtime.h"
+
+#include <gtest/gtest.h>
+
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/kernel_utils.h"
+#include "paddle/infrt/host_context/op_executable.h"
+#include "paddle/infrt/host_context/symbol_table.h"
+
+namespace infrt {
+namespace host_context {
+
+int add(int a, int b) { return a + b; }
+int sub(int a, int b) { return a - b; }
+
+TEST(CoreRuntime, basic) {
+  KernelRegistry registry;
+  registry.AddKernel("infrt.test.addi32", INFRT_KERNEL(add));
+  registry.AddKernel("infrt.test.subi32", INFRT_KERNEL(sub));
+
+  CoreRuntimeBuilder builder(&registry);
+  auto* table = builder.symbol_table();
+  table->Register("a", 1);
+  table->Register("b", 2);
+  table->Register("d", 4);
+
+  // c = a + b
+  auto* op0 = builder.NewOpExecutable("infrt.test.addi32");
+  op0->AppendArgument("a");
+  op0->AppendArgument("b");
+  op0->SetResults({"c"});
+
+  // e = c - d
+  auto* op1 = builder.NewOpExecutable("infrt.test.subi32");
+  op1->AppendArgument("c");
+  op1->AppendArgument("d");
+  op1->SetResults({"e"});
+
+  builder.Execute();
+
+  ASSERT_EQ(table->GetValue("d")->get<int>(), 4);
+  ASSERT_EQ(table->GetValue("c")->get<int>(), 3);
+  ASSERT_EQ(table->GetValue("e")->get<int>(), -1);
+}
+
+TEST(CoreRuntime, function) {
+  // The function:
+  // func(int a, int b) {
+  //   int c = a + b
+  //   return c
+  // }
+  KernelRegistry registry;
+  registry.AddKernel("infrt.test.addi32", INFRT_KERNEL(add));
+  registry.AddKernel("infrt.test.subi32", INFRT_KERNEL(sub));
+
+  CoreRuntimeBuilder builder(&registry);
+  auto* table = builder.symbol_table();
+
+  std::vector<std::pair<std::string, ValueRef>> feeds{
+      {std::make_pair("a", ValueRef(new Value(1))),  //
+       std::make_pair("b", ValueRef(new Value(2)))}};
+  builder.FeedInArgs(llvm::ArrayRef<std::pair<std::string, ValueRef>>(
+      feeds.data(), feeds.size()));
+
+  ASSERT_EQ(table->Get<int>("a"), 1);
+  ASSERT_EQ(table->Get<int>("b"), 2);
+  ASSERT_EQ(table->size(), 2UL);
+
+  auto* op = builder.NewOpExecutable("infrt.test.addi32");
+  op->AppendArgument("a");
+  op->AppendArgument("b");
+  op->SetResults({"c"});
+
+  builder.Execute();
+
+  auto res = builder.GetResults({"c"});
+  ASSERT_EQ(res.size(), 1UL);
+  ASSERT_EQ(res[0].get<int>(), 3);
+}
+
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/function.cc b/paddle/infrt/host_context/function.cc
new file mode 100644
index 0000000000000..8b111f2645a80
--- /dev/null
+++ b/paddle/infrt/host_context/function.cc
@@ -0,0 +1,19 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/function.h"
+
+namespace infrt {
+namespace host_context {}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/function.h b/paddle/infrt/host_context/function.h
new file mode 100644
index 0000000000000..030e3b6cfbc09
--- /dev/null
+++ b/paddle/infrt/host_context/function.h
@@ -0,0 +1,62 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <llvm/ADT/ArrayRef.h>
+
+#include <string>
+
+namespace infrt {
+namespace host_context {
+
+struct Value;
+struct ValueRef;
+
+/**
+ * Base class of all executable Function.
+ *
+ * This is used by `infrt.call` op, to execute a function.
+ */
+class Function {
+ public:
+  Function(Function&& other)
+      : name_(other.name_),
+        num_arguments_(other.num_arguments_),
+        num_results_(other.num_results_) {}
+
+  Function() = delete;
+
+  std::string name() const { return name_; }
+
+  size_t num_arguments() const { return num_arguments_; }
+  size_t num_results() const { return num_results_; }
+
+  virtual void Execute(llvm::ArrayRef<Value*> arguments,
+                       llvm::MutableArrayRef<ValueRef> results,
+                       bool is_region = false) const {}
+
+  virtual ~Function() = default;
+
+ protected:
+  Function(std::string name, size_t num_arguments, size_t num_results)
+      : name_(name), num_arguments_(num_arguments), num_results_(num_results) {}
+
+ private:
+  std::string name_;
+  size_t num_arguments_{};
+  size_t num_results_{};
+};
+
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/kernel_frame.cc b/paddle/infrt/host_context/kernel_frame.cc
new file mode 100644
index 0000000000000..1acb35e898308
--- /dev/null
+++ b/paddle/infrt/host_context/kernel_frame.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/kernel_frame.h"
+
+#include <memory>
+
+namespace infrt {
+namespace host_context {
+
+std::ostream& operator<<(std::ostream& os, const KernelFrame& frame) {
+  os << "KernelFrame: " << frame.GetNumArgs() << " args, "
+     << frame.GetNumResults() << " res, " << frame.GetNumResults() << " attrs";
+  return os;
+}
+
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/kernel_frame.h b/paddle/infrt/host_context/kernel_frame.h
new file mode 100644
index 0000000000000..20cb17dc7fbe2
--- /dev/null
+++ b/paddle/infrt/host_context/kernel_frame.h
@@ -0,0 +1,166 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <glog/logging.h>
+#include <llvm/ADT/ArrayRef.h>
+
+#include <utility>
+
+#include "llvm/ADT/SmallVector.h"
+#include "paddle/infrt/host_context/value.h"
+
+namespace infrt::host_context {
+
+/**
+ * KernelFrame captures the states(input arguments, attributes, results)
+ * associated with a kernel invocation.
+ */
+class KernelFrame {
+ public:
+  int GetNumArgs() const { return num_arguments_; }
+  int GetNumResults() const { return num_results_; }
+  int GetNumAttributes() const {
+    return value_or_attrs_.size() - num_arguments_ -
+           (num_results_ == -1 ? 0 : num_results_);
+  }
+
+  template <typename T>
+  T& GetArgAt(int index) {
+    CHECK_LT(index, GetNumArgs());
+    return value_or_attrs_[index]->get<T>();
+  }
+  template <typename T>
+  const T& GetArgAt(int index) const {
+    CHECK_LT(index, GetNumArgs());
+    return value_or_attrs_[index]->get<T>();
+  }
+
+  Value* GetArgAt(int index) {
+    CHECK_LT(index, GetNumArgs());
+    return value_or_attrs_[index];
+  }
+
+  // Get all arguments.
+  llvm::ArrayRef<Value*> GetArguments() const {
+    return GetValues(0, num_arguments_);
+  }
+
+  Value* GetAttributeAt(int idx) {
+    CHECK_NE(num_results_, -1)
+        << "Must call SetNumResults before GetAttributeAt";
+    CHECK_LT(idx,
+             static_cast<int>(value_or_attrs_.size() - num_arguments_ -
+                              num_results_));
+    return value_or_attrs_[num_arguments_ + num_results_ + idx];
+  }
+
+  void AddAttribute(Value* v) {
+    CHECK_NE(num_results_, -1)
+        << "Must call SetNumResults before calling AddAttribute";
+    value_or_attrs_.emplace_back(v);
+  }
+
+  template <typename T, typename... Args>
+  void EmplaceResult(Args&&... args) {
+    EmplaceResult<T>(0, std::forward<Args>(args)...);
+  }
+
+  template <typename T, typename... Args>
+  void EmplaceResult(int index, Args&&... args) {
+    SetResultAt(index, T(std::forward<Args>(args)...));
+  }
+
+  template <typename T>
+  void SetResultAt(int index, T&& value) {
+    CHECK_LT(index, num_results_) << "Invalid result index";
+    CHECK(value_or_attrs_[num_arguments_ + index]);
+    value_or_attrs_[num_arguments_ + index]->set(std::move(value));
+  }
+
+  llvm::ArrayRef<Value*> GetResults() const {
+    return GetValues(num_arguments_, num_results_);
+  }
+  llvm::MutableArrayRef<Value*> GetResults() {
+    return GetMutableValues(num_arguments_, num_results_);
+  }
+
+  llvm::ArrayRef<Value*> GetValues(size_t from, size_t length) const {
+    CHECK_LE(static_cast<int>(from + length), num_arguments_ + num_results_);
+    if (length == 0) return {};
+
+    return llvm::makeArrayRef(&value_or_attrs_[from], length);
+  }
+
+  llvm::MutableArrayRef<Value*> GetMutableValues(size_t from, size_t length) {
+    CHECK_LE(static_cast<int>(from + length), num_arguments_ + num_results_);
+    if (length == 0) return {};
+    return llvm::makeMutableArrayRef(&value_or_attrs_[from], length);
+  }
+
+ protected:
+  int num_arguments_{};
+  int num_results_{-1};
+
+  llvm::SmallVector<Value*, 8> value_or_attrs_;
+};
+
+std::ostream& operator<<(std::ostream& os, const KernelFrame& frame);
+
+class KernelFrameBuilder : public KernelFrame {
+ public:
+  void AddArgument(Value* value) {
+    CHECK(value);
+    CHECK_EQ(num_results_, -1)
+        << "Should call AddArgument before calling SetNumResults";
+    value_or_attrs_.push_back(value);
+    ++num_arguments_;
+  }
+
+  void SetResults(llvm::ArrayRef<Value*> values) {
+    CHECK_EQ(num_arguments_, static_cast<int>(value_or_attrs_.size()));
+    CHECK_EQ(num_results_, -1);
+    for (Value* x : values) {
+      value_or_attrs_.push_back(x);
+    }
+    num_results_ = values.size();
+  }
+
+  void SetNumResults(size_t n) {
+    CHECK_EQ(num_arguments_, static_cast<int>(value_or_attrs_.size()));
+    CHECK_EQ(num_results_, -1);
+    num_results_ = n;
+    for (size_t i = 0; i < n; i++) {
+      value_or_attrs_.emplace_back(new Value);
+    }
+  }
+
+  void SetResultAt(int result_id, Value* value) {
+    CHECK_EQ(static_cast<int>(value_or_attrs_.size()),
+             num_arguments_ + num_results_)
+        << "Call SetNumResults first";
+    CHECK_LT(result_id + num_arguments_,
+             static_cast<int>(value_or_attrs_.size()));
+    CHECK(value);
+    value_or_attrs_[num_arguments_ + result_id]->set(value);
+  }
+
+  void Reset() {
+    value_or_attrs_.clear();
+    num_arguments_ = 0;
+    num_results_ = -1;
+  }
+};
+
+}  // namespace infrt::host_context
diff --git a/paddle/infrt/host_context/kernel_registry.cc b/paddle/infrt/host_context/kernel_registry.cc
new file mode 100644
index 0000000000000..f343dfc71b040
--- /dev/null
+++ b/paddle/infrt/host_context/kernel_registry.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/kernel_registry.h"
+
+#include <unordered_map>
+
+#include "glog/logging.h"
+#include "llvm/ADT/SmallVector.h"
+
+namespace infrt {
+namespace host_context {
+
+struct KernelRegistry::Impl {
+  std::unordered_map<std::string, KernelImplementation> data;
+  std::unordered_map<std::string, llvm::SmallVector<std::string, 4>> attr_names;
+};
+
+KernelRegistry::KernelRegistry() : impl_(std::make_unique<Impl>()) {}
+
+void KernelRegistry::AddKernel(const std::string &key,
+                               KernelImplementation fn) {
+  CHECK(!impl_->data.count(key)) << "kernel [" << key
+                                 << "] is registered twice";
+  impl_->data.emplace(key, fn);
+}
+
+void KernelRegistry::AddKernelAttrNameList(
+    const std::string &key, const std::vector<std::string> &names) {
+  CHECK(!impl_->attr_names.count(key))
+      << "kernel [" << key << "] is registered twice in attribute names";
+  impl_->attr_names.emplace(
+      key, llvm::SmallVector<std::string, 4>(names.begin(), names.end()));
+}
+
+KernelImplementation KernelRegistry::GetKernel(const std::string &key) const {
+  auto it = impl_->data.find(key);
+  return it != impl_->data.end() ? it->second : KernelImplementation{};
+}
+
+std::vector<std::string> KernelRegistry::GetKernelList() const {
+  std::vector<std::string> res(impl_->data.size());
+  for (auto i : impl_->data) {
+    res.push_back(i.first);
+  }
+  return res;
+}
+
+KernelRegistry::~KernelRegistry() {}
+
+size_t KernelRegistry::size() const { return impl_->data.size(); }
+
+KernelRegistry *GetCpuKernelRegistry() {
+  static auto registry = std::make_unique<KernelRegistry>();
+  return registry.get();
+}
+
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/kernel_registry.h b/paddle/infrt/host_context/kernel_registry.h
new file mode 100644
index 0000000000000..d65969999f6ed
--- /dev/null
+++ b/paddle/infrt/host_context/kernel_registry.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace infrt {
+namespace host_context {
+
+class KernelFrame;
+
+using KernelImplementation = void (*)(KernelFrame *frame);
+
+/**
+ * Hold the kernels registered in the system.
+ */
+class KernelRegistry {
+ public:
+  KernelRegistry();
+
+  void AddKernel(const std::string &key, KernelImplementation fn);
+  void AddKernelAttrNameList(const std::string &key,
+                             const std::vector<std::string> &names);
+
+  KernelImplementation GetKernel(const std::string &key) const;
+  std::vector<std::string> GetKernelList() const;
+
+  size_t size() const;
+
+  ~KernelRegistry();
+
+ private:
+  class Impl;
+
+  std::unique_ptr<Impl> impl_;
+};
+
+//! The global CPU kernel registry.
+KernelRegistry *GetCpuKernelRegistry();
+
+}  // namespace host_context
+}  // namespace infrt
+
+/**
+ * compile function RegisterKernels in C way to avoid C++ name mangling.
+ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+void RegisterKernels(infrt::host_context::KernelRegistry *registry);
+#ifdef __cplusplus
+}
+#endif
diff --git a/paddle/infrt/host_context/kernel_registry_test.cc b/paddle/infrt/host_context/kernel_registry_test.cc
new file mode 100644
index 0000000000000..f36ec2a1cac7d
--- /dev/null
+++ b/paddle/infrt/host_context/kernel_registry_test.cc
@@ -0,0 +1,47 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/kernel_registry.h"
+
+#include <gtest/gtest.h>
+
+#include "paddle/infrt/host_context/kernel_utils.h"
+
+namespace infrt::host_context {
+
+int add_i32(int a, int b) { return a + b; }
+
+TEST(KernelRegistry, basic) {
+  KernelRegistry registry;
+  std::string key = "infrt.test.add.i32";
+  registry.AddKernel(key, INFRT_KERNEL(add_i32));
+
+  auto* kernel_impl = registry.GetKernel(key);
+  ASSERT_TRUE(kernel_impl);
+
+  ValueRef a(1);
+  ValueRef b(2);
+  KernelFrameBuilder fbuilder;
+  fbuilder.AddArgument(a.get());
+  fbuilder.AddArgument(b.get());
+  fbuilder.SetNumResults(1);
+
+  kernel_impl(&fbuilder);
+
+  auto results = fbuilder.GetResults();
+  ASSERT_EQ(results.size(), 1UL);
+  ASSERT_EQ(results[0]->get<int>(), 3);
+}
+
+}  // namespace infrt::host_context
diff --git a/paddle/infrt/host_context/kernel_utils.cc b/paddle/infrt/host_context/kernel_utils.cc
new file mode 100644
index 0000000000000..cf9476da032be
--- /dev/null
+++ b/paddle/infrt/host_context/kernel_utils.cc
@@ -0,0 +1,19 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/kernel_utils.h"
+
+namespace infrt {
+namespace host_context {}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/kernel_utils.h b/paddle/infrt/host_context/kernel_utils.h
new file mode 100644
index 0000000000000..33812912ba029
--- /dev/null
+++ b/paddle/infrt/host_context/kernel_utils.h
@@ -0,0 +1,352 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <glog/logging.h>
+#include <llvm/ADT/ArrayRef.h>
+
+#include <utility>
+
+#include "paddle/infrt/host_context/kernel_frame.h"
+#include "paddle/infrt/host_context/value.h"
+
+namespace infrt {
+namespace host_context {
+
+template <typename T>
+class Argument {
+ public:
+  explicit Argument(ValueRef value) : value_(value) {}
+
+  ValueRef& value() { return value_; }
+  const ValueRef& value() const { return value_; }
+
+  T& get() const { return value_.get<T>(); }
+
+ private:
+  ValueRef value_;
+};
+
+/**
+ * RemainingArguments collects all remaining arguments in an ArrayRef.
+ */
+class RemainingArguments {
+ public:
+  explicit RemainingArguments(llvm::ArrayRef<Value*> remaining_arguments)
+      : remaining_arguments_(remaining_arguments) {}
+
+  llvm::ArrayRef<Value*> values() const { return remaining_arguments_; }
+  size_t size() const { return remaining_arguments_.size(); }
+  const Value* operator[](size_t i) const { return remaining_arguments_[i]; }
+
+ private:
+  llvm::ArrayRef<Value*> remaining_arguments_;
+};
+
+/**
+ * RemainingResults collects all remaining results in a MutableArrayRef.
+ */
+class RemainingResults {
+ public:
+  explicit RemainingResults(llvm::MutableArrayRef<ValueRef> remaining_results)
+      : remaining_results_(remaining_results) {}
+  llvm::MutableArrayRef<ValueRef> values() { return remaining_results_; }
+  size_t size() const { return remaining_results_.size(); }
+
+  template <typename T>
+  const ValueRef& AllocateAt(int index) {
+    // eagerly create a ValueRef
+    if (remaining_results_[index].get()) return remaining_results_[index];
+    remaining_results_[index] = ValueRef(new Value);
+    return remaining_results_[index];
+  }
+  ValueRef& operator[](size_t i) const { return remaining_results_[i]; }
+
+ private:
+  llvm::MutableArrayRef<ValueRef> remaining_results_;
+};
+
+template <typename T>
+class Result {
+ public:
+  explicit Result(ValueRef* result) : result_(result) {}
+
+  template <typename... Args>
+  void Emplace(Args&&... args) {
+    ValueRef v;
+    Set(T(std::forward<Args>(args)...));
+  }
+
+  void Set(Argument<T> argument) {
+    CHECK(!result_->IsValid());
+    *result_ = argument.value();
+  }
+
+ private:
+  ValueRef* result_{};
+};
+
+template <typename T>
+class Attribute {
+ public:
+  explicit Attribute(const Value* value) : value_(value) {}
+
+  const T& get() const { return value_->get<T>(); }
+
+ private:
+  const Value* value_;
+};
+
+template <typename ViewT>
+class ArgumentView {
+  using UnderlyingT = typename ViewT::UnderlyingT;
+
+ public:
+  explicit ArgumentView(Value* value)
+      : value_(value), arg_(&value->template get<UnderlyingT>()) {}
+
+  Value* value() const { return value_; }
+  ViewT& get() const { return arg_; }
+  ViewT* operator->() const { return &get(); }
+  ViewT& operator*() const { return get(); }
+
+ private:
+  Value* value_{};
+  mutable ViewT arg_;
+};
+
+template <typename F, F f>
+struct KernelImpl;
+
+template <typename T>
+struct TypeTag {};
+
+#define INFRT_KERNEL(...)                                   \
+  ::infrt::host_context::KernelImpl<decltype(&__VA_ARGS__), \
+                                    &__VA_ARGS__>::Invoke
+
+template <typename Return, typename... Args, Return (*impl_fn)(Args...)>
+struct KernelImpl<Return (*)(Args...), impl_fn> {
+  static void Invoke(KernelFrame* frame) {
+    KernelCallHelper<Args..., TypeTag<int>>::template Invoke<0, 0, 0>(frame);
+  }
+
+  // Helper that introspects the arguments to derive the signature and cast
+  // parts of the KernelFrame to their type before passing them to impl_fn.
+  template <typename... RemainingArgs>
+  struct KernelCallHelper;
+
+  // Casts the return value of the kernel, if non-void.
+  // bool _ is an unnecessary parameter to make compiler allow templace specific
+  // in non-namespace scope.
+  template <typename T, bool _>
+  struct KernelReturnHelper {
+    static void Invoke(KernelFrame* frame, const Args&... args) {
+      HandleReturn(frame, impl_fn(args...));
+    }
+  };
+
+  template <bool _>
+  struct KernelReturnHelper<void, _> {
+    static void Invoke(KernelFrame* frame, const Args&... args) {
+      impl_fn(args...);
+    }
+  };
+
+  // Specialization to cast a single input argument(Head).
+  template <typename Head, typename... Tail>
+  struct KernelCallHelper<Argument<Head>, Tail...> {
+    template <int in_idx, int out_idx, int const_idx, typename... PreviousArgs>
+    static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) {
+      static_assert(in_idx != -1,
+                    "Do not place Arguments after RemainingArguments");
+      static_assert(out_idx == 0, "Arguments should appear before results");
+      static_assert(const_idx == 0,
+                    "Arguments and results should appear before attributes.");
+
+      Argument<Head> arg(frame->GetArgAt(in_idx));
+      KernelCallHelper<
+          Tail...>::template Invoke<in_idx + 1, out_idx, const_idx>(frame,
+                                                                    pargs...,
+                                                                    arg);
+    }
+  };
+
+  template <typename Head, typename... Tail>
+  struct KernelCallHelper<ArgumentView<Head>, Tail...> {
+    template <int in_idx, int out_idx, int const_idx, typename... PreviousArgs>
+    static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) {
+      static_assert(in_idx != -1,
+                    "Do not place Arguments after RemainingArguments");
+      static_assert(out_idx == 0, "Arguments should appear before results");
+      static_assert(const_idx == 0,
+                    "Arguments and results should appear before attributes.");
+
+      ArgumentView<Head> arg(frame->GetArgAt(in_idx));
+      KernelCallHelper<
+          Tail...>::template Invoke<in_idx + 1, out_idx, const_idx>(frame,
+                                                                    pargs...,
+                                                                    arg);
+    }
+  };
+
+  // Specialization to cast a single result argument (Head).
+  template <typename Head, typename... Tail>
+  struct KernelCallHelper<Result<Head>, Tail...> {
+    template <int in_idx, int out_idx, int const_idx, typename... PreviousArgs>
+    static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) {
+      static_assert(out_idx != -1,
+                    "Do not place Results after RemainingResults");
+      static_assert(const_idx == 0,
+                    "Arguments and results should appear before attributes");
+      Result<Head> arg(&frame->GetResults()[out_idx]);
+      KernelCallHelper<
+          Tail...>::template Invoke<in_idx, out_idx + 1, const_idx>(frame,
+                                                                    pargs...,
+                                                                    arg);
+    }
+  };
+
+  // Specialization to cast a single attribute.
+  template <typename Head, typename... Tail>
+  struct KernelCallHelper<Attribute<Head>, Tail...> {
+    template <int in_idx, int out_idx, int const_idx, typename... PreviousArgs>
+    static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) {
+      static_assert(const_idx != -1,
+                    "Do not place Attributes after RemainingAttributes");
+      Attribute<Head> arg(frame->GetAttributeAt(const_idx));
+      KernelCallHelper<
+          Tail...>::template Invoke<in_idx, out_idx, const_idx + 1>(frame,
+                                                                    pargs...,
+                                                                    arg);
+    }
+  };
+
+  // Treat other pointer as an Argument.
+  template <typename Head, typename... Tail>
+  struct KernelCallHelper<Head*, Tail...> {
+    template <int in_idx, int out_idx, int const_idx, typename... PreviousArgs>
+    static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) {
+      static_assert(in_idx != -1,
+                    "Do not place Arguments after RemainingArguments");
+      static_assert(out_idx == 0, "Arguments should appear before results");
+      static_assert(const_idx == 0,
+                    "Arguments and results should appear before attributes.");
+      auto* arg = &frame->GetArgAt<Head>(in_idx);
+      KernelCallHelper<
+          Tail...>::template Invoke<in_idx + 1, out_idx, const_idx>(frame,
+                                                                    pargs...,
+                                                                    arg);
+    }
+  };
+
+  // Treat any other type as an Argument.
+  template <typename Head, typename... Tail>
+  struct KernelCallHelper<Head, Tail...> {
+    using ArgT = std::decay_t<Head>;
+
+    template <int in_idx, int out_idx, int const_idx, typename... PreviousArgs>
+    static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) {
+      static_assert(in_idx != -1,
+                    "Do not place Arguments after RemainingArguments");
+      static_assert(out_idx == 0, "Arguments should appear before results");
+      static_assert(const_idx == 0,
+                    "Arguments and results should appear before attributes.");
+
+      auto* value = frame->GetArgAt(in_idx);
+      auto&& arg = value->get<ArgT>();
+
+      KernelCallHelper<
+          Tail...>::template Invoke<in_idx + 1, out_idx, const_idx>(frame,
+                                                                    pargs...,
+                                                                    arg);
+    }
+  };
+
+  // RemainingArguments provides an ArrayRef<AsyncValue*> containing all
+  // remaining arguments. Useful for variadic
+  // kernels.
+  template <typename... Tail>
+  struct KernelCallHelper<RemainingArguments, Tail...> {
+    template <int in_idx, int out_idx, int const_idx, typename... PreviousArgs>
+    static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) {
+      static_assert(in_idx != -1,
+                    "Do not use more than one RemainingArguments");
+      static_assert(out_idx == 0, "Arguments should appear before results.");
+      static_assert(const_idx == 0,
+                    "Arguments and results should appear before attributes");
+      RemainingArguments remaining_arguments(
+          frame->GetArguments().drop_front(in_idx));
+
+      KernelCallHelper<Tail...>::template Invoke<-1, out_idx, const_idx>(
+          frame, pargs..., remaining_arguments);
+    }
+  };
+
+  // RemainingResults provides an MutableArrayRef<AsyncValue*> containing all
+  // remaining results.
+  template <typename... Tail>
+  struct KernelCallHelper<RemainingResults, Tail...> {
+    template <int in_idx, int out_idx, int const_idx, typename... PreviousArgs>
+    static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) {
+      static_assert(out_idx != -1, "Do not use more than one RemainingResults");
+      static_assert(const_idx == 0,
+                    "Arguments and results should appear before attributes");
+      llvm::MutableArrayRef<Value*> returned_results =
+          frame->GetResults().drop_front(out_idx);
+
+      llvm::SmallVector<ValueRef, 4> result_values;
+      for (size_t i = 0; i < returned_results.size(); i++)
+        result_values.emplace_back(returned_results[i]);
+
+      RemainingResults remaining_results(result_values);
+      KernelCallHelper<Tail...>::template Invoke<in_idx, -1, const_idx>(
+          frame, pargs..., remaining_results);
+    }
+  };
+
+  // No arguments left.
+  template <typename T>
+  struct KernelCallHelper<TypeTag<T>> {
+    template <int in_idx, int out_idx, int const_idx, typename... PreviousArgs>
+    static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) {
+      KernelReturnHelper<Return, false>::Invoke(frame, pargs...);
+    }
+  };
+
+  // Handle pair result
+  template <typename T0, typename T1>
+  static void HandleReturn(KernelFrame* frame, std::pair<T0, T1>&& t) {
+    CHECK_EQ(frame->GetNumResults(), 2);
+    StoreResultAt(frame, 0, std::move(t.first));
+    StoreResultAt(frame, 1, std::move(t.second));
+  }
+
+  // Store the function result back to the output Value in KernelFrame.
+  template <typename T>
+  static void HandleReturn(KernelFrame* frame, T&& t) {
+    assert(frame->GetNumResults() == 1 && "Extra results passed to kernel.");
+    StoreResultAt(frame, 0, std::forward<T>(t));
+  }
+
+  // Store result as an Value output in KernelFrame.
+  template <typename T>
+  static void StoreResultAt(KernelFrame* frame, int index, T&& t) {
+    frame->EmplaceResult<std::decay_t<T>>(index, std::forward<T>(t));
+  }
+};
+
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/kernel_utils_test.cc b/paddle/infrt/host_context/kernel_utils_test.cc
new file mode 100644
index 0000000000000..1904eb106a293
--- /dev/null
+++ b/paddle/infrt/host_context/kernel_utils_test.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/kernel_utils.h"
+
+#include <gtest/gtest.h>
+
+namespace infrt::host_context {
+
+int add_i32(int a, int b) { return a + b; }
+float add_f32(float a, float b) { return a + b; }
+std::pair<int, float> add_pair(int a, float b) { return {a, b}; }
+
+TEST(KernelImpl, i32) {
+  KernelFrameBuilder fbuilder;
+  ValueRef a(new Value(1));
+  ValueRef b(new Value(2));
+  fbuilder.AddArgument(a.get());
+  fbuilder.AddArgument(b.get());
+  fbuilder.SetNumResults(1);
+
+  INFRT_KERNEL(add_i32)(&fbuilder);
+  auto results = fbuilder.GetResults();
+  ASSERT_EQ(results.size(), 1UL);
+  ASSERT_EQ(results.front()->get<int>(), 3);
+}
+
+TEST(KernelImpl, f32) {
+  KernelFrameBuilder fbuilder;
+  ValueRef a(new Value(1.f));
+  ValueRef b(new Value(2.f));
+  fbuilder.AddArgument(a.get());
+  fbuilder.AddArgument(b.get());
+  fbuilder.SetNumResults(1);
+
+  INFRT_KERNEL(add_f32)(&fbuilder);
+  auto results = fbuilder.GetResults();
+  ASSERT_EQ(results.size(), 1UL);
+  ASSERT_EQ(results.front()->get<float>(), 3.f);
+}
+
+TEST(KernelImpl, pair) {
+  KernelFrameBuilder fbuilder;
+  ValueRef a(new Value(1));
+  ValueRef b(new Value(3.f));
+
+  fbuilder.AddArgument(a.get());
+  fbuilder.AddArgument(b.get());
+  fbuilder.SetNumResults(2);
+
+  INFRT_KERNEL(add_pair)(&fbuilder);
+  auto results = fbuilder.GetResults();
+  ASSERT_EQ(results.size(), 2UL);
+  ASSERT_EQ(results[0]->get<int>(), 1);
+  ASSERT_EQ(results[1]->get<float>(), 3.f);
+}
+
+}  // namespace infrt::host_context
diff --git a/paddle/infrt/host_context/mlir_exec.cc b/paddle/infrt/host_context/mlir_exec.cc
new file mode 100644
index 0000000000000..b0d70af5ef9f2
--- /dev/null
+++ b/paddle/infrt/host_context/mlir_exec.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <llvm/Support/CommandLine.h>
+
+#include <iostream>
+#include <string>
+
+#include "llvm/Support/DynamicLibrary.h"
+#include "paddle/infrt/common/global.h"
+#include "paddle/infrt/dialect/mlir_loader.h"
+#include "paddle/infrt/host_context/core_runtime.h"
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/mlir_to_runtime_translate.h"
+#include "paddle/infrt/kernel/basic_kernels.h"
+#include "paddle/infrt/kernel/control_flow_kernels.h"
+#include "paddle/infrt/kernel/tensor_kernels.h"
+#include "paddle/infrt/kernel/tensor_shape_kernels.h"
+#include "paddle/infrt/kernel/test_kernels.h"
+
+static llvm::cl::list<std::string> cl_shared_libs(  // NOLINT
+    "shared_libs",
+    llvm::cl::desc("Specify shared library with kernels."),
+    llvm::cl::ZeroOrMore,
+    llvm::cl::MiscFlags::CommaSeparated);
+
+int main(int argc, char** argv) {
+  using namespace llvm;   // NOLINT
+  using namespace infrt;  // NOLINT
+  cl::opt<std::string> input_file("i",
+                                  cl::desc("Specify input filename"),
+                                  cl::value_desc("input file name"));
+  cl::ParseCommandLineOptions(argc, argv);
+
+  mlir::MLIRContext* context = infrt::Global::getMLIRContext();
+  auto module = dialect::LoadMlirFile(input_file.c_str(), context);
+
+  host_context::KernelRegistry registry;
+
+  kernel::RegisterBasicKernels(&registry);
+  kernel::RegisterTestKernels(&registry);
+  kernel::RegisterTensorShapeKernels(&registry);
+  kernel::RegisterTensorKernels(&registry);
+  kernel::RegisterControlFlowKernels(&registry);
+
+  // load extra shared library
+  for (const auto& lib_path : cl_shared_libs) {
+    std::string err;
+    llvm::sys::DynamicLibrary dynLib =
+        llvm::sys::DynamicLibrary::getPermanentLibrary(lib_path.c_str(), &err);
+    if (!dynLib.isValid()) {
+      llvm::errs() << "Load shared library failed. Error: " << err << "\n";
+      return 1;
+    }
+    if (auto reg_sym = dynLib.SearchForAddressOfSymbol("RegisterKernels")) {
+      auto reg_func =
+          reinterpret_cast<void (*)(host_context::KernelRegistry*)>(reg_sym);
+      reg_func(&registry);
+    } else {
+      llvm::outs() << "Symbol \"RegisterKernels\" not found in \"" << lib_path
+                   << "\". Skip.\n";
+    }
+  }
+
+  host_context::TestMlir(module.get(), &registry);
+
+  std::cout << std::endl;
+  return 0;
+}
diff --git a/paddle/infrt/host_context/mlir_function_executable.cc b/paddle/infrt/host_context/mlir_function_executable.cc
new file mode 100644
index 0000000000000..5f8dacf8e448a
--- /dev/null
+++ b/paddle/infrt/host_context/mlir_function_executable.cc
@@ -0,0 +1,135 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/mlir_function_executable.h"
+
+#include <glog/logging.h>
+
+#include <string>  // NOLINT
+
+#include "paddle/infrt/common/common.h"
+#include "paddle/infrt/host_context/core_runtime.h"
+
+namespace infrt {
+namespace host_context {
+
+template <typename T>
+std::string DumpToString(T& op) {  // NOLINT
+  std::string buffer;
+  llvm::raw_string_ostream os(buffer);
+  op.print(os);
+  os.flush();
+  return buffer;
+}
+
+MlirFunctionExecutable::MlirFunctionExecutable(
+    mlir::FuncOp func_op,
+    KernelRegistry* kernel_registry,
+    MlirToRuntimeTranslator::function_defs_t& function_table)
+    : Function(func_op.getName().str(),
+               func_op.getNumArguments(),
+               func_op.getNumResults()),
+      MlirToRuntimeTranslator(&core_runtime_builder_),
+      region_(&func_op.getRegion()),
+      core_runtime_builder_(kernel_registry),
+      function_table_(function_table) {}
+
+MlirFunctionExecutable::MlirFunctionExecutable(
+    mlir::Region* region,
+    mlir::FunctionType func_type,
+    KernelRegistry* kernel_registry,
+    MlirToRuntimeTranslator::function_defs_t& function_table)
+    : Function("", func_type.getNumInputs(), func_type.getNumResults()),
+      MlirToRuntimeTranslator(&core_runtime_builder_),
+      region_(region),
+      core_runtime_builder_(kernel_registry),
+      function_table_(function_table) {}
+
+void MlirFunctionExecutable::BuildExecutables(
+    llvm::ArrayRef<Value*> arguments,
+    llvm::MutableArrayRef<ValueRef> results,
+    bool is_region) {
+  CHECK_EQ(arguments.size(), num_arguments());
+  // We use the function call's arguments as op_executable's operands to avoid
+  // copy.
+  for (size_t i = 0; i < num_arguments(); i++) {
+    AddValue(region_->getArgument(i), arguments[i]);
+  }
+
+  // build the program
+  auto& blocks = region_->getBlocks();
+  CHECK_EQ(blocks.size(), 1UL)
+      << "function with more than one block is not supported yet";
+
+  llvm::SmallVector<Value*, 3> runtime_results;
+  for (auto& op : blocks.front()) {
+    if (EmitConstantOp(&op)) continue;
+    if (EmitBuildShapeOp(&op)) continue;
+
+    llvm::SmallVector<mlir::Value, 3> mlir_results;
+    if (EmitReturnOp(&op, &mlir_results)) {
+      if (!is_region) {
+        for (auto v : mlir_results) {
+          runtime_results.push_back(GetValue(v));
+        }
+      }
+      continue;
+    }
+
+    if (EmitCallOp(&op, &function_table_)) continue;
+
+    if (EmitGeneralOp(&op)) continue;
+    LOG(FATAL) << "Not supported op: " << DumpToString(op);
+  }
+
+  // after the block is built, we can get the result values of the whole
+  // function call in the runtime_results.
+
+  mlir::SmallVector<Value*, 3> results_copied;
+  if (!is_region) {
+    for (ValueRef& x : results) {
+      results_copied.push_back(x.get());
+    }
+  }
+
+  // set a lambda function to help copy the results from the runtime results in
+  // the local function to outer program.
+  CHECK_EQ(results_copied.size(), runtime_results.size());
+  this->copy_res_fn_ = [results_copied, runtime_results] {
+    VLOG(4) << "copy results to result";
+    for (size_t i = 0; i < results_copied.size(); i++) {
+      VLOG(4) << ".. copy " << runtime_results[i] << " to "
+              << results_copied[i];
+      CopyTo(*runtime_results[i], results_copied[i]);
+    }
+  };
+}
+
+void MlirFunctionExecutable::Execute(llvm::ArrayRef<Value*> arguments,
+                                     llvm::MutableArrayRef<ValueRef> results,
+                                     bool is_region) const {
+  CHECK_EQ(arguments.size(), num_arguments());
+  CHECK_EQ(results.size(), num_results());
+
+  if (core_runtime_builder_.num_ops() == 0) {
+    Reference(this).BuildExecutables(arguments, results, is_region);
+  }
+
+  Reference(&core_runtime_builder_).Execute();
+
+  copy_res_fn_();
+}
+
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/mlir_function_executable.h b/paddle/infrt/host_context/mlir_function_executable.h
new file mode 100644
index 0000000000000..ba5fa154d6fcc
--- /dev/null
+++ b/paddle/infrt/host_context/mlir_function_executable.h
@@ -0,0 +1,78 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <mlir/IR/Function.h>
+
+#include <string>
+#include <unordered_map>
+
+#include "paddle/infrt/host_context/core_runtime.h"
+#include "paddle/infrt/host_context/function.h"
+#include "paddle/infrt/host_context/mlir_to_runtime_translate.h"
+
+namespace infrt {
+namespace host_context {
+
+struct KernelRegistry;
+
+/**
+ * Executable function for a given MLIR function definition, mainly used in two
+ * scenerios:
+ * 1. infrt.call op
+ * 2. main function call
+ *
+ * A MlirFunctionExecutable might have one or more arguments and results.
+ */
+class MlirFunctionExecutable : public Function, public MlirToRuntimeTranslator {
+ public:
+  using function_defs_t = std::unordered_map<std::string, mlir::FuncOp>;
+
+  MlirFunctionExecutable(mlir::FuncOp func_op,
+                         KernelRegistry* kernel_registry,
+                         function_defs_t& function_table);  // NOLINT
+
+  MlirFunctionExecutable(
+      mlir::Region* region,
+      mlir::FunctionType func_type,
+      KernelRegistry* kernel_registry,
+      MlirToRuntimeTranslator::function_defs_t& function_table);  // NOLINT
+
+  /**
+   * Execute the function with the given arguments and results.
+   * NOTE the \param arguments and \param results should not be altered.
+   */
+  void Execute(llvm::ArrayRef<Value*> arguments,
+               llvm::MutableArrayRef<ValueRef> results,
+               bool is_region = false) const;
+
+ private:
+  /**
+   * Build the runtime executables once the function call arguments and results
+   * are passed in.
+   * This will trigger in the first execution.
+   */
+  void BuildExecutables(llvm::ArrayRef<Value*> arguments,
+                        llvm::MutableArrayRef<ValueRef> results,
+                        bool is_region);
+
+ private:
+  mlir::Region* region_{};
+  CoreRuntimeBuilder core_runtime_builder_;
+  MlirToRuntimeTranslator::function_defs_t& function_table_;
+  std::function<void()> copy_res_fn_;
+};
+
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/mlir_program_executor.cc b/paddle/infrt/host_context/mlir_program_executor.cc
new file mode 100644
index 0000000000000..c5009bcc97c5c
--- /dev/null
+++ b/paddle/infrt/host_context/mlir_program_executor.cc
@@ -0,0 +1,19 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/mlir_program_executor.h"
+
+namespace infrt {
+namespace host_context {}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/mlir_program_executor.h b/paddle/infrt/host_context/mlir_program_executor.h
new file mode 100644
index 0000000000000..b2af4d2d79db5
--- /dev/null
+++ b/paddle/infrt/host_context/mlir_program_executor.h
@@ -0,0 +1,79 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <mlir/Dialect/StandardOps/IR/Ops.h>
+#include <mlir/IR/Diagnostics.h>
+#include <mlir/IR/Function.h>
+#include <mlir/IR/Module.h>
+#include <mlir/IR/OperationSupport.h>
+#include <unordered_map>
+
+#include <memory>
+#include <string>
+
+#include "paddle/infrt/host_context/core_runtime.h"
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/mlir_function_executable.h"
+#include "paddle/infrt/host_context/mlir_to_runtime_translate.h"
+#include "paddle/infrt/host_context/op_executable.h"
+
+namespace infrt {
+namespace host_context {
+
+/**
+ * This get a MLIR program as input, it compiles it into runtime program, and
+ * one can retrieve the function and execute
+ * it by passing the input arguments.
+ */
+class MlirProgramExecutor : public MlirToRuntimeTranslator {
+ public:
+  CoreRuntimeBuilder runtime_builder;
+  mlir::ModuleOp module;
+  function_defs_t function_defs;
+
+  MlirProgramExecutor(mlir::ModuleOp module, KernelRegistry* registry)
+      : MlirToRuntimeTranslator(module, &runtime_builder),
+        runtime_builder(registry),
+        module(module) {}
+
+  // Build functions and generate executables.
+  void BuildFunctions() { EmitFunctions(); }
+
+  void EmitFunction(mlir::FuncOp op) override {
+    LOG(INFO) << "Emit function: " << op.getName().str();
+    function_defs[op.getName().str()] = op;
+
+    func_executables_.emplace(
+        op.getName().str(),
+        new MlirFunctionExecutable(
+            op, runtime_builder.kernel_registry(), function_defs));
+  }
+
+  MlirFunctionExecutable* LookupFunc(const std::string& name) {
+    auto it = func_executables_.find(name);
+    if (it != func_executables_.end()) {
+      return it->second.get();
+    }
+    return nullptr;
+  }
+
+ private:
+  std::unordered_map<std::string, std::unique_ptr<MlirFunctionExecutable>>
+      func_executables_;
+};
+
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/mlir_tests/basic.mlir b/paddle/infrt/host_context/mlir_tests/basic.mlir
new file mode 100644
index 0000000000000..263d5884134b1
--- /dev/null
+++ b/paddle/infrt/host_context/mlir_tests/basic.mlir
@@ -0,0 +1,30 @@
+// CHECK-LABEL: basic
+func @basic() -> f32 {
+  %v0 = infrt.constant.f32 1.0
+  %v1 = infrt.constant.f32 2.0
+  %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+
+  // CHECK: 1
+  "infrt.print.f32"(%v0) : (f32) -> ()
+  // CHECK: 2
+  "infrt.print.f32"(%v1) : (f32) -> ()
+
+  // CHECK: 3
+  "infrt.print.f32"(%v2) : (f32) -> ()
+
+  %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
+
+  // CHECK: 6
+  "infrt.print.f32"(%v3) : (f32) -> ()
+
+  infrt.return %v3 : f32
+}
+
+// CHECK-LABEL: basic1
+// Check the mlir executor can work with more than one function in a file.
+func @basic1() -> () {
+  %v0 = infrt.constant.f32 1.0
+  "infrt.print.f32"(%v0) : (f32) -> ()
+  // CHECK: 1
+  infrt.return
+}
\ No newline at end of file
diff --git a/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir b/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir
new file mode 100644
index 0000000000000..83afa1db8a91c
--- /dev/null
+++ b/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir
@@ -0,0 +1,9 @@
+// CHECK-LABEL: build_tensor1
+func @build_tensor1() {
+  %a = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%a : !infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+  // CHECK: tensor: shape=shape[3,4], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+  dt.print_tensor (%a : !infrt.tensor<X86, NCHW, F32>)
+
+  infrt.return
+}
diff --git a/paddle/infrt/host_context/mlir_tests/shape.mlir b/paddle/infrt/host_context/mlir_tests/shape.mlir
new file mode 100644
index 0000000000000..a3130857b0ef7
--- /dev/null
+++ b/paddle/infrt/host_context/mlir_tests/shape.mlir
@@ -0,0 +1,7 @@
+// CHECK-LABEL: build_tensor1
+func @build_tensor1() {
+  %a = ts.build_shape [1:i64, 57:i64, 92:i64]
+  // CHECK: shape[1,57,92]
+  ts.print_shape %a
+  infrt.return
+}
\ No newline at end of file
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
new file mode 100644
index 0000000000000..25324b1291582
--- /dev/null
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
@@ -0,0 +1,558 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/mlir_to_runtime_translate.h"
+
+#include <llvm/Support/SourceMgr.h>
+#include <mlir/Dialect/StandardOps/IR/Ops.h>
+#include <mlir/IR/Diagnostics.h>
+#include <mlir/IR/Function.h>
+#include <mlir/IR/OperationSupport.h>
+#include <mlir/Parser.h>
+
+#include <iostream>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "boost/optional.hpp"
+#include "paddle/infrt/common/string.h"
+#include "paddle/infrt/dialect/mlir_loader.h"
+#include "paddle/infrt/dialect/tensor_shape.h"
+#include "paddle/infrt/host_context/core_runtime.h"
+#include "paddle/infrt/host_context/kernel_frame.h"
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/mlir_function_executable.h"
+#include "paddle/infrt/host_context/op_executable.h"
+#include "paddle/infrt/host_context/value.h"
+#include "paddle/infrt/tensor/tensor_shape.h"
+
+namespace infrt::host_context {
+
+template <typename T>
+std::string DumpToString(T& op) {  // NOLINT
+  std::string buffer;
+  llvm::raw_string_ostream os(buffer);
+  op.print(os);
+  os.flush();
+  return buffer;
+}
+
+struct MlirToRuntimeTranslator::Impl {
+  mlir::ModuleOp module;
+  // The runtime for a function call.
+  CoreRuntimeBuilder* runtime{};
+  // The current working op, the translator process the ops one by one, each
+  // time it updates `cur_op` here to current op
+  // working on.
+  OpExecutableBuilder* cur_op{};
+
+  // record the current function name.
+  std::string cur_func_name;
+
+  // Name to function definitions.
+  std::unordered_map<std::string, mlir::FuncOp> func_defs;
+
+  // Map from an operation to its results.
+  std::unordered_map<const mlir::Operation*, std::vector<ValueRef>> op_results;
+  llvm::DenseMap<mlir::Value, ValueRef> value_map;
+};
+
+bool MlirToRuntimeTranslator::EmitConstantOp(mlir::Operation* op) {
+  if (!infrt::Startswith(op->getName().getStringRef().str(), "infrt.constant"))
+    return false;
+  VLOG(3) << "Emitting constant op [" << op->getName().getStringRef().str()
+          << "]";
+
+  auto attr = op->getAttr("value");
+  if (attr.isa<mlir::FloatAttr>()) {
+    if (attr.getType().isF32()) {
+      impl_->op_results[op] = {ValueRef(
+          static_cast<float>(attr.cast<mlir::FloatAttr>().getValueAsDouble()))};
+    } else if (attr.getType().isF64()) {
+      impl_->op_results[op] = {ValueRef(static_cast<double>(
+          attr.cast<mlir::FloatAttr>().getValueAsDouble()))};
+    } else {
+      LOG(FATAL) << "Not supported attribute type";
+    }
+    return true;
+  }
+
+  if (attr.isa<mlir::IntegerAttr>()) {
+    if (attr.getType().isInteger(32)) {
+      impl_->op_results[op] = {ValueRef(
+          static_cast<int32_t>(attr.cast<mlir::IntegerAttr>().getSInt()))};
+    } else if (attr.getType().isInteger(64)) {
+      impl_->op_results[op] = {ValueRef(
+          static_cast<int64_t>(attr.cast<mlir::IntegerAttr>().getSInt()))};
+    } else if (attr.getType().isInteger(1)) {
+      impl_->op_results[op] = {
+          ValueRef(static_cast<bool>(attr.cast<mlir::IntegerAttr>().getInt()))};
+    } else {
+      LOG(FATAL) << "Not supported attribute type";
+    }
+    return true;
+  }
+
+  LOG(FATAL) << "Not supported constant attribute type";
+  return true;
+}
+
+template <>
+boost::optional<int32_t> MlirToRuntimeTranslator::EmitAttribute(
+    const mlir::Attribute* attr) {
+  if (!attr->isa<mlir::IntegerAttr>()) return boost::none;
+  if (attr->isa<mlir::IntegerAttr>()) {
+    auto val = attr->cast<mlir::IntegerAttr>();
+    if (val.getType().isInteger(32)) {
+      return val.getInt();
+    }
+  }
+  return boost::none;
+}
+template <>
+boost::optional<int64_t> MlirToRuntimeTranslator::EmitAttribute(
+    const mlir::Attribute* attr) {
+  if (!attr->isa<mlir::IntegerAttr>()) return boost::none;
+  if (attr->isa<mlir::IntegerAttr>()) {
+    auto val = attr->cast<mlir::IntegerAttr>();
+    if (val.getType().isInteger(64)) {
+      return val.getInt();
+    }
+  }
+  return boost::none;
+}
+
+// TODO(Superjomn) Make double and float parsing share some thing.
+template <>
+boost::optional<float> MlirToRuntimeTranslator::EmitAttribute(
+    const mlir::Attribute* attr) {
+  if (!attr->isa<mlir::FloatAttr>()) return boost::none;
+  if (attr->isa<mlir::FloatAttr>()) {
+    auto val = attr->cast<mlir::FloatAttr>();
+    if (val.getType().isF32()) return val.getValueAsDouble();
+  }
+  return boost::none;
+}
+
+template <>
+boost::optional<double> MlirToRuntimeTranslator::EmitAttribute(
+    const mlir::Attribute* attr) {
+  if (!attr->isa<mlir::FloatAttr>()) return boost::none;
+  if (attr->isa<mlir::FloatAttr>()) {
+    auto val = attr->cast<mlir::FloatAttr>();
+    if (val.getType().isF64()) return val.getValueAsDouble();
+  }
+  return boost::none;
+}
+
+template <>
+boost::optional<std::string> MlirToRuntimeTranslator::EmitAttribute(
+    const mlir::Attribute* attr) {
+  if (!attr->isa<mlir::StringAttr>()) return boost::none;
+  return attr->cast<mlir::StringAttr>().getValue().str();
+}
+
+#define PROCESS_ARRAY_INT(type__, bits__)                                      \
+  template <>                                                                  \
+  boost::optional<std::vector<type__>> MlirToRuntimeTranslator::EmitAttribute( \
+      const mlir::Attribute* attr) {                                           \
+    if (!attr->isa<mlir::ArrayAttr>()) return boost::none;                     \
+    auto array = attr->cast<mlir::ArrayAttr>();                                \
+    CHECK(!array.empty());                                                     \
+                                                                               \
+    if (!array[0].getType().isInteger(bits__)) {                               \
+      return boost::none;                                                      \
+    }                                                                          \
+                                                                               \
+    std::vector<type__> res;                                                   \
+    for (auto& v : array) {                                                    \
+      res.push_back(v.cast<mlir::IntegerAttr>().getInt());                     \
+    }                                                                          \
+    return res;                                                                \
+  }
+
+PROCESS_ARRAY_INT(int16_t, 16);
+PROCESS_ARRAY_INT(int32_t, 32);
+PROCESS_ARRAY_INT(int64_t, 64);
+
+template <>
+boost::optional<std::vector<float>> MlirToRuntimeTranslator::EmitAttribute(
+    const mlir::Attribute* attr) {
+  if (!attr->isa<mlir::ArrayAttr>()) return boost::none;
+  auto array = attr->cast<mlir::ArrayAttr>();
+  CHECK(!array.empty());
+
+  if (!array[0].getType().isF32()) return boost::none;
+
+  std::vector<float> res;
+  for (auto& v : array) {
+    res.push_back(v.cast<mlir::FloatAttr>().getValueAsDouble());
+  }
+  return res;
+}
+
+template <>
+boost::optional<std::vector<double>> MlirToRuntimeTranslator::EmitAttribute(
+    const mlir::Attribute* attr) {
+  if (!attr->isa<mlir::ArrayAttr>()) return boost::none;
+  auto array = attr->cast<mlir::ArrayAttr>();
+  CHECK(!array.empty());
+
+  if (!array[0].getType().isF64()) return boost::none;
+
+  std::vector<double> res;
+  for (auto& v : array) {
+    res.push_back(v.cast<mlir::FloatAttr>().getValueAsDouble());
+  }
+  return res;
+}
+
+static bool IsReturn(mlir::Operation* op) {
+  return op->getName().getStringRef() == "infrt.return";
+}
+
+bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) {
+  CHECK(impl_->runtime);
+  impl_->cur_op =
+      impl_->runtime->NewOpExecutable(op->getName().getStringRef().str());
+
+  VLOG(3) << "processing general op : " << op->getName().getStringRef().str();
+
+  // process operands
+  for (int i = 0, e = op->getNumOperands(); i < e; i++) {
+    // function argument as value
+    auto operand = op->getOperand(i);
+    if (operand.getKind() == mlir::Value::Kind::BlockArgument) {
+      mlir::BlockArgument arg = operand.dyn_cast<mlir::BlockArgument>();
+      Value* arg_value = GetValue(arg);
+      impl_->cur_op->AppendArgument(arg_value);
+      VLOG(3) << "* op mlir operand: " << DumpToString(arg) << " "
+              << GetValue(arg);
+      continue;
+    }
+
+    // normal value
+    Value* arg_value = GetValue(operand);
+    if (!arg_value) {
+      auto upstream_op = operand.getDefiningOp();
+      arg_value = GetOpResult(upstream_op);
+    }
+    CHECK(arg_value) << "No-exist argument value found: "
+                     << DumpToString(operand);
+    impl_->cur_op->AppendArgument(arg_value);
+
+    VLOG(3) << "* op mlir operand: " << DumpToString(operand) << " "
+            << GetValue(operand) << " vs " << arg_value;
+  }
+
+  // process results
+  llvm::SmallVector<Value*, 4> res_values;
+  for (int i = 0, e = op->getNumResults(); i < e; i++) {
+    auto res = op->getResult(i);
+    res_values.push_back(AddValue(res));
+
+    VLOG(3) << "* op mlir res: " << DumpToString(res) << " " << GetValue(res);
+  }
+  impl_->cur_op->SetResults(res_values);
+
+#ifdef INFRT_DEBUG
+  {
+    VLOG(3) << "check result";
+    for (int i = 0; i < impl_->cur_op->frame().GetNumResults(); i++) {
+      VLOG(3) << "+ res value: " << impl_->cur_op->frame().GetResults()[i];
+    }
+  }
+#endif
+
+  // process attributes
+  auto attrs = op->getAttrs();
+
+  for (size_t i = 0; i < attrs.size(); i++) {
+    auto& attr = attrs[i];
+    if (auto v = EmitAttribute<int32_t>(&attr.second)) {
+      impl_->cur_op->AppendAttribute(new Value(*v));
+    } else if (auto v = EmitAttribute<int64_t>(&attr.second)) {
+      impl_->cur_op->AppendAttribute(new Value(*v));
+    } else if (auto v = EmitAttribute<float>(&attr.second)) {
+      impl_->cur_op->AppendAttribute(new Value(*v));
+    } else if (auto v = EmitAttribute<double>(&attr.second)) {
+      impl_->cur_op->AppendAttribute(new Value(*v));
+    } else if (auto v = EmitAttribute<std::string>(&attr.second)) {
+      impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
+    } else if (auto v = EmitAttribute<std::vector<int16_t>>(&attr.second)) {
+      impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
+    } else if (auto v = EmitAttribute<std::vector<int32_t>>(&attr.second)) {
+      impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
+    } else if (auto v = EmitAttribute<std::vector<int64_t>>(&attr.second)) {
+      impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
+    } else if (auto v = EmitAttribute<std::vector<float>>(&attr.second)) {
+      impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
+    } else if (auto v = EmitAttribute<std::vector<double>>(&attr.second)) {
+      impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
+    } else {
+      LOG(FATAL) << "Not supported attribute type";
+    }
+  }
+
+  // process regions, we treat regions as attribute.
+  auto num_regions = op->getNumRegions();
+  if (num_regions > 0) {
+    CHECK_EQ(num_regions, 1UL)
+        << "op with more than one region is not supported yet.";
+    auto& region = op->getRegions().front();
+    auto num_blocks = region.getBlocks().size();
+    CHECK_EQ(num_blocks, 1UL)
+        << "region with more than one block is not supported yet.";
+
+    // process arguments
+    llvm::SmallVector<mlir::Type, 4> inputs;
+    auto& block = region.getBlocks().front();
+    for (auto arg : block.getArguments()) inputs.push_back(arg.getType());
+
+    // process results
+    // NOTE: if an op contains a region, we simply ignore the region's return
+    // values,
+    //       or its return values will conflict with op's return values.
+    llvm::SmallVector<mlir::Type, 0> results;
+
+    auto func_type =
+        mlir::FunctionType::get(inputs, results, region.getContext());
+    auto* function = impl_->cur_op->CreateFunctionExecutable(
+        &region, func_type, &impl_->func_defs);
+    impl_->cur_op->AppendAttribute(new Value(function));
+  }
+
+  return true;
+}
+
+bool MlirToRuntimeTranslator::EmitReturnOp(
+    mlir::Operation* op, llvm::SmallVectorImpl<mlir::Value>* results) {
+  CHECK(results);
+  if (op->getName().getStringRef() == "infrt.return") {
+    for (size_t i = 0; i < op->getNumOperands(); i++) {
+      results->push_back(op->getOperand(i));
+    }
+
+    return true;
+  }
+  return false;
+}
+
+bool MlirToRuntimeTranslator::EmitFunctions() {
+  for (auto func_op : impl_->module.getOps<mlir::FuncOp>()) {
+    EmitFunction(func_op);
+  }
+  return true;
+}
+
+void MlirToRuntimeTranslator::EmitFunction(mlir::FuncOp op) {
+  impl_->func_defs[op.getName().str()] = op;
+}
+
+Value* MlirToRuntimeTranslator::GetOpResult(mlir::Operation* op) {
+  auto it = impl_->op_results.find(op);
+  return it == impl_->op_results.end() ? nullptr : it->second.front().get();
+}
+
+Value* MlirToRuntimeTranslator::GetValue(mlir::Value value) {
+  auto it = impl_->value_map.find(value);
+  return it == impl_->value_map.end() ? nullptr : it->second.get();
+}
+
+Value* MlirToRuntimeTranslator::AddValue(mlir::Value value) {
+  auto res = impl_->value_map.try_emplace(value, ValueRef(new Value));
+  CHECK(res.second) << "Duplicate add mlir value [" << DumpToString(value)
+                    << "]";
+  return res.first->second.get();
+}
+
+MlirToRuntimeTranslator::~MlirToRuntimeTranslator() {}
+
+void MlirToRuntimeTranslator::UpdateCurFuncName(const std::string& name) {
+  impl_->cur_func_name = std::string(name);
+}
+
+MlirToRuntimeTranslator::MlirToRuntimeTranslator(mlir::ModuleOp module,
+                                                 CoreRuntimeBuilder* runtime)
+    : impl_(new Impl) {
+  CHECK(runtime);
+  impl_->module = module;
+  impl_->runtime = runtime;
+}
+
+bool MlirToRuntimeTranslator::EmitBuildShapeOp(mlir::Operation* op) {
+  if (op->getName().getStringRef() != "ts.build_shape") return false;
+
+  auto value = op->getAttr("value");
+
+  CHECK(value.isa<mlir::ArrayAttr>());
+  auto values = value.cast<mlir::ArrayAttr>().getValue();
+  std::vector<int64_t> dims;
+  for (auto& attr_v : values) {
+    dims.push_back(attr_v.cast<mlir::IntegerAttr>().getInt());
+  }
+  impl_->op_results[op] = {
+      ValueRef(new Value(tensor::TensorShape(llvm::ArrayRef<int64_t>(dims))))};
+
+  return true;
+}
+
+bool MlirToRuntimeTranslator::EmitCallOp(mlir::Operation* op,
+                                         function_defs_t* function_table) {
+  CHECK(op);
+  CHECK(function_table);
+  if (op->getName().getStringRef() != "infrt.call") return false;
+
+  impl_->cur_op =
+      impl_->runtime->NewOpExecutable(op->getName().getStringRef().str());
+
+  auto callee = op->getAttr("callee");
+  auto callee_name = callee.dyn_cast<mlir::FlatSymbolRefAttr>();
+
+  // process arguments
+  for (size_t i = 0; i < op->getNumOperands(); i++) {
+    auto operand = op->getOperand(i);
+    auto* arg_value = GetValue(operand);
+
+    if (!arg_value) {
+      auto upstream_op = operand.getDefiningOp();
+      arg_value = GetOpResult(upstream_op);
+    }
+    CHECK(arg_value) << "No-exist argument value found: "
+                     << DumpToString(operand);
+    impl_->cur_op->AppendArgument(arg_value);
+  }
+
+  // process results
+  llvm::SmallVector<Value*, 4> res_values;
+  for (int i = 0, e = op->getNumResults(); i < e; i++) {
+    auto res = op->getResult(i);
+    res_values.push_back(AddValue(res));
+  }
+  impl_->cur_op->SetResults(res_values);
+
+  // process attribute
+  auto& table = function_table ? *function_table : impl_->func_defs;
+  {
+    // lookup the callee function
+    auto it = table.find(callee_name.getValue().str());
+    CHECK(it != table.end()) << "can't find function ["
+                             << callee_name.getValue().str() << "]";
+    auto* function =
+        impl_->cur_op->CreateFunctionExecutable(it->second, &impl_->func_defs);
+    impl_->cur_op->AppendAttribute(new Value(function));
+  }
+
+  VLOG(3) << "Emit call " << callee_name.getValue().str() << " "
+          << impl_->cur_op->frame();
+  return true;
+}
+
+MlirToRuntimeTranslator::MlirToRuntimeTranslator(CoreRuntimeBuilder* runtime)
+    : impl_(new Impl) {
+  CHECK(runtime);
+  impl_->runtime = runtime;
+}
+
+Value* MlirToRuntimeTranslator::AddValue(mlir::Value mlir_value, Value* value) {
+  auto it = impl_->value_map.try_emplace(mlir_value, ValueRef(value));
+  CHECK(it.second) << "duplicate add value " << DumpToString(mlir_value);
+  return value;
+}
+
+void MlirToRuntimeTranslate(mlir::ModuleOp module,
+                            CoreRuntimeBuilder* runtime) {
+  MlirToRuntimeTranslator(module, runtime).Run();
+}
+
+/**
+ * Execute the mlir program in test mode -- print some debug information to
+ * stdout.
+ */
+class MlirProgramTestExecutor : public MlirToRuntimeTranslator {
+ public:
+  CoreRuntimeBuilder core_runtime;
+
+  MlirProgramTestExecutor(mlir::ModuleOp module, KernelRegistry* registry)
+      : MlirToRuntimeTranslator(module, &core_runtime),
+        core_runtime(registry),
+        registry(registry) {
+    CHECK(registry);
+  }
+
+  void Run() {
+    EmitFunctions();
+
+    CHECK(registry);
+    for (auto func_op : impl_->module.getOps<mlir::FuncOp>()) {
+      VLOG(3) << "Running function " << func_op.getName().str();
+      EmitAndRunFuncWithoutArguments(func_op);
+    }
+  }
+
+ protected:
+  std::unordered_map<std::string, mlir::FuncOp> func_def_table;
+
+  void EmitFunction(mlir::FuncOp op) override {
+    CHECK(!impl_->func_defs.count(op.getName().str()))
+        << "Duplicate function defition found for function ["
+        << op.getName().str();
+    impl_->func_defs.emplace(op.getName().str(), op);
+  }
+
+ private:
+  void EmitAndRunFuncWithoutArguments(mlir::FuncOp func) {
+    // print the function name for llvm FileChecker macro, CHECK-LABEL
+    std::cout << '@' << func.getName().str() << std::endl;
+    if (func.getNumArguments() ==
+        0) {  // an entry function, execute it immediately
+      VLOG(3) << "executing function " << func.getName().str();
+      // Emit and execute each function
+      CoreRuntimeBuilder runtime(registry);
+      impl_->runtime = &runtime;
+
+      auto& blocks = func.getBlocks();
+      CHECK_EQ(blocks.size(), 1UL)
+          << "function with more than one block is not supported yet";
+
+      for (auto& op : blocks.front()) {
+        if (EmitConstantOp(&op)) continue;
+        if (EmitBuildShapeOp(&op)) continue;
+        llvm::SmallVector<mlir::Value, 3> results;
+        if (EmitReturnOp(&op, &results)) continue;
+        if (EmitCallOp(&op, &impl_->func_defs)) continue;
+        if (EmitGeneralOp(&op)) continue;
+        LOG(FATAL) << "Not supported op: " << DumpToString(op);
+      }
+
+      runtime.Execute();
+
+    } else {
+      VLOG(2) << "get an callable function: " << func.getName().str();
+    }
+  }
+
+ private:
+  KernelRegistry* registry{};
+};
+
+void TestMlir(mlir::ModuleOp module, KernelRegistry* registry) {
+  MlirProgramTestExecutor execute(module, registry);
+  execute.Run();
+}
+
+}  // namespace infrt::host_context
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.h b/paddle/infrt/host_context/mlir_to_runtime_translate.h
new file mode 100644
index 0000000000000..598e81bfd96d8
--- /dev/null
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate.h
@@ -0,0 +1,107 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <llvm/ADT/SmallVector.h>
+
+#include <boost/optional.hpp>
+#include <memory>         // NOLINT
+#include <string>         //NOLINT
+#include <unordered_map>  // NOLINT
+
+namespace mlir {
+class FuncOp;
+class ModuleOp;
+class Operation;
+class Attribute;
+class Value;
+}  // namespace mlir
+
+namespace infrt::host_context {
+
+class CoreRuntimeBuilder;
+class Value;
+class ValueRef;
+class KernelRegistry;
+
+/**
+ * MlirToRuntimeTranslator helps to translate a MLIR program to a CoreRuntime.
+ * This is the base class of all the modules those parse a MLIR program and
+ * finally generate a CoreRuntime.
+ */
+class MlirToRuntimeTranslator {
+ public:
+  //! Holds all the function definitions.
+  using function_defs_t = std::unordered_map<std::string, mlir::FuncOp>;
+
+  explicit MlirToRuntimeTranslator(CoreRuntimeBuilder* runtime);
+  MlirToRuntimeTranslator(mlir::ModuleOp module, CoreRuntimeBuilder* runtime);
+
+  void Run() { EmitFunctions(); }
+
+  virtual ~MlirToRuntimeTranslator();
+
+ protected:
+  //! Emit a "infrt.constant.*" operation, return true if succeed.
+  bool EmitConstantOp(mlir::Operation* op);
+  //! Emit a "infrt.return" operation.
+  bool EmitReturnOp(mlir::Operation* op,
+                    llvm::SmallVectorImpl<mlir::Value>* results);
+  //! Emit a "ts.build_shape" operation.
+  bool EmitBuildShapeOp(mlir::Operation* op);
+  //! Emit an operation other than the special cases above.
+  bool EmitGeneralOp(mlir::Operation* op);
+  //! Emit all the functions.
+  bool EmitFunctions();
+
+  //! Emit a single function, this is an API that should be implemented by
+  //! inherients.
+  virtual void EmitFunction(mlir::FuncOp op);
+
+  bool EmitCallOp(mlir::Operation* op, function_defs_t* function_table);
+
+  template <typename T>
+  boost::optional<T> EmitAttribute(const mlir::Attribute* attr);
+
+  Value* GetOpResult(mlir::Operation* op);
+
+  Value* GetValue(mlir::Value value);
+
+  Value* AddValue(mlir::Value value);
+
+  Value* AddValue(mlir::Value mlir_value, Value* value);
+
+  void UpdateCurFuncName(const std::string& name);
+
+ protected:
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+/**
+ * Build a CoreRuntime from a MLIR module.
+ */
+void MlirToRuntimeTranslate(mlir::ModuleOp module, CoreRuntimeBuilder* runtime);
+
+/**
+ * Execute a MLIR program, that is execute all the functions without input
+ * arguments.
+ * This is mainly used by testcase.
+ * @param module a MLIR module.
+ * @param registry the kernel registry containing all the valid kernels.
+ */
+void TestMlir(mlir::ModuleOp module, KernelRegistry* registry);
+
+}  // namespace infrt::host_context
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc b/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc
new file mode 100644
index 0000000000000..9b85be977ab6c
--- /dev/null
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc
@@ -0,0 +1,160 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/mlir_to_runtime_translate.h"
+
+#include <gtest/gtest.h>
+#include <llvm/Support/FormatVariadic.h>
+
+#include "paddle/infrt/common/global.h"
+#include "paddle/infrt/dialect/mlir_loader.h"
+#include "paddle/infrt/host_context/core_runtime.h"
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/kernel_utils.h"
+#include "paddle/infrt/host_context/mlir_program_executor.h"
+#include "paddle/infrt/kernel/basic_kernels.h"
+#include "paddle/infrt/kernel/control_flow_kernels.h"
+#include "paddle/infrt/kernel/tensor_kernels.h"
+#include "paddle/infrt/kernel/tensor_shape_kernels.h"
+#include "paddle/infrt/kernel/test_kernels.h"
+
+namespace infrt::host_context {
+
+TEST(MlirToRuntimeTranslate, basic) {
+  mlir::MLIRContext context;
+
+  auto source = R"ROC(
+func @main() -> () {
+  %v0 = infrt.constant.f32 1.0
+  %v1 = infrt.constant.f32 2.0
+  %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+  %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
+
+  "infrt.print.f32"(%v1) : (f32) -> ()
+
+  infrt.return
+}
+)ROC";
+
+  auto module = dialect::LoadMlirSource(&context, source);
+  module->verify();
+
+  KernelRegistry registry;
+  kernel::RegisterFloatBasicKernels(&registry);
+  kernel::RegisterIntBasicKernels(&registry);
+
+  TestMlir(module.get(), &registry);
+}
+
+TEST(TestMlir, basic) {
+  mlir::MLIRContext context;
+
+  auto source = R"ROC(
+func @main() -> () {
+  %v0 = infrt.constant.f32 1.0
+  %v1 = infrt.constant.f32 2.0
+  %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+  %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
+
+  "infrt.print.f32"(%v1) : (f32) -> ()
+
+  infrt.return
+}
+)ROC";
+
+  auto module = dialect::LoadMlirSource(&context, source);
+  module->verify();
+
+  KernelRegistry registry;
+  kernel::RegisterFloatBasicKernels(&registry);
+  kernel::RegisterIntBasicKernels(&registry);
+
+  TestMlir(module.get(), &registry);
+}
+
+TEST(TestMlir, shadow_copy_tensor_profile) {
+  mlir::MLIRContext* context = infrt::Global::getMLIRContext();
+
+  auto head = R"ROC(
+func @predict(%a: !infrt.tensor<X86, NCHW, F32>, %b: !infrt.tensor<X86, NCHW, F32>) -> (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) {
+)ROC";
+
+  auto tpl0 =
+      "%a{0} = dt.shallow_copy_tensor %a : !infrt.tensor<X86, NCHW, F32> -> "
+      "!infrt.tensor<X86, NCHW, F32>";
+  auto tpl1 =
+      "%b{0} = dt.shallow_copy_tensor %b : !infrt.tensor<X86, NCHW, F32> -> "
+      "!infrt.tensor<X86, NCHW, F32>";
+
+  auto end = R"ROC(
+infrt.return %a0, %b0: !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>
+}
+  )ROC";
+
+  std::stringstream ss;
+  ss << head;
+  for (int i = 0; i < 2000; i++) {
+    ss << llvm::formatv(tpl0, i).str() << "\n";
+    ss << llvm::formatv(tpl1, i).str() << "\n";
+  }
+  ss << end;
+
+  auto content = ss.str();
+
+  // LOG(INFO) << "content: " << content << std::endl;
+
+  auto module = dialect::LoadMlirSource(context, content);
+  module->verify();
+
+  host_context::KernelRegistry registry;
+
+  kernel::RegisterBasicKernels(&registry);
+  kernel::RegisterTestKernels(&registry);
+  kernel::RegisterTensorShapeKernels(&registry);
+  kernel::RegisterTensorKernels(&registry);
+  kernel::RegisterControlFlowKernels(&registry);
+
+  MlirProgramExecutor executor(*module, &registry);
+  executor.BuildFunctions();
+
+  auto* func = executor.LookupFunc("predict");
+  ASSERT_TRUE(func);
+
+  std::vector<Value*> in_args;
+  std::vector<ValueRef> out_args(
+      {ValueRef(new Value(tensor::DenseHostTensor())),
+       ValueRef(new Value(tensor::DenseHostTensor()))});
+
+  auto create_tensor = [] {
+    tensor::DenseHostTensor a(tensor::TensorShape{{200, 3000}},
+                              DType(DType::Kind::F32));
+    auto* data = reinterpret_cast<float*>(a.raw_data());
+    for (int i = 0; i < a.shape().GetNumElements(); i++) {
+      data[i] = i;
+    }
+    return a;
+  };
+
+  std::vector<ValueRef> inputs({ValueRef(new Value(create_tensor())),
+                                ValueRef(new Value(create_tensor()))});
+  in_args.assign({inputs[0].get(), inputs[1].get()});
+
+  for (int i = 0; i < 500; i++) {
+    func->Execute(
+        llvm::ArrayRef<Value*>(in_args.data(), in_args.size()),
+        llvm::MutableArrayRef<ValueRef>(out_args.data(), out_args.size()));
+  }
+}
+
+}  // namespace infrt::host_context
diff --git a/paddle/infrt/host_context/op_executable.cc b/paddle/infrt/host_context/op_executable.cc
new file mode 100644
index 0000000000000..6b10ed473719e
--- /dev/null
+++ b/paddle/infrt/host_context/op_executable.cc
@@ -0,0 +1,151 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/op_executable.h"
+
+#include <string>
+
+#include "paddle/infrt/host_context/kernel_frame.h"
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/mlir_function_executable.h"
+#include "paddle/infrt/host_context/symbol_table.h"
+
+namespace infrt::host_context {
+
+struct OpExecutable::Impl {
+  Impl(const std::string& op_name,
+       SymbolTable* symbol_table,
+       KernelRegistry* kernel_registry)
+      : name(op_name),
+        symbol_table(symbol_table),
+        kernel_registry(kernel_registry ? kernel_registry
+                                        : GetCpuKernelRegistry()) {
+    CHECK(kernel_registry);
+  }
+
+  inline bool to_execute() const {
+    return !run_once || run_once && !has_executed;
+  }
+  inline void MarkRun() { has_executed = true; }
+
+  std::string name;
+  SymbolTable* symbol_table{};
+  KernelFrameBuilder frame;
+  KernelRegistry* kernel_registry{};
+
+  std::unique_ptr<MlirFunctionExecutable> mlir_function_executable;
+
+  KernelImplementation kernel_impl{};
+
+  //! Tell whether this Op should be executed only once.
+  bool run_once{};
+  //! Tell whether this op has been executed.
+  bool has_executed{};
+};
+
+OpExecutable::OpExecutable(OpExecutable::Impl* impl) : impl_(impl) {}
+
+const std::string& OpExecutable::name() const { return impl_->name; }
+
+OpExecutableBuilder::OpExecutableBuilder(const std::string& op_name,
+                                         SymbolTable* symbol_table,
+                                         KernelRegistry* kernel_registry)
+    : OpExecutable(new Impl(op_name, symbol_table, kernel_registry)) {
+  CHECK(impl_);
+  // CPU kernel registry is the default KernelRegistry.
+  impl_->kernel_impl = impl_->kernel_registry->GetKernel(
+      std::string(op_name.data(), op_name.size()));
+  // TODO(Superjomn) support other device other than CPU.
+  CHECK(impl_->kernel_impl) << "No CPU kernel called " << op_name;
+
+  if (op_name == "dt.get_param") {
+    impl_->run_once = true;
+  }
+}
+
+void OpExecutableBuilder::AppendArgument(const std::string& name) {
+  if (!impl_->symbol_table->GetValue(name)) {
+    impl_->symbol_table->Register(name);
+  }
+  impl_->frame.AddArgument(impl_->symbol_table->GetValue(name));
+}
+
+void OpExecutableBuilder::AppendArgument(Value* value) {
+  impl_->frame.AddArgument(value);
+}
+
+KernelFrame& OpExecutable::frame() { return impl_->frame; }
+const KernelFrame& OpExecutable::frame() const { return impl_->frame; }
+
+void OpExecutableBuilder::SetResults(llvm::ArrayRef<std::string> result_names) {
+  llvm::SmallVector<Value*, 3> results;
+  for (size_t result_id = 0; result_id < result_names.size(); result_id++) {
+    Value* value = impl_->symbol_table->Register(result_names[result_id]);
+    results.push_back(value);
+  }
+  impl_->frame.SetResults(results);
+}
+
+void OpExecutableBuilder::SetResults(llvm::ArrayRef<Value*> results) {
+  impl_->frame.SetResults(results);
+}
+
+void OpExecutableBuilder::AppendAttribute(Value* value) {
+  impl_->frame.AddAttribute(value);
+}
+
+OpExecutableBuilder::OpExecutableBuilder(OpExecutableBuilder&& other)
+    : OpExecutable(other.impl_.release()) {}
+
+MlirFunctionExecutable* OpExecutableBuilder::CreateFunctionExecutable(
+    mlir::FuncOp op, MlirToRuntimeTranslator::function_defs_t* function_defs) {
+  CHECK(!impl_->mlir_function_executable);
+  impl_->mlir_function_executable.reset(
+      new MlirFunctionExecutable(op, impl_->kernel_registry, *function_defs));
+  return impl_->mlir_function_executable.get();
+}
+
+MlirFunctionExecutable* OpExecutableBuilder::CreateFunctionExecutable(
+    mlir::Region* region,
+    mlir::FunctionType func_type,
+    function_defs_t* function_defs) {
+  CHECK(!impl_->mlir_function_executable);
+  impl_->mlir_function_executable.reset(new MlirFunctionExecutable(
+      region, func_type, impl_->kernel_registry, *function_defs));
+  return impl_->mlir_function_executable.get();
+}
+
+void OpExecutable::Execute() {
+#ifndef NDEBUG
+  VLOG(3) << "execute " << name()
+          << " --- frame args: " << impl_->frame.GetNumArgs() << " results "
+          << impl_->frame.GetNumResults() << " attributes "
+          << impl_->frame.GetNumAttributes();
+  for (int i = 0; i < impl_->frame.GetNumArgs(); i++) {
+    VLOG(3) << "function arg: " << impl_->frame.GetArgAt(i);
+  }
+  for (int i = 0; i < impl_->frame.GetNumResults(); i++) {
+    VLOG(3) << "function result: " << impl_->frame.GetResults()[i];
+  }
+#endif
+
+  if (impl_->to_execute()) {
+    impl_->kernel_impl(&impl_->frame);
+    impl_->MarkRun();
+  }
+}
+
+OpExecutable::~OpExecutable() {}
+
+}  // namespace infrt::host_context
diff --git a/paddle/infrt/host_context/op_executable.h b/paddle/infrt/host_context/op_executable.h
new file mode 100644
index 0000000000000..e2248225a5caf
--- /dev/null
+++ b/paddle/infrt/host_context/op_executable.h
@@ -0,0 +1,92 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <llvm/ADT/ArrayRef.h>
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Region.h"
+
+namespace mlir {
+class FuncOp;
+}  // namespace mlir
+
+namespace infrt::host_context {
+
+class SymbolTable;
+class KernelRegistry;
+class KernelFrame;
+class Value;
+class CoreRuntimeBuilder;
+class MlirFunctionExecutable;
+
+/**
+ * OpExecutable is a runtime executable instance for an operation. It captures
+ * all the information(Tensors, attributes
+ * and so on) needed for execution.
+ * With the SymbolTable and op definition, it create and hold a KernelFrame once
+ * and execute any times.
+ */
+class OpExecutable {
+ public:
+  KernelFrame& frame();
+  const KernelFrame& frame() const;
+
+  void Execute();
+
+  const std::string& name() const;
+
+  ~OpExecutable();
+
+ protected:
+  class Impl;
+  explicit OpExecutable(Impl* impl);
+
+  std::unique_ptr<Impl> impl_;
+};
+
+/**
+ * Builder to help contruct an OpExecutable.
+ */
+class OpExecutableBuilder : public OpExecutable {
+ public:
+  using function_defs_t = std::unordered_map<std::string, mlir::FuncOp>;
+
+  OpExecutableBuilder(const std::string& op_name,
+                      SymbolTable* symbol_table,
+                      KernelRegistry* kernel_registry = nullptr);
+  OpExecutableBuilder(OpExecutableBuilder&& other);
+
+  void AppendArgument(const std::string& name);
+  void AppendArgument(Value* value);
+
+  void SetResults(llvm::ArrayRef<std::string> result_names);
+  void SetResults(llvm::ArrayRef<Value*> results);
+
+  void AppendAttribute(Value* value);
+
+  MlirFunctionExecutable* CreateFunctionExecutable(
+      mlir::FuncOp op, function_defs_t* function_defs);
+
+  MlirFunctionExecutable* CreateFunctionExecutable(
+      mlir::Region* region,
+      mlir::FunctionType func_type,
+      function_defs_t* function_defs);
+};
+
+}  // namespace infrt::host_context
diff --git a/paddle/infrt/host_context/op_executable_test.cc b/paddle/infrt/host_context/op_executable_test.cc
new file mode 100644
index 0000000000000..f981cca4426c1
--- /dev/null
+++ b/paddle/infrt/host_context/op_executable_test.cc
@@ -0,0 +1,56 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/op_executable.h"
+
+#include <gtest/gtest.h>
+
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/kernel_utils.h"
+#include "paddle/infrt/host_context/symbol_table.h"
+
+namespace infrt {
+namespace host_context {
+
+int add(int a, int b) { return a + b; }
+
+TEST(OpExecutable, basic) {
+  // register kernel
+  KernelRegistry registry;
+  registry.AddKernel("infrt.test.add.i32", INFRT_KERNEL(add));
+
+  SymbolTable table;
+  table.Register("a", 1);
+  table.Register("b", 2);
+
+  OpExecutableBuilder executable("infrt.test.add.i32", &table, &registry);
+  executable.AppendArgument("a");
+  executable.AppendArgument("b");
+  executable.SetResults({"c"});
+
+  executable.Execute();
+
+  // check the kernel frame has the result.
+  auto results = executable.frame().GetResults();
+  ASSERT_EQ(results.size(), 1UL);
+  ASSERT_EQ(results.front()->get<int32_t>(), 3);
+
+  // check symbol table contains the same result instance.
+  LOG(INFO) << "type: " << table.GetValue("c")->type_info();
+  int c = table.GetValue("c")->get<int32_t>();
+  ASSERT_EQ(c, 3);
+}
+
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/symbol_table.cc b/paddle/infrt/host_context/symbol_table.cc
new file mode 100644
index 0000000000000..318dc0cc55624
--- /dev/null
+++ b/paddle/infrt/host_context/symbol_table.cc
@@ -0,0 +1,82 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/symbol_table.h"
+
+#include <string>
+
+namespace infrt {
+namespace host_context {
+
+struct SymbolTable::Impl {
+  std::unordered_map<std::string, ValueRef> data;
+};
+
+SymbolTable::SymbolTable() : impl_(new Impl) {}
+
+Value* SymbolTable::Register(const std::string& key) {
+  CHECK(!impl_->data.count(key)) << "Duplicate register [" << key << "]";
+  auto newitem = ValueRef(new Value);
+  impl_->data.emplace(key, newitem);
+  return newitem.get();
+}
+
+Value* SymbolTable::Register(const std::string& key, ValueRef value) {
+  CHECK(!impl_->data.count(key)) << "Duplicate register [" << key << "]";
+  impl_->data.emplace(key, value);
+  return value.get();
+}
+
+Value* SymbolTable::GetValue(const std::string& key) const {
+  auto it = impl_->data.find(std::string(key));
+  return it != impl_->data.end() ? it->second.get() : nullptr;
+}
+
+// @{
+#define REGISTER_TYPE__(T)                                       \
+  template <>                                                    \
+  T SymbolTable::Get<T>(const std::string& key) {                \
+    auto it = impl_->data.find(std::string(key));                \
+    CHECK(it != impl_->data.end()) << "No value called " << key; \
+    return it->second->get<T>();                                 \
+  }
+REGISTER_TYPE__(int32_t);
+REGISTER_TYPE__(float);
+REGISTER_TYPE__(double);
+REGISTER_TYPE__(int64_t);
+#undef REGISTER_TYPE__
+// @}
+
+SymbolTable::~SymbolTable() {}
+
+size_t SymbolTable::size() const { return impl_->data.size(); }
+
+// @{
+#define REGISTER_TYPE__(T)                                                  \
+  template <>                                                               \
+  Value* SymbolTable::Register(const std::string& key, T&& v) {             \
+    CHECK(!impl_->data.count(key)) << "Duplicate register [" << key << "]"; \
+    auto newitem = ValueRef(v);                                             \
+    impl_->data.emplace(key, newitem);                                      \
+    return newitem.get();                                                   \
+  }
+REGISTER_TYPE__(int)
+REGISTER_TYPE__(float)
+REGISTER_TYPE__(double)
+REGISTER_TYPE__(bool)
+#undef REGISTER_TYPE__
+// @}
+
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/symbol_table.h b/paddle/infrt/host_context/symbol_table.h
new file mode 100644
index 0000000000000..805215a78ce0d
--- /dev/null
+++ b/paddle/infrt/host_context/symbol_table.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <unordered_map>
+
+#include <memory>
+
+#include "paddle/infrt/host_context/value.h"
+
+namespace infrt {
+namespace host_context {
+
+/**
+ * SymbolTable holds all the states of the kernel graph in the runtime.
+ */
+class SymbolTable {
+ public:
+  SymbolTable();
+
+  /**
+   * Register a state called \p key.
+   */
+  Value* Register(const std::string& key);
+
+  Value* Register(const std::string& key, ValueRef value);
+
+  /**
+   * Register a state and set value.
+   */
+  template <typename T>
+  Value* Register(const std::string& key, T&& v);
+
+  size_t size() const;
+
+  /**
+   * Get a state called \p key.
+   */
+  Value* GetValue(const std::string& key) const;
+
+  template <typename T>
+  T Get(const std::string& key);
+
+  ~SymbolTable();
+
+ private:
+  class Impl;
+
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/value.cc b/paddle/infrt/host_context/value.cc
new file mode 100644
index 0000000000000..8c3ccba3d0ba5
--- /dev/null
+++ b/paddle/infrt/host_context/value.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/value.h"
+
+#include "paddle/infrt/tensor/dense_tensor_view.h"
+
+namespace infrt {
+namespace host_context {
+
+ValueRef::ValueRef(int32_t val) : Shared<Value>(new Value(val)) {}
+ValueRef::ValueRef(int64_t val) : Shared<Value>(new Value(val)) {}
+ValueRef::ValueRef(float val) : Shared<Value>(new Value(val)) {}
+ValueRef::ValueRef(double val) : Shared<Value>(new Value(val)) {}
+ValueRef::ValueRef(bool val) : Shared<Value>(new Value(val)) {}
+
+const char* Value::type_info() const { return __type_info__; }
+
+void CopyTo(const Value& from, Value* to) {
+  CHECK(from.valid()) << "from value is not valid, can't be copied";
+  CHECK(to) << "to is not valid";
+  visit(
+      [&](auto&& arg) {
+        using T = std::decay_t<decltype(arg)>;
+        if (std::is_same<T, int16_t>::value)
+          to->data = arg;
+        else if (std::is_same<T, int32_t>::value)
+          to->data = arg;
+        else if (std::is_same<T, float>::value)
+          to->data = arg;
+        else if (std::is_same<T, double>::value)
+          to->data = arg;
+        else if (std::is_same<T, uint32_t>::value)
+          to->data = arg;
+        else if (std::is_same<T, uint64_t>::value)
+          to->data = arg;
+        else if (std::is_same<T, bool>::value)
+          to->data = arg;
+        else if (std::is_same<T, tensor::TensorShape>::value)
+          to->data = arg;
+        else if (std::is_same<T, MlirFunctionExecutable*>::value)
+          to->data = arg;
+        else if (std::is_same<T, tensor::DenseHostTensor>::value)
+          to->data = arg;
+        else if (std::is_same<T, std::vector<int16_t>>::value)
+          to->data = arg;
+        else if (std::is_same<T, std::vector<int64_t>>::value)
+          to->data = arg;
+        else if (std::is_same<T, tensor::TensorMap>::value)
+          to->data = arg;
+        else
+          LOG(FATAL) << "Not supported Value copy: " << typeid(T).name();
+      },
+      from.data);
+}
+
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h
new file mode 100644
index 0000000000000..4a2b92a7e69c5
--- /dev/null
+++ b/paddle/infrt/host_context/value.h
@@ -0,0 +1,156 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <glog/logging.h>
+#include <llvm/ADT/SmallVector.h>
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/infrt/common/object.h"
+#include "paddle/infrt/common/shared.h"
+#include "paddle/infrt/host_context/function.h"
+#include "paddle/infrt/support/variant.h"
+#include "paddle/infrt/tensor/dense_host_tensor.h"
+#include "paddle/infrt/tensor/dense_tensor_view.h"
+#include "paddle/infrt/tensor/tensor_map.h"
+#include "paddle/infrt/tensor/tensor_shape.h"
+
+namespace infrt {
+namespace host_context {
+
+struct MlirFunctionExecutable;
+
+using ValueVariantType = Variant<int16_t,
+                                 int32_t,
+                                 int64_t,
+                                 float,
+                                 double,
+                                 bool,
+                                 std::string,
+                                 tensor::TensorShape,
+                                 tensor::DenseHostTensor,
+                                 MlirFunctionExecutable*,
+                                 tensor::TensorMap,
+                                 std::vector<int16_t>,
+                                 std::vector<int32_t>,
+                                 std::vector<int64_t>,
+                                 std::vector<float>,
+                                 std::vector<double>>;
+
+//! Copy content from \param from to \param to.
+void CopyTo(const Value& from, Value* to);
+
+/**
+ * Represents any data type for value in host context.
+ */
+class Value : public common::Object {
+ public:
+  using variant_type = ValueVariantType;
+
+  explicit Value() {}  // NOLINT
+  explicit Value(int32_t x) : data(x) {}
+  explicit Value(int64_t x) : data(x) {}
+  explicit Value(float x) : data(x) {}
+  explicit Value(double x) : data(x) {}
+  explicit Value(bool x) : data(x) {}
+  explicit Value(std::string x) : data(x) {}
+  explicit Value(tensor::TensorMap&& x) : data(x) {}
+  explicit Value(std::vector<int16_t>&& x) : data(x) {}
+  explicit Value(std::vector<int32_t>&& x) : data(x) {}
+  explicit Value(std::vector<int64_t>&& x) : data(x) {}
+  explicit Value(std::vector<float>&& x) : data(x) {}
+  explicit Value(std::vector<double>&& x) : data(x) {}
+  explicit Value(tensor::TensorShape&& x) : data(std::move(x)) {}
+  explicit Value(tensor::DenseHostTensor&& x) : data(std::move(x)) {}
+  explicit Value(MlirFunctionExecutable* x) : data(x) {}
+
+  template <typename T>
+  const T& get() const {
+    return data.get<T>();
+  }
+  template <typename T>
+  T& get() {
+    return data.get<T>();
+  }
+
+  template <typename T>
+  void set(T&& v) {
+    data = std::move(v);
+  }
+
+  void set(Value* v) { data = std::move(v->data); }
+
+  bool valid() const { return true; }
+
+  const char* type_info() const override;
+
+  friend void CopyTo(const Value& from, Value* to);
+
+ private:
+  ValueVariantType data;
+  static constexpr const char* __type_info__ = "host_context_value";
+};
+
+/**
+ * Represents a counted reference of a Value.
+ */
+class ValueRef : common::Shared<Value> {
+ public:
+  ValueRef() = default;
+  explicit ValueRef(Value* n) : common::Shared<Value>(n) {}
+  explicit ValueRef(int32_t val);
+  explicit ValueRef(int64_t val);
+  explicit ValueRef(float val);
+  explicit ValueRef(double val);
+  explicit ValueRef(bool val);
+
+  using common::Shared<Value>::get;
+  using common::Shared<Value>::Reset;
+  using common::Shared<Value>::operator->;
+  using common::Shared<Value>::operator*;
+  //! Get a readonly data.
+  template <typename T>
+  const T& get() const {
+    CHECK(p_);
+    return p_->get<T>();
+  }
+
+  template <typename T>
+  T& get() {
+    CHECK(p_);
+    return p_->get<T>();
+  }
+
+  //! Assign a data.
+  template <typename T>
+  void Assign(const T& x) {
+    if (!p_) {
+      p_ = common::make_shared<Value>();
+    }
+    *p_ = x;
+  }
+
+  template <typename T, typename... Args>
+  void Assign(Args... args) {
+    p_ = common::make_shared<T>(std::forward<Args>(args)...);
+  }
+
+  inline bool IsValid() { return p_; }
+};
+
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/value_test.cc b/paddle/infrt/host_context/value_test.cc
new file mode 100644
index 0000000000000..48d49478ce0ef
--- /dev/null
+++ b/paddle/infrt/host_context/value_test.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/value.h"
+
+#include <gtest/gtest.h>
+
+namespace infrt {
+namespace host_context {
+
+TEST(ValueRef, test) {
+  ValueRef x(12);
+  ASSERT_EQ(x.get<int>(), 12);
+
+  ValueRef y(1.2f);
+  ASSERT_EQ(y.get<float>(), 1.2f);
+
+  ValueRef z(true);
+  ASSERT_EQ(z.get<bool>(), true);
+}
+
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/CMakeLists.txt b/paddle/infrt/kernel/CMakeLists.txt
new file mode 100644
index 0000000000000..da858aad28f81
--- /dev/null
+++ b/paddle/infrt/kernel/CMakeLists.txt
@@ -0,0 +1,9 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    basic_kernels.cc
+    test_kernels.cc
+    tensor_shape_kernels.cc
+    tensor_kernels.cc
+    control_flow_kernels.cc
+    )
diff --git a/paddle/infrt/kernel/basic_kernels.cc b/paddle/infrt/kernel/basic_kernels.cc
new file mode 100644
index 0000000000000..d7f2c3865157d
--- /dev/null
+++ b/paddle/infrt/kernel/basic_kernels.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/kernel/basic_kernels.h"
+
+#include <iostream>
+#include <string>
+
+#include "llvm/Support/raw_ostream.h"
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/kernel_utils.h"
+
+using infrt::host_context::Attribute;
+
+namespace infrt::kernel {
+
+template <typename T>
+T add(T a, T b) {
+  return a + b;
+}
+
+template <typename T>
+T sub(T a, T b) {
+  return a - b;
+}
+
+template <typename T>
+T mul(T a, T b) {
+  return a * b;
+}
+
+template <typename T>
+T div(T a, T b) {
+  return a / b;
+}
+
+template <typename T>
+void print(T a) {
+  std::cout << a << std::endl;
+}
+
+static std::string GetString(Attribute<std::string> value) {
+  return value.get();
+}
+
+static void PrintString(const std::string &str) {
+  llvm::outs() << "string = " << str << '\n';
+  llvm::outs().flush();
+}
+
+void RegisterBasicKernels(host_context::KernelRegistry *registry) {
+  RegisterIntBasicKernels(registry);
+  RegisterFloatBasicKernels(registry);
+  registry->AddKernel("infrt.get_string", INFRT_KERNEL(GetString));
+  registry->AddKernel("infrt.print_string", INFRT_KERNEL(PrintString));
+}
+
+void RegisterIntBasicKernels(host_context::KernelRegistry *registry) {
+  registry->AddKernel("infrt.add.i32", INFRT_KERNEL(add<int32_t>));
+  registry->AddKernel("infrt.sub.i32", INFRT_KERNEL(sub<int32_t>));
+  registry->AddKernel("infrt.mul.i32", INFRT_KERNEL(mul<int32_t>));
+  registry->AddKernel("infrt.div.i32", INFRT_KERNEL(div<int32_t>));
+  registry->AddKernel("infrt.print.i32", INFRT_KERNEL(print<int32_t>));
+}
+
+void RegisterFloatBasicKernels(host_context::KernelRegistry *registry) {
+  registry->AddKernel("infrt.add.f32", INFRT_KERNEL(add<float>));
+  registry->AddKernel("infrt.sub.f32", INFRT_KERNEL(sub<float>));
+  registry->AddKernel("infrt.mul.f32", INFRT_KERNEL(mul<float>));
+  registry->AddKernel("infrt.div.f32", INFRT_KERNEL(div<float>));
+  registry->AddKernel("infrt.print.f32", INFRT_KERNEL(print<float>));
+}
+
+}  // namespace infrt::kernel
diff --git a/paddle/infrt/kernel/basic_kernels.h b/paddle/infrt/kernel/basic_kernels.h
new file mode 100644
index 0000000000000..9e98885cf6ebf
--- /dev/null
+++ b/paddle/infrt/kernel/basic_kernels.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+
+namespace infrt::host_context {
+
+struct KernelRegistry;
+
+}  // namespace infrt::host_context
+
+namespace infrt::kernel {
+
+/**
+ * Register all the basic kernels to \p registry.
+ */
+void RegisterBasicKernels(host_context::KernelRegistry* registry);
+
+void RegisterIntBasicKernels(host_context::KernelRegistry* registry);
+void RegisterFloatBasicKernels(host_context::KernelRegistry* registry);
+
+}  // namespace infrt::kernel
diff --git a/paddle/infrt/kernel/control_flow_kernels.cc b/paddle/infrt/kernel/control_flow_kernels.cc
new file mode 100644
index 0000000000000..6cc94dbcce077
--- /dev/null
+++ b/paddle/infrt/kernel/control_flow_kernels.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/kernel/control_flow_kernels.h"
+
+#include <glog/logging.h>
+
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/mlir_function_executable.h"
+
+namespace infrt {
+namespace kernel {
+
+static void INFRTCall(
+    host_context::RemainingArguments args,
+    host_context::RemainingResults results,
+    host_context::Attribute<host_context::MlirFunctionExecutable*> fn) {
+  VLOG(3) << "running call kernel ...";
+  CHECK_EQ(fn.get()->num_arguments(), args.size());
+  CHECK_EQ(fn.get()->num_results(), results.size());
+
+  for (auto& v : results.values()) {
+    CHECK(v.get());
+  }
+  fn.get()->Execute(args.values(), results.values());
+}
+
+void RegisterControlFlowKernels(host_context::KernelRegistry* registry) {
+  registry->AddKernel("infrt.call", INFRT_KERNEL(INFRTCall));
+}
+
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/control_flow_kernels.h b/paddle/infrt/kernel/control_flow_kernels.h
new file mode 100644
index 0000000000000..5fa6b985f0b17
--- /dev/null
+++ b/paddle/infrt/kernel/control_flow_kernels.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/infrt/host_context/function.h"
+#include "paddle/infrt/host_context/kernel_utils.h"
+
+namespace infrt {
+
+namespace host_context {
+struct KernelRegistry;
+}  // namespace host_context
+
+namespace kernel {
+
+void RegisterControlFlowKernels(host_context::KernelRegistry* registry);
+
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc
new file mode 100644
index 0000000000000..2fa477aa4dbda
--- /dev/null
+++ b/paddle/infrt/kernel/tensor_kernels.cc
@@ -0,0 +1,79 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/kernel/tensor_kernels.h"
+
+#include <iostream>
+#include <vector>
+
+#include "paddle/infrt/common/global.h"
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/kernel_utils.h"
+#include "paddle/infrt/tensor/dense_host_tensor.h"
+#include "paddle/infrt/tensor/dense_tensor_view.h"
+#include "paddle/infrt/tensor/tensor_map.h"
+#include "paddle/infrt/tensor/tensor_shape.h"
+
+namespace infrt::kernel {
+using namespace host_context;  // NOLINT
+using namespace tensor;        // NOLINT
+
+/// ===== Kernel begin ====
+
+template <typename T>
+DenseHostTensor CreateUninitTensor(Attribute<std::vector<int64_t>> shape) {
+  const auto &shape_data = shape.get();
+  auto array = llvm::ArrayRef<int64_t>(shape_data.data(), shape_data.size());
+  auto type = GetDType<T>();
+  return DenseHostTensor(TensorShape(array), type);
+}
+
+void PrintTensor(const DenseHostTensor &tensor) {
+  std::cout << tensor << std::endl;
+}
+
+template <typename T>
+void FillTensorWithConstant(DenseHostTensor *tensor, Attribute<T> v) {
+  MutableDTArrayView<T>(tensor).Fill(v.get());
+}
+
+TensorMap LoadParams(const std::string &path) {
+  return *(infrt::tensor::LoadParams(path));
+}
+
+DenseHostTensor GetParam(TensorMap map, Attribute<std::string> nameAttr) {
+  auto &name = nameAttr.get();
+  return *(map[name]);
+}
+
+DenseHostTensor ShallowCopyTensor(DenseHostTensor v) { return v; }
+
+/// ===== Kernel end ====
+
+void RegisterTensorKernels(host_context::KernelRegistry *registry) {
+  registry->AddKernel("dt.create_uninit_tensor.f32",
+                      INFRT_KERNEL(CreateUninitTensor<float>));
+  registry->AddKernelAttrNameList("dt.create_uninit_tensor.f32", {"shape"});
+  registry->AddKernel("dt.print_tensor", INFRT_KERNEL(PrintTensor));
+  registry->AddKernel("dt.fill_tensor_with_constant.f32",
+                      INFRT_KERNEL(FillTensorWithConstant<float>));
+  registry->AddKernel("dt.fill_tensor_with_constant.f64",
+                      INFRT_KERNEL(FillTensorWithConstant<double>));
+  registry->AddKernel("dt.load_params", INFRT_KERNEL(LoadParams));
+  registry->AddKernel("dt.get_param", INFRT_KERNEL(GetParam));
+  registry->AddKernel("dt.shallow_copy_tensor",
+                      INFRT_KERNEL(ShallowCopyTensor));
+}
+
+}  // namespace infrt::kernel
diff --git a/paddle/infrt/kernel/tensor_kernels.h b/paddle/infrt/kernel/tensor_kernels.h
new file mode 100644
index 0000000000000..8f2180ba80a4f
--- /dev/null
+++ b/paddle/infrt/kernel/tensor_kernels.h
@@ -0,0 +1,25 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace infrt::host_context {
+struct KernelRegistry;
+}  // namespace infrt::host_context
+
+namespace infrt::kernel {
+
+void RegisterTensorKernels(host_context::KernelRegistry* registry);
+
+}  // namespace infrt::kernel
diff --git a/paddle/infrt/kernel/tensor_shape_kernels.cc b/paddle/infrt/kernel/tensor_shape_kernels.cc
new file mode 100644
index 0000000000000..a04b492819298
--- /dev/null
+++ b/paddle/infrt/kernel/tensor_shape_kernels.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/kernel/tensor_shape_kernels.h"
+
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/Support/raw_os_ostream.h>
+
+#include <iostream>
+
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/kernel_utils.h"
+#include "paddle/infrt/tensor/tensor_shape.h"
+
+namespace infrt::kernel {
+
+void PrintShape(const tensor::TensorShape& shape) {
+  llvm::raw_os_ostream oos(std::cout);
+  oos << shape << '\n';
+}
+
+void RegisterTensorShapeKernels(host_context::KernelRegistry* registry) {
+  registry->AddKernel("ts.print_shape", INFRT_KERNEL(PrintShape));
+}
+
+}  // namespace infrt::kernel
diff --git a/paddle/infrt/kernel/tensor_shape_kernels.h b/paddle/infrt/kernel/tensor_shape_kernels.h
new file mode 100644
index 0000000000000..e87c6c37e88a0
--- /dev/null
+++ b/paddle/infrt/kernel/tensor_shape_kernels.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace infrt::host_context {
+
+class KernelRegistry;
+
+}  // namespace infrt::host_context
+
+namespace infrt::kernel {
+
+void RegisterTensorShapeKernels(host_context::KernelRegistry* registry);
+
+}  // namespace infrt::kernel
diff --git a/paddle/infrt/kernel/test_kernels.cc b/paddle/infrt/kernel/test_kernels.cc
new file mode 100644
index 0000000000000..d5f64d09b602f
--- /dev/null
+++ b/paddle/infrt/kernel/test_kernels.cc
@@ -0,0 +1,200 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/kernel/test_kernels.h"
+
+#include <llvm/ADT/FunctionExtras.h>
+#include <llvm/Support/raw_ostream.h>
+
+#include <cassert>
+#include <chrono>
+#include <ctime>
+#include <iomanip>
+#include <iostream>
+#include <string>
+
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/kernel_utils.h"
+#include "paddle/infrt/host_context/mlir_function_executable.h"
+#include "paddle/infrt/tensor/dense_host_tensor.h"
+
+using infrt::host_context::Attribute;
+using infrt::host_context::MlirFunctionExecutable;
+using infrt::host_context::RemainingArguments;
+
+namespace infrt::kernel {
+namespace {
+class BenchmarkStats {
+ public:
+  BenchmarkStats(std::string name,
+                 int num_warmup_runs,
+                 int max_count,
+                 std::chrono::microseconds benchmark_duration)
+      : name_{name},
+        num_warmup_runs_{num_warmup_runs},
+        max_count_{max_count},
+        benchmark_duration_{benchmark_duration} {}
+
+  void StartRun() {
+    ++cur_count_;
+    // Start recording CPU time.
+    cur_start_walltime_ = std::chrono::steady_clock::now();
+    cur_start_cpu_ = std::clock();
+  }
+
+  void StopRun() {
+    // Do not collect the runtime statistics if we are still in the warm up
+    // period.
+    if (cur_count_ <= num_warmup_runs_) return;
+
+    // Stop the CPU timer.
+    std::clock_t cur_stop_cpu_ = std::clock();
+
+    // Stop the wall clock timer.
+    auto cur_stop_walltime_ = std::chrono::steady_clock::now();
+
+    // Collect the wall clock duration.
+    auto duration_walltime_ = cur_stop_walltime_ - cur_start_walltime_;
+    run_times_walltime_.push_back(duration_walltime_);
+
+    // Collect the CPU duration in microseconds.
+    // First cast to integer that represents microseconds with truncation, as
+    // does std::chrono::duration_cast. Then cast to std::chrono::microseconds.
+    std::clock_t duration_cpu_raw = cur_stop_cpu_ - cur_start_cpu_;
+    auto duration_cpu_ = static_cast<std::chrono::nanoseconds>(
+        static_cast<int64_t>(1e9 * duration_cpu_raw / CLOCKS_PER_SEC));
+
+    run_times_cpu_.push_back(duration_cpu_);
+
+    total_duration_walltime_ += duration_walltime_;
+    total_duration_cpu_ += duration_cpu_;
+  }
+  // Return if we should we run more rounds.
+  bool MoreRun() const {
+    return cur_count_ < max_count_ + num_warmup_runs_ &&
+           total_duration_walltime_ < benchmark_duration_;
+  }
+
+  // Summarize the benchmark results.
+  void Summarize() {
+    std::sort(run_times_walltime_.begin(), run_times_walltime_.end());
+    std::sort(run_times_cpu_.begin(), run_times_cpu_.end());
+
+    auto percentile = [](
+        double p, const std::vector<std::chrono::nanoseconds> &run_times) {
+      assert(p >= 0.0 && p <= 1.0);
+      return run_times[run_times.size() * p];
+    };
+
+    // BM: prefix is added to make grepping results from lit output easier.
+    std::string prefix;
+    llvm::raw_string_ostream(prefix) << "BM:" << name_ << ':';
+    auto cpu_utilization =
+        total_duration_cpu_.count() * 100.0 / total_duration_walltime_.count();
+
+    llvm::outs() << prefix << "Count: " << run_times_walltime_.size() << '\n';
+    llvm::outs() << prefix
+                 << "Duration(ns): " << total_duration_walltime_.count()
+                 << '\n';
+    llvm::outs() << prefix
+                 << "Time Min(ns): " << run_times_walltime_.front().count()
+                 << '\n';
+    llvm::outs() << prefix
+                 << "Time Max(ns): " << run_times_walltime_.back().count()
+                 << '\n';
+    llvm::outs() << prefix << "Time 50%(ns): "
+                 << percentile(0.5, run_times_walltime_).count() << '\n';
+    llvm::outs() << prefix << "Time 95%(ns): "
+                 << percentile(0.95, run_times_walltime_).count() << '\n';
+    llvm::outs() << prefix << "Time 99%(ns): "
+                 << percentile(0.99, run_times_walltime_).count() << '\n';
+    // Log CPU time statistics.
+    llvm::outs() << prefix
+                 << "CPU Duration(ns): " << total_duration_cpu_.count() << '\n';
+    llvm::outs() << prefix << "CPU Min(ns): " << run_times_cpu_.front().count()
+                 << '\n';
+    llvm::outs() << prefix << "CPU Max(ns): " << run_times_cpu_.back().count()
+                 << '\n';
+    llvm::outs() << prefix
+                 << "CPU 50%(ns): " << percentile(0.5, run_times_cpu_).count()
+                 << '\n';
+    llvm::outs() << prefix
+                 << "CPU 95%(ns): " << percentile(0.95, run_times_cpu_).count()
+                 << '\n';
+    llvm::outs() << prefix
+                 << "CPU 99%(ns): " << percentile(0.99, run_times_cpu_).count()
+                 << '\n';
+    llvm::outs() << prefix << "CPU utilization(percent): " << cpu_utilization
+                 << "\n";
+    llvm::outs().flush();
+  }
+
+ private:
+  const std::string name_;
+  const int num_warmup_runs_;
+  const int max_count_;
+  int cur_count_ = 0;
+  const std::chrono::nanoseconds benchmark_duration_;
+  std::chrono::nanoseconds total_duration_walltime_{};
+  std::chrono::nanoseconds total_duration_cpu_{};
+  std::chrono::time_point<std::chrono::steady_clock> cur_start_walltime_{};
+  std::clock_t cur_start_cpu_;
+  std::vector<std::chrono::nanoseconds> run_times_walltime_;
+  // CPU run times in microseconds.
+  std::vector<std::chrono::nanoseconds> run_times_cpu_;
+};
+
+}  // anonymous namespace
+
+// This op benchmarks the input function by running the function in a loop
+// up to a max count or max time as specified in the function's attributes.
+//
+// Attributes:
+// duration_secs: Benchmark duration in seconds.
+// max_count: Max run count of input function.
+// name: The name used to tag the benchmark results.
+// num_warmup_runs: Number of warm up runs before benchmarking starts.
+// fn: The input function to be benchmarked.
+static void benchmark(RemainingArguments args,
+                      host_context::RemainingResults results,
+                      Attribute<int32_t> duration_secs,
+                      Attribute<int32_t> max_count,
+                      Attribute<std::string> name,
+                      Attribute<int32_t> num_warmup_runs,
+                      Attribute<MlirFunctionExecutable *> fn) {
+  BenchmarkStats bm_stats{name.get(),
+                          num_warmup_runs.get(),
+                          max_count.get(),
+                          std::chrono::seconds(duration_secs.get())};
+
+  while (bm_stats.MoreRun()) {
+    bm_stats.StartRun();
+    fn.get()->Execute(args.values(), results.values(), true);
+    bm_stats.StopRun();
+  }
+  bm_stats.Summarize();
+}
+
+// Just copy the input to the result.
+tensor::DenseHostTensor ShadowCopyTensor(tensor::DenseHostTensor src) {
+  return src;
+}
+
+void RegisterTestKernels(host_context::KernelRegistry *registry) {
+  registry->AddKernel("infrt.benchmark", INFRT_KERNEL(benchmark));
+  registry->AddKernel("infrt.test.shadow_copy_tensor",
+                      INFRT_KERNEL(ShadowCopyTensor));
+}
+
+}  // namespace infrt::kernel
diff --git a/paddle/infrt/kernel/test_kernels.h b/paddle/infrt/kernel/test_kernels.h
new file mode 100644
index 0000000000000..f42884dfaf2c9
--- /dev/null
+++ b/paddle/infrt/kernel/test_kernels.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+
+namespace infrt::host_context {
+
+struct KernelRegistry;
+
+}  // namespace infrt::host_context
+
+namespace infrt::kernel {
+
+/**
+ * Register all the test kernels to registry.
+ */
+void RegisterTestKernels(host_context::KernelRegistry* registry);
+
+}  // namespace infrt::kernel
diff --git a/paddle/infrt/paddle/CMakeLists.txt b/paddle/infrt/paddle/CMakeLists.txt
new file mode 100644
index 0000000000000..172d78ecde3b8
--- /dev/null
+++ b/paddle/infrt/paddle/CMakeLists.txt
@@ -0,0 +1,24 @@
+proto_library(paddle_framework_proto SRCS framework.proto)
+
+add_subdirectory(cpp)
+add_subdirectory(pb)
+
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    model_parser.cc
+    scope.cc
+    tensor.cc
+    )
+
+foreach(cpp ${SRCS})
+  set(infrt_src
+    "${infrt_src};infrt/paddle/${cpp}"
+    CACHE INTERNAL "")
+endforeach()
+
+file(GLOB includes LIST_DIRECTORIES false RELATIVE ${CMAKE_SOURCE_DIR} *.h)
+
+foreach(header ${includes})
+  set(core_includes "${core_includes};${header}" CACHE INTERNAL "")
+endforeach()
diff --git a/paddle/infrt/paddle/cpp/CMakeLists.txt b/paddle/infrt/paddle/cpp/CMakeLists.txt
new file mode 100644
index 0000000000000..0feaabd2fa7c9
--- /dev/null
+++ b/paddle/infrt/paddle/cpp/CMakeLists.txt
@@ -0,0 +1,16 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    )
+
+foreach(cpp ${SRCS})
+  set(infrt_src
+    "${infrt_src};infrt/paddle/cpp/${cpp}"
+    CACHE INTERNAL "")
+endforeach()
+
+file(GLOB includes LIST_DIRECTORIES false RELATIVE ${CMAKE_SOURCE_DIR} *.h)
+
+foreach(header ${includes})
+  set(core_includes "${core_includes};${header}" CACHE INTERNAL "")
+endforeach()
diff --git a/paddle/infrt/paddle/cpp/desc_api.h b/paddle/infrt/paddle/cpp/desc_api.h
new file mode 100644
index 0000000000000..ccd79c048ab14
--- /dev/null
+++ b/paddle/infrt/paddle/cpp/desc_api.h
@@ -0,0 +1,229 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace infrt::paddle::cpp {
+
+/*
+ * Compatible interfaces for all the different kinds of XXXDesc. All the XXXDesc
+ * classes should implement this.
+ */
+class VarDescAPI {
+ public:
+  enum class Type {
+    // Pod Types
+    BOOL = 0,
+    INT16,
+    INT32,
+    INT64,
+    FP16,
+    FP32,
+    FP64,
+    // Tensor<size_t> is used in C++.
+    SIZE_T,
+    UINT8,
+    INT8,
+
+    // Other types that may need additional descriptions
+    LOD_TENSOR,
+    SELECTED_ROWS,
+    FEED_MINIBATCH,
+    FETCH_LIST,
+    STEP_SCOPES,
+    LOD_RANK_TABLE,
+    LOD_TENSOR_ARRAY,
+    PLACE_LIST,
+    READER,
+    // Any runtime decided variable type is raw
+    // raw variables should manage their own allocations
+    // in operators like nccl_op
+    RAW,
+    TUPLE
+  };
+
+  using VarDataType = Type;
+
+  virtual ~VarDescAPI() = default;
+
+  // Get var's name
+  virtual std::string Name() const = 0;
+  // Set var's name
+  virtual void SetName(std::string name) = 0;
+  // Get var's type
+  virtual Type GetType() const = 0;
+  // Set var's type
+  virtual void SetType(Type type) = 0;
+  // Tell whether var is persistable or not
+  virtual bool Persistable() const = 0;
+  // Set var to be persistable or not
+  virtual void SetPersistable(bool persistable) = 0;
+  // Get var's shape
+  virtual std::vector<int64_t> GetShape() const = 0;
+  // Set var's shape
+  virtual void SetShape(const std::vector<int64_t>& dims) = 0;
+};
+
+/*
+ * NOTE Some interfaces are weried, we remain them unchanged to keep compatible
+ * with framework::OpDesc in Fluid framework.
+ */
+class OpDescAPI {
+ public:
+  // The AttrType is used to make the proto::AttrType portable.
+  enum class AttrType {
+    INT = 0,
+    FLOAT = 1,
+    STRING = 2,
+    INTS = 3,
+    FLOATS = 4,
+    STRINGS = 5,
+    BOOLEAN = 6,
+    BOOLEANS = 7,
+    BLOCK = 8,
+    LONG = 9,
+    BLOCKS = 10,
+    LONGS = 11,
+    UNK,
+  };
+
+  virtual ~OpDescAPI() = default;
+
+  /// Get operator's type.
+  virtual std::string Type() const = 0;
+  /// Set operator's type.
+  virtual void SetType(const std::string& type) = 0;
+  /// Get arguments given the parameter.
+  virtual std::vector<std::string> Input(const std::string& param) const = 0;
+  /// Get parameters.
+  virtual std::vector<std::string> InputArgumentNames() const = 0;
+  /// Get arguments given the parameter.
+  virtual std::vector<std::string> Output(const std::string& param) const = 0;
+  /// Get parameters.
+  virtual std::vector<std::string> OutputArgumentNames() const = 0;
+  /// Set a input given the parameter and arguments.
+  virtual void SetInput(const std::string& param,
+                        const std::vector<std::string>& args) = 0;
+  virtual void SetOutput(const std::string& param,
+                         const std::vector<std::string>& args) = 0;
+  /// Tell whether this desc has an attribute.
+  virtual bool HasAttr(const std::string& name) const = 0;
+
+  /// Get the type of an attribute.
+  virtual AttrType GetAttrType(const std::string& name) const = 0;
+
+  virtual std::vector<std::string> AttrNames() const = 0;
+
+  /// Set an attribute.
+  template <typename T>
+  void SetAttr(const std::string& name, const T& v);
+
+  /// Get an attribute.
+  template <typename T>
+  T GetAttr(const std::string& name) const;
+
+  std::string Repr() const {
+    std::stringstream ss;
+    ss << Type();
+    ss << "(";
+    for (auto& arg : InputArgumentNames()) {
+      ss << arg << ":";
+      for (auto val : Input(arg)) {
+        ss << val << " ";
+      }
+    }
+    ss << ") -> (";
+    for (auto& arg : OutputArgumentNames()) {
+      ss << arg << ":";
+      for (auto val : Output(arg)) {
+        ss << val << " ";
+      }
+    }
+    ss << ")";
+    return ss.str();
+  }
+};
+
+class BlockDescAPI {
+ public:
+  virtual ~BlockDescAPI() = default;
+
+  virtual int32_t Idx() const = 0;
+
+  virtual void SetIdx(int32_t idx) = 0;
+
+  virtual int32_t ParentIdx() const = 0;
+
+  virtual void SetParentIdx(int32_t idx) = 0;
+
+  virtual size_t VarsSize() const = 0;
+
+  virtual void ClearVars() = 0;
+
+  // NOTE: This ugly method is used to compatible interfaces between cpp and
+  // pb/nb backends
+  // TODO(sangoly): refine this
+  template <typename T>
+  T* GetVar(int32_t idx);
+
+  template <typename T>
+  T* AddVar();
+
+  virtual size_t OpsSize() const = 0;
+
+  virtual void ClearOps() = 0;
+
+  // NOTE: This ugly method is used to compatible interfaces between cpp and
+  // pb/nb backends
+  // TODO(sangoly): refine this
+  template <typename T>
+  T* GetOp(int32_t idx);
+
+  template <typename T>
+  T* AddOp();
+
+  virtual int32_t ForwardBlockIdx() const = 0;
+
+  virtual void SetForwardBlockIdx(int32_t idx) = 0;
+};
+
+class ProgramDescAPI {
+ public:
+  virtual ~ProgramDescAPI() = default;
+
+  virtual size_t BlocksSize() const = 0;
+
+  virtual void ClearBlocks() = 0;
+
+  // NOTE: This ugly method is used to compatible interfaces between cpp and
+  // pb/nb backends
+  // TODO(sangoly): refine this
+  template <typename T>
+  T* GetBlock(int32_t idx);
+
+  template <typename T>
+  T* AddBlock();
+
+  virtual bool HasVersion() const = 0;
+
+  virtual int64_t Version() const = 0;
+
+  virtual void SetVersion(int64_t version) = 0;
+};
+
+}  // namespace infrt::paddle::cpp
diff --git a/paddle/infrt/paddle/framework.proto b/paddle/infrt/paddle/framework.proto
new file mode 100644
index 0000000000000..634ec9665d08e
--- /dev/null
+++ b/paddle/infrt/paddle/framework.proto
@@ -0,0 +1,213 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+syntax = "proto2";
+package paddle.framework.proto;
+
+// Any incompatible changes to ProgramDesc and its dependencies should
+// raise the version defined version.h.
+//
+// Serailization and Deserialization codes should be modified in a way
+// that supports old versions following the version and compatibility policy.
+message Version { optional int64 version = 1 [ default = 0 ]; }
+
+enum AttrType {
+  INT = 0;
+  FLOAT = 1;
+  STRING = 2;
+  INTS = 3;
+  FLOATS = 4;
+  STRINGS = 5;
+  BOOLEAN = 6;
+  BOOLEANS = 7;
+  BLOCK = 8;
+  LONG = 9;
+  BLOCKS = 10;
+  LONGS = 11;
+}
+
+// OpDesc describes an instance of a C++ framework::OperatorBase
+// derived class type.
+message OpDesc {
+
+  message Attr {
+    required string name = 1;
+    required AttrType type = 2;
+    optional int32 i = 3;
+    optional float f = 4;
+    optional string s = 5;
+    repeated int32 ints = 6;
+    repeated float floats = 7;
+    repeated string strings = 8;
+    optional bool b = 10;
+    repeated bool bools = 11;
+    optional int32 block_idx = 12;
+    optional int64 l = 13;
+    repeated int32 blocks_idx = 14;
+    repeated int64 longs = 15;
+  };
+
+  message Var {
+    required string parameter = 1;
+    repeated string arguments = 2;
+  };
+
+  required string type = 3;
+  repeated Var inputs = 1;
+  repeated Var outputs = 2;
+  repeated Attr attrs = 4;
+  optional bool is_target = 5 [ default = false ];
+};
+
+// OpProto describes a C++ framework::OperatorBase derived class.
+message OpProto {
+
+  // VarProto describes the C++ type framework::Variable.
+  message Var {
+    required string name = 1;
+    required string comment = 2;
+
+    optional bool duplicable = 3 [ default = false ];
+    optional bool intermediate = 4 [ default = false ];
+    optional bool dispensable = 5 [ default = false ];
+  }
+
+  // AttrProto describes the C++ type Attribute.
+  message Attr {
+    required string name = 1;
+    required AttrType type = 2;
+    required string comment = 3;
+    // If that attribute is generated, it means the Paddle third
+    // language binding has responsibility to fill that
+    // attribute. End-User should not set that attribute.
+    optional bool generated = 4 [ default = false ];
+  }
+
+  required string type = 1;
+  repeated Var inputs = 2;
+  repeated Var outputs = 3;
+  repeated Attr attrs = 4;
+  required string comment = 5;
+}
+
+message VarType {
+  enum Type {
+    // Pod Types
+    BOOL = 0;
+    INT16 = 1;
+    INT32 = 2;
+    INT64 = 3;
+    FP16 = 4;
+    FP32 = 5;
+    FP64 = 6;
+    // Tensor<size_t> is used in C++.
+    SIZE_T = 19;
+    UINT8 = 20;
+    INT8 = 21;
+
+    // Other types that may need additional descriptions
+    LOD_TENSOR = 7;
+    SELECTED_ROWS = 8;
+    FEED_MINIBATCH = 9;
+    FETCH_LIST = 10;
+    STEP_SCOPES = 11;
+    LOD_RANK_TABLE = 12;
+    LOD_TENSOR_ARRAY = 13;
+    PLACE_LIST = 14;
+    READER = 15;
+    // Any runtime decided variable type is raw
+    // raw variables should manage their own allocations
+    // in operators like nccl_op
+    RAW = 17;
+    TUPLE = 18;
+  }
+
+  required Type type = 1;
+
+  message TensorDesc {
+    // Should only be PODType. Is enforced in C++
+    required Type data_type = 1;
+    repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+  }
+  optional TensorDesc selected_rows = 2;
+
+  message LoDTensorDesc {
+    required TensorDesc tensor = 1;
+    optional int32 lod_level = 2 [ default = 0 ];
+  }
+  optional LoDTensorDesc lod_tensor = 3;
+
+  message LoDTensorArrayDesc {
+    required TensorDesc tensor = 1;
+    optional int32 lod_level = 2 [ default = 0 ];
+  }
+  optional LoDTensorArrayDesc tensor_array = 4;
+
+  message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; }
+  optional ReaderDesc reader = 5;
+
+  message Tuple { repeated Type element_type = 1; }
+  optional Tuple tuple = 7;
+}
+
+message VarDesc {
+  required string name = 1;
+  required VarType type = 2;
+  optional bool persistable = 3 [ default = false ];
+  // True if the variable is an input data and
+  // have to check the feed data shape and dtype
+  optional bool need_check_feed = 4 [ default = false ];
+}
+
+message BlockDesc {
+  required int32 idx = 1;
+  required int32 parent_idx = 2;
+  repeated VarDesc vars = 3;
+  repeated OpDesc ops = 4;
+  optional int32 forward_block_idx = 5 [ default = -1 ];
+}
+
+// CompatibleInfo is used to determine if a feature is compatible and
+// provides the information.
+message CompatibleInfo {
+  enum Type {
+    COMPATIBLE = 0;
+    DEFINITELY_NOT = 1;
+    POSSIBLE = 2;
+    BUG_FIX = 3;
+    PRECISION_CHANGE = 4;
+  }
+  required string version = 1;
+  required Type type = 2;
+}
+
+// In some cases, Paddle Fluid may perform operator definition iterations,
+// and the operator uses OpCompatibleMap for compatibility testing.
+message OpCompatibleMap {
+  message OpCompatiblePair {
+    required string op_name = 1;
+    required CompatibleInfo compatible_info = 2;
+  }
+  repeated OpCompatiblePair pair = 1;
+  optional string default_required_version = 2;
+}
+
+// Please refer to
+// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
+// for more details.
+// TODO(panyx0718): A model can have multiple programs. Need a
+// way to distinguish them. Maybe ID or name?
+message ProgramDesc {
+  reserved 2; // For backward compatibility.
+  repeated BlockDesc blocks = 1;
+  optional Version version = 4;
+  optional OpCompatibleMap op_compatible_map = 3;
+}
\ No newline at end of file
diff --git a/paddle/infrt/paddle/model_parser.cc b/paddle/infrt/paddle/model_parser.cc
new file mode 100644
index 0000000000000..285280e69435b
--- /dev/null
+++ b/paddle/infrt/paddle/model_parser.cc
@@ -0,0 +1,172 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/paddle/model_parser.h"
+
+#include <fstream>
+#include <vector>
+
+#include "paddle/infrt/common/common.h"
+#include "paddle/infrt/common/string.h"
+#include "paddle/infrt/common/target.h"
+#include "paddle/infrt/common/type.h"
+
+namespace infrt::paddle {
+
+int SizeOfType(framework_proto::VarType::Type type) {
+  using Type = framework_proto::VarType::Type;
+  switch (static_cast<int>(type)) {
+#define DO(desc, type)            \
+  case Type::VarType_Type_##desc: \
+    return sizeof(type);
+    DO(BOOL, bool);
+    DO(FP16, float);
+    DO(FP32, float);
+    DO(INT8, int8_t);
+    DO(INT16, int16_t);
+    DO(INT32, int);
+    DO(INT64, int64_t);
+#undef DO
+    default:
+      LOG(FATAL) << "unknown data type " << type;
+  }
+  return -1;
+}
+
+void TensorFromStream(std::istream &is,
+                      _Tensor_ *tensor,
+                      const common::Target &target) {
+  using Type = framework_proto::VarType::Type;
+  uint32_t version;
+  is.read(reinterpret_cast<char *>(&version), sizeof(version));
+  CHECK_EQ(version, 0U) << "Only version 0 is supported";
+  // read tensor desc
+  framework_proto::VarType::TensorDesc desc;
+  {
+    // int32_t size
+    // proto buffer
+    int32_t size;
+    is.read(reinterpret_cast<char *>(&size), sizeof(size));
+    std::unique_ptr<char[]> buf(new char[size]);
+    is.read(reinterpret_cast<char *>(buf.get()), size);
+    CHECK(desc.ParseFromArray(buf.get(), size)) << "Cannot parse tensor desc";
+  }
+
+  // read tensor
+  std::vector<int32_t> dims_vec;
+  std::copy(
+      desc.dims().begin(), desc.dims().end(), std::back_inserter(dims_vec));
+  Shape dims(dims_vec);
+  tensor->Resize(dims);
+  void *buf;
+  size_t size = tensor->shape().numel() * SizeOfType(desc.data_type());
+  // alllocate memory
+  if (target.arch == Target::Arch::X86) {
+    switch (static_cast<int>(desc.data_type())) {
+#define SET_TENSOR(desc, type, precision)     \
+  case Type::VarType_Type_##desc:             \
+    buf = tensor->mutable_data<type>(target); \
+    tensor->set_type(precision);              \
+    break
+
+      SET_TENSOR(FP32, float, Float(32));
+      SET_TENSOR(INT8, int8_t, Int(8));
+      SET_TENSOR(INT16, int16_t, Int(16));
+      SET_TENSOR(INT32, int32_t, Int(32));
+      SET_TENSOR(INT64, int64_t, Int(64));
+#undef SET_TENSOR
+      default:
+        LOG(FATAL) << "unknown type " << desc.data_type();
+    }
+    // tensor->set_persistable(true);
+    is.read(static_cast<char *>(buf), size);
+  } else if (target.arch == Target::Arch::NVGPU) {
+#ifdef INFRT_WITH_CUDA
+    if (desc.data_type() != Type::VarType_Type_FP32)
+      LOG(FATAL) << "[CUDA] The type is not fp32!!";
+    auto *data = tensor->mutable_data<float>(target);
+    tensor->set_type(infrt::common::Float(32));
+    std::vector<float> temp(tensor->shape().numel());
+    // LOG(INFO) <<"[CUDA] The tensor's size is "<< tensor->shape().numel();
+    is.read(reinterpret_cast<char *>(temp.data()), size);
+    CUDA_CALL(cudaMemcpy(reinterpret_cast<void *>(data),
+                         temp.data(),
+                         tensor->shape().numel() * sizeof(float),
+                         cudaMemcpyHostToDevice));
+#else
+    LOG(FATAL) << "To use CUDA backends, you need to set WITH_CUDA ON!";
+#endif
+  } else {
+    INFRT_NOT_IMPLEMENTED
+  }
+}
+
+void LoadLoDTensor(std::istream &is, _Variable *var, const Target &target) {
+  auto &tensor = var->get<Tensor>();
+  uint32_t version{};
+  is.read(reinterpret_cast<char *>(&version), sizeof(version));
+  VLOG(3) << "model version " << version;
+
+  // Load LoD information
+  uint64_t lod_level{};
+  is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
+
+  for (uint64_t i = 0; i < lod_level; ++i) {
+    uint64_t size;
+    is.read(reinterpret_cast<char *>(&size), sizeof(size));
+    std::vector<uint64_t> tmp(size / sizeof(uint64_t));
+    is.read(reinterpret_cast<char *>(tmp.data()),
+            static_cast<std::streamsize>(size));
+    // lod[i] = tmp;
+  }
+
+  TensorFromStream(is, tensor.operator->(), target);
+}
+
+void ReadBinaryFile(const std::string &filename, std::string *contents) {
+  std::ifstream fin(filename, std::ios::in | std::ios::binary);
+  CHECK(fin.is_open()) << "Cannot open file: " << filename;
+  fin.seekg(0, std::ios::end);
+  auto size = fin.tellg();
+  contents->clear();
+  contents->resize(size);
+  fin.seekg(0, std::ios::beg);
+  fin.read(&(contents->at(0)), contents->size());
+  fin.close();
+}
+
+std::unique_ptr<framework_proto::ProgramDesc> LoadProgram(
+    const std::string &path, bool program_from_memory) {
+  std::unique_ptr<framework_proto::ProgramDesc> main_program(
+      new framework_proto::ProgramDesc);
+  if (!program_from_memory) {
+    std::string desc_str;
+    ReadBinaryFile(path, &desc_str);
+    main_program->ParseFromString(desc_str);
+  } else {
+    main_program->ParseFromString(path);
+  }
+  return main_program;
+}
+
+void LoadParams(const std::string &path) {}
+
+// Load directly to CPU, and latter transfer to other devices.
+void LoadParam(const std::string &path, _Variable *out, const Target &target) {
+  std::ifstream fin(path, std::ios::binary);
+  CHECK(fin.is_open()) << "failed to open file " << path;
+  LoadLoDTensor(fin, out, target);
+}
+
+}  // namespace infrt::paddle
diff --git a/paddle/infrt/paddle/model_parser.h b/paddle/infrt/paddle/model_parser.h
new file mode 100644
index 0000000000000..73125fadedb82
--- /dev/null
+++ b/paddle/infrt/paddle/model_parser.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/infrt/paddle/framework.pb.h"
+#include "paddle/infrt/paddle/pb/block_desc.h"
+#include "paddle/infrt/paddle/pb/op_desc.h"
+#include "paddle/infrt/paddle/pb/program_desc.h"
+#include "paddle/infrt/paddle/scope.h"
+#include "paddle/infrt/paddle/tensor.h"
+
+namespace infrt::paddle {
+namespace framework_proto = ::paddle::framework::proto;
+
+// Read a __model__ file.
+std::unique_ptr<framework_proto::ProgramDesc> LoadProgram(
+    const std::string& path, bool program_from_memory = false);
+
+void LoadLoDTensor(std::istream& is,
+                   _Variable* var,
+                   const common::Target& target);
+
+// Read a single file containing all the parameters.
+void LoadParams(const std::string& path);
+
+// Load a single parameter to an output tensor.
+void LoadParam(const std::string& path,
+               _Variable* out,
+               const common::Target& target);
+
+// LoDTensor to ostream
+void TensorToStream(std::ostream& os, const _Tensor_& tensor);
+void TensorFromStream(
+    std::istream& is,
+    _Tensor_* tensor,
+    const common::Target& target = common::DefaultHostTarget());
+void ReadBinaryFile(const std::string& filename, std::string* contents);
+
+}  // namespace infrt::paddle
diff --git a/paddle/infrt/paddle/pb/CMakeLists.txt b/paddle/infrt/paddle/pb/CMakeLists.txt
new file mode 100644
index 0000000000000..fac38afa62db2
--- /dev/null
+++ b/paddle/infrt/paddle/pb/CMakeLists.txt
@@ -0,0 +1,20 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    var_desc.cc
+    op_desc.cc
+    block_desc.cc
+    program_desc.cc
+    )
+
+foreach(cpp ${SRCS})
+  set(infrt_src
+    "${infrt_src};infrt/paddle/pb/${cpp}"
+    CACHE INTERNAL "")
+endforeach()
+
+file(GLOB includes LIST_DIRECTORIES false RELATIVE ${CMAKE_SOURCE_DIR} *.h)
+
+foreach(header ${includes})
+  set(core_includes "${core_includes};${header}" CACHE INTERNAL "")
+endforeach()
diff --git a/paddle/infrt/paddle/pb/block_desc.cc b/paddle/infrt/paddle/pb/block_desc.cc
new file mode 100644
index 0000000000000..11186bc68af16
--- /dev/null
+++ b/paddle/infrt/paddle/pb/block_desc.cc
@@ -0,0 +1,43 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/paddle/pb/block_desc.h"
+
+namespace infrt::paddle::pb {
+
+template <>
+framework_proto::VarDesc* BlockDesc::GetVar<framework_proto::VarDesc>(
+    int32_t idx) {
+  CHECK_LT(idx, static_cast<int>(VarsSize())) << "idx >= vars.size()";
+  return desc_->mutable_vars(idx);
+}
+
+template <>
+framework_proto::VarDesc* BlockDesc::AddVar<framework_proto::VarDesc>() {
+  return desc_->add_vars();
+}
+
+template <>
+framework_proto::OpDesc* BlockDesc::GetOp<framework_proto::OpDesc>(
+    int32_t idx) {
+  CHECK_LT(idx, static_cast<int>(OpsSize())) << "idx >= ops.size()";
+  return desc_->mutable_ops(idx);
+}
+
+template <>
+framework_proto::OpDesc* BlockDesc::AddOp<framework_proto::OpDesc>() {
+  return desc_->add_ops();
+}
+
+}  // namespace infrt::paddle::pb
diff --git a/paddle/infrt/paddle/pb/block_desc.h b/paddle/infrt/paddle/pb/block_desc.h
new file mode 100644
index 0000000000000..9c1b7f9adf172
--- /dev/null
+++ b/paddle/infrt/paddle/pb/block_desc.h
@@ -0,0 +1,77 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <glog/logging.h>
+
+#include "paddle/infrt/paddle/cpp/desc_api.h"
+#include "paddle/infrt/paddle/framework.pb.h"
+
+namespace infrt::paddle::pb {
+
+namespace framework_proto = ::paddle::framework::proto;
+
+class BlockDesc : public cpp::BlockDescAPI {
+ public:
+  BlockDesc() = delete;
+
+  explicit BlockDesc(framework_proto::BlockDesc* desc) : desc_(desc) {
+    CHECK(desc_);
+  }
+
+  framework_proto::BlockDesc* Proto() { return desc_; }
+
+  const framework_proto::BlockDesc& ReadonlyProto() const { return *desc_; }
+
+  int32_t Idx() const override { return desc_->idx(); }
+
+  void SetIdx(int32_t idx) override { desc_->set_idx(idx); }
+
+  int32_t ParentIdx() const override { return desc_->parent_idx(); }
+
+  void SetParentIdx(int32_t idx) override { desc_->set_parent_idx(idx); }
+
+  size_t VarsSize() const override { return desc_->vars_size(); }
+
+  void ClearVars() override { desc_->clear_vars(); }
+
+  template <typename T>
+  T* GetVar(int32_t idx);
+
+  template <typename T>
+  T* AddVar();
+
+  size_t OpsSize() const override { return desc_->ops_size(); }
+
+  void ClearOps() override { desc_->clear_ops(); }
+
+  template <typename T>
+  T* GetOp(int32_t idx);
+
+  template <typename T>
+  T* AddOp();
+
+  int32_t ForwardBlockIdx() const override {
+    return desc_->forward_block_idx();
+  }
+
+  void SetForwardBlockIdx(int32_t idx) override {
+    desc_->set_forward_block_idx(idx);
+  }
+
+ private:
+  framework_proto::BlockDesc* desc_;  // not_own
+};
+
+}  // namespace infrt::paddle::pb
diff --git a/paddle/infrt/paddle/pb/op_desc.cc b/paddle/infrt/paddle/pb/op_desc.cc
new file mode 100644
index 0000000000000..c7b1e66f50642
--- /dev/null
+++ b/paddle/infrt/paddle/pb/op_desc.cc
@@ -0,0 +1,139 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/paddle/pb/op_desc.h"
+
+namespace infrt::paddle::pb {
+
+google::protobuf::internal::RepeatedPtrIterator<framework_proto::OpDesc_Attr>
+FindAttr(framework_proto::OpDesc *desc, const std::string &name) {
+  auto &xs = *desc->mutable_attrs();
+  auto it = std::find_if(
+      xs.begin(), xs.end(), [&](const framework_proto::OpDesc_Attr &x) {
+        return x.name() == name;
+      });
+  if (it == xs.end()) {
+    auto *attr = xs.Add();
+    attr->set_name(name);
+    it = std::find_if(
+        xs.begin(), xs.end(), [&](const framework_proto::OpDesc_Attr &x) {
+          return x.name() == name;
+        });
+  }
+  return it;
+}
+
+#define SET_IMPL_ONE(T, ty__, pb_f__)                            \
+  template <>                                                    \
+  void OpDesc::SetAttr<T>(const std::string &name, const T &v) { \
+    auto it = FindAttr(desc_, name);                             \
+    it->set_type(framework_proto::ty__);                         \
+    it->set_##pb_f__(v);                                         \
+  }
+SET_IMPL_ONE(int, INT, i);
+SET_IMPL_ONE(float, FLOAT, f);
+SET_IMPL_ONE(bool, BOOLEAN, b);
+SET_IMPL_ONE(int64_t, LONG, l);
+
+template <>
+void OpDesc::SetAttr<std::vector<int>>(const std::string &name,
+                                       const std::vector<int> &v) {
+  auto it = FindAttr(desc_, name);
+  it->set_type(framework_proto::INTS);
+  it->clear_ints();
+  for (auto &i : v) {
+    it->add_ints(i);
+  }
+}
+
+template <>
+void OpDesc::SetAttr<std::string>(const std::string &name,
+                                  const std::string &v) {
+  auto it = FindAttr(desc_, name);
+  it->set_type(framework_proto::STRING);
+  it->set_s(v.c_str());
+}
+
+template <>
+void OpDesc::SetAttr<std::vector<float>>(const std::string &name,
+                                         const std::vector<float> &v) {
+  auto it = FindAttr(desc_, name);
+  it->set_type(framework_proto::FLOATS);
+  it->clear_floats();
+  for (auto &i : v) {
+    it->add_floats(i);
+  }
+}
+
+template <>
+void OpDesc::SetAttr<std::vector<std::string>>(
+    const std::string &name, const std::vector<std::string> &v) {
+  auto it = FindAttr(desc_, name);
+  it->set_type(framework_proto::STRINGS);
+  it->clear_strings();
+  for (auto &i : v) {
+    it->add_strings(i);
+  }
+}
+
+template <>
+void OpDesc::SetAttr<std::vector<int64_t>>(const std::string &name,
+                                           const std::vector<int64_t> &v) {
+  auto it = FindAttr(desc_, name);
+  it->set_type(framework_proto::LONGS);
+  it->clear_longs();
+  for (auto &i : v) {
+    it->add_longs(i);
+  }
+}
+google::protobuf::internal::RepeatedPtrIterator<
+    const framework_proto::OpDesc_Attr>
+GetFindAttr(const framework_proto::OpDesc &desc, const std::string &name) {
+  auto &xs = desc.attrs();
+  auto it = std::find_if(
+      xs.begin(), xs.end(), [&](const framework_proto::OpDesc_Attr &x) {
+        return x.name() == name;
+      });
+  return it;
+}
+
+#define GET_ATTR_IMPL(T, pb_f__)                        \
+  template <>                                           \
+  T OpDesc::GetAttr<T>(const std::string &name) const { \
+    auto it = GetFindAttr(*desc_, name);                \
+    return it->pb_f__();                                \
+  }
+
+#define GET_ATTRS_IMPL(T, pb_f__)                       \
+  template <>                                           \
+  T OpDesc::GetAttr<T>(const std::string &name) const { \
+    auto it = GetFindAttr(*desc_, name);                \
+    T res;                                              \
+    for (const auto &v : it->pb_f__()) {                \
+      res.push_back(v);                                 \
+    }                                                   \
+    return res;                                         \
+  }
+GET_ATTR_IMPL(int32_t, i);
+GET_ATTR_IMPL(int16_t, block_idx);
+GET_ATTR_IMPL(float, f);
+GET_ATTR_IMPL(bool, b);
+GET_ATTR_IMPL(int64_t, l);
+GET_ATTRS_IMPL(std::vector<int>, ints);
+GET_ATTRS_IMPL(std::vector<float>, floats);
+GET_ATTRS_IMPL(std::vector<std::string>, strings);
+GET_ATTR_IMPL(std::string, s);
+GET_ATTRS_IMPL(std::vector<int64_t>, longs);
+
+}  // namespace infrt::paddle::pb
diff --git a/paddle/infrt/paddle/pb/op_desc.h b/paddle/infrt/paddle/pb/op_desc.h
new file mode 100644
index 0000000000000..81d57d9f32252
--- /dev/null
+++ b/paddle/infrt/paddle/pb/op_desc.h
@@ -0,0 +1,198 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <glog/logging.h>
+
+#include "paddle/infrt/paddle/cpp/desc_api.h"
+#include "paddle/infrt/paddle/framework.pb.h"
+#include "paddle/infrt/support/variant.h"
+
+namespace infrt::paddle::pb {
+
+namespace framework_proto = ::paddle::framework::proto;
+
+using Attribute =
+    Variant<int, float, bool, std::vector<std::string>, std::vector<int>>;
+using VariableNameMap = std::map<std::string, std::vector<std::string>>;
+
+/*
+ * The lite::OpDesc, an light-weight implementation of wrapper of proto::OpDesc.
+ * Unlike the original one in framework::OpDesc, we remove the local members
+ * except the desc_, to avoid the inconsistent state, which is normal in the
+ * original interface and results in bugs.
+ */
+class OpDesc : public cpp::OpDescAPI {
+ public:
+  OpDesc() = delete;
+
+  explicit OpDesc(framework_proto::OpDesc *desc) : desc_(desc) { CHECK(desc_); }
+
+  framework_proto::OpDesc *Proto() { return desc_; }
+  const framework_proto::OpDesc &ReadonlyProto() const { return *desc_; }
+
+  std::string Type() const override { return desc_->type(); }
+
+  void SetType(const std::string &type) override { desc_->set_type(type); }
+
+  // Get the arguments of parameter called `param`
+  std::vector<std::string> Input(const std::string &param) const override {
+    return GetArguments(desc_->inputs(), param);
+  }
+
+  std::vector<std::string> InputArgumentNames() const override {
+    return GetArgumentNames(desc_->inputs());
+  }
+
+  void SetInput(const std::string &param,
+                const std::vector<std::string> &args) override {
+    SetArgument(desc_->mutable_inputs(), param, args);
+  }
+
+  std::vector<std::string> Output(const std::string &param) const override {
+    return GetArguments(desc_->outputs(), param);
+  }
+
+  std::vector<std::string> OutputArgumentNames() const override {
+    return GetArgumentNames(desc_->outputs());
+  }
+
+  void SetOutput(const std::string &param,
+                 const std::vector<std::string> &args) override {
+    SetArgument(desc_->mutable_outputs(), param, args);
+  }
+
+  bool HasAttr(const std::string &name) const override {
+    const auto &xs = desc_->attrs();
+    auto it = std::find_if(
+        xs.begin(), xs.end(), [&](const framework_proto::OpDesc_Attr &x) {
+          return x.name() == name;
+        });
+    return it != xs.end();
+  }
+
+  AttrType GetAttrType(const std::string &name) const override {
+    const auto &xs = desc_->attrs();
+    auto it = std::find_if(
+        xs.begin(), xs.end(), [&](const framework_proto::OpDesc_Attr &x) {
+          return x.name() == name;
+        });
+    CHECK(it != xs.end());
+#define DEF_ONE(type__)                   \
+  case framework_proto::AttrType::type__: \
+    return AttrType::type__;
+
+    switch (it->type()) {
+      DEF_ONE(INT);
+      DEF_ONE(FLOAT);
+      DEF_ONE(STRING);
+      DEF_ONE(INTS);
+      DEF_ONE(FLOATS);
+      DEF_ONE(STRINGS);
+      DEF_ONE(BOOLEAN);
+      DEF_ONE(BOOLEANS);
+      DEF_ONE(BLOCK);
+      DEF_ONE(LONG);
+      DEF_ONE(BLOCKS);
+      DEF_ONE(LONGS);
+      default:
+        LOG(FATAL) << "Unknown attribute type";
+        return static_cast<AttrType>(-1);
+    }
+#undef DEF_ONE
+  }
+
+  std::vector<std::string> AttrNames() const override {
+    std::vector<std::string> res;
+    const auto &xs = desc_->attrs();
+    std::transform(
+        xs.begin(),
+        xs.end(),
+        std::back_inserter(res),
+        [](const framework_proto::OpDesc_Attr &x) { return x.name(); });
+    return res;
+  }
+
+  template <typename T>
+  void SetAttr(const std::string &name, const T &v);
+
+  template <typename T>
+  T GetAttr(const std::string &name) const;
+
+ private:
+  std::vector<std::string> GetArguments(
+      const google::protobuf::RepeatedPtrField<framework_proto::OpDesc_Var> &xs,
+      const std::string &param) const {
+    std::vector<std::string> res;
+    auto it = std::find_if(
+        xs.begin(), xs.end(), [&](const framework_proto::OpDesc_Var &it) {
+          return it.parameter() == param;
+        });
+    CHECK(it != xs.end());
+
+    const auto &ys = it->arguments();
+    std::transform(ys.begin(),
+                   ys.end(),
+                   std::back_inserter(res),
+                   [](const std::string &x) { return x; });
+    return res;
+  }
+
+  void SetArgument(
+      google::protobuf::RepeatedPtrField<framework_proto::OpDesc_Var> *xs,
+      const std::string &param,
+      const std::vector<std::string> &args) {
+    auto it = std::find_if(
+        xs->begin(), xs->end(), [&](const framework_proto::OpDesc_Var &it) {
+          return it.parameter() == param;
+        });
+    if (it == xs->end()) {
+      auto *new_arg = xs->Add();
+      new_arg->set_parameter(param);
+      for (const auto &arg : args) {
+        *new_arg->mutable_arguments()->Add() = arg;
+      }
+    } else {
+      it->mutable_arguments()->Clear();
+      for (const auto &arg : args) {
+        *it->mutable_arguments()->Add() = arg;
+      }
+    }
+  }
+
+  std::vector<std::string> GetArgumentNames(
+      const google::protobuf::RepeatedPtrField<framework_proto::OpDesc_Var> &xs)
+      const {
+    std::vector<std::string> res;
+    std::transform(
+        xs.begin(),
+        xs.end(),
+        std::back_inserter(res),
+        [](const framework_proto::OpDesc_Var &x) { return x.parameter(); });
+    return res;
+  }
+
+ private:
+  framework_proto::OpDesc *desc_;
+};
+
+template <>
+void OpDesc::SetAttr<std::string>(const std::string &name,
+                                  const std::string &v);
+
+template <>
+void OpDesc::SetAttr<std::vector<int>>(const std::string &name,
+                                       const std::vector<int> &v);
+
+}  // namespace infrt::paddle::pb
diff --git a/paddle/infrt/paddle/pb/program_desc.cc b/paddle/infrt/paddle/pb/program_desc.cc
new file mode 100644
index 0000000000000..ed8a7e36e0129
--- /dev/null
+++ b/paddle/infrt/paddle/pb/program_desc.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/paddle/pb/program_desc.h"
+
+#include <algorithm>
+#include <limits>
+
+namespace infrt::paddle::pb {
+
+template <>
+framework_proto::BlockDesc* ProgramDesc::GetBlock<framework_proto::BlockDesc>(
+    int32_t idx) {
+  CHECK_LT(idx, static_cast<int>(BlocksSize())) << "idx >= blocks.size()";
+  return desc_->mutable_blocks(idx);
+}
+
+template <>
+framework_proto::BlockDesc*
+ProgramDesc::AddBlock<framework_proto::BlockDesc>() {
+  return desc_->add_blocks();
+}
+
+}  // namespace infrt::paddle::pb
diff --git a/paddle/infrt/paddle/pb/program_desc.h b/paddle/infrt/paddle/pb/program_desc.h
new file mode 100644
index 0000000000000..4adad650c974d
--- /dev/null
+++ b/paddle/infrt/paddle/pb/program_desc.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <glog/logging.h>
+
+#include <string>
+#include <vector>
+
+#include "paddle/infrt/paddle/cpp/desc_api.h"
+#include "paddle/infrt/paddle/framework.pb.h"
+
+namespace infrt::paddle::pb {
+namespace framework_proto = ::paddle::framework::proto;
+
+class ProgramDesc : public cpp::ProgramDescAPI {
+ public:
+  ProgramDesc() = delete;
+
+  explicit ProgramDesc(framework_proto::ProgramDesc *desc) : desc_(desc) {
+    CHECK(desc_);
+  }
+
+  framework_proto::ProgramDesc *Proto() { return desc_; }
+
+  const framework_proto::ProgramDesc &ReadonlyProto() const { return *desc_; }
+
+  size_t BlocksSize() const override { return desc_->blocks_size(); }
+
+  void ClearBlocks() override { desc_->clear_blocks(); }
+
+  template <typename T>
+  T *GetBlock(int32_t idx);
+
+  template <typename T>
+  T *AddBlock();
+
+  bool HasVersion() const override { return desc_->has_version(); }
+
+  int64_t Version() const override { return desc_->version().version(); }
+
+  void SetVersion(int64_t version) override {
+    desc_->mutable_version()->set_version(version);
+  }
+
+ private:
+  framework_proto::ProgramDesc *desc_;  // not_own
+};
+
+}  // namespace infrt::paddle::pb
diff --git a/paddle/infrt/paddle/pb/var_desc.cc b/paddle/infrt/paddle/pb/var_desc.cc
new file mode 100644
index 0000000000000..cf80df4f1b845
--- /dev/null
+++ b/paddle/infrt/paddle/pb/var_desc.cc
@@ -0,0 +1,367 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/paddle/pb/var_desc.h"
+
+#include <google/protobuf/map.h>
+
+#include "paddle/infrt/paddle/cpp/desc_api.h"
+#include "paddle/infrt/paddle/framework.pb.h"
+
+namespace infrt::paddle::pb {
+
+cpp::VarDescAPI::Type VarDesc::GetType() const {
+  auto type = desc_->type().type();
+
+#define GET_TYPE_CASE_ITEM(type__)       \
+  case framework_proto::VarType::type__: \
+    return cpp::VarDescAPI::Type::type__;
+
+  switch (type) {
+    GET_TYPE_CASE_ITEM(LOD_TENSOR);
+    GET_TYPE_CASE_ITEM(LOD_TENSOR_ARRAY);
+    GET_TYPE_CASE_ITEM(LOD_RANK_TABLE);
+    GET_TYPE_CASE_ITEM(SELECTED_ROWS);
+    GET_TYPE_CASE_ITEM(FEED_MINIBATCH);
+    GET_TYPE_CASE_ITEM(FETCH_LIST);
+    GET_TYPE_CASE_ITEM(STEP_SCOPES);
+    GET_TYPE_CASE_ITEM(PLACE_LIST);
+    GET_TYPE_CASE_ITEM(READER);
+    default:
+      LOG(FATAL) << "Unknown var type";
+      return VarDescAPI::Type();
+  }
+#undef GET_TYPE_CASE_ITEM
+}
+
+void VarDesc::SetType(VarDescAPI::Type type) {
+#define SET_TYPE_CASE_ITEM(type__)                                     \
+  case VarDescAPI::Type::type__:                                       \
+    desc_->mutable_type()->set_type(framework_proto::VarType::type__); \
+    break;
+
+  switch (type) {
+    SET_TYPE_CASE_ITEM(LOD_TENSOR);
+    SET_TYPE_CASE_ITEM(LOD_TENSOR_ARRAY);
+    SET_TYPE_CASE_ITEM(LOD_RANK_TABLE);
+    SET_TYPE_CASE_ITEM(SELECTED_ROWS);
+    SET_TYPE_CASE_ITEM(FEED_MINIBATCH);
+    SET_TYPE_CASE_ITEM(FETCH_LIST);
+    SET_TYPE_CASE_ITEM(STEP_SCOPES);
+    SET_TYPE_CASE_ITEM(PLACE_LIST);
+    SET_TYPE_CASE_ITEM(READER);
+    default:
+      LOG(FATAL) << "Unknown var type";
+  }
+#undef SET_TYPE_CASE_ITEM
+}
+
+void VarDesc::SetShape(const std::vector<int64_t> &dims) {
+  VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims());
+}
+
+void VarDesc::SetTensorDescNum(size_t num) {
+  switch (desc_->type().type()) {
+    case framework_proto::VarType::READER: {
+      auto *lod_tensors_ptr =
+          desc_->mutable_type()->mutable_reader()->mutable_lod_tensor();
+      lod_tensors_ptr->Clear();
+      for (size_t i = 0; i < num; ++i) {
+        lod_tensors_ptr->Add();
+      }
+      return;
+    } break;
+    default:
+      LOG(FATAL) << "Setting 'sub_tensor_number' is not supported by the type "
+                    "of var %s."
+                 << this->Name();
+  }
+}
+
+size_t VarDesc::GetTensorDescNum() const {
+  switch (desc_->type().type()) {
+    case framework_proto::VarType::READER:
+      return desc_->type().reader().lod_tensor_size();
+      break;
+    default:
+      LOG(FATAL) << "Getting 'sub_tensor_number' is not supported by the type "
+                    "of var %s."
+                 << this->Name();
+  }
+  return 0;
+}
+
+void VarDesc::SetShapes(
+    const std::vector<std::vector<int64_t>> &multiple_dims) {
+  if (multiple_dims.size() != GetTensorDescNum()) {
+    VLOG(3) << "WARNING: The number of given shapes(" << multiple_dims.size()
+            << ") doesn't match the existing tensor number("
+            << GetTensorDescNum()
+            << "). The Reader is going to be reinitialized.";
+    SetTensorDescNum(multiple_dims.size());
+  }
+  std::vector<framework_proto::VarType::TensorDesc *> tensors =
+      mutable_tensor_descs();
+  for (size_t i = 0; i < multiple_dims.size(); ++i) {
+    VectorToRepeated(multiple_dims[i], tensors[i]->mutable_dims());
+  }
+}
+
+std::vector<int64_t> VarDesc::GetShape() const {
+  return RepeatedToVector(tensor_desc().dims());
+}
+
+std::vector<std::vector<int64_t>> VarDesc::GetShapes() const {
+  std::vector<framework_proto::VarType::TensorDesc> descs = tensor_descs();
+  std::vector<std::vector<int64_t>> res;
+  res.reserve(descs.size());
+  for (const auto &tensor_desc : descs) {
+    res.push_back(RepeatedToVector(tensor_desc.dims()));
+  }
+  return res;
+}
+
+void VarDesc::SetDataType(VarDescAPI::VarDataType data_type) {
+#define SET_DATA_TYPE_CASE_ITEM(type__)                                     \
+  case cpp::VarDescAPI::Type::type__:                                       \
+    mutable_tensor_desc()->set_data_type(framework_proto::VarType::type__); \
+    break;
+
+  switch (data_type) {
+    SET_DATA_TYPE_CASE_ITEM(BOOL);
+    SET_DATA_TYPE_CASE_ITEM(SIZE_T);
+    SET_DATA_TYPE_CASE_ITEM(UINT8);
+    SET_DATA_TYPE_CASE_ITEM(INT8);
+    SET_DATA_TYPE_CASE_ITEM(INT16);
+    SET_DATA_TYPE_CASE_ITEM(INT32);
+    SET_DATA_TYPE_CASE_ITEM(INT64);
+    SET_DATA_TYPE_CASE_ITEM(FP16);
+    SET_DATA_TYPE_CASE_ITEM(FP32);
+    SET_DATA_TYPE_CASE_ITEM(FP64);
+    default:
+      LOG(FATAL) << "Unknown var type: " << static_cast<int>(data_type);
+  }
+#undef SET_DATA_TYPE_CASE_ITEM
+}
+
+void VarDesc::SetDataTypes(
+    const std::vector<framework_proto::VarType::Type> &multiple_data_type) {
+  if (multiple_data_type.size() != GetTensorDescNum()) {
+    VLOG(3) << "WARNING: The number of given data types("
+            << multiple_data_type.size()
+            << ") doesn't match the existing tensor number("
+            << GetTensorDescNum()
+            << "). The Reader is going to be reinitialized.";
+    SetTensorDescNum(multiple_data_type.size());
+  }
+  std::vector<framework_proto::VarType::TensorDesc *> tensor_descs =
+      mutable_tensor_descs();
+  for (size_t i = 0; i < multiple_data_type.size(); ++i) {
+    tensor_descs[i]->set_data_type(multiple_data_type[i]);
+  }
+}
+
+// proto::VarType::Type VarDesc::GetDataType() const {
+//   return tensor_desc().data_type();
+// }
+cpp::VarDescAPI::VarDataType VarDesc::GetDataType() const {
+  CHECK(desc_->has_type()) << "The var's type hasn't been set.";
+  CHECK(desc_->type().has_type()) << "The var type hasn't been set.";
+  if (desc_->type().type() != framework_proto::VarType::LOD_TENSOR) {
+    return VarDescAPI::Type();
+  }
+  auto type = tensor_desc().data_type();
+#define GET_DATA_TYPE_CASE_ITEM(type__)                       \
+  case framework_proto::VarType::Type::VarType_Type_##type__: \
+    return VarDescAPI::Type::type__
+
+  switch (type) {
+    GET_DATA_TYPE_CASE_ITEM(BOOL);
+    GET_DATA_TYPE_CASE_ITEM(SIZE_T);
+    GET_DATA_TYPE_CASE_ITEM(UINT8);
+    GET_DATA_TYPE_CASE_ITEM(INT8);
+    GET_DATA_TYPE_CASE_ITEM(INT16);
+    GET_DATA_TYPE_CASE_ITEM(INT32);
+    GET_DATA_TYPE_CASE_ITEM(INT64);
+    GET_DATA_TYPE_CASE_ITEM(FP16);
+    GET_DATA_TYPE_CASE_ITEM(FP32);
+    GET_DATA_TYPE_CASE_ITEM(FP64);
+    default:
+      LOG(FATAL) << "Unknown var type: " << static_cast<int>(type);
+      return VarDescAPI::Type();
+  }
+#undef GET_DATA_TYPE_CASE_ITEM
+}
+
+std::vector<framework_proto::VarType::Type> VarDesc::GetDataTypes() const {
+  std::vector<framework_proto::VarType::TensorDesc> descs = tensor_descs();
+  std::vector<framework_proto::VarType::Type> res;
+  res.reserve(descs.size());
+  for (const auto &tensor_desc : descs) {
+    res.push_back(tensor_desc.data_type());
+  }
+  return res;
+}
+
+void VarDesc::SetLoDLevel(int32_t lod_level) {
+  switch (desc_->type().type()) {
+    case framework_proto::VarType::LOD_TENSOR:
+      desc_->mutable_type()->mutable_lod_tensor()->set_lod_level(lod_level);
+      break;
+    case framework_proto::VarType::LOD_TENSOR_ARRAY:
+      desc_->mutable_type()->mutable_tensor_array()->set_lod_level(lod_level);
+      break;
+    default:
+      LOG(FATAL)
+          << "Setting 'lod_level' is not supported by the type of var %s."
+          << this->Name();
+  }
+}
+
+void VarDesc::SetLoDLevels(const std::vector<int32_t> &multiple_lod_level) {
+  if (multiple_lod_level.size() != GetTensorDescNum()) {
+    VLOG(3) << "WARNING: The number of given lod_levels("
+            << multiple_lod_level.size()
+            << ") doesn't match the existing tensor number("
+            << GetTensorDescNum()
+            << "). The Reader is going to be reinitialized.";
+    SetTensorDescNum(multiple_lod_level.size());
+  }
+  switch (desc_->type().type()) {
+    case framework_proto::VarType::READER: {
+      size_t i = 0;
+      for (auto &lod_tensor :
+           *desc_->mutable_type()->mutable_reader()->mutable_lod_tensor()) {
+        lod_tensor.set_lod_level(multiple_lod_level[i++]);
+      }
+    } break;
+    default:
+      LOG(FATAL)
+          << "Setting 'lod_levels' is not supported by the type of var %s."
+          << this->Name();
+  }
+}
+
+int32_t VarDesc::GetLoDLevel() const {
+  switch (desc_->type().type()) {
+    case framework_proto::VarType::LOD_TENSOR:
+      return desc_->type().lod_tensor().lod_level();
+    case framework_proto::VarType::LOD_TENSOR_ARRAY:
+      return desc_->type().tensor_array().lod_level();
+    default:
+      LOG(FATAL)
+          << "Getting 'lod_level' is not supported by the type of var %s."
+          << this->Name();
+  }
+  return 0;
+}
+
+std::vector<int32_t> VarDesc::GetLoDLevels() const {
+  std::vector<int32_t> res;
+  switch (desc_->type().type()) {
+    case framework_proto::VarType::READER:
+      res.reserve(desc_->type().reader().lod_tensor_size());
+      for (auto &lod_tensor : desc_->type().reader().lod_tensor()) {
+        res.push_back(lod_tensor.lod_level());
+      }
+      return res;
+      break;
+    default:
+      LOG(FATAL)
+          << "Getting 'lod_levels' is not supported by the type of var %s."
+          << this->Name();
+  }
+  return std::vector<int32_t>();
+}
+
+const framework_proto::VarType::TensorDesc &VarDesc::tensor_desc() const {
+  CHECK(desc_->has_type()) << "The var's type hasn't been set.";
+  CHECK(desc_->type().has_type()) << "The var type hasn't been set.";
+  switch (desc_->type().type()) {
+    case framework_proto::VarType::SELECTED_ROWS:
+      return desc_->type().selected_rows();
+    case framework_proto::VarType::LOD_TENSOR:
+      return desc_->type().lod_tensor().tensor();
+    case framework_proto::VarType::LOD_TENSOR_ARRAY:
+      return desc_->type().tensor_array().tensor();
+    default:
+      LOG(FATAL)
+          << "Getting 'tensor_desc' is not supported by the type of var %s."
+          << this->Name();
+  }
+  return framework_proto::VarDesc().type().lod_tensor().tensor();
+}
+
+std::vector<framework_proto::VarType::TensorDesc> VarDesc::tensor_descs()
+    const {
+  CHECK(desc_->has_type()) << "The var type hasn't been set.";
+  std::vector<framework_proto::VarType::TensorDesc> res;
+  res.reserve(GetTensorDescNum());
+  switch (desc_->type().type()) {
+    case framework_proto::VarType::READER:
+      for (const auto &lod_tensor : desc_->type().reader().lod_tensor()) {
+        res.push_back(lod_tensor.tensor());
+      }
+      return res;
+    default:
+      LOG(FATAL)
+          << "Getting 'tensor_descs' is not supported by the type of var "
+             "%s."
+          << this->Name();
+  }
+  return std::vector<framework_proto::VarType::TensorDesc>();
+}
+
+framework_proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() {
+  CHECK(desc_->has_type()) << "The var type hasn't been set.";
+  CHECK(desc_->type().has_type()) << "The var type hasn't been set.";
+  switch (desc_->type().type()) {
+    case framework_proto::VarType::SELECTED_ROWS:
+      return desc_->mutable_type()->mutable_selected_rows();
+    case framework_proto::VarType::LOD_TENSOR:
+      return desc_->mutable_type()->mutable_lod_tensor()->mutable_tensor();
+    case framework_proto::VarType::LOD_TENSOR_ARRAY:
+      return desc_->mutable_type()->mutable_tensor_array()->mutable_tensor();
+    default:
+      LOG(FATAL) << "Getting 'mutable_tensor_desc' is not supported by the "
+                    "type of var "
+                    "%s."
+                 << this->Name();
+  }
+  return nullptr;
+}
+
+std::vector<framework_proto::VarType::TensorDesc *>
+VarDesc::mutable_tensor_descs() {
+  CHECK(desc_->has_type()) << "The var type hasn't been set.";
+  CHECK(desc_->type().has_type()) << "The var type hasn't been set.";
+  std::vector<framework_proto::VarType::TensorDesc *> res;
+  res.reserve(GetTensorDescNum());
+  switch (desc_->type().type()) {
+    case framework_proto::VarType::READER:
+      for (auto &lod_tensor :
+           *desc_->mutable_type()->mutable_reader()->mutable_lod_tensor()) {
+        res.push_back(lod_tensor.mutable_tensor());
+      }
+      return res;
+    default:
+      LOG(FATAL)
+          << "Getting 'tensor_descs' is not supported by the type of var "
+             "%s."
+          << this->Name();
+  }
+  return std::vector<framework_proto::VarType::TensorDesc *>();
+}
+
+}  // namespace infrt::paddle::pb
diff --git a/paddle/infrt/paddle/pb/var_desc.h b/paddle/infrt/paddle/pb/var_desc.h
new file mode 100644
index 0000000000000..4cff5fdee0375
--- /dev/null
+++ b/paddle/infrt/paddle/pb/var_desc.h
@@ -0,0 +1,124 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <glog/logging.h>
+#include <google/protobuf/map.h>
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "paddle/infrt/paddle/cpp/desc_api.h"
+#include "paddle/infrt/paddle/framework.pb.h"
+
+namespace infrt::paddle::pb {
+namespace framework_proto = ::paddle::framework::proto;
+
+// convert between std::vector and protobuf repeated.
+template <typename T>
+inline std::vector<T> RepeatedToVector(
+    const google::protobuf::RepeatedField<T> &repeated_field) {
+  std::vector<T> ret;
+  ret.reserve(repeated_field.size());
+  std::copy(
+      repeated_field.begin(), repeated_field.end(), std::back_inserter(ret));
+  return ret;
+}
+
+template <typename T, typename RepeatedField>
+inline void VectorToRepeated(const std::vector<T> &vec,
+                             RepeatedField *repeated_field) {
+  repeated_field->Clear();
+  repeated_field->Reserve(vec.size());
+  for (const auto &elem : vec) {
+    *repeated_field->Add() = elem;
+  }
+}
+
+// Specialize vector<bool>.
+template <typename RepeatedField>
+inline void VectorToRepeated(const std::vector<bool> &vec,
+                             RepeatedField *repeated_field) {
+  repeated_field->Clear();
+  repeated_field->Reserve(vec.size());
+  for (auto elem : vec) {
+    *repeated_field->Add() = elem;
+  }
+}
+
+class VarDesc : public cpp::VarDescAPI {
+ public:
+  VarDesc() = delete;
+
+  explicit VarDesc(framework_proto::VarDesc *desc) : desc_(desc) {
+    CHECK(desc_);
+  }
+
+  ::paddle::framework::proto::VarDesc *Proto() { return desc_; }
+  const framework_proto::VarDesc &ReadonlyProto() const { return *desc_; }
+
+  std::string Name() const override { return desc_->name(); }
+
+  void SetName(std::string name) override { desc_->set_name(name); }
+
+  void SetTensorDescNum(size_t num);
+
+  size_t GetTensorDescNum() const;
+
+  void SetShape(const std::vector<int64_t> &dims);
+
+  void SetShapes(const std::vector<std::vector<int64_t>> &multiple_dims);
+
+  std::vector<int64_t> GetShape() const;
+
+  std::vector<std::vector<int64_t>> GetShapes() const;
+
+  void SetDataType(VarDescAPI::VarDataType data_type);
+
+  void SetDataTypes(
+      const std::vector<framework_proto::VarType::Type> &multiple_data_type);
+
+  VarDescAPI::VarDataType GetDataType() const;
+
+  std::vector<framework_proto::VarType::Type> GetDataTypes() const;
+
+  void SetLoDLevel(int32_t lod_level);
+
+  void SetLoDLevels(const std::vector<int32_t> &multiple_lod_level);
+
+  int32_t GetLoDLevel() const;
+
+  std::vector<int32_t> GetLoDLevels() const;
+
+  VarDescAPI::Type GetType() const override;
+
+  void SetType(VarDescAPI::Type type) override;
+
+  bool Persistable() const override { return desc_->persistable(); }
+
+  void SetPersistable(bool persistable) override {
+    desc_->set_persistable(persistable);
+  }
+
+ private:
+  const framework_proto::VarType::TensorDesc &tensor_desc() const;
+  std::vector<framework_proto::VarType::TensorDesc> tensor_descs() const;
+  framework_proto::VarType::TensorDesc *mutable_tensor_desc();
+  std::vector<framework_proto::VarType::TensorDesc *> mutable_tensor_descs();
+
+  framework_proto::VarDesc *desc_;
+};
+
+}  // namespace infrt::paddle::pb
diff --git a/paddle/infrt/paddle/scope.cc b/paddle/infrt/paddle/scope.cc
new file mode 100644
index 0000000000000..d7bab9f749591
--- /dev/null
+++ b/paddle/infrt/paddle/scope.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/paddle/scope.h"
+
+#include "paddle/infrt/common/common.h"
+
+namespace infrt {
+namespace paddle {
+
+_Variable* Scope::FindVar(const std::string& name) const {
+  auto it = data_.find(name);
+  if (it != data_.end()) return it->second.get();
+  return nullptr;
+}
+
+Tensor Scope::GetTensor(const std::string& name) const {
+  CheckVarNameValid(name);
+  auto* var = FindVar(name);
+  CHECK(var) << "No variable called [" << name << "] found";
+  return var->get<Tensor>();
+}
+
+std::vector<std::string> Scope::var_names() const {
+  std::vector<std::string> names;
+  for (auto& item : data_) {
+    names.push_back(item.first);
+  }
+  return names;
+}
+
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/scope.h b/paddle/infrt/paddle/scope.h
new file mode 100644
index 0000000000000..4ebf846374c6f
--- /dev/null
+++ b/paddle/infrt/paddle/scope.h
@@ -0,0 +1,68 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <unordered_map>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/infrt/common/macros.h"
+#include "paddle/infrt/paddle/tensor.h"
+#include "paddle/infrt/support/variant.h"
+
+namespace infrt {
+namespace paddle {
+
+using _Variable = Variant<Tensor>;
+
+struct _Tensor_;
+
+class Scope {
+ public:
+  static std::shared_ptr<Scope> Create() { return std::make_shared<Scope>(); }
+
+  //! Get or create a variable.
+  template <typename T>
+  _Variable* Var(const std::string& name);
+
+  //! Find a variable, get null if not exists.
+  _Variable* FindVar(const std::string& name) const;
+
+  Tensor GetTensor(const std::string& name) const;
+
+  //! Get variable names.
+  std::vector<std::string> var_names() const;
+
+  Scope() = default;
+
+ private:
+  std::unordered_map<std::string, std::unique_ptr<_Variable>> data_;
+
+  INFRT_DISALLOW_COPY_AND_ASSIGN(Scope);
+};
+
+template <typename T>
+_Variable* Scope::Var(const std::string& name) {
+  VLOG(4) << "Scope insert Var [" << name << "]";
+  _Variable* x = FindVar(name);
+  if (x) return x;
+  auto* data = new _Variable(T());
+  data_[name].reset(data);
+  return data;
+}
+
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/tensor.cc b/paddle/infrt/paddle/tensor.cc
new file mode 100644
index 0000000000000..072701ee9077d
--- /dev/null
+++ b/paddle/infrt/paddle/tensor.cc
@@ -0,0 +1,19 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/paddle/tensor.h"
+
+namespace infrt {
+namespace paddle {}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/tensor.h b/paddle/infrt/paddle/tensor.h
new file mode 100644
index 0000000000000..5c4458bb62d73
--- /dev/null
+++ b/paddle/infrt/paddle/tensor.h
@@ -0,0 +1,107 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <numeric>
+#include <vector>
+
+#include "paddle/infrt/common/buffer.h"
+#include "paddle/infrt/common/common.h"
+#include "paddle/infrt/common/object.h"
+
+namespace infrt {
+namespace paddle {
+using common::Target;
+
+struct Shape {
+  using dim_t = int;
+
+  Shape() = default;
+  explicit Shape(const std::vector<dim_t>& data) : data_(data) {}
+
+  void SetData(const std::vector<dim_t>& data) { data_ = data; }
+
+  const std::vector<dim_t>& data() const INFRT_RESULT_SHOULD_USE {
+    return data_;
+  }
+  std::vector<dim_t>& data() INFRT_RESULT_SHOULD_USE { return data_; }
+  size_t size() const INFRT_RESULT_SHOULD_USE { return data_.size(); }
+  uint32_t numel() const INFRT_RESULT_SHOULD_USE {
+    return std::accumulate(
+        data_.begin(), data_.end(), 1, [](dim_t a, dim_t b) { return a * b; });
+  }
+
+ private:
+  std::vector<dim_t> data_;
+};
+
+class _Tensor_ : public common::Object {
+ public:
+  _Tensor_() : buffer_(std::make_shared<Buffer>()) {}
+
+  Shape& shape() { return shape_; }
+
+  void Resize(const Shape& shape) {
+    shape_ = shape;
+    buffer_->data()->resize(
+        reinterpret_cast<const infrt_dimension_t*>(shape.data().data()),
+        shape.size());
+  }
+
+  template <typename T>
+  inline T* mutable_data(const Target& target) {
+    set_type(type_of<T>());
+    if (target == common::DefaultHostTarget()) {
+      int alignment = type_of<T>().ElementOf().bits();
+      buffer_->ResizeLazy(alignment, shape_.numel() * sizeof(T), target);
+    } else {
+      buffer_->ResizeLazy(shape_.numel() * sizeof(T), target);
+    }
+    return reinterpret_cast<T*>(buffer_->data()->memory);
+  }
+
+  template <typename T>
+  const T* data() const {
+    return reinterpret_cast<T*>(buffer_->data()->memory);
+  }
+
+  const Type& type() { return type_; }
+
+  void set_type(Type type) { type_ = type; }
+  const Type& type() const { return type_; }
+
+  infrt_buffer_t* buffer() { return buffer_->data(); }
+
+  const char* type_info() const override { return __type_info__; }
+
+ private:
+  common::Type type_;
+  // A shared ptr to make it easier to share buffer between tensors.
+  std::shared_ptr<Buffer> buffer_;
+  Shape shape_;
+
+  static constexpr const char* __type_info__ = "_frontend_tensor_";
+};
+
+class Tensor : public Shared<_Tensor_> {
+ public:
+  Tensor() : Shared(new _Tensor_) {}
+  explicit Tensor(_Tensor_* x) : Shared(x) {}
+};
+
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/support/CMakeLists.txt b/paddle/infrt/support/CMakeLists.txt
new file mode 100644
index 0000000000000..9bcce6cab368d
--- /dev/null
+++ b/paddle/infrt/support/CMakeLists.txt
@@ -0,0 +1 @@
+core_gather_headers()
diff --git a/paddle/infrt/support/type_traits.h b/paddle/infrt/support/type_traits.h
new file mode 100644
index 0000000000000..341dabb7c1c4a
--- /dev/null
+++ b/paddle/infrt/support/type_traits.h
@@ -0,0 +1,147 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file defines type traits related utilities.
+
+#pragma once
+
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+#include "llvm/ADT/STLExtras.h"
+
+namespace infrt {
+
+// Utility template for tag dispatching.
+template <typename T>
+struct TypeTag {};
+
+// This is the equivalent of std::void_t in C++17.
+template <typename... Ts>
+struct make_void {
+  typedef void type;
+};
+template <typename... Ts>
+using void_t = typename make_void<Ts...>::type;
+
+// The same as std::disjunction in C++17.
+template <class...>
+struct disjunction : std::false_type {};
+template <class B1>
+struct disjunction<B1> : B1 {};
+template <class B1, class... Bn>
+struct disjunction<B1, Bn...>
+    : std::conditional_t<bool(B1::value), B1, disjunction<Bn...>> {};
+
+// Check whether T may be a base class.
+template <typename T>
+using MaybeBase =
+    llvm::conjunction<std::is_class<T>, llvm::negation<std::is_final<T>>>;
+
+// Find the index of a type in a tuple.
+//
+// Example:
+// using Tuple = std::tuple<int, float, double>;
+// static_assert(TupleIndexOf<int, Tuple>::value == 0);
+// static_assert(TupleIndexOf<double, Tuple>::value == 2);
+template <class T, class Tuple>
+struct TupleIndexOf;
+
+template <class T, class... Types>
+struct TupleIndexOf<T, std::tuple<T, Types...>>
+    : std::integral_constant<size_t, 0> {};
+
+template <class T, class U, class... Types>
+struct TupleIndexOf<T, std::tuple<U, Types...>>
+    : std::integral_constant<size_t,
+                             1 + TupleIndexOf<T, std::tuple<Types...>>::value> {
+};
+
+template <typename T, typename Tuple>
+struct TupleHasType;
+
+template <typename T, typename... Us>
+struct TupleHasType<T, std::tuple<Us...>>
+    : disjunction<std::is_same<T, Us>...> {};
+
+// The detector pattern in C++ that can be used for checking whether a type has
+// a specific property, e.g. whether an internal type is present or whether a
+// particular operation is valid.
+//
+// Sample usage:
+//
+// struct Foo {
+//   using difference_type = int;
+//   int get();
+// };
+// struct Bar {};
+//
+// // Check whether a type T has an internal difference_type.
+// template<class T>
+// using diff_t = typename T::difference_type;
+//
+// static_assert(is_detected_v<diff_t, Foo>, "Foo has difference_type");
+// static_assert(!is_detected_v<diff_t, Bar>, "Bar has no difference_type");
+//
+// // Check whether a type T has a get() member function.
+// template<class T>
+// using has_get_t = decltype(std::declval<T>().get());
+//
+// static_assert(is_detected_v<has_get_t, Foo>, "Foo has get()");
+// static_assert(!is_detected_v<has_get_t, Bar>, "Bar has no get()");
+//
+// See https://en.cppreference.com/w/cpp/experimental/is_detected for details.
+
+namespace internal {
+
+// nonesuch is a class type used to indicate detection failure.
+struct nonesuch {
+  ~nonesuch() = delete;
+  nonesuch(nonesuch const&) = delete;
+  void operator=(nonesuch const&) = delete;
+};
+
+template <class Default,
+          class AlwaysVoid,
+          template <class...> class Op,
+          class... Args>
+struct detector : std::false_type {
+  using value_t = std::false_type;
+  using type = Default;
+};
+
+template <class Default, template <class...> class Op, class... Args>
+struct detector<Default, void_t<Op<Args...>>, Op, Args...> {
+  using value_t = std::true_type;
+  using type = Op<Args...>;
+};
+
+}  // namespace internal
+
+template <template <class...> class Op, class... Args>
+using is_detected =
+    typename internal::detector<internal::nonesuch, void, Op, Args...>::value_t;
+
+template <template <class...> class Op, class... Args>
+using detected_t =
+    typename internal::detector<internal::nonesuch, void, Op, Args...>::type;
+
+template <class Default, template <class...> class Op, class... Args>
+using detected_or = internal::detector<Default, void, Op, Args...>;
+
+template <template <class...> class Op, class... Args>
+constexpr bool is_detected_v = is_detected<Op, Args...>::value;
+
+}  // namespace infrt
diff --git a/paddle/infrt/support/variant.h b/paddle/infrt/support/variant.h
new file mode 100644
index 0000000000000..2f415b21c8010
--- /dev/null
+++ b/paddle/infrt/support/variant.h
@@ -0,0 +1,219 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file implements the variant data structure similar to
+// absl::variant in C++17.
+
+#pragma once
+
+#include <algorithm>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+#include "paddle/infrt/support/type_traits.h"
+
+namespace infrt {
+
+// A Variant similar to absl::variant in C++17.
+//
+// Example usage:
+//
+// Variant<int, float, double> v;
+//
+// v = 1;
+// assert(v.get<int>() == 1);
+// assert(v.is<int>());
+// assert(v.get_if<float>() == nullptr);
+//
+// // Print the variant.
+// visit([](auto& t) { std::cout << t; }, v);
+//
+// v.emplace<float>(3);
+//
+template <typename... Ts>
+class Variant {
+  // Convenient constant to check if a type is a variant.
+  template <typename T>
+  static constexpr bool IsVariant =
+      std::is_same<std::decay_t<T>, Variant>::value;
+
+ public:
+  using IndexT = int16_t;
+  using Types = std::tuple<Ts...>;
+  template <int N>
+  using TypeOf = typename std::tuple_element<N, Types>::type;
+  static constexpr size_t kNTypes = sizeof...(Ts);
+
+  // Default constructor sets the Variant to the default constructed fisrt type.
+  Variant() {
+    using Type0 = TypeOf<0>;
+    index_ = 0;
+    new (&storage_) Type0();
+  }
+
+  template <typename T, std::enable_if_t<!IsVariant<T>, int> = 0>
+  explicit Variant(T&& t) {
+    fillValue(std::forward<T>(t));
+  }
+
+  Variant(const Variant& v) {
+    visit([this](auto& t) { fillValue(t); }, v);
+  }
+
+  Variant(Variant&& v) {
+    visit([this](auto&& t) { fillValue(std::move(t)); }, v);
+  }
+
+  ~Variant() { destroy(); }
+
+  Variant& operator=(Variant&& v) {
+    visit([this](auto& t) { *this = std::move(t); }, v);
+    return *this;
+  }
+
+  Variant& operator=(const Variant& v) {
+    visit([this](auto& t) { *this = t; }, v);
+    return *this;
+  }
+
+  template <typename T, std::enable_if_t<!IsVariant<T>, int> = 0>
+  Variant& operator=(T&& t) {
+    destroy();
+    fillValue(std::forward<T>(t));
+
+    return *this;
+  }
+
+  template <typename T, typename... Args>
+  T& emplace(Args&&... args) {
+    AssertHasType<T>();
+
+    destroy();
+    index_ = IndexOf<T>;
+    auto* t = new (&storage_) T(std::forward<Args>(args)...);
+    return *t;
+  }
+
+  template <typename T>
+  bool is() const {
+    AssertHasType<T>();
+    return IndexOf<T> == index_;
+  }
+
+  template <typename T>
+  const T& get() const {
+    AssertHasType<T>();
+    return *reinterpret_cast<const T*>(&storage_);
+  }
+
+  template <typename T>
+  T& get() {
+    AssertHasType<T>();
+    return *reinterpret_cast<T*>(&storage_);
+  }
+
+  template <typename T>
+  const T* get_if() const {
+    if (is<T>()) return &get<T>();
+    return nullptr;
+  }
+
+  template <typename T>
+  T* get_if() {
+    if (is<T>()) return &get<T>();
+    return nullptr;
+  }
+
+  IndexT index() { return index_; }
+
+ private:
+  template <typename T>
+  static constexpr size_t IndexOf = TupleIndexOf<T, Types>::value;
+
+  static constexpr size_t kStorageSize = std::max({sizeof(Ts)...});
+  static constexpr size_t kAlignment = std::max({alignof(Ts)...});
+
+  template <typename T>
+  static constexpr void AssertHasType() {
+    constexpr bool has_type = TupleHasType<T, Types>::value;
+    static_assert(has_type, "Invalid Type used for Variant");
+  }
+
+  void destroy() {
+    visit(
+        [](auto& t) {
+          using T = std::decay_t<decltype(t)>;
+          t.~T();
+        },
+        *this);
+  }
+
+  template <typename T>
+  void fillValue(T&& t) {
+    using Type = std::decay_t<T>;
+    AssertHasType<Type>();
+
+    index_ = IndexOf<Type>;
+    new (&storage_) Type(std::forward<T>(t));
+  }
+
+  using StorageT = std::aligned_storage_t<kStorageSize, kAlignment>;
+
+  StorageT storage_;
+  IndexT index_ = -1;
+};
+
+struct Monostate {};
+
+namespace internal {
+
+template <typename F, typename Variant>
+decltype(auto) visitHelper(
+    F&& f,
+    Variant&& v,
+    std::integral_constant<int, std::decay_t<Variant>::kNTypes>) {
+  assert(false && "Unexpected index_ in Variant");
+}
+
+// Disable clang-format as it does not format less-than (<) in the template
+// parameter properly.
+//
+// clang-format off
+template <
+    typename F, typename Variant, int N,
+    std::enable_if_t<N < std::decay_t<Variant>::kNTypes, int> = 0>
+decltype(auto) visitHelper(F&& f, Variant&& v, std::integral_constant<int, N>) {
+  // clang-format on
+  using VariantT = std::decay_t<Variant>;
+  using T = typename VariantT::template TypeOf<N>;
+  if (auto* t = v.template get_if<T>()) {
+    return f(*t);
+  } else {
+    return visitHelper(std::forward<F>(f),
+                       std::forward<Variant>(v),
+                       std::integral_constant<int, N + 1>());
+  }
+}
+
+}  // namespace internal
+
+template <typename F, typename Variant>
+decltype(auto) visit(F&& f, Variant&& v) {
+  return internal::visitHelper(std::forward<F>(f),
+                               std::forward<Variant>(v),
+                               std::integral_constant<int, 0>());
+}
+
+}  // namespace infrt
diff --git a/paddle/infrt/tensor/CMakeLists.txt b/paddle/infrt/tensor/CMakeLists.txt
new file mode 100644
index 0000000000000..95b2e8f683926
--- /dev/null
+++ b/paddle/infrt/tensor/CMakeLists.txt
@@ -0,0 +1,20 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+  tensor_map.cc
+  tensor_metadata.cc
+  dense_tensor_view.cc
+  dense_host_tensor.cc
+  tensor_shape.cc
+  )
+
+# set(tensor_map_mlir "${CMAKE_SOURCE_DIR}/infrt/dialect/mlir_tests/tensor_map.mlir")
+# set(external_kernels_lib "${CMAKE_BINARY_DIR}/paddle/libexternal_kernels.so")
+# message(STATUS "tensor_map_mlir: ${tensor_map_mlir}")
+# message(STATUS "external_kernels_lib: ${external_kernels_lib}")
+
+# Disable temporarily for the external-kernel's mkldnn is outdate
+# add_test(
+#     NAME run_and_check_tensor_map
+#     COMMAND sh -c "sed -e 's|/infrt/build|${CMAKE_BINARY_DIR}|' ${tensor_map_mlir} > /tmp/tensor_map.mlir && ${CMAKE_BINARY_DIR}/infrt/host_context/infrt-exec -i /tmp/tensor_map.mlir --shared_libs=${external_kernels_lib} | ${LLVM_PATH}/bin/FileCheck ${tensor_map_mlir}"
+# )
diff --git a/paddle/infrt/tensor/dense_host_tensor.cc b/paddle/infrt/tensor/dense_host_tensor.cc
new file mode 100644
index 0000000000000..e54ab0e5c42ce
--- /dev/null
+++ b/paddle/infrt/tensor/dense_host_tensor.cc
@@ -0,0 +1,86 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/tensor/dense_host_tensor.h"
+
+#include <llvm/Support/raw_os_ostream.h>
+
+#include "paddle/infrt/common/buffer.h"
+
+namespace infrt::tensor {
+
+DenseHostTensor::DenseHostTensor(const TensorShape& shape, DType dtype)
+    : HostTensor(TensorMetadata{dtype, shape}) {
+  CHECK(metadata().IsValid()) << "Tensor construct get invalid metadata";
+  buffer_.reset(new infrt::Buffer(infrt::common::DefaultHostTarget()));
+  buffer_->ResizeLazy(dtype.GetHostSize() * shape.GetNumElements());
+}
+
+const TensorShape& DenseHostTensor::shape() const { return metadata().shape; }
+
+void DenseHostTensor::Init(const std::vector<int64_t>& shape, DType dtype) {
+  auto shape_array = llvm::ArrayRef<int64_t>(shape.data(), shape.size());
+  auto metadata = TensorMetadata(dtype, shape_array);
+  setTensorMetadata(metadata);
+  buffer_.reset(new infrt::Buffer(infrt::common::DefaultHostTarget()));
+  buffer_->ResizeLazy(dtype.GetHostSize() * metadata.shape.GetNumElements());
+}
+
+const infrt::Buffer* DenseHostTensor::buffer() const { return buffer_.get(); }
+
+template <typename T>
+void DisplayArray(std::ostream& os, T* data, int num_elements) {
+  for (int i = 0; i < num_elements - 1; i++) os << data[i] << ", ";
+  if (num_elements > 0) os << data[num_elements - 1];
+}
+
+std::ostream& operator<<(std::ostream& os, const DenseHostTensor& instance) {
+  CHECK(instance.metadata().IsValid())
+      << "Cann't print tensor with invalid metadata";
+  llvm::raw_os_ostream oos(os);
+  oos << "tensor: ";
+  oos << "shape=";
+  oos << instance.shape();
+  oos << ", values=[";
+
+  oos.flush();
+
+  if (instance.metadata().dtype == GetDType<float>()) {
+    auto* data = reinterpret_cast<float*>(instance.buffer()->data()->memory);
+    DisplayArray(os, data, instance.shape().GetNumElements());
+  } else if (instance.metadata().dtype == GetDType<double>()) {
+    auto* data = reinterpret_cast<double*>(instance.buffer()->data()->memory);
+    DisplayArray(os, data, instance.shape().GetNumElements());
+  } else if (instance.metadata().dtype == GetDType<int32_t>()) {
+    auto* data = reinterpret_cast<int32_t*>(instance.buffer()->data()->memory);
+    DisplayArray(os, data, instance.shape().GetNumElements());
+  } else if (instance.metadata().dtype == GetDType<int64_t>()) {
+    auto* data = reinterpret_cast<int64_t*>(instance.buffer()->data()->memory);
+    DisplayArray(os, data, instance.shape().GetNumElements());
+  } else {
+    LOG(FATAL) << "Not supported dtype [" << instance.metadata().dtype.name()
+               << " " << static_cast<int>(instance.metadata().dtype.kind())
+               << "] in print";
+  }
+
+  os << "]";
+
+  return os;
+}
+
+DenseHostTensor::~DenseHostTensor() {}
+
+void* DenseHostTensor::raw_data() const { return buffer_->data()->memory; }
+
+}  // namespace infrt::tensor
diff --git a/paddle/infrt/tensor/dense_host_tensor.h b/paddle/infrt/tensor/dense_host_tensor.h
new file mode 100644
index 0000000000000..7821395b54ea1
--- /dev/null
+++ b/paddle/infrt/tensor/dense_host_tensor.h
@@ -0,0 +1,92 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <utility>
+
+#include "paddle/infrt/tensor/tensor_metadata.h"
+#include "paddle/infrt/tensor/tensor_shape.h"
+
+namespace infrt {
+class Buffer;
+}  // namespace infrt
+
+namespace infrt::tensor {
+
+enum class DeviceKind {
+  kCPU = 0,
+};
+
+class Tensor {
+ public:
+  virtual bool IsHostTensor() const = 0;
+  virtual ~Tensor() = default;
+
+  const TensorMetadata& metadata() const { return metadata_; }
+
+ protected:
+  Tensor() = default;
+  void setTensorMetadata(TensorMetadata& metadata) {  // NOLINT
+    metadata_ = metadata;
+  }
+  explicit Tensor(const TensorMetadata& metadata) : metadata_(metadata) {}
+  explicit Tensor(TensorMetadata&& metadata) : metadata_(std::move(metadata)) {}
+
+ private:
+  TensorMetadata metadata_;
+};
+
+class HostTensor : public Tensor {
+ public:
+  bool IsHostTensor() const override { return true; }
+
+  virtual ~HostTensor() {}
+
+ protected:
+  HostTensor() = default;
+  explicit HostTensor(const TensorMetadata& metadata) : Tensor(metadata) {}
+  explicit HostTensor(TensorMetadata&& metadata)
+      : Tensor(std::move(metadata)) {}
+};
+
+// TODO(Superjomn) Replace the hlir/framework/Tensor with this.
+/**
+ * DenseTensor is a dense tensor, it holds a TensorShape and a buffer.
+ */
+class DenseHostTensor : public HostTensor {
+ public:
+  DenseHostTensor() = default;
+  DenseHostTensor(const TensorShape& shape, DType dtype);
+
+  void Init(const std::vector<int64_t>& shape, DType dtype);
+  const TensorShape& shape() const;
+
+  const Buffer* buffer() const;
+
+  void* raw_data() const;
+
+  friend std::ostream& operator<<(std::ostream& os,
+                                  const DenseHostTensor& instance);
+
+  virtual ~DenseHostTensor();
+
+ private:
+  // TODO(Superjomn) Discard the dependency of the Buffer in infrtcore or create
+  // a general buffer in common.
+  std::shared_ptr<Buffer> buffer_;
+};
+
+}  // namespace infrt::tensor
diff --git a/paddle/infrt/tensor/dense_tensor_view.cc b/paddle/infrt/tensor/dense_tensor_view.cc
new file mode 100644
index 0000000000000..df8c8ba27018b
--- /dev/null
+++ b/paddle/infrt/tensor/dense_tensor_view.cc
@@ -0,0 +1,17 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/tensor/dense_tensor_view.h"
+
+namespace infrt::tensor {}  // namespace infrt::tensor
diff --git a/paddle/infrt/tensor/dense_tensor_view.h b/paddle/infrt/tensor/dense_tensor_view.h
new file mode 100644
index 0000000000000..71bb23fd4a782
--- /dev/null
+++ b/paddle/infrt/tensor/dense_tensor_view.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <glog/logging.h>
+
+#include "paddle/infrt/tensor/dense_host_tensor.h"
+
+namespace infrt::tensor {
+
+template <typename DType>
+class DTArrayView {
+ public:
+  using UnderlyingT = DenseHostTensor;
+
+  explicit DTArrayView(const DenseHostTensor* tensor) : tensor_(*tensor) {}
+
+  const TensorShape& shape() { return tensor_.shape(); }
+
+  size_t GetNumElements() const { return tensor_.shape().GetNumElements(); }
+
+  const DType* data() const {
+    return static_cast<const DType*>(tensor_.raw_data());
+  }
+  DType* data() { return static_cast<DType*>(tensor_.raw_data()); }
+
+  llvm::ArrayRef<DType> Elements() const {
+    return llvm::ArrayRef<DType>(data(), GetNumElements());
+  }
+
+ private:
+  const DenseHostTensor& tensor_;
+};
+
+template <typename DType>
+class MutableDTArrayView : public DTArrayView<DType> {
+ public:
+  explicit MutableDTArrayView(DenseHostTensor* tensor)
+      : DTArrayView<DType>(tensor) {}
+
+  void Fill(const DType& v) {
+    std::fill(this->data(), this->data() + this->GetNumElements(), v);
+  }
+
+  using DTArrayView<DType>::data;
+  using DTArrayView<DType>::GetNumElements;
+  llvm::MutableArrayRef<DType> Elements() {
+    return llvm::MutableArrayRef<DType>(data(), this->GetNumElements());
+  }
+};
+
+}  // namespace infrt::tensor
diff --git a/paddle/infrt/tensor/tensor_map.cc b/paddle/infrt/tensor/tensor_map.cc
new file mode 100644
index 0000000000000..2fe81a3b77489
--- /dev/null
+++ b/paddle/infrt/tensor/tensor_map.cc
@@ -0,0 +1,95 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/tensor/tensor_map.h"
+
+#include <fstream>
+#include <iostream>
+
+#include "paddle/infrt/common/string.h"
+#include "paddle/infrt/paddle/model_parser.h"
+
+using Scope = infrt::paddle::Scope;
+using Target = infrt::common::Target;
+using Type = infrt::common::Type;
+
+namespace infrt {
+namespace tensor {
+
+DType CinnType2DType_(Type type) {
+  if (type.is_bool()) return GetDType<bool>();
+  if (type.is_int(8)) return GetDType<int8_t>();
+  if (type.is_int(16)) return GetDType<int16_t>();
+  if (type.is_int(32)) return GetDType<int32_t>();
+  if (type.is_int(64)) return GetDType<int64_t>();
+  if (type.is_uint(8)) return GetDType<uint8_t>();
+  if (type.is_uint(16)) return GetDType<uint16_t>();
+  if (type.is_uint(32)) return GetDType<uint32_t>();
+  if (type.is_uint(64)) return GetDType<uint64_t>();
+  if (type.is_float(32)) return GetDType<float>();
+  if (type.is_float(64)) return GetDType<double>();
+  if (type.is_string()) return GetDType<std::string>();
+  return DType(DType::Kind::Unk);
+}
+
+TensorMap *LoadParams(const std::string &path) {
+  std::cout << "loading params from: " << path << std::endl;
+  TensorMap *map = new TensorMap();
+  Scope scope;
+  const Target &target = common::DefaultHostTarget();
+
+  std::string model_path = path + "/__model__";
+  // paddle::framework::proto::ProgramDesc pb_proto_prog =
+  // *infrt::frontend::paddle::LoadProgram(model_path);
+  auto pb_proto_prog = *paddle::LoadProgram(model_path);
+  // infrt::frontend::paddle::pb::ProgramDesc pb_prog_desc(&pb_proto_prog);
+  // infrt::frontend::paddle::TransformProgramDescAnyToCpp(pb_prog_desc,
+  // cpp_prog);
+  auto main_block = pb_proto_prog.blocks(0);
+  for (auto &var : main_block.vars()) {
+    if (var.name() == "feed" || var.name() == "fetch" || !var.persistable())
+      continue;
+    std::string param_path = path + "/" + var.name();
+    std::ifstream param_file(param_path, std::ios::binary);
+    switch (var.type().type()) {
+      case ::paddle::framework::proto::VarType_Type_LOD_TENSOR: {
+        auto var_name = infrt::TransValidVarName(var.name());
+        // std::cout << "var name: " << var.name() << " " << var_name <<
+        // std::endl;
+        auto *_var = scope.Var<paddle::Tensor>(var_name);
+        paddle::LoadLoDTensor(param_file, _var, target);
+        auto tensor = scope.GetTensor(var_name);
+        auto *src_data = tensor->data<float>();
+        auto &infrt_type = tensor->type();
+        std::vector<int64_t> shape;
+        for (int dim : tensor->shape().data()) shape.push_back(dim);
+        auto shape_array = llvm::ArrayRef<int64_t>(shape.data(), shape.size());
+        auto dtype = CinnType2DType_(infrt_type);
+        auto *dht = new DenseHostTensor(TensorShape(shape_array), dtype);
+        int num_elements = dht->shape().GetNumElements();
+        auto *dst_data = reinterpret_cast<float *>(dht->raw_data());
+        for (int i = 0; i < num_elements; ++i) dst_data[i] = src_data[i];
+        (*map)[var.name()] = dht;
+        break;
+      }
+      default:
+        std::cout << "unknown weight type" << std::endl;
+        break;
+    }
+  }
+  return map;
+}
+
+}  // namespace tensor
+}  // namespace infrt
diff --git a/paddle/infrt/tensor/tensor_map.h b/paddle/infrt/tensor/tensor_map.h
new file mode 100644
index 0000000000000..ec0a58149ceb6
--- /dev/null
+++ b/paddle/infrt/tensor/tensor_map.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <unordered_map>
+
+#include "paddle/infrt/tensor/dense_host_tensor.h"
+
+namespace infrt {
+namespace tensor {
+
+using TensorMap = std::unordered_map<std::string, tensor::DenseHostTensor*>;
+
+TensorMap* LoadParams(const std::string& path);
+
+}  // namespace tensor
+}  // namespace infrt
diff --git a/paddle/infrt/tensor/tensor_metadata.cc b/paddle/infrt/tensor/tensor_metadata.cc
new file mode 100644
index 0000000000000..20f3ee2508547
--- /dev/null
+++ b/paddle/infrt/tensor/tensor_metadata.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/tensor/tensor_metadata.h"
+
+#include <llvm/Support/raw_ostream.h>
+
+namespace infrt {
+namespace tensor {
+
+llvm::raw_ostream& operator<<(llvm::raw_ostream& os, TensorMetadata& meta) {
+  os << meta.dtype.name();
+  os << "\n";
+  os << meta.shape;
+  return os;
+}
+
+}  // namespace tensor
+}  // namespace infrt
diff --git a/paddle/infrt/tensor/tensor_metadata.h b/paddle/infrt/tensor/tensor_metadata.h
new file mode 100644
index 0000000000000..b5aa8c1a83f73
--- /dev/null
+++ b/paddle/infrt/tensor/tensor_metadata.h
@@ -0,0 +1,58 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <glog/logging.h>
+
+#include "paddle/infrt/common/dtype.h"
+#include "paddle/infrt/tensor/tensor_shape.h"
+
+namespace infrt {
+namespace tensor {
+
+struct TensorMetadata {
+  DType dtype;
+  TensorShape shape;
+
+  TensorMetadata() = default;
+  TensorMetadata(DType dtype, const TensorShape& shape)
+      : dtype(dtype), shape(shape) {
+    CHECK(IsValid());
+  }
+  TensorMetadata(DType dtype, llvm::ArrayRef<int64_t> shape)
+      : dtype(dtype), shape(shape) {
+    CHECK(IsValid());
+  }
+
+  size_t GetHostSizeInBytes() const {
+    return dtype.GetHostSize() * shape.GetNumElements();
+  }
+
+  bool IsValid() const { return dtype.IsValid(); }
+  bool IsInvalid() const { return !dtype.IsValid(); }
+
+  bool operator==(const TensorMetadata& other) const {
+    return dtype == other.dtype && shape == other.shape;
+  }
+  bool operator!=(const TensorMetadata& other) const {
+    return !(*this == other);
+  }
+
+  friend llvm::raw_ostream& operator<<(llvm::raw_ostream& os,
+                                       TensorMetadata& meta);
+};
+
+}  // namespace tensor
+}  // namespace infrt
diff --git a/paddle/infrt/tensor/tensor_shape.cc b/paddle/infrt/tensor/tensor_shape.cc
new file mode 100644
index 0000000000000..1e6d5c107e1b8
--- /dev/null
+++ b/paddle/infrt/tensor/tensor_shape.cc
@@ -0,0 +1,96 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/tensor/tensor_shape.h"
+
+#include <glog/logging.h>
+#include <llvm/Support/raw_ostream.h>
+
+#include <algorithm>
+#include <functional>
+
+namespace infrt {
+namespace tensor {
+
+TensorShape::TensorShape(llvm::ArrayRef<int64_t> dims)
+    : dims_(dims.begin(), dims.end()) {}
+
+int TensorShape::GetRank() const { return dims_.size(); }
+
+int64_t TensorShape::GetDim(int idx) const {
+  CHECK_GE(idx, 0);
+  CHECK_LT(idx, GetRank());
+  return dims_[idx];
+}
+int TensorShape::GetNumElements() const {
+  int64_t size = 1;
+  for (int v : dims_) size *= v;
+  return size;
+}
+
+DynamicTensorShape::DynamicTensorShape(
+    llvm::Optional<llvm::ArrayRef<int64_t>> dims) {
+  if (dims.hasValue()) {
+    dims_ = llvm::SmallVector<int64_t, 4>(dims->begin(), dims->end());
+  }
+}
+
+int DynamicTensorShape::GetRank() const {
+  if (dims_.hasValue()) return dims_->size();
+  return kUnknownDimSize;
+}
+
+int64_t DynamicTensorShape::GetDim(int idx) const {
+  CHECK_GE(idx, 0);
+  CHECK_LT(idx, GetRank());
+  return (*dims_)[idx];
+}
+
+bool DynamicTensorShape::IsShapeKnown() const {
+  if (!dims_.hasValue()) return false;
+  for (int64_t v : *dims_) {
+    if (IsDimUnknown(v)) return false;
+  }
+  return true;
+}
+
+llvm::Optional<TensorShape> DynamicTensorShape::ToTensorShape() const {
+  if (IsShapeKnown()) {
+    return TensorShape(*dims_);
+  }
+  return llvm::None;
+}
+
+llvm::raw_ostream& operator<<(llvm::raw_ostream& os, const TensorShape& v) {
+  os << "shape[";
+  for (int i = 0; i < v.GetRank() - 1; i++) {
+    os << v.dims_[i] << ",";
+  }
+  if (v.GetRank() > 0) os << v.dims_.back();
+  os << "]";
+  return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const DynamicTensorShape& v) {
+  os << "dynamic_shape[";
+  for (int i = 0; i < v.GetRank() - 1; i++) {
+    os << v << ",";
+  }
+  if (v.GetRank() > 0) os << v.dims_->back();
+  os << "]";
+  return os;
+}
+
+}  // namespace tensor
+}  // namespace infrt
diff --git a/paddle/infrt/tensor/tensor_shape.h b/paddle/infrt/tensor/tensor_shape.h
new file mode 100644
index 0000000000000..cce95072f5c35
--- /dev/null
+++ b/paddle/infrt/tensor/tensor_shape.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <llvm/ADT/ArrayRef.h>
+
+namespace infrt {
+namespace tensor {
+
+/**
+ * TensorShape represents the shape of a Tensor, all the dimensions should be
+ * known.
+ */
+class TensorShape {
+ public:
+  TensorShape() = default;
+  explicit TensorShape(llvm::ArrayRef<int64_t> dims);
+
+  int GetRank() const;
+
+  int64_t GetDim(int idx) const;
+
+  int GetNumElements() const;
+
+  friend llvm::raw_ostream& operator<<(llvm::raw_ostream& os,
+                                       const TensorShape& v);
+  friend bool operator==(const TensorShape& a, const TensorShape& b) {
+    return a.dims_ == b.dims_;
+  }
+
+ private:
+  llvm::SmallVector<int64_t, 4> dims_;
+};
+
+/**
+ * DynamicTensorShape represents the shape of a Tensor, with some dimensions or
+ * even the rank is unknown.
+ */
+class DynamicTensorShape {
+ public:
+  explicit DynamicTensorShape(llvm::Optional<llvm::ArrayRef<int64_t>> dims);
+
+  //! Returns the rank if rank is known, or kUnknownDimSize.
+  int GetRank() const;
+
+  int64_t GetDim(int idx) const;
+
+  bool IsShapeKnown() const;
+
+  //! Convert to a TensorShape if all the dimensions are known.
+  llvm::Optional<TensorShape> ToTensorShape() const;
+
+  static constexpr int64_t kUnknownDimSize = -1;
+
+  static bool IsDimUnknown(int64_t dim) { return dim == kUnknownDimSize; }
+
+  friend std::ostream& operator<<(std::ostream& os,
+                                  const DynamicTensorShape& v);
+  friend bool operator==(const DynamicTensorShape& a,
+                         const DynamicTensorShape& b) {
+    return a.dims_ == b.dims_;
+  }
+
+ private:
+  //! Will be std::nullopt if no dim is known.
+  llvm::Optional<llvm::SmallVector<int64_t, 4>> dims_;
+};
+
+}  // namespace tensor
+}  // namespace infrt
diff --git a/paddle/pten/api/CMakeLists.txt b/paddle/pten/api/CMakeLists.txt
index 09df2c01fd97c..a454ae807bcaa 100644
--- a/paddle/pten/api/CMakeLists.txt
+++ b/paddle/pten/api/CMakeLists.txt
@@ -1,3 +1,3 @@
 add_subdirectory(lib)
 
-cc_library(pten_api SRCS all.cc DEPS linalg_api math_api creation_api manipulation_api utils_api)
+cc_library(pten_api SRCS all.cc DEPS pten_function_api utils_api)
diff --git a/paddle/pten/api/all.h b/paddle/pten/api/all.h
index 2c647786379c0..e853ae331e4c7 100644
--- a/paddle/pten/api/all.h
+++ b/paddle/pten/api/all.h
@@ -25,10 +25,7 @@ limitations under the License. */
 #endif
 
 // new pten apis
-#include "paddle/pten/api/include/creation.h"
-#include "paddle/pten/api/include/linalg.h"
-#include "paddle/pten/api/include/manipulation.h"
-#include "paddle/pten/api/include/math.h"
+#include "paddle/pten/api/include/api.h"
 #include "paddle/pten/api/include/tensor.h"
 #include "paddle/pten/api/include/utils.h"
 
diff --git a/paddle/pten/api/ext/dll_decl.h b/paddle/pten/api/ext/dll_decl.h
index 3dbea5e6dffc2..37c637c102c3b 100644
--- a/paddle/pten/api/ext/dll_decl.h
+++ b/paddle/pten/api/ext/dll_decl.h
@@ -15,13 +15,13 @@
 #pragma once
 
 #if defined(_WIN32)
-#ifndef PD_DLL_DECL
+#ifndef PADDLE_API
 #ifdef PADDLE_DLL_EXPORT
-#define PD_DLL_DECL __declspec(dllexport)
+#define PADDLE_API __declspec(dllexport)
 #else
-#define PD_DLL_DECL __declspec(dllimport)
+#define PADDLE_API __declspec(dllimport)
 #endif  // PADDLE_DLL_EXPORT
-#endif  // PD_DLL_DECL
+#endif  // PADDLE_API
 #else
-#define PD_DLL_DECL
+#define PADDLE_API
 #endif  // _WIN32
diff --git a/paddle/pten/api/ext/op_meta_info.h b/paddle/pten/api/ext/op_meta_info.h
index 140874f93aed7..351e88b57bd8b 100644
--- a/paddle/pten/api/ext/op_meta_info.h
+++ b/paddle/pten/api/ext/op_meta_info.h
@@ -33,7 +33,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-class PD_DLL_DECL OpMetaInfoHelper;
+class PADDLE_API OpMetaInfoHelper;
 }  // namespace framework
 
 using Tensor = paddle::Tensor;
@@ -425,7 +425,7 @@ struct InferDtypeFuncImpl<Return (*)(Args...), impl_fn> {
 
 ////////////////////// Op Meta Info //////////////////////
 
-class PD_DLL_DECL OpMetaInfo {
+class PADDLE_API OpMetaInfo {
  public:
   explicit OpMetaInfo(const std::string& op_name) : name_(op_name) {}
 
@@ -464,7 +464,7 @@ class PD_DLL_DECL OpMetaInfo {
 
 //////////////// Op Meta Info Map /////////////////
 
-class PD_DLL_DECL OpMetaInfoMap {
+class PADDLE_API OpMetaInfoMap {
  public:
   // this function's impl should keep in header file.
   // if move to cc file, meta info can not be added
@@ -488,7 +488,7 @@ class PD_DLL_DECL OpMetaInfoMap {
 
 //////////////// Op Meta Info Builder /////////////////
 
-class PD_DLL_DECL OpMetaInfoBuilder {
+class PADDLE_API OpMetaInfoBuilder {
  public:
   explicit OpMetaInfoBuilder(std::string&& name, size_t index);
   OpMetaInfoBuilder& Inputs(std::vector<std::string>&& inputs);
diff --git a/paddle/pten/api/include/creation.h b/paddle/pten/api/include/creation.h
deleted file mode 100644
index b4e4bd0fd0519..0000000000000
--- a/paddle/pten/api/include/creation.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/pten/api/include/tensor.h"
-#include "paddle/pten/common/backend.h"
-#include "paddle/pten/common/data_type.h"
-#include "paddle/pten/common/scalar.h"
-#include "paddle/pten/common/scalar_array.h"
-
-namespace paddle {
-namespace experimental {
-
-PD_DLL_DECL Tensor full(const ScalarArray& shape,
-                        const Scalar& value,
-                        DataType dtype = DataType::FLOAT32,
-                        Backend backend = Backend::CPU,
-                        DataLayout layout = DataLayout::NCHW);
-
-PD_DLL_DECL Tensor full_like(const Tensor& x,
-                             const Scalar& value,
-                             DataType dtype = DataType::UNDEFINED,
-                             Backend backend = Backend::UNDEFINED,
-                             DataLayout layout = DataLayout::UNDEFINED);
-
-PD_DLL_DECL Tensor ones_like(const Tensor& x,
-                             DataType dtype = DataType::UNDEFINED,
-                             Backend backend = Backend::UNDEFINED,
-                             DataLayout layout = DataLayout::UNDEFINED);
-
-PD_DLL_DECL Tensor zeros_like(const Tensor& x,
-                              DataType dtype = DataType::UNDEFINED,
-                              Backend backend = Backend::UNDEFINED,
-                              DataLayout layout = DataLayout::UNDEFINED);
-
-}  // namespace experimental
-}  // namespace paddle
diff --git a/paddle/pten/api/include/math.h b/paddle/pten/api/include/math.h
deleted file mode 100644
index 149500c546dfd..0000000000000
--- a/paddle/pten/api/include/math.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/pten/api/include/tensor.h"
-
-namespace paddle {
-namespace experimental {
-
-// TODO(chenweihang): add scale API
-// TODO(chenweihang): move mean API into stat.h/cc
-PD_DLL_DECL Tensor mean(const Tensor& x,
-                        const std::vector<int64_t>& axis,
-                        bool keep_dim);
-
-PD_DLL_DECL Tensor add(const Tensor& x, const Tensor& y);
-
-PD_DLL_DECL Tensor subtract(const Tensor& x, const Tensor& y);
-
-PD_DLL_DECL Tensor divide(const Tensor& x, const Tensor& y);
-
-PD_DLL_DECL Tensor multiply(const Tensor& x, const Tensor& y);
-
-PD_DLL_DECL Tensor sum(const Tensor& x,
-                       const std::vector<int64_t>& axis,
-                       DataType dtype,
-                       bool keep_dim);
-
-}  // namespace experimental
-}  // namespace paddle
diff --git a/paddle/pten/api/include/tensor.h b/paddle/pten/api/include/tensor.h
index 3eb1f89225414..6693dbf78f4d6 100644
--- a/paddle/pten/api/include/tensor.h
+++ b/paddle/pten/api/include/tensor.h
@@ -84,7 +84,7 @@ class AbstractAutogradMeta {
  * another simple Tensor design may be required for inference.
  */
 
-class PD_DLL_DECL Tensor final {
+class PADDLE_API Tensor final {
  public:
   /* Part 1: Construction and destruction methods */
 
diff --git a/paddle/pten/api/include/utils.h b/paddle/pten/api/include/utils.h
index c038e503d47bb..b8b955090b99a 100644
--- a/paddle/pten/api/include/utils.h
+++ b/paddle/pten/api/include/utils.h
@@ -21,7 +21,7 @@ namespace paddle {
 namespace experimental {
 
 // TODO(chenweihang): Replace backend by place when place is ready
-PD_DLL_DECL Tensor copy_to(const Tensor& x, Backend backend, bool blocking);
+PADDLE_API Tensor copy_to(const Tensor& x, Backend backend, bool blocking);
 
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/pten/api/lib/CMakeLists.txt b/paddle/pten/api/lib/CMakeLists.txt
index f30a3c89eb69e..96ad9ade8e3ad 100644
--- a/paddle/pten/api/lib/CMakeLists.txt
+++ b/paddle/pten/api/lib/CMakeLists.txt
@@ -14,8 +14,26 @@ cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS pten_tensor device_conte
 
 cc_library(op_meta_info SRCS op_meta_info.cc DEPS pten_tensor)
 
-cc_library(math_api SRCS math.cc DEPS pten_tensor pten kernel_dispatch)
-cc_library(linalg_api SRCS linalg.cc DEPS pten_tensor pten kernel_dispatch)
-cc_library(creation_api SRCS creation.cc DEPS pten_tensor pten kernel_dispatch)
-cc_library(manipulation_api SRCS manipulation.cc DEPS pten_tensor pten kernel_dispatch)
+set(api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api_gen.py)
+set(api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml)
+
+set(api_header_file ${CMAKE_SOURCE_DIR}/paddle/pten/api/include/api.h)
+set(api_source_file ${CMAKE_SOURCE_DIR}/paddle/pten/api/lib/api.cc)
+set(api_header_file_tmp ${api_header_file}.tmp)
+set(api_source_file_tmp ${api_source_file}.tmp)
+
+add_custom_command(
+  OUTPUT ${api_header_file} ${api_source_file}
+  COMMAND ${PYTHON_EXECUTABLE} -m pip install pyyaml
+  COMMAND ${PYTHON_EXECUTABLE} ${api_gen_file} 
+                 --api_yaml_path ${api_yaml_file}
+                 --api_header_path ${api_header_file_tmp}
+                 --api_source_path ${api_source_file_tmp}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${api_header_file_tmp} ${api_header_file}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${api_source_file_tmp} ${api_source_file}
+  COMMENT "copy_if_different ${api_header_file} ${api_source_file}"
+  DEPENDS ${api_yaml_file} ${api_gen_file}
+  VERBATIM)
+
 cc_library(utils_api SRCS utils.cc DEPS pten_tensor pten kernel_dispatch)
+cc_library(pten_function_api SRCS ${api_source_file} DEPS pten_tensor pten kernel_dispatch)
diff --git a/paddle/pten/api/lib/api_registry.h b/paddle/pten/api/lib/api_registry.h
index abb31451d522e..d75774a1a12be 100644
--- a/paddle/pten/api/lib/api_registry.h
+++ b/paddle/pten/api/lib/api_registry.h
@@ -37,10 +37,10 @@ namespace experimental {
 
 // use to declare symbol
 #define PT_REGISTER_API(name) \
-  PD_DLL_DECL int RegisterSymbolsFor##name() { return 0; }
+  PADDLE_API int RegisterSymbolsFor##name() { return 0; }
 
-#define PT_DECLARE_API(name)                         \
-  extern PD_DLL_DECL int RegisterSymbolsFor##name(); \
+#define PT_DECLARE_API(name)                        \
+  extern PADDLE_API int RegisterSymbolsFor##name(); \
   UNUSED static int use_pten_api_##name = RegisterSymbolsFor##name()
 
 }  // namespace experimental
diff --git a/paddle/pten/api/lib/creation.cc b/paddle/pten/api/lib/creation.cc
deleted file mode 100644
index 40054b5d272bd..0000000000000
--- a/paddle/pten/api/lib/creation.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/pten/api/include/creation.h"
-
-#include <memory>
-
-#include "glog/logging.h"
-
-#include "paddle/pten/api/lib/api_registry.h"
-#include "paddle/pten/api/lib/kernel_dispatch.h"
-#include "paddle/pten/api/lib/utils/allocator.h"
-#include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/include/core.h"
-#include "paddle/pten/include/infermeta.h"
-
-PT_DECLARE_MODULE(CreationCPU);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_MODULE(CreationCUDA);
-#endif
-
-namespace paddle {
-namespace experimental {
-
-PD_DLL_DECL Tensor full(const ScalarArray& shape,
-                        const Scalar& value,
-                        DataType dtype,
-                        Backend backend,
-                        DataLayout layout) {
-  // 1. Get kernel signature and kernel
-  pten::KernelKey kernel_key{backend, layout, dtype};
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "fill_constant", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  kernel_context.EmplaceBackAttr(pten::ScalarArray(shape));
-  kernel_context.EmplaceBackAttr(pten::Scalar(value));
-
-  // 4. InferMeta
-  auto out_meta = pten::FullInferMeta(shape, dtype, layout);
-
-  // 5. Prepare outputs
-  const auto allocator =
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  Tensor out;
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-PD_DLL_DECL Tensor full_like(const Tensor& x,
-                             const Scalar& value,
-                             DataType dtype,
-                             Backend backend,
-                             DataLayout layout) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-
-  DataType kernel_data_type =
-      dtype == DataType::UNDEFINED ? kernel_key.dtype() : dtype;
-  Backend kernel_backend =
-      backend == Backend::UNDEFINED ? kernel_key.backend() : backend;
-  DataLayout kernel_layout =
-      layout == DataLayout::UNDEFINED ? kernel_key.layout() : layout;
-
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "fill_any_like", {kernel_backend, kernel_layout, kernel_data_type});
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackAttr(pten::Scalar(value));
-
-  // 4. InferMeta
-  auto out_meta = FullLikeInferMeta(dense_x->meta(), dtype, layout);
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator =
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          pten::TransToFluidPlace(kernel_backend));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-PD_DLL_DECL Tensor ones_like(const Tensor& x,
-                             DataType dtype,
-                             Backend backend,
-                             DataLayout layout) {
-  return full_like(x, 1, dtype, backend, layout);
-}
-
-PD_DLL_DECL Tensor zeros_like(const Tensor& x,
-                              DataType dtype,
-                              Backend backend,
-                              DataLayout layout) {
-  return full_like(x, 0, dtype, backend, layout);
-}
-
-}  // namespace experimental
-}  // namespace paddle
-
-PT_REGISTER_API(Creation);
diff --git a/paddle/pten/api/lib/ext_compat_utils.cc b/paddle/pten/api/lib/ext_compat_utils.cc
index b7250d1579431..791a8526f3847 100644
--- a/paddle/pten/api/lib/ext_compat_utils.cc
+++ b/paddle/pten/api/lib/ext_compat_utils.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/pten/api/lib/ext_compat_utils.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 namespace paddle {
 namespace experimental {
diff --git a/paddle/pten/api/lib/kernel_dispatch.cc b/paddle/pten/api/lib/kernel_dispatch.cc
index 0205a0d53c319..97b3bf281fc8e 100644
--- a/paddle/pten/api/lib/kernel_dispatch.cc
+++ b/paddle/pten/api/lib/kernel_dispatch.cc
@@ -57,5 +57,46 @@ paddle::platform::DeviceContext* GetDeviceContextByBackend(
   return pool.Get(pten::TransToFluidPlace(backend));
 }
 
+DataType ParseDataType(DataType dtype) { return dtype; }
+DataType ParseDataType(const Tensor& tensor) { return tensor.type(); }
+DataType ParseDataType(const std::vector<Tensor>& tensors) {
+  if (tensors.empty()) {
+    return DataType::UNDEFINED;
+  }
+  DataType dtype = tensors[0].type();
+  auto n = tensors.size();
+  for (size_t i = 1; i < n; ++i) {
+    if (tensors[i].type() != dtype) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The data_type of input tensor in list isn't consistent, "
+          "the first tensor is %s, but %dth tensor is %s.",
+          dtype,
+          i,
+          tensors[i].type()));
+    }
+  }
+  return dtype;
+}
+
+DataType ParseDataTypeWithInputOrder(DataType dtype, const Tensor& tensor) {
+  return dtype != DataType::UNDEFINED ? dtype : ParseDataType(tensor);
+}
+
+Backend ParseBackend(Backend backend) { return backend; }
+Backend ParseBackend(const Tensor& tensor) {
+  return pten::TransToPtenBackend(tensor.inner_place());
+}
+
+Backend ParseBackendWithInputOrder(Backend backend, const Tensor& tensor) {
+  return backend != Backend::UNDEFINED ? backend : ParseBackend(tensor);
+}
+
+DataLayout ParseLayout(DataLayout layout) { return layout; }
+DataLayout ParseLayout(const Tensor& tensor) { return tensor.layout(); }
+
+DataLayout ParseLayoutWithInputOrder(DataLayout layout, const Tensor& tensor) {
+  return layout != DataLayout::UNDEFINED ? layout : ParseLayout(tensor);
+}
+
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/pten/api/lib/kernel_dispatch.h b/paddle/pten/api/lib/kernel_dispatch.h
index 2dba88d07eb12..e78e79f27c28b 100644
--- a/paddle/pten/api/lib/kernel_dispatch.h
+++ b/paddle/pten/api/lib/kernel_dispatch.h
@@ -129,5 +129,25 @@ KernelKeySet ParseKernelKeyByInputArgs(const Args&... args) {
   return detail::KernelKeyParser().apply(args...).key_set;
 }
 
+DataType ParseDataType(DataType dtype);
+DataType ParseDataType(const Tensor& tensor);
+DataType ParseDataType(const std::vector<Tensor>& tensors);
+DataType ParseDataTypeWithInputOrder(DataType dtype, const Tensor& tensor);
+
+Backend ParseBackend(Backend backend);
+Backend ParseBackend(const Tensor& tensor);
+template <typename T, typename... Args>
+Backend ParseBackend(T t, Args... args) {
+  auto backend_set =
+      BackendSet(ParseBackend(t)) | BackendSet(ParseBackend(args...));
+  return static_cast<Backend>(64 -
+                              detail::CountLeadingZeros(backend_set.bitset()));
+}
+Backend ParseBackendWithInputOrder(Backend backend, const Tensor& tensor);
+
+DataLayout ParseLayout(DataLayout layout);
+DataLayout ParseLayout(const Tensor& tensor);
+DataLayout ParseLayoutWithInputOrder(DataLayout layout, const Tensor& tensor);
+
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/pten/api/lib/linalg.cc b/paddle/pten/api/lib/linalg.cc
deleted file mode 100644
index 8eae16d9018ad..0000000000000
--- a/paddle/pten/api/lib/linalg.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/pten/api/include/linalg.h"
-
-#include <memory>
-
-#include "glog/logging.h"
-
-#include "paddle/pten/api/lib/api_registry.h"
-#include "paddle/pten/api/lib/kernel_dispatch.h"
-#include "paddle/pten/api/lib/utils/allocator.h"
-#include "paddle/pten/core/convert_utils.h"
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/core/kernel_context.h"
-#include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/include/core.h"
-#include "paddle/pten/include/infermeta.h"
-
-PT_DECLARE_MODULE(LinalgCPU);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_MODULE(LinalgCUDA);
-#endif
-
-namespace paddle {
-namespace experimental {
-
-PD_DLL_DECL Tensor dot(const Tensor& x, const Tensor& y) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "dot", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-  auto dense_y = std::dynamic_pointer_cast<pten::DenseTensor>(y.impl());
-  kernel_context.EmplaceBackInput(dense_y);
-  // TODO(chenweihang): add transform impl
-
-  // 4. InferMeta
-  auto out_meta = DotInferMeta(dense_x->meta(), dense_y->meta());
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator = std::make_shared<DefaultAllocator>(
-      pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-PD_DLL_DECL Tensor matmul(const Tensor& x,
-                          const Tensor& y,
-                          bool transpose_x,
-                          bool transpose_y) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x, y);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "matmul_v2", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  auto dense_y = std::dynamic_pointer_cast<pten::DenseTensor>(y.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-  kernel_context.EmplaceBackInput(dense_y);
-  kernel_context.EmplaceBackAttr(transpose_x);
-  kernel_context.EmplaceBackAttr(transpose_y);
-  // TODO(chenweihang): add transform impl
-
-  // 4. InferMeta
-  auto out_meta = MatmulInferMeta(
-      dense_x->meta(), dense_y->meta(), transpose_x, transpose_y);
-
-  // 5. Prepare outputs
-  const auto allocator = std::make_shared<DefaultAllocator>(
-      pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-
-  Tensor out;
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-}  // namespace experimental
-}  // namespace paddle
-
-PT_REGISTER_API(Linalg);
diff --git a/paddle/pten/api/lib/manipulation.cc b/paddle/pten/api/lib/manipulation.cc
deleted file mode 100644
index 51a7702d9fc6e..0000000000000
--- a/paddle/pten/api/lib/manipulation.cc
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/pten/api/include/manipulation.h"
-
-#include <memory>
-
-#include "glog/logging.h"
-#include "paddle/pten/api/lib/api_registry.h"
-#include "paddle/pten/api/lib/kernel_dispatch.h"
-#include "paddle/pten/api/lib/utils/allocator.h"
-#include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/include/core.h"
-#include "paddle/pten/infermeta/unary.h"
-
-PT_DECLARE_MODULE(ManipulationCPU);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_MODULE(ManipulationCUDA);
-#endif
-
-namespace paddle {
-namespace experimental {
-
-PD_DLL_DECL Tensor flatten(const Tensor& x, int start_axis, int stop_axis) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "flatten_contiguous_range", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-  kernel_context.EmplaceBackAttr(start_axis);
-  kernel_context.EmplaceBackAttr(stop_axis);
-
-  // 4. InferMeta
-  auto out_meta = FlattenInferMeta(dense_x->meta(), start_axis, stop_axis);
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator = std::make_shared<DefaultAllocator>(
-      pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-PD_DLL_DECL Tensor cast(const Tensor& x, DataType out_dtype) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "cast", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-  kernel_context.EmplaceBackAttr(out_dtype);
-  kernel_context.EmplaceBackAttr(dense_x->meta().dtype);
-
-  // 4. InferMeta
-  auto out_meta = CastInferMeta(dense_x->meta(), out_dtype);
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator = std::make_shared<DefaultAllocator>(
-      pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-PD_DLL_DECL Tensor reshape(const Tensor& x, const std::vector<int64_t>& shape) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "reshape2", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-  kernel_context.EmplaceBackAttr(shape);
-
-  // 4. InferMeta
-  auto out_meta = InferMetaFromVecValue(dense_x->meta(), shape);
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator = std::make_shared<DefaultAllocator>(
-      pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-}  // namespace experimental
-}  // namespace paddle
-
-PT_REGISTER_API(Manipulation);
diff --git a/paddle/pten/api/lib/math.cc b/paddle/pten/api/lib/math.cc
deleted file mode 100644
index bd2567ddb1506..0000000000000
--- a/paddle/pten/api/lib/math.cc
+++ /dev/null
@@ -1,280 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/pten/api/include/math.h"
-
-#include <memory>
-
-#include "glog/logging.h"
-
-#include "paddle/pten/api/lib/api_registry.h"
-#include "paddle/pten/api/lib/kernel_dispatch.h"
-#include "paddle/pten/api/lib/utils/allocator.h"
-#include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/include/core.h"
-#include "paddle/pten/include/infermeta.h"
-#include "paddle/pten/infermeta/unary.h"
-
-PT_DECLARE_MODULE(MathCPU);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_MODULE(MathCUDA);
-#endif
-
-namespace paddle {
-namespace experimental {
-
-PD_DLL_DECL Tensor mean(const Tensor& x,
-                        const std::vector<int64_t>& axis,
-                        bool keep_dim) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "reduce_mean", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-
-  // The real value of reduce_all will be get in kernel
-  // so use default value(false) is OK.
-  bool reduce_all = false;
-
-  DataType out_dtype = DataType::UNDEFINED;
-
-  kernel_context.EmplaceBackAttr(axis);
-  kernel_context.EmplaceBackAttr(keep_dim);
-  kernel_context.EmplaceBackAttr(reduce_all);
-  kernel_context.EmplaceBackAttr(dense_x->dtype());
-  kernel_context.EmplaceBackAttr(out_dtype);
-
-  // 4. InferShape
-  auto out_meta = ReduceInferMeta(dense_x->meta(), axis, keep_dim);
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator =
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-PD_DLL_DECL Tensor sum(const Tensor& x,
-                       const std::vector<int64_t>& axis,
-                       DataType dtype,
-                       bool keep_dim) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "reduce_sum", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-
-  // The real value of reduce_all will be get in kernel
-  // so use default value(false) is OK.
-  bool reduce_all = false;
-
-  DataType out_dtype = DataType::UNDEFINED;
-  if (dense_x->dtype() == DataType::BOOL ||
-      dense_x->dtype() == DataType::INT32 ||
-      dense_x->dtype() == DataType::INT64) {
-    out_dtype = DataType::INT64;
-  }
-
-  kernel_context.EmplaceBackAttr(axis);
-  kernel_context.EmplaceBackAttr(keep_dim);
-  kernel_context.EmplaceBackAttr(reduce_all);
-  kernel_context.EmplaceBackAttr(dense_x->dtype());
-  kernel_context.EmplaceBackAttr(out_dtype);
-
-  // 4. InferMeta
-  auto out_meta = ReduceInferMeta(dense_x->meta(), axis, keep_dim);
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator =
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-PD_DLL_DECL Tensor add(const Tensor& x, const Tensor& y) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x, y);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "elementwise_add", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-  auto dense_y = std::dynamic_pointer_cast<pten::DenseTensor>(y.impl());
-  kernel_context.EmplaceBackInput(dense_y);
-  kernel_context.EmplaceBackAttr(-1);
-
-  // 4. InferMeta
-  auto out_meta = ElementwiseInferMeta(dense_x->meta(), dense_y->meta(), -1);
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator = std::make_shared<DefaultAllocator>(
-      pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-PD_DLL_DECL Tensor subtract(const Tensor& x, const Tensor& y) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x, y);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "elementwise_sub", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-  auto dense_y = std::dynamic_pointer_cast<pten::DenseTensor>(y.impl());
-  kernel_context.EmplaceBackInput(dense_y);
-  kernel_context.EmplaceBackAttr(-1);
-
-  // 4. InferMeta
-  auto out_meta = ElementwiseInferMeta(dense_x->meta(), dense_y->meta(), -1);
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator = std::make_shared<DefaultAllocator>(
-      pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-PD_DLL_DECL Tensor divide(const Tensor& x, const Tensor& y) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x, y);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "elementwise_div", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-  auto dense_y = std::dynamic_pointer_cast<pten::DenseTensor>(y.impl());
-  kernel_context.EmplaceBackInput(dense_y);
-  kernel_context.EmplaceBackAttr(-1);
-
-  // 4. InferMeta
-  auto out_meta = ElementwiseInferMeta(dense_x->meta(), dense_y->meta(), -1);
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator = std::make_shared<DefaultAllocator>(
-      pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-PD_DLL_DECL Tensor multiply(const Tensor& x, const Tensor& y) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x, y);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "elementwise_mul", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-  auto dense_y = std::dynamic_pointer_cast<pten::DenseTensor>(y.impl());
-  kernel_context.EmplaceBackInput(dense_y);
-  kernel_context.EmplaceBackAttr(-1);
-
-  // 4. InferMeta
-  auto out_meta = ElementwiseInferMeta(dense_x->meta(), dense_y->meta(), -1);
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator = std::make_shared<DefaultAllocator>(
-      pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-}  // namespace experimental
-}  // namespace paddle
-
-PT_REGISTER_API(Math);
diff --git a/paddle/pten/api/lib/tensor.cc b/paddle/pten/api/lib/tensor.cc
index 3f0966d369d0c..f6cccf0b357ce 100644
--- a/paddle/pten/api/lib/tensor.cc
+++ b/paddle/pten/api/lib/tensor.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <vector>
 
 #include "glog/logging.h"
-#include "paddle/pten/api/include/manipulation.h"
 #include "paddle/pten/api/include/utils.h"
 #include "paddle/pten/api/lib/ext_compat_utils.h"
 #include "paddle/pten/api/lib/utils/allocator.h"
@@ -67,6 +66,9 @@ inline bool IsDenseTensor(
 
 }  // namespace detail
 
+// declare cast api
+Tensor cast(const Tensor &x, DataType out_dtype);
+
 /////// Tensor Methods ////////
 
 /* Part 1: Construction and destruction methods */
@@ -157,19 +159,19 @@ T *Tensor::mutable_data() {
   return nullptr;
 }
 
-template PD_DLL_DECL float *Tensor::mutable_data<float>();
-template PD_DLL_DECL double *Tensor::mutable_data<double>();
-template PD_DLL_DECL int64_t *Tensor::mutable_data<int64_t>();
-template PD_DLL_DECL int32_t *Tensor::mutable_data<int32_t>();
-template PD_DLL_DECL uint8_t *Tensor::mutable_data<uint8_t>();
-template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>();
-template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>();
-template PD_DLL_DECL bool *Tensor::mutable_data<bool>();
-template PD_DLL_DECL paddle::platform::complex<float>
+template PADDLE_API float *Tensor::mutable_data<float>();
+template PADDLE_API double *Tensor::mutable_data<double>();
+template PADDLE_API int64_t *Tensor::mutable_data<int64_t>();
+template PADDLE_API int32_t *Tensor::mutable_data<int32_t>();
+template PADDLE_API uint8_t *Tensor::mutable_data<uint8_t>();
+template PADDLE_API int8_t *Tensor::mutable_data<int8_t>();
+template PADDLE_API int16_t *Tensor::mutable_data<int16_t>();
+template PADDLE_API bool *Tensor::mutable_data<bool>();
+template PADDLE_API paddle::platform::complex<float>
     *Tensor::mutable_data<paddle::platform::complex<float>>();
-template PD_DLL_DECL paddle::platform::complex<double>
+template PADDLE_API paddle::platform::complex<double>
     *Tensor::mutable_data<paddle::platform::complex<double>>();
-template PD_DLL_DECL paddle::platform::float16 *
+template PADDLE_API paddle::platform::float16 *
 Tensor::mutable_data<paddle::platform::float16>();
 
 template <typename T>
@@ -183,25 +185,25 @@ T *Tensor::mutable_data(const PlaceType &place) {
   return mutable_data<T>();
 }
 
-template PD_DLL_DECL float *Tensor::mutable_data<float>(const PlaceType &place);
-template PD_DLL_DECL double *Tensor::mutable_data<double>(
+template PADDLE_API float *Tensor::mutable_data<float>(const PlaceType &place);
+template PADDLE_API double *Tensor::mutable_data<double>(
     const PlaceType &place);
-template PD_DLL_DECL int64_t *Tensor::mutable_data<int64_t>(
+template PADDLE_API int64_t *Tensor::mutable_data<int64_t>(
     const PlaceType &place);
-template PD_DLL_DECL int32_t *Tensor::mutable_data<int32_t>(
+template PADDLE_API int32_t *Tensor::mutable_data<int32_t>(
     const PlaceType &place);
-template PD_DLL_DECL uint8_t *Tensor::mutable_data<uint8_t>(
+template PADDLE_API uint8_t *Tensor::mutable_data<uint8_t>(
     const PlaceType &place);
-template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>(
+template PADDLE_API int8_t *Tensor::mutable_data<int8_t>(
     const PlaceType &place);
-template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>(
+template PADDLE_API int16_t *Tensor::mutable_data<int16_t>(
     const PlaceType &place);
-template PD_DLL_DECL bool *Tensor::mutable_data<bool>(const PlaceType &place);
-template PD_DLL_DECL paddle::platform::complex<float> *
+template PADDLE_API bool *Tensor::mutable_data<bool>(const PlaceType &place);
+template PADDLE_API paddle::platform::complex<float> *
 Tensor::mutable_data<paddle::platform::complex<float>>(const PlaceType &place);
-template PD_DLL_DECL paddle::platform::complex<double> *
+template PADDLE_API paddle::platform::complex<double> *
 Tensor::mutable_data<paddle::platform::complex<double>>(const PlaceType &place);
-template PD_DLL_DECL paddle::platform::float16 *
+template PADDLE_API paddle::platform::float16 *
 Tensor::mutable_data<paddle::platform::float16>(const PlaceType &place);
 
 template <typename T>
@@ -212,22 +214,22 @@ const T *Tensor::data() const {
   return nullptr;
 }
 
-template PD_DLL_DECL const float *Tensor::data<float>() const;
-template PD_DLL_DECL const double *Tensor::data<double>() const;
-template PD_DLL_DECL const int64_t *Tensor::data<int64_t>() const;
-template PD_DLL_DECL const int32_t *Tensor::data<int32_t>() const;
-template PD_DLL_DECL const uint8_t *Tensor::data<uint8_t>() const;
-template PD_DLL_DECL const int8_t *Tensor::data<int8_t>() const;
-template PD_DLL_DECL const int16_t *Tensor::data<int16_t>() const;
-template PD_DLL_DECL const uint16_t *Tensor::data<uint16_t>() const;
-template PD_DLL_DECL const bool *Tensor::data<bool>() const;
-template PD_DLL_DECL const paddle::platform::complex<float>
+template PADDLE_API const float *Tensor::data<float>() const;
+template PADDLE_API const double *Tensor::data<double>() const;
+template PADDLE_API const int64_t *Tensor::data<int64_t>() const;
+template PADDLE_API const int32_t *Tensor::data<int32_t>() const;
+template PADDLE_API const uint8_t *Tensor::data<uint8_t>() const;
+template PADDLE_API const int8_t *Tensor::data<int8_t>() const;
+template PADDLE_API const int16_t *Tensor::data<int16_t>() const;
+template PADDLE_API const uint16_t *Tensor::data<uint16_t>() const;
+template PADDLE_API const bool *Tensor::data<bool>() const;
+template PADDLE_API const paddle::platform::complex<float>
     *Tensor::data<paddle::platform::complex<float>>() const;
-template PD_DLL_DECL const paddle::platform::complex<double>
+template PADDLE_API const paddle::platform::complex<double>
     *Tensor::data<paddle::platform::complex<double>>() const;
-template PD_DLL_DECL const paddle::platform::float16 *
+template PADDLE_API const paddle::platform::float16 *
 Tensor::data<paddle::platform::float16>() const;
-template PD_DLL_DECL const paddle::platform::bfloat16 *
+template PADDLE_API const paddle::platform::bfloat16 *
 Tensor::data<paddle::platform::bfloat16>() const;
 
 template <typename T>
@@ -239,19 +241,19 @@ T *Tensor::data() {
   return nullptr;
 }
 
-template PD_DLL_DECL float *Tensor::data<float>();
-template PD_DLL_DECL double *Tensor::data<double>();
-template PD_DLL_DECL int64_t *Tensor::data<int64_t>();
-template PD_DLL_DECL int32_t *Tensor::data<int32_t>();
-template PD_DLL_DECL uint8_t *Tensor::data<uint8_t>();
-template PD_DLL_DECL int8_t *Tensor::data<int8_t>();
-template PD_DLL_DECL int16_t *Tensor::data<int16_t>();
-template PD_DLL_DECL bool *Tensor::data<bool>();
-template PD_DLL_DECL paddle::platform::complex<float>
+template PADDLE_API float *Tensor::data<float>();
+template PADDLE_API double *Tensor::data<double>();
+template PADDLE_API int64_t *Tensor::data<int64_t>();
+template PADDLE_API int32_t *Tensor::data<int32_t>();
+template PADDLE_API uint8_t *Tensor::data<uint8_t>();
+template PADDLE_API int8_t *Tensor::data<int8_t>();
+template PADDLE_API int16_t *Tensor::data<int16_t>();
+template PADDLE_API bool *Tensor::data<bool>();
+template PADDLE_API paddle::platform::complex<float>
     *Tensor::data<paddle::platform::complex<float>>();
-template PD_DLL_DECL paddle::platform::complex<double>
+template PADDLE_API paddle::platform::complex<double>
     *Tensor::data<paddle::platform::complex<double>>();
-template PD_DLL_DECL paddle::platform::float16 *
+template PADDLE_API paddle::platform::float16 *
 Tensor::data<paddle::platform::float16>();
 
 // TODO(chenweihang): replace slice impl by API
@@ -292,27 +294,27 @@ Tensor Tensor::copy_to(const PlaceType &target_place) const {
   return copy_to(ConvertExtPlaceToBackend(target_place), /*blocking=*/false);
 }
 
-template PD_DLL_DECL Tensor
+template PADDLE_API Tensor
 Tensor::copy_to<float>(const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor
+template PADDLE_API Tensor
 Tensor::copy_to<double>(const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor
+template PADDLE_API Tensor
 Tensor::copy_to<int64_t>(const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor
+template PADDLE_API Tensor
 Tensor::copy_to<int32_t>(const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor
+template PADDLE_API Tensor
 Tensor::copy_to<uint8_t>(const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor
+template PADDLE_API Tensor
 Tensor::copy_to<int8_t>(const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor
+template PADDLE_API Tensor
 Tensor::copy_to<int16_t>(const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor
+template PADDLE_API Tensor
 Tensor::copy_to<bool>(const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex<float>>(
+template PADDLE_API Tensor Tensor::copy_to<paddle::platform::complex<float>>(
     const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex<double>>(
+template PADDLE_API Tensor Tensor::copy_to<paddle::platform::complex<double>>(
     const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor
+template PADDLE_API Tensor
 Tensor::copy_to<paddle::platform::float16>(const PlaceType &target_place) const;
 
 Tensor Tensor::copy_to(Backend backend, bool blocking) const {
diff --git a/paddle/pten/api/lib/utils.cc b/paddle/pten/api/lib/utils.cc
index 0948ba5d698a6..e17b19d9f689e 100644
--- a/paddle/pten/api/lib/utils.cc
+++ b/paddle/pten/api/lib/utils.cc
@@ -34,7 +34,7 @@ PT_DECLARE_MODULE(UtilsCUDA);
 namespace paddle {
 namespace experimental {
 
-PD_DLL_DECL Tensor copy_to(const Tensor& x, Backend backend, bool blocking) {
+PADDLE_API Tensor copy_to(const Tensor& x, Backend backend, bool blocking) {
   // 1. Get kernel signature and kernel
   auto kernel_key_set = ParseKernelKeyByInputArgs(x);
   kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend);
diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc
index 0983abfa92137..f2b6e4841aab2 100644
--- a/paddle/pten/api/lib/utils/tensor_utils.cc
+++ b/paddle/pten/api/lib/utils/tensor_utils.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 
+#include <utility>
 #include <vector>
 
 #include "paddle/pten/core/compat_utils.h"
@@ -342,6 +343,29 @@ void MovesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst) {
   MovesStorage(src, static_cast<paddle::framework::Tensor*>(dst));
 }
 
+void MovesSharedStorage(pten::DenseTensor* src,
+                        paddle::framework::Tensor* dst) {
+  PADDLE_ENFORCE_NOT_NULL(
+      src,
+      platform::errors::InvalidArgument(
+          "The source DenseTensor is nullptr when move allocation."));
+  PADDLE_ENFORCE_NOT_NULL(
+      dst,
+      platform::errors::InvalidArgument(
+          "The destination Tensor is nullptr when move allocation."));
+  dst->Resize(src->dims());
+  auto* storage = static_cast<SharedStorage*>(
+      pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(src));
+  dst->ResetHolderWithType(storage->GetAllocation(),
+                           pten::TransToProtoVarType(src->dtype()));
+}
+
+void MovesSharedStorage(pten::DenseTensor* src,
+                        paddle::framework::LoDTensor* dst) {
+  MovesSharedStorage(src, static_cast<paddle::framework::Tensor*>(dst));
+  SetLoD(dst->mutable_lod(), src->lod());
+}
+
 void ReMakePtenDenseTensor(const paddle::framework::Tensor& src,
                            const pten::TensorArgDef& arg_def,
                            pten::DenseTensor* dst) {
diff --git a/paddle/pten/api/lib/utils/tensor_utils.h b/paddle/pten/api/lib/utils/tensor_utils.h
index 04f0f6c1ff0c8..6397ca369ce75 100644
--- a/paddle/pten/api/lib/utils/tensor_utils.h
+++ b/paddle/pten/api/lib/utils/tensor_utils.h
@@ -58,6 +58,11 @@ void MovesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst);
 
 void MovesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst);
 
+void MovesSharedStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst);
+
+void MovesSharedStorage(pten::DenseTensor* src,
+                        paddle::framework::LoDTensor* dst);
+
 /**
  * In order to improve the compatibility state performance, some tricky tool
  * functions are added.
diff --git a/paddle/pten/core/convert_utils.cc b/paddle/pten/core/convert_utils.cc
index 92709647dac00..211734f3315bc 100644
--- a/paddle/pten/core/convert_utils.cc
+++ b/paddle/pten/core/convert_utils.cc
@@ -11,11 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #include "paddle/pten/core/convert_utils.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 namespace pten {
 
@@ -180,4 +179,95 @@ pten::LoD TransToPtenLoD(const paddle::framework::LoD& lod) {
   return out;
 }
 
+size_t DataTypeSize(DataType dtype) {
+  switch (dtype) {
+    case DataType::UNDEFINED:
+      return 0;
+    case DataType::BOOL:
+      return sizeof(bool);
+    case DataType::INT8:
+      return sizeof(int8_t);
+    case DataType::UINT8:
+      return sizeof(uint8_t);
+    case DataType::INT16:
+      return sizeof(int16_t);
+    case DataType::INT32:
+      return sizeof(int);
+    case DataType::INT64:
+      return sizeof(int64_t);
+    case DataType::FLOAT16:
+      return sizeof(paddle::platform::float16);
+    case DataType::FLOAT32:
+      return sizeof(float);
+    case DataType::FLOAT64:
+      return sizeof(double);
+    case DataType::COMPLEX64:
+      return sizeof(paddle::platform::complex<float>);
+    case DataType::COMPLEX128:
+      return sizeof(paddle::platform::complex<double>);
+    default:
+      return 0;
+  }
+}
+
+DataType String2DataType(const std::string& str) {
+  if (str == "bool") {
+    return DataType::BOOL;
+  } else if (str == "float16") {
+    return DataType::FLOAT16;
+  } else if (str == "float32") {
+    return DataType::FLOAT32;
+  } else if (str == "float64") {
+    return DataType::FLOAT64;
+  } else if (str == "int8") {
+    return DataType::INT8;
+  } else if (str == "int16") {
+    return DataType::INT16;
+  } else if (str == "int32") {
+    return DataType::INT32;
+  } else if (str == "int64") {
+    return DataType::INT64;
+  } else if (str == "uint8") {
+    return DataType::UINT8;
+  } else if (str == "complex64") {
+    return DataType::COMPLEX64;
+  } else if (str == "complex128") {
+    return DataType::COMPLEX128;
+  } else {
+    return DataType::UNDEFINED;
+  }
+}
+
+std::string DataType2String(DataType dtype) {
+  switch (dtype) {
+    case DataType::BOOL:
+      return "bool";
+    case DataType::INT8:
+      return "int8";
+    case DataType::UINT8:
+      return "uint8";
+    case DataType::INT16:
+      return "int16";
+    case DataType::INT32:
+      return "int32";
+    case DataType::INT64:
+      return "int64";
+    case DataType::FLOAT16:
+      return "float16";
+    case DataType::FLOAT32:
+      return "float32";
+    case DataType::FLOAT64:
+      return "float64";
+    case DataType::COMPLEX64:
+      return "complex64";
+    case DataType::COMPLEX128:
+      return "complex128";
+    default:
+      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+          "Unknow pten::DataType, the int value = %d.",
+          static_cast<int>(dtype)));
+      return "";
+  }
+}
+
 }  // namespace pten
diff --git a/paddle/pten/core/convert_utils.h b/paddle/pten/core/convert_utils.h
index 0b807c48bc150..32ed753b4b0ab 100644
--- a/paddle/pten/core/convert_utils.h
+++ b/paddle/pten/core/convert_utils.h
@@ -45,4 +45,8 @@ paddle::framework::DataLayout TransToFluidDataLayout(const DataLayout& layout);
 paddle::framework::LoD TransToFluidLoD(const pten::LoD& lod);
 pten::LoD TransToPtenLoD(const paddle::framework::LoD& lod);
 
+size_t DataTypeSize(DataType dtype);
+DataType String2DataType(const std::string& str);
+std::string DataType2String(DataType dtype);
+
 }  // namespace pten
diff --git a/paddle/pten/core/kernel_context.cc b/paddle/pten/core/kernel_context.cc
index 443990c07247d..b2c84807951a5 100644
--- a/paddle/pten/core/kernel_context.cc
+++ b/paddle/pten/core/kernel_context.cc
@@ -14,4 +14,114 @@
 
 #include "paddle/pten/core/kernel_context.h"
 
-namespace pten {}  // namespace pten
+namespace pten {
+
+void KernelContext::EmplaceBackInput(std::shared_ptr<TensorBase> input) {
+  int index = inputs_.size();
+  inputs_.emplace_back(std::move(input));
+  // Record the start and end index of the input
+  input_range_.emplace_back(std::pair<int, int>(index, index + 1));
+}
+
+void KernelContext::EmplaceBackInputWithoutSetRange(
+    std::shared_ptr<TensorBase> input) {
+  inputs_.emplace_back(std::move(input));
+}
+
+void KernelContext::EmplaceBackInputs(
+    paddle::SmallVector<std::shared_ptr<TensorBase>> inputs) {
+  int index = inputs_.size();
+  // Record the start and end index of the input
+  input_range_.emplace_back(std::pair<int, int>(index, index + inputs.size()));
+  inputs_.insert(inputs_.end(),
+                 std::make_move_iterator(inputs.begin()),
+                 std::make_move_iterator(inputs.end()));
+}
+
+void KernelContext::EmplaceBackOutput(std::shared_ptr<TensorBase> output) {
+  int index = outputs_.size();
+  outputs_.emplace_back(std::move(output));
+  // Record the start and end index of the input
+  output_range_.emplace_back(std::pair<int, int>(index, index + 1));
+}
+
+void KernelContext::EmplaceBackOutputWithoutSetRange(
+    std::shared_ptr<TensorBase> output) {
+  outputs_.emplace_back(std::move(output));
+}
+
+void KernelContext::EmplaceBackOutputs(
+    paddle::SmallVector<std::shared_ptr<TensorBase>> outputs) {
+  int index = outputs_.size();
+  // Record the start and end index of the input
+  output_range_.emplace_back(
+      std::pair<int, int>(index, index + outputs.size()));
+  outputs_.insert(outputs_.end(),
+                  std::make_move_iterator(outputs.begin()),
+                  std::make_move_iterator(outputs.end()));
+}
+
+void KernelContext::EmplaceBackAttr(paddle::any attr) {
+  attrs_.emplace_back(std::move(attr));
+}
+
+void KernelContext::AssignInputRange(std::pair<int, int>&& range, size_t idx) {
+  if (idx < input_range_.size()) {
+    input_range_[idx] = range;
+  } else if (idx == input_range_.size()) {
+    input_range_.emplace_back(range);
+  } else {
+    PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
+        "Invalid idx when trying to set InputRange, "
+        "index is `%d`, it is greater than the size(%d) of InputRange.",
+        idx,
+        input_range_.size()));
+  }
+}
+
+void KernelContext::AssignOutputRange(std::pair<int, int>&& range, size_t idx) {
+  if (idx < output_range_.size()) {
+    output_range_[idx] = range;
+  } else if (idx == output_range_.size()) {
+    output_range_.emplace_back(range);
+  } else {
+    PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
+        "Invalid idx when trying to set InputRange, "
+        "index is `%d`, it is greater than the size(%d) of InputRange.",
+        idx,
+        output_range_.size()));
+  }
+}
+
+const std::pair<int, int>& KernelContext::InputRangeAt(size_t idx) const {
+  return input_range_.at(idx);
+}
+
+const std::pair<int, int>& KernelContext::OutputRangeAt(size_t idx) const {
+  return output_range_.at(idx);
+}
+
+std::pair<int, int>& KernelContext::MutableInputRangeAt(size_t idx) {
+  return input_range_[idx];
+}
+
+std::pair<int, int>& KernelContext::MutableOutputRangeAt(size_t idx) {
+  return output_range_[idx];
+}
+
+// Temporary method: For compatible with fluid Tensor and improve performance
+// Only deal with DenseTensor now
+void KernelContext::ClearData() {
+  for (auto& in : inputs_) {
+    if (in) {
+      CompatibleDenseTensorUtils::ClearStorage(
+          static_cast<DenseTensor*>(in.get()));
+    }
+  }
+  for (auto& out : outputs_) {
+    CompatibleDenseTensorUtils::ClearStorage(
+        static_cast<DenseTensor*>(out.get()));
+  }
+  attrs_.clear();
+}
+}  // namespace pten
diff --git a/paddle/pten/core/kernel_context.h b/paddle/pten/core/kernel_context.h
index 4f4d673dfe6c4..6c695987096cb 100644
--- a/paddle/pten/core/kernel_context.h
+++ b/paddle/pten/core/kernel_context.h
@@ -51,85 +51,53 @@ class KernelContext {
     return static_cast<const CtxType&>(*dev_ctx_);
   }
 
-  void EmplaceBackInput(std::shared_ptr<TensorBase> input) {
-    int index = inputs_.size();
-    inputs_.emplace_back(std::move(input));
-    // Record the start and end index of the input
-    input_range_.emplace_back(std::pair<int, int>(index, index + 1));
-  }
+  void EmplaceBackInput(std::shared_ptr<TensorBase> input);
 
-  void EmplaceBackInputWithoutSetRange(std::shared_ptr<TensorBase> input) {
-    inputs_.emplace_back(std::move(input));
-  }
+  void EmplaceBackInputWithoutSetRange(std::shared_ptr<TensorBase> input);
 
   void EmplaceBackInputs(
-      paddle::SmallVector<std::shared_ptr<TensorBase>> inputs) {
-    int index = inputs_.size();
-    // Record the start and end index of the input
-    input_range_.emplace_back(
-        std::pair<int, int>(index, index + inputs.size()));
-    inputs_.insert(inputs_.end(),
-                   std::make_move_iterator(inputs.begin()),
-                   std::make_move_iterator(inputs.end()));
-  }
+      paddle::SmallVector<std::shared_ptr<TensorBase>> inputs);
 
-  void EmplaceBackOutput(std::shared_ptr<TensorBase> output) {
-    int index = outputs_.size();
-    outputs_.emplace_back(std::move(output));
-    // Record the start and end index of the input
-    output_range_.emplace_back(std::pair<int, int>(index, index + 1));
-  }
+  void EmplaceBackOutput(std::shared_ptr<TensorBase> output);
 
-  void EmplaceBackOutputWithoutSetRange(std::shared_ptr<TensorBase> output) {
-    outputs_.emplace_back(std::move(output));
-  }
+  void EmplaceBackOutputWithoutSetRange(std::shared_ptr<TensorBase> output);
 
   void EmplaceBackOutputs(
-      paddle::SmallVector<std::shared_ptr<TensorBase>> outputs) {
-    int index = outputs_.size();
-    // Record the start and end index of the input
-    output_range_.emplace_back(
-        std::pair<int, int>(index, index + outputs.size()));
-    outputs_.insert(outputs_.end(),
-                    std::make_move_iterator(outputs.begin()),
-                    std::make_move_iterator(outputs.end()));
-  }
+      paddle::SmallVector<std::shared_ptr<TensorBase>> outputs);
 
-  void EmplaceBackAttr(paddle::any attr) {
-    attrs_.emplace_back(std::move(attr));
-  }
+  void EmplaceBackAttr(paddle::any attr);
+
+  const std::pair<int, int>& InputRangeAt(size_t idx) const;
+
+  const std::pair<int, int>& OutputRangeAt(size_t idx) const;
+
+  std::pair<int, int>& MutableInputRangeAt(size_t idx);
+
+  std::pair<int, int>& MutableOutputRangeAt(size_t idx);
 
   template <typename TensorType>
   const TensorType& InputAt(size_t idx) const {
     return static_cast<const TensorType&>(*(inputs_.at(idx)));
   }
 
+  std::shared_ptr<TensorBase>& MutableInputPtrAt(size_t idx) {
+    return inputs_.at(idx);
+  }
+
   template <typename TensorType>
-  std::vector<TensorType> InputBetween(size_t start, size_t end) const {
+  std::vector<TensorType> MoveInputsBetween(size_t start, size_t end) {
     std::vector<TensorType> v;
     for (size_t i = start; i < end; ++i) {
       auto t = std::dynamic_pointer_cast<TensorType>(inputs_.at(i));
       v.emplace_back(std::move(*t.get()));
+      inputs_.at(i) = nullptr;
     }
-
     return v;
   }
 
-  const std::pair<int, int>& InputRangeAt(size_t idx) const {
-    return input_range_.at(idx);
-  }
-
-  const std::pair<int, int>& OutputRangeAt(size_t idx) const {
-    return output_range_.at(idx);
-  }
+  void AssignInputRange(std::pair<int, int>&& range, size_t idx);
 
-  std::pair<int, int>& MutableInputRangeAt(size_t idx) {
-    return input_range_[idx];
-  }
-
-  std::pair<int, int>& MutableOutputRangeAt(size_t idx) {
-    return output_range_[idx];
-  }
+  void AssignOutputRange(std::pair<int, int>&& range, size_t idx);
 
   template <typename TensorType>
   TensorType* MutableInputAt(size_t idx) {
@@ -163,17 +131,7 @@ class KernelContext {
 
   // Temporary method: For compatible with fluid Tensor and improve performance
   // Only deal with DenseTensor now
-  void ClearData() {
-    for (auto& in : inputs_) {
-      CompatibleDenseTensorUtils::ClearStorage(
-          static_cast<DenseTensor*>(in.get()));
-    }
-    for (auto& out : outputs_) {
-      CompatibleDenseTensorUtils::ClearStorage(
-          static_cast<DenseTensor*>(out.get()));
-    }
-    attrs_.clear();
-  }
+  void ClearData();
 
   size_t InputsSize() const { return inputs_.size(); }
   size_t OutputsSize() const { return outputs_.size(); }
diff --git a/paddle/pten/core/kernel_utils.h b/paddle/pten/core/kernel_utils.h
index 7e6be1c391400..dcfc8c55644d9 100644
--- a/paddle/pten/core/kernel_utils.h
+++ b/paddle/pten/core/kernel_utils.h
@@ -88,26 +88,26 @@ using XPUContext = paddle::platform::XPUDeviceContext;
     }                                                                   \
   }
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type)     \
-  template <typename... Tail>                                           \
-  struct KernelCallHelper<const std::vector<tensor_type>&, Tail...> {   \
-    template <int dev_ctx_idx,                                          \
-              int in_idx,                                               \
-              int attr_idx,                                             \
-              int out_idx,                                              \
-              typename... PreviousArgs>                                 \
-    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {   \
-      static_assert(attr_idx == 0,                                      \
-                    "Kernel's Input should appear before Attributes."); \
-      static_assert(out_idx == 0,                                       \
-                    "Kernel's Input should appear before Outputs.");    \
-      const std::pair<int, int> range = ctx->InputRangeAt(in_idx);      \
-      std::vector<tensor_type> arg = std::move(                         \
-          ctx->InputBetween<tensor_type>(range.first, range.second));   \
-      KernelCallHelper<Tail...>::                                       \
-          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>( \
-              ctx, pargs..., arg);                                      \
-    }                                                                   \
+#define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type)        \
+  template <typename... Tail>                                              \
+  struct KernelCallHelper<const std::vector<tensor_type>&, Tail...> {      \
+    template <int dev_ctx_idx,                                             \
+              int in_idx,                                                  \
+              int attr_idx,                                                \
+              int out_idx,                                                 \
+              typename... PreviousArgs>                                    \
+    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {      \
+      static_assert(attr_idx == 0,                                         \
+                    "Kernel's Input should appear before Attributes.");    \
+      static_assert(out_idx == 0,                                          \
+                    "Kernel's Input should appear before Outputs.");       \
+      const std::pair<int, int> range = ctx->InputRangeAt(in_idx);         \
+      std::vector<tensor_type> arg = std::move(                            \
+          ctx->MoveInputsBetween<tensor_type>(range.first, range.second)); \
+      KernelCallHelper<Tail...>::                                          \
+          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>(    \
+              ctx, pargs..., arg);                                         \
+    }                                                                      \
   }
 
 #define PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(attr_type)           \
diff --git a/paddle/pten/include/math.h b/paddle/pten/include/math.h
index c6528d85c27cc..c2b9f75bda044 100644
--- a/paddle/pten/include/math.h
+++ b/paddle/pten/include/math.h
@@ -78,7 +78,7 @@ DenseTensor Sum(const ContextT& dev_ctx,
 template <typename T, typename ContextT>
 DenseTensor Scale(const ContextT& dev_ctx,
                   const DenseTensor& x,
-                  float scale,
+                  const Scalar& scale,
                   float bias,
                   bool bias_after_scale) {
   auto out_meta = UnchangedInferMeta(x.meta());
@@ -90,21 +90,6 @@ DenseTensor Scale(const ContextT& dev_ctx,
   return dense_out;
 }
 
-template <typename T, typename ContextT>
-DenseTensor Scale(const ContextT& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& scale,
-                  float bias,
-                  bool bias_after_scale) {
-  auto out_meta = UnchangedInferMeta(x.meta());
-  const auto allocator =
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          dev_ctx.GetPlace());
-  pten::DenseTensor dense_out(allocator, out_meta);
-  ScaleHost<T>(dev_ctx, x, scale, bias, bias_after_scale, &dense_out);
-  return dense_out;
-}
-
 template <typename T, typename ContextT>
 DenseTensor Add(const ContextT& dev_ctx,
                 const DenseTensor& x,
diff --git a/paddle/pten/kernels/cpu/manipulation.cc b/paddle/pten/kernels/cpu/manipulation.cc
index 8f559b01b3bcb..7693e204eaa09 100644
--- a/paddle/pten/kernels/cpu/manipulation.cc
+++ b/paddle/pten/kernels/cpu/manipulation.cc
@@ -51,7 +51,7 @@ void ReshapeFromVectorVal(const CPUContext& dev_ctx,
                           const std::vector<int64_t>& shape,
                           DenseTensor* out) {
   auto out_meta = InferMetaFromVecValue(x.meta(), shape);
-  if (&x == out) {
+  if (x.data() == out->data() && x.numel() == out->numel()) {
     out->Resize(out_meta.dims);
     return;
   }
@@ -185,3 +185,34 @@ PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mid",
                                 CPU,
                                 ANY,
                                 pten::ReshapeFromVectorValWithXShape) {}
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.host",
+                                CPU,
+                                ANY,
+                                pten::ReshapeFromDT) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
+}
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.host.mid",
+                                CPU,
+                                ANY,
+                                pten::ReshapeFromDTWithXShape) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
+}
+PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mulhost",
+                                CPU,
+                                ANY,
+                                pten::ReshapeFromVectorDT) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
+}
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mulhost.mid",
+                                CPU,
+                                ANY,
+                                pten::ReshapeFromVectorDTWithXShape) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
+}
diff --git a/paddle/pten/kernels/cpu/manipulation.h b/paddle/pten/kernels/cpu/manipulation.h
index 3dce249c54532..36f9aaa85aa5e 100644
--- a/paddle/pten/kernels/cpu/manipulation.h
+++ b/paddle/pten/kernels/cpu/manipulation.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
 
diff --git a/paddle/pten/kernels/cpu/math.cc b/paddle/pten/kernels/cpu/math.cc
index 634b5231da266..05ca7a3ae5244 100644
--- a/paddle/pten/kernels/cpu/math.cc
+++ b/paddle/pten/kernels/cpu/math.cc
@@ -50,28 +50,12 @@ void Mean(const CPUContext& dev_ctx,
 template <typename T>
 void Scale(const CPUContext& dev_ctx,
            const DenseTensor& x,
-           float scale,
+           const Scalar& scale,
            float bias,
            bool bias_after_scale,
            DenseTensor* out) {
-  eigen::Scale<CPUContext, T>(dev_ctx, x, scale, bias, bias_after_scale, out);
-}
-
-// TODO(chenweihang): now the ScaleTensor's dtype are same as x, so we cannot
-// register its dtype def
-template <typename T>
-void ScaleHost(const CPUContext& dev_ctx,
-               const DenseTensor& x,
-               const DenseTensor& scale,
-               float bias,
-               bool bias_after_scale,
-               DenseTensor* out) {
-  eigen::Scale<CPUContext, T>(dev_ctx,
-                              x,
-                              static_cast<float>(*scale.data<T>()),
-                              bias,
-                              bias_after_scale,
-                              out);
+  eigen::Scale<CPUContext, T>(
+      dev_ctx, x, scale.to<float>(), bias, bias_after_scale, out);
 }
 
 template <typename T>
@@ -145,20 +129,7 @@ PT_REGISTER_KERNEL("scale",
                    int16_t,
                    int,
                    int64_t) {}
-PT_REGISTER_KERNEL("scale.host",
-                   CPU,
-                   ANY,
-                   pten::ScaleHost,
-                   float,
-                   double,
-                   paddle::platform::bfloat16,
-                   uint8_t,
-                   int8_t,
-                   int16_t,
-                   int,
-                   int64_t) {
-  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
-}
+
 PT_REGISTER_KERNEL("elementwise_add",
                    CPU,
                    ANY,
diff --git a/paddle/pten/kernels/cpu/math.h b/paddle/pten/kernels/cpu/math.h
index c06d40e57799f..31532f38f6e49 100644
--- a/paddle/pten/kernels/cpu/math.h
+++ b/paddle/pten/kernels/cpu/math.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/pten/common/scalar.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
 
@@ -40,19 +41,11 @@ void Mean(const CPUContext& dev_ctx,
 template <typename T>
 void Scale(const CPUContext& dev_ctx,
            const DenseTensor& x,
-           float scale,
+           const Scalar& scale,
            float bias,
            bool bias_after_scale,
            DenseTensor* out);
 
-template <typename T>
-void ScaleHost(const CPUContext& dev_ctx,
-               const DenseTensor& x,
-               const DenseTensor& scale,
-               float bias,
-               bool bias_after_scale,
-               DenseTensor* out);
-
 template <typename T>
 void ElementwiseAdd(const CPUContext& dev_ctx,
                     const DenseTensor& x,
diff --git a/paddle/pten/kernels/cuda/manipulation.cu b/paddle/pten/kernels/cuda/manipulation.cu
index 22ada75304f24..1a1d5cef300d4 100644
--- a/paddle/pten/kernels/cuda/manipulation.cu
+++ b/paddle/pten/kernels/cuda/manipulation.cu
@@ -51,7 +51,7 @@ void ReshapeFromVectorVal(const CUDAContext& dev_ctx,
                           const std::vector<int64_t>& shape,
                           DenseTensor* out) {
   auto out_meta = InferMetaFromVecValue(x.meta(), shape);
-  if (&x == out) {
+  if (x.data() == out->data() && x.numel() == out->numel()) {
     out->Resize(out_meta.dims);
     return;
   }
@@ -193,3 +193,35 @@ PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mid",
                                 CUDA,
                                 ANY,
                                 pten::ReshapeFromVectorValWithXShape) {}
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.host",
+                                CUDA,
+                                ANY,
+                                pten::ReshapeFromDT) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
+}
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.host.mid",
+                                CUDA,
+                                ANY,
+                                pten::ReshapeFromDTWithXShape) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
+}
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mulhost",
+                                CUDA,
+                                ANY,
+                                pten::ReshapeFromVectorDT) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
+}
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mulhost.mid",
+                                CUDA,
+                                ANY,
+                                pten::ReshapeFromVectorDTWithXShape) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
+}
diff --git a/paddle/pten/kernels/cuda/manipulation.h b/paddle/pten/kernels/cuda/manipulation.h
index bb724beb2e34b..c0f2d8a11414e 100644
--- a/paddle/pten/kernels/cuda/manipulation.h
+++ b/paddle/pten/kernels/cuda/manipulation.h
@@ -18,6 +18,7 @@
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 #include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/pten/kernels/cuda/math.cu b/paddle/pten/kernels/cuda/math.cu
index bc5582926a400..8d6abc9285530 100644
--- a/paddle/pten/kernels/cuda/math.cu
+++ b/paddle/pten/kernels/cuda/math.cu
@@ -79,30 +79,12 @@ void Mean(const CUDAContext& dev_ctx,
 template <typename T>
 void Scale(const CUDAContext& dev_ctx,
            const DenseTensor& x,
-           float scale,
+           const Scalar& scale,
            float bias,
            bool bias_after_scale,
            DenseTensor* out) {
-  eigen::Scale<CUDAContext, T>(dev_ctx, x, scale, bias, bias_after_scale, out);
-}
-
-template <typename T>
-void ScaleHost(const CUDAContext& dev_ctx,
-               const DenseTensor& x,
-               const DenseTensor& scale,
-               float bias,
-               bool bias_after_scale,
-               DenseTensor* out) {
-  PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(scale.place()),
-                    false,
-                    paddle::platform::errors::InvalidArgument(
-                        "Scale argument isn't a host tensor."));
-  eigen::Scale<CUDAContext, T>(dev_ctx,
-                               x,
-                               static_cast<float>(*scale.data<T>()),
-                               bias,
-                               bias_after_scale,
-                               out);
+  eigen::Scale<CUDAContext, T>(
+      dev_ctx, x, scale.to<float>(), bias, bias_after_scale, out);
 }
 
 // Create the definition of ElementwiseAdd
@@ -150,20 +132,6 @@ PT_REGISTER_KERNEL("scale",
                    int16_t,
                    int,
                    int64_t) {}
-PT_REGISTER_KERNEL("scale.host",
-                   CUDA,
-                   ANY,
-                   pten::ScaleHost,
-                   float,
-                   double,
-                   float16,
-                   uint8_t,
-                   int8_t,
-                   int16_t,
-                   int,
-                   int64_t) {
-  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
-}
 PT_REGISTER_KERNEL("elementwise_add",
                    CUDA,
                    ANY,
diff --git a/paddle/pten/kernels/cuda/math.h b/paddle/pten/kernels/cuda/math.h
index dcee649d7d82d..0ac55f1f87950 100644
--- a/paddle/pten/kernels/cuda/math.h
+++ b/paddle/pten/kernels/cuda/math.h
@@ -17,6 +17,7 @@ limitations under the License. */
 // CUDA and HIP use same api
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
+#include "paddle/pten/common/scalar.h"
 #include "paddle/pten/core/dense_tensor.h"
 
 // See Note [ Why still include the fluid headers? ]
@@ -42,19 +43,11 @@ void Mean(const CUDAContext& dev_ctx,
 template <typename T>
 void Scale(const CUDAContext& dev_ctx,
            const DenseTensor& x,
-           float scale,
+           const Scalar& scale,
            float bias,
            bool bias_after_scale,
            DenseTensor* out);
 
-template <typename T>
-void ScaleHost(const CUDAContext& dev_ctx,
-               const DenseTensor& x,
-               const DenseTensor& scale,
-               float bias,
-               bool bias_after_scale,
-               DenseTensor* out);
-
 template <typename T>
 void ElementwiseAdd(const CUDAContext& dev_ctx,
                     const DenseTensor& x,
diff --git a/paddle/pten/kernels/functions/cuda/cast_kernel_impl.h b/paddle/pten/kernels/functions/cuda/cast_kernel_impl.h
index 435da644356f9..1bf5bb288e832 100644
--- a/paddle/pten/kernels/functions/cuda/cast_kernel_impl.h
+++ b/paddle/pten/kernels/functions/cuda/cast_kernel_impl.h
@@ -13,12 +13,12 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_helper.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/pten/core/dense_tensor.h"
 
 #include "paddle/fluid/platform/aligned_vector.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 namespace pten {
 namespace detail {
 using CUDAContext = paddle::platform::CUDADeviceContext;
diff --git a/paddle/pten/kernels/functions/cuda/reduce/reduce_cuda_impl.h b/paddle/pten/kernels/functions/cuda/reduce/reduce_cuda_impl.h
index e292a56d08a8d..21663ee0388c0 100644
--- a/paddle/pten/kernels/functions/cuda/reduce/reduce_cuda_impl.h
+++ b/paddle/pten/kernels/functions/cuda/reduce/reduce_cuda_impl.h
@@ -36,7 +36,7 @@ namespace cub = hipcub;
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/cast_op.h"
 #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/fast_divmod.h"
 
 #include "paddle/fluid/operators/kernel_primitives/compute_primitives.h"
@@ -479,9 +479,9 @@ struct ReduceConfig {
       reduce_num_per_thread = details::AlignUp(reduce_num, block_dim->y);
     }
     int device_id = paddle::platform::GetCurrentDeviceId();
-    int max_mp = paddle::platform::GetCUDAMultiProcessors(device_id);
+    int max_mp = paddle::platform::GetGPUMultiProcessors(device_id);
     int max_threads_per_mp =
-        paddle::platform::GetCUDAMaxThreadsPerMultiProcessor(device_id);
+        paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id);
     int max_threads = max_threads_per_mp * max_mp;
     int num_threads = block_dim->x * block_dim->y;
     int max_num_blocks = max_threads / num_threads;
@@ -521,9 +521,9 @@ struct ReduceConfig {
     left_num = last_dim_num;
     grid_dim->z = grid_z;
     int device_id = paddle::platform::GetCurrentDeviceId();
-    int max_mp = paddle::platform::GetCUDAMultiProcessors(device_id);
+    int max_mp = paddle::platform::GetGPUMultiProcessors(device_id);
     int max_threads_per_mp =
-        paddle::platform::GetCUDAMaxThreadsPerMultiProcessor(device_id);
+        paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id);
     int max_threads = max_threads_per_mp * max_mp;
     // init
     int num_block = (max_threads / left_num);
diff --git a/paddle/pten/kernels/math/cpu/CMakeLists.txt b/paddle/pten/kernels/math/cpu/CMakeLists.txt
index b30d6d96bdd2d..235a49a5e4af5 100644
--- a/paddle/pten/kernels/math/cpu/CMakeLists.txt
+++ b/paddle/pten/kernels/math/cpu/CMakeLists.txt
@@ -1 +1 @@
-cc_library(pten_transpose_cpu SRCS transpose.cc DEPS dense_tensor)
+cc_library(pten_transpose_cpu SRCS transpose.cc DEPS dense_tensor device_context)
diff --git a/paddle/pten/kernels/math/cuda/CMakeLists.txt b/paddle/pten/kernels/math/cuda/CMakeLists.txt
index 5e14f759a6dfc..b0be23bb09241 100644
--- a/paddle/pten/kernels/math/cuda/CMakeLists.txt
+++ b/paddle/pten/kernels/math/cuda/CMakeLists.txt
@@ -1,5 +1,5 @@
 if(WITH_GPU)
-  nv_library(pten_transpose_cuda SRCS transpose.cu DEPS dense_tensor malloc)
+  nv_library(pten_transpose_cuda SRCS transpose.cu DEPS dense_tensor malloc device_context)
 elseif(WITH_ROCM)
-  hip_library(pten_transpose_cuda SRCS transpose.cu DEPS dense_tensor malloc)
+  hip_library(pten_transpose_cuda SRCS transpose.cu DEPS dense_tensor malloc device_context)
 endif()
diff --git a/paddle/pten/tests/api/CMakeLists.txt b/paddle/pten/tests/api/CMakeLists.txt
index 207d8f35b4c45..46f2ef8be7c46 100644
--- a/paddle/pten/tests/api/CMakeLists.txt
+++ b/paddle/pten/tests/api/CMakeLists.txt
@@ -1,7 +1,7 @@
 if(WITH_ROCM)
-  hip_test(test_pten_tensor SRCS test_pten_tensor.cc DEPS pten_tensor utils_api manipulation_api glog)
+  hip_test(test_pten_tensor SRCS test_pten_tensor.cc DEPS pten_tensor pten_function_api utils_api glog)
 else()
-  cc_test(test_pten_tensor SRCS test_pten_tensor.cc DEPS pten_tensor utils_api manipulation_api glog)
+  cc_test(test_pten_tensor SRCS test_pten_tensor.cc DEPS pten_tensor pten_function_api utils_api glog)
 endif()
 
 cc_test(test_pten_exception SRCS test_pten_exception.cc DEPS gtest)
@@ -20,3 +20,4 @@ cc_test(test_reshape_api SRCS test_reshape_api.cc DEPS pten_tensor pten_api pten
 cc_test(test_to_api SRCS test_to_api.cc DEPS pten_tensor pten_api pten_api_utils)
 cc_test(test_slice_api SRCS test_slice_api.cc DEPS pten_tensor pten_api pten_api_utils)
 cc_test(test_sum_api SRCS test_sum_api.cc DEPS pten_tensor pten_api pten_api_utils)
+cc_test(test_scale_api SRCS test_scale_api.cc DEPS pten_tensor pten_api pten_api_utils)
diff --git a/paddle/pten/tests/api/test_cast_api.cc b/paddle/pten/tests/api/test_cast_api.cc
index ef110e8e33c03..c2660a1f80019 100644
--- a/paddle/pten/tests/api/test_cast_api.cc
+++ b/paddle/pten/tests/api/test_cast_api.cc
@@ -15,8 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/creation.h"
-#include "paddle/pten/api/include/manipulation.h"
+#include "paddle/pten/api/include/api.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/tests/api/test_dot_api.cc b/paddle/pten/tests/api/test_dot_api.cc
index 972e065596e31..41c03f8f26201 100644
--- a/paddle/pten/tests/api/test_dot_api.cc
+++ b/paddle/pten/tests/api/test_dot_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/linalg.h"
+#include "paddle/pten/api/include/api.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/tests/api/test_elementwise_api.cc b/paddle/pten/tests/api/test_elementwise_api.cc
index 44033f1c611c4..e5971aae5513f 100644
--- a/paddle/pten/tests/api/test_elementwise_api.cc
+++ b/paddle/pten/tests/api/test_elementwise_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/math.h"
+#include "paddle/pten/api/include/api.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/tests/api/test_fill_api.cc b/paddle/pten/tests/api/test_fill_api.cc
index 1ebfc8e674625..e87d094eec9d3 100644
--- a/paddle/pten/tests/api/test_fill_api.cc
+++ b/paddle/pten/tests/api/test_fill_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/creation.h"
+#include "paddle/pten/api/include/api.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/tests/api/test_flatten_api.cc b/paddle/pten/tests/api/test_flatten_api.cc
index 2fcf00efc6036..93c8a50f02a78 100644
--- a/paddle/pten/tests/api/test_flatten_api.cc
+++ b/paddle/pten/tests/api/test_flatten_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/manipulation.h"
+#include "paddle/pten/api/include/api.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/tests/api/test_matmul_api.cc b/paddle/pten/tests/api/test_matmul_api.cc
index d3652db54ec5c..01ca4aad642ba 100644
--- a/paddle/pten/tests/api/test_matmul_api.cc
+++ b/paddle/pten/tests/api/test_matmul_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/linalg.h"
+#include "paddle/pten/api/include/api.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/tests/api/test_mean_api.cc b/paddle/pten/tests/api/test_mean_api.cc
index 59d91672f9677..a8c4c5306dced 100644
--- a/paddle/pten/tests/api/test_mean_api.cc
+++ b/paddle/pten/tests/api/test_mean_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/math.h"
+#include "paddle/pten/api/include/api.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/tests/api/test_reshape_api.cc b/paddle/pten/tests/api/test_reshape_api.cc
index 643551ec1cb1d..b6179f11b1019 100644
--- a/paddle/pten/tests/api/test_reshape_api.cc
+++ b/paddle/pten/tests/api/test_reshape_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/manipulation.h"
+#include "paddle/pten/api/include/api.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/tests/api/test_scale_api.cc b/paddle/pten/tests/api/test_scale_api.cc
new file mode 100644
index 0000000000000..3541e3b85ccee
--- /dev/null
+++ b/paddle/pten/tests/api/test_scale_api.cc
@@ -0,0 +1,56 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/pten/api/include/api.h"
+
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/core/dense_tensor.h"
+
+namespace paddle {
+namespace tests {
+
+namespace framework = paddle::framework;
+using DDim = paddle::framework::DDim;
+
+void CheckScaleResult(Tensor* out) {
+  ASSERT_EQ(out->dims().size(), 2);
+  ASSERT_EQ(out->dims()[0], 3);
+  ASSERT_EQ(out->dims()[1], 4);
+  ASSERT_EQ(out->numel(), 12);
+  ASSERT_EQ(out->is_cpu(), true);
+  ASSERT_EQ(out->type(), pten::DataType::FLOAT32);
+  ASSERT_EQ(out->layout(), pten::DataLayout::NCHW);
+  ASSERT_EQ(out->initialized(), true);
+  for (int64_t i = 0; i < out->numel(); ++i) {
+    ASSERT_EQ(out->mutable_data<float>()[i], 3.0);
+  }
+}
+
+TEST(API, scale) {
+  // 1. check `scale` is float value
+  auto x = experimental::full({3, 4}, 1.0, pten::DataType::FLOAT32);
+  auto out1 = experimental::scale(x, 2.0, 1.0, true);
+  CheckScaleResult(&out1);
+
+  // 2. check `scale` is Tensor with shape [1]
+  auto scale = experimental::full({1}, 2.0, pten::DataType::FLOAT32);
+  auto out2 = experimental::scale(x, scale, 1.0, true);
+  CheckScaleResult(&out2);
+}
+
+}  // namespace tests
+}  // namespace paddle
diff --git a/paddle/pten/tests/api/test_slice_api.cc b/paddle/pten/tests/api/test_slice_api.cc
index 31a96c392dc1d..004c085af06e8 100644
--- a/paddle/pten/tests/api/test_slice_api.cc
+++ b/paddle/pten/tests/api/test_slice_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/creation.h"
+#include "paddle/pten/api/include/api.h"
 #include "paddle/pten/api/include/tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
 
diff --git a/paddle/pten/tests/api/test_sum_api.cc b/paddle/pten/tests/api/test_sum_api.cc
index 4656f40463960..d1b7ea33e8b76 100644
--- a/paddle/pten/tests/api/test_sum_api.cc
+++ b/paddle/pten/tests/api/test_sum_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/math.h"
+#include "paddle/pten/api/include/api.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
@@ -50,7 +50,7 @@ TEST(API, sum) {
   std::vector<int64_t> axis = {0, 1};
 
   // 2. test API
-  auto out = paddle::experimental::sum(x, axis, DataType::UNDEFINED, false);
+  auto out = paddle::experimental::sum(x, axis, false);
   // 3. check result
   ASSERT_EQ(out.dims().size(), 1);
   ASSERT_EQ(out.dims()[0], 1);
diff --git a/paddle/scripts/musl_build/build_inside.sh b/paddle/scripts/musl_build/build_inside.sh
index 04dea2086a678..4c7fa804de578 100755
--- a/paddle/scripts/musl_build/build_inside.sh
+++ b/paddle/scripts/musl_build/build_inside.sh
@@ -51,6 +51,7 @@ if [ "$pip_index" ]; then
 fi
 
 if [ "$WITH_REQUIREMENT" ]; then
+    echo "pyyaml" >> $WITH_REQUIREMENT
     echo ">>> install python requirement: $WITH_REQUIREMENT";
     pip install $PIP_ARGS -r "$WITH_REQUIREMENT";
 fi
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 1f5810dc5bcdd..84e6893ea26cc 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -86,6 +86,7 @@ if "%WITH_PYTHON%" == "ON" (
     where python
     where pip
     pip install wheel --user
+    pip install pyyaml --user
     pip install -r %work_dir%\python\requirements.txt --user
     if !ERRORLEVEL! NEQ 0 (
         echo pip install requirements.txt failed!
@@ -741,7 +742,7 @@ for /F %%i in ("%libsize%") do (
 )
 
 cd /d %work_dir%\paddle\fluid\inference\api\demo_ci
-%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %TENSORRT_ROOT%/include %TENSORRT_ROOT%/lib %MSVC_STATIC_CRT%
+%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %TENSORRT_ROOT% %MSVC_STATIC_CRT%
 goto:eof
 
 :test_inference_error
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index fba7c8a71dcaa..df647a1738617 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -216,6 +216,7 @@ function cmake_base() {
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
         -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON}
+        -DWITH_INFRT=${WITH_INFRT:-OFF}
         -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR}
         -DPY_VERSION=${PY_VERSION:-2.7}
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
@@ -262,6 +263,7 @@ EOF
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
         -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \
+        -DWITH_INFRT=${WITH_INFRT:-OFF} \
         -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \
         -DPY_VERSION=${PY_VERSION:-2.7} \
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} \
@@ -312,20 +314,25 @@ function check_style() {
     fi
 
 
-    pip install cpplint pylint pytest astroid isort
     # set up go environment for running gometalinter
     mkdir -p $GOPATH/src/github.com/PaddlePaddle/
     ln -sf ${PADDLE_ROOT} $GOPATH/src/github.com/PaddlePaddle/Paddle
 
+    # pre-commit use python3.8.0 
+    OLD_PATH=$PATH
+    export PATH=export PATH=/usr/local/python3.8.0/bin:/usr/local/python3.8.0/include:/usr/local/bin:${PATH}
+
     pre-commit install
     clang-format --version
 
     commit_files=on
-    for file_name in `git diff --numstat upstream/$BRANCH |awk '{print $NF}'`;do
+    for file_name in `git diff --numstat ${BRANCH} |awk '{print $NF}'`;do
         if ! pre-commit run --files $file_name ; then
             commit_files=off
         fi
     done 
+
+    export PATH=${OLD_PATH}
     
     if [ $commit_files == 'off' ];then
         echo "code format error"
@@ -2237,8 +2244,7 @@ EOF
     demo_ci_startTime_s=`date +%s`
     cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci
     ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF} ${INFERENCE_DEMO_INSTALL_DIR} \
-             ${TENSORRT_INCLUDE_DIR:-/usr/local/TensorRT/include} \
-             ${TENSORRT_LIB_DIR:-/usr/local/TensorRT/lib}
+             ${TENSORRT_ROOT_DIR:-/usr}
     DEMO_EXIT_CODE=$?
     ./clean.sh
     demo_ci_endTime_s=`date +%s`
diff --git a/patches/cryptopp/CMakeLists.txt b/patches/cryptopp/CMakeLists.txt
index c533b707350d6..d5918d6a1da77 100644
--- a/patches/cryptopp/CMakeLists.txt
+++ b/patches/cryptopp/CMakeLists.txt
@@ -447,7 +447,7 @@ if (MSVC)
   if (CMAKE_SYSTEM_VERSION MATCHES "10\\.0.*")
     list(APPEND CRYPTOPP_COMPILE_DEFINITIONS "_WIN32_WINNT=0x0A00")
   endif ()
-  list(APPEND CRYPTOPP_COMPILE_OPTIONS "/FIwinapifamily.h")
+  list(APPEND CRYPTOPP_COMPILE_OPTIONS "/nologo")
 endif ()
 
 # Enable PIC for all target machines except 32-bit i386 due to register pressures.
diff --git a/python/paddle/_C_ops.py b/python/paddle/_C_ops.py
index ffec9dc69472e..83030bb2454f6 100644
--- a/python/paddle/_C_ops.py
+++ b/python/paddle/_C_ops.py
@@ -19,3 +19,21 @@
 for name in dir(core.ops):
     globals()[name] = getattr(core.ops, name)
     __all__.append(name)
+
+
+def switch_to_core_ops():
+    for name in dir(core.eager.ops):
+        del globals()[name]
+        __all__.remove(name)
+    for name in dir(core.ops):
+        globals()[name] = getattr(core.ops, name)
+        __all__.append(name)
+
+
+def switch_to_eager_ops():
+    for name in dir(core.ops):
+        del globals()[name]
+        __all__.remove(name)
+    for name in dir(core.eager.ops):
+        globals()[name] = getattr(core.eager.ops, name)
+        __all__.append(name)
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 5823cf460ee9f..a70bd3f81bfc7 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -150,6 +150,7 @@
 from .tensor.manipulation import unsqueeze_  # noqa: F401
 from .tensor.manipulation import unstack  # noqa: F401
 from .tensor.manipulation import flip  # noqa: F401
+from .tensor.manipulation import rot90  # noqa: F401
 from .tensor.manipulation import unbind  # noqa: F401
 from .tensor.manipulation import roll  # noqa: F401
 from .tensor.manipulation import chunk  # noqa: F401
@@ -223,6 +224,10 @@
 from .tensor.math import digamma  # noqa: F401
 from .tensor.math import neg  # noqa: F401
 from .tensor.math import lgamma  # noqa: F401
+from .tensor.math import rad2deg  # noqa: F401
+from .tensor.math import deg2rad  # noqa: F401
+from .tensor.math import diff  # noqa: F401
+from .tensor.math import angle  # noqa: F401
 
 from .tensor.random import multinomial  # noqa: F401
 from .tensor.random import standard_normal  # noqa: F401
@@ -277,6 +282,7 @@
 from .device import get_cudnn_version  # noqa: F401
 from .device import set_device  # noqa: F401
 from .device import get_device  # noqa: F401
+from .fluid.framework import is_compiled_with_cinn  # noqa: F401
 from .fluid.framework import is_compiled_with_cuda  # noqa: F401
 from .fluid.framework import is_compiled_with_rocm  # noqa: F401
 from .fluid.framework import disable_signal_handler  # noqa: F401
@@ -305,6 +311,16 @@
 import paddle.vision  # noqa: F401
 
 from .tensor.random import check_shape  # noqa: F401
+
+# CINN has to set a flag to include a lib
+if is_compiled_with_cinn():
+    import os
+    package_dir = os.path.dirname(os.path.abspath(__file__))
+    runtime_include_dir = os.path.join(package_dir, "libs")
+    cuh_file = os.path.join(runtime_include_dir, "cinn_cuda_runtime_source.cuh")
+    if os.path.exists(cuh_file):
+        os.environ['runtime_include_dir'] = runtime_include_dir
+
 disable_static()
 
 __all__ = [  # noqa
@@ -404,6 +420,7 @@
            'bitwise_not',
            'mm',
            'flip',
+           'rot90',
            'bincount',
            'histogram',
            'multiplex',
@@ -457,6 +474,8 @@
            'ceil',
            'atan',
            'atan2',
+           'rad2deg',
+           'deg2rad',
            'expand',
            'broadcast_to',
            'ones_like',
@@ -531,5 +550,7 @@
            'broadcast_tensors',
            'einsum',
            'set_flags',
-           'get_flags'
+           'get_flags',
+           'diff',
+           'angle',
 ]
diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py
index 84b08fcdd39a0..95402898589f6 100644
--- a/python/paddle/device/__init__.py
+++ b/python/paddle/device/__init__.py
@@ -18,6 +18,7 @@
 from paddle.fluid import core
 from paddle.fluid import framework
 from paddle.fluid.dygraph.parallel import ParallelEnv
+from paddle.fluid.framework import is_compiled_with_cinn  # noqa: F401
 from paddle.fluid.framework import is_compiled_with_cuda  # noqa: F401
 from paddle.fluid.framework import is_compiled_with_rocm  # noqa: F401
 from . import cuda
@@ -28,6 +29,7 @@
     'get_device',
     'XPUPlace',
     'is_compiled_with_xpu',
+    'is_compiled_with_cinn',
     'is_compiled_with_cuda',
     'is_compiled_with_rocm',
     'is_compiled_with_npu'
diff --git a/python/paddle/distributed/auto_parallel/mapper.py b/python/paddle/distributed/auto_parallel/mapper.py
index f015cf4477195..543fa2d9681c0 100644
--- a/python/paddle/distributed/auto_parallel/mapper.py
+++ b/python/paddle/distributed/auto_parallel/mapper.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
+import os
 import operator
 import functools
 import json
@@ -175,9 +176,19 @@ def build_process_graph(distributed_program):
 
 def build_cluster_graph(cluster):
     graph = Graph()
+    cuda_visible_devices_env = os.getenv("CUDA_VISIBLE_DEVICES")
+    cuda_visible_devices = []
+    if cuda_visible_devices_env is not None and cuda_visible_devices_env != "":
+        cuda_visible_devices = [
+            int(d.strip()) for d in cuda_visible_devices_env.split(",")
+        ]
     for machine in cluster.machines.values():
         for device in machine.devices.values():
             graph.add_node(device.global_id, device=device)
+            if cuda_visible_devices and device.local_id not in cuda_visible_devices:
+                graph.nodes[device.global_id]["occupied"] = True
+            else:
+                graph.nodes[device.global_id]["occupied"] = False
         for link in machine.links.values():
             graph.add_edge(
                 link.source.global_id, link.target.global_id, link=link)
@@ -195,9 +206,6 @@ def mapping(distributed_program, cluster):
     for cur_rank_node in process_graph:
         cur_rank_node["visited"] = False
 
-    for cur_device_node in cluster_graph:
-        cur_device_node["occupied"] = False
-
     def sort_by_comm_volume(rank_edge):
         return rank_edge["comm_requirements"]["comm_volume"]
 
diff --git a/python/paddle/distributed/auto_parallel/parallelizer.py b/python/paddle/distributed/auto_parallel/parallelizer.py
index 14556ff6ef459..affb27317daaf 100644
--- a/python/paddle/distributed/auto_parallel/parallelizer.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer.py
@@ -12,6 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+import sys
+import json
+import shlex
+import copy
+import pathlib
+import subprocess
 import logging
 import paddle
 from paddle.distributed.utils import get_logger
@@ -23,9 +30,12 @@
 from .completion import complete_annotation, complete_backward_annotation
 from .partitioner import Partitioner
 from .process_group import get_all_process_groups
+from .process_group import get_world_process_groups
 from .utils import make_data_unshard
 from .utils import set_grad_var_shape
 from .reshard import reshard
+from .cluster import Cluster
+from .mapper import mapping
 # from .auto_search import auto_search
 
 _logger = get_logger(logging.INFO)
@@ -46,6 +56,21 @@ def __init__(self, fleet):
         self._optimizer = self._fleet.user_defined_optimizer
         self._dist_strategy = self._fleet._user_defined_strategy
         self._dist_context = DistributedContext()
+        self._cluster = None
+        self._cluster_topo_path = os.getenv("PADDLE_CLUSTER_TOPO_PATH", None)
+        if self._cluster_topo_path is not None:
+            self._cluster = Cluster()
+            self._cluster.build_from_file(self._cluster_topo_path)
+        # Prepare information for auto mapping
+        self._rank_mapping_path = os.getenv("PADDLE_RANK_MAPPING_PATH", None)
+        enable_auto_mapping_env = os.getenv("PADDLE_ENABLE_AUTO_MAPPING", None)
+        if enable_auto_mapping_env is None:
+            self._enable_auto_mapping = False
+        else:
+            self._enable_auto_mapping = True
+        self._need_rank_mapping = os.getenv("PADDLE_NEED_RANK_MAPPING")
+        self._need_rank_mapping = True if self._need_rank_mapping and \
+            self._need_rank_mapping.lower() == 'true' else False
 
     def _remove_distributed_attrs(self, main_program):
         suffix = core.kAutoParallelSuffix()
@@ -57,60 +82,103 @@ def _remove_distributed_attrs(self, main_program):
                     if suffix in attr_name:
                         op._remove_attr(attr_name)
 
+    def _get_dist_program(self, dist_context, rank):
+        # Annotation completion
+        completed_main_program = complete_annotation(self._main_program,
+                                                     dist_context)
+        # Logical partition
+        partitioner = Partitioner(self._dist_strategy, dist_context, rank)
+        dist_main_prog, dist_startup_prog = partitioner.transpile_forward(
+            completed_main_program, self._startup_program)
+        dist_params_grads = partitioner.apply_backward(
+            self._loss, completed_main_program, self._startup_program,
+            dist_main_prog, dist_startup_prog)
+        dist_optimize_ops = partitioner.apply_optimize(
+            copy.deepcopy(self._optimizer), dist_params_grads, dist_main_prog,
+            dist_startup_prog)
+
+        make_data_unshard(dist_main_prog, dist_startup_prog, dist_context)
+
+        reshard(dist_main_prog, dist_startup_prog, rank, dist_context)
+
+        return dist_optimize_ops, dist_params_grads, dist_startup_prog, dist_main_prog
+
     def parallelize(self,
                     loss,
                     startup_program,
                     parameter_list=None,
                     no_grad_set=None):
         assert startup_program is not None
-        main_program = loss.block.program
-
-        if self._dist_strategy.auto_search:
-            # auto search
-            _logger.info("Start search dist attr.")
-            # self._dist_context, _ = auto_search(main_program, startup_program,
-            #                                     loss, self._optimizer)
-            # completed_main_program = main_program
-            raise NotImplementedError("Auto search has not implemented")
-        else:
-            # Annotation completion
-            _logger.info("Start annotation dist attr.")
-            completed_main_program = complete_annotation(main_program,
-                                                         self._dist_context)
-
-        # Logical partition 
-        rank = paddle.distributed.get_rank()
-        partitioner = Partitioner(self._dist_strategy, self._dist_context, rank)
-        partitioned_main_prog, partitioned_startup_prog = partitioner.transpile_forward(
-            completed_main_program, startup_program)
-        dist_params_grads = partitioner.apply_backward(
-            loss, completed_main_program, startup_program,
-            partitioned_main_prog, partitioned_startup_prog)
-        dist_optimize_ops = partitioner.apply_optimize(
-            self._optimizer, dist_params_grads, partitioned_main_prog,
-            partitioned_startup_prog)
+        self._loss = loss
+        self._startup_program = startup_program
+        self._main_program = loss.block.program
+        self._parameter_list = parameter_list
+        self._no_grad_set = no_grad_set
+
+        if self._enable_auto_mapping and self._need_rank_mapping:
+            # Do the mapping pass before parallelization
+            assert self._cluster is not None, \
+                "The cluster must not be none when using auto mapping."
+            dist_programs = {}
+            world_process_group = get_world_process_groups()
+            for rank in world_process_group.ranks:
+                dist_context = DistributedContext()
+                dist_optimize_ops, dist_params_grads, dist_startup_prog, dist_main_prog = self._get_dist_program(
+                    dist_context, rank)
+                dist_programs[rank] = dist_main_prog
+
+            # Do the mapping between the distributed program graph and the cluster graph
+            rank_mapping_dict = mapping(dist_programs, self._cluster)
+            rank_mapping = list(rank_mapping_dict.values())
 
-        # set the grad var shape
-        set_grad_var_shape(partitioned_main_prog, self._dist_context)
+            # Relaunch the training by using the rank mapping file
+            with open(self._rank_mapping_path, "w") as rank_mapping_file:
+                json.dump(rank_mapping, rank_mapping_file)
+
+            enable_elastic = os.getenv("PADDLE_ENABLE_ELASTIC")
+            enable_elastic = True if enable_elastic and enable_elastic.lower(
+            ) == 'true' else False
+            if enable_elastic:
+                print("Auto mapping finished, now do elastic re-launch")
+                sys.exit(paddle.distributed.fleet.elastic.manager.
+                         ELASTIC_AUTO_PARALLEL_EXIT_CODE)
+
+            original_cmd_args = os.getenv("PADDLE_ORIGINAL_CMD_ARGS")
+            rank_mapping_args = " ".join(
+                ["--rank_mapping_path", self._rank_mapping_path])
+            if os.environ.get("WITH_COVERAGE", "OFF") == "ON":
+                coverage_args = ["-m", "coverage", "run", "--branch", "-p"]
+            else:
+                coverage_args = []
+            new_cmd_args = "-m paddle.distributed.fleet.launch" + " " + rank_mapping_args + " " + original_cmd_args
+            new_cmd = [sys.executable, "-u"] + coverage_args + shlex.split(
+                new_cmd_args)
+            new_process = subprocess.Popen(new_cmd)
+            new_process.wait()
+            assert new_process.returncode == 0, \
+                "Launch failed with rank mapping"
+            print("Successfully do the second launch for auto mapping!")
+            sys.exit(0)
+        else:
+            # Parallelization after the mapping pass
+            rank = paddle.distributed.get_rank()
 
-        # The last step: remove all distributed attributes to be compatiable
-        # with inference.
-        self._remove_distributed_attrs(partitioned_main_prog)
-        make_data_unshard(partitioned_main_prog, partitioned_startup_prog,
-                          self._dist_context)
+            dist_optimize_ops, dist_params_grads, dist_startup_prog, dist_main_prog = self._get_dist_program(
+                self._dist_context, rank)
 
-        reshard(partitioned_main_prog, partitioned_startup_prog, rank,
-                self._dist_context)
+            # Traverse different rank programs and traverse each op of them,
+            # instantiate communication by process_mapping.
+            all_process_groups = get_all_process_groups()
+            for process_group in all_process_groups:
+                if rank not in process_group.ranks:
+                    continue
+                process_group.instantiate()
 
-        # Traverse different rank programs and traverse each op of them,
-        # instantiate communication by process_mapping.
-        all_process_groups = get_all_process_groups()
-        for process_group in all_process_groups:
-            if rank not in process_group.ranks:
-                continue
-            process_group.instantiate()
+            # Copy distributed info to the default context
+            set_default_distributed_context(self._dist_context)
 
-        # Copy distributed info to the default context
-        set_default_distributed_context(self._dist_context)
+            # The last step: remove all distributed attributes to be compatible
+            # with inference.
+            self._remove_distributed_attrs(dist_main_prog)
 
-        return dist_optimize_ops, dist_params_grads, partitioned_startup_prog, partitioned_main_prog
+            return dist_optimize_ops, dist_params_grads, dist_startup_prog, dist_main_prog
diff --git a/python/paddle/distributed/auto_parallel/process_group.py b/python/paddle/distributed/auto_parallel/process_group.py
index 70a19f6c5386a..2e4d370b39435 100644
--- a/python/paddle/distributed/auto_parallel/process_group.py
+++ b/python/paddle/distributed/auto_parallel/process_group.py
@@ -19,10 +19,6 @@
 from ...fluid.framework import in_dygraph_mode
 from ...fluid.layers.tensor import fill_constant
 
-# Note that Process group 0 is reserved for representing all ranks.
-# At the begining, group 0 is empty and new ranks will be added automatically. 
-_g_process_group_map = {}
-
 
 def get_all_process_groups():
     global _g_process_group_map
@@ -34,6 +30,11 @@ def get_process_group(group_id):
     return _g_process_group_map.get(group_id, None)
 
 
+def get_world_process_groups():
+    global _g_process_group_map
+    return _g_process_group_map[0]
+
+
 def new_process_group(ranks):
     global _g_process_group_map
     # A key constructed from ranks is used for avoiding duplication 
@@ -151,4 +152,7 @@ def __str__(self):
         return string
 
 
+# Note that Process group 0 is reserved for representing all ranks.
+# At the begining, group 0 is empty and new ranks will be added automatically. 
+_g_process_group_map = {}
 _g_process_group_map[0] = ProcessGroup(0, [])
diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py
index a3505eae876ef..92918c834a5da 100755
--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
@@ -1036,3 +1036,139 @@ def set_grad_var_shape(program, dist_context):
 
                 if list(grad_var.shape) != ref_shape:
                     grad_var.desc.set_shape(ref_shape)
+
+
+def update_op_dims_mapping_by_default_dist_impl(dist_op):
+    changed = False
+    op_dist_attr = dist_op.dist_attr
+    op_desc = dist_op.serial_op.desc
+    # The following statement will be replaced by a more elegent way
+    if op_desc.type() == "shape" or op_desc.type() == "slice":
+        return False
+    output_names = op_desc.output_names()
+    xshape_arg_names = []
+    if "XShape" in output_names:
+        xshape_arg_names = op_desc.output("XShape")
+    batch_dim_mappings = []
+    for arg_name in op_desc.input_arg_names():
+        serial_tensor = dist_op.get_serial_input(arg_name)
+        if serial_tensor.is_parameter:
+            continue
+        dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
+        if len(dims_mapping) > 1:
+            for idx, mapping in enumerate(dims_mapping[1:]):
+                assert mapping == -1, \
+                    "{} only the batch dimension (0-dim) can be sharded, but the dimension {} is sharded by {} part."\
+                        .format(op_desc.type(), idx, mapping)
+        batch_dim_mappings.append(dims_mapping[0])
+    for arg_name in op_desc.output_arg_names():
+        serial_tensor = dist_op.get_serial_output(arg_name)
+        if serial_tensor.is_parameter:
+            continue
+        dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
+        if arg_name not in xshape_arg_names:
+            if len(dims_mapping) > 1:
+                for idx, mapping in enumerate(dims_mapping[1:]):
+                    assert mapping == -1, \
+                        "{} only the batch dimension (0-dim) can be sharded, but the dimension {} is sharded by {} part."\
+                            .format(op_desc.type(), idx, mapping)
+            batch_dim_mappings.append(dims_mapping[0])
+        else:
+            assert dims_mapping[0] == -1, \
+                "{} only the batch dimension (1-dim) of XShape can be sharded, but the dimension 0 is sharded by {} part."\
+                    .format(op_desc.type(), mapping)
+            if len(dims_mapping) > 2:
+                for idx, mapping in enumerate(dims_mapping[2:]):
+                    assert mapping == -1, \
+                        "{} only the batch dimension (1-dim) of XShape can be sharded, but the dimension {} is sharded by {} part."\
+                            .format(op_desc.type(), idx, mapping)
+            batch_dim_mappings.append(dims_mapping[1])
+
+    compatible_dim_mapping = compute_compatible_dim_mapping(batch_dim_mappings)
+    assert compatible_dim_mapping is not None, "There is no compatible dim mapping."
+    for arg_name in op_desc.input_arg_names():
+        serial_tensor = dist_op.get_serial_input(arg_name)
+        if serial_tensor.is_parameter:
+            continue
+        dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
+        if compatible_dim_mapping != dims_mapping[0]:
+            dims_mapping[0] = compatible_dim_mapping
+            changed = True
+    for arg_name in op_desc.output_arg_names():
+        serial_tensor = dist_op.get_serial_output(arg_name)
+        if serial_tensor.is_parameter:
+            continue
+        dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
+        if arg_name not in xshape_arg_names:
+            if compatible_dim_mapping != dims_mapping[0]:
+                dims_mapping[0] = compatible_dim_mapping
+                changed = True
+        else:
+            if compatible_dim_mapping != dims_mapping[1]:
+                dims_mapping[1] = compatible_dim_mapping
+                changed = True
+
+    return changed
+
+
+def update_op_dims_mapping_by_elementwise_like_dist_impl(dist_op):
+    changed = False
+    op_dist_attr = dist_op.dist_attr
+    op_desc = dist_op.serial_op.desc
+    input_arg_names = op_desc.input_arg_names()
+    input_dims_mapping_dict = {}
+    input_dims_mapping_lens = {}
+    max_dims_mapping_len = -1
+    for arg_name in input_arg_names:
+        dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
+        if max_dims_mapping_len < len(dims_mapping):
+            max_dims_mapping_len = len(dims_mapping)
+        input_dims_mapping_dict[arg_name] = dims_mapping
+        input_dims_mapping_lens[arg_name] = len(dims_mapping)
+
+    dims_mapping_list = []
+    for arg_name in input_arg_names:
+        if input_dims_mapping_lens[arg_name] < max_dims_mapping_len:
+            new_dims_mapping = [-1 for _ in range(max_dims_mapping_len)]
+            for i in range(input_dims_mapping_lens[arg_name]):
+                new_idx = (max_dims_mapping_len -
+                           input_dims_mapping_lens[arg_name]) + i
+                new_dims_mapping[new_idx] = input_dims_mapping_dict[arg_name][i]
+            dims_mapping_list.append(new_dims_mapping)
+        else:
+            dims_mapping_list.append(input_dims_mapping_dict[arg_name])
+    output_arg_names = op_desc.output_arg_names()
+    for arg_name in output_arg_names:
+        dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
+        assert len(dims_mapping) == max_dims_mapping_len
+        dims_mapping_list.append(dims_mapping)
+
+    compatible_dims_mapping = compute_compatible_dims_mapping(dims_mapping_list)
+    assert compatible_dims_mapping is not None, "There is no compatible dim mapping."
+
+    for arg_name in input_arg_names:
+        if input_dims_mapping_lens[arg_name] < max_dims_mapping_len:
+            new_dims_mapping = [
+                -1 for _ in range(input_dims_mapping_lens[arg_name])
+            ]
+            for i in range(input_dims_mapping_lens[arg_name]):
+                new_idx = (max_dims_mapping_len -
+                           input_dims_mapping_lens[arg_name]) + i
+                new_dims_mapping[i] = compatible_dims_mapping[new_idx]
+            if new_dims_mapping != input_dims_mapping_dict[arg_name]:
+                op_dist_attr.set_input_dims_mapping(arg_name, new_dims_mapping)
+                changed = True
+        else:
+            if compatible_dims_mapping != input_dims_mapping_dict[arg_name]:
+                op_dist_attr.set_input_dims_mapping(arg_name,
+                                                    compatible_dims_mapping)
+                changed = True
+
+    for arg_name in output_arg_names:
+        dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
+        if compatible_dims_mapping != dims_mapping:
+            op_dist_attr.set_output_dims_mapping(arg_name,
+                                                 compatible_dims_mapping)
+            changed = True
+
+    return changed
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index cdbc7bd0cd744..e58b6c312fa1f 100644
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -470,12 +470,22 @@ def sparse_table_configs(self, configs):
         from google.protobuf.descriptor import FieldDescriptor
         table_param = self.strategy.downpour_table_param
 
-        def set_table_config(msg, config_name, configs):
+        def set_table_config(msg, config_name, configs, index=0):
             for field in msg.DESCRIPTOR.fields:
                 name = config_name + "." + field.name
                 if field.type == FieldDescriptor.TYPE_MESSAGE:
                     print("message:", name)
-                    set_table_config(getattr(msg, field.name), name, configs)
+                    if field.label == FieldDescriptor.LABEL_REPEATED:
+                        if name + ".num" not in configs:
+                            continue
+                        num = configs[name + ".num"]
+                        print("message num:", name, num)
+                        for i in range(num):
+                            data = getattr(msg, field.name).add()
+                            set_table_config(data, name, configs, i)
+                    else:
+                        set_table_config(
+                            getattr(msg, field.name), name, configs)
                 else:
                     print("not message:", name)
                     if name not in configs:
@@ -483,9 +493,15 @@ def set_table_config(msg, config_name, configs):
                     if field.label == FieldDescriptor.LABEL_REPEATED:
                         getattr(msg, field.name).extend(configs[name])
                     else:
-                        setattr(msg, field.name, configs[name])
+                        if type(configs[name]) == list:
+                            setattr(msg, field.name, configs[name][index])
+                        else:
+                            setattr(msg, field.name, configs[name])
 
-        set_table_config(table_param, "table_parameters", configs)
+        if not configs:
+            print("table configs is empty")
+        else:
+            set_table_config(table_param, "table_parameters", configs)
 
     @property
     def amp(self):
@@ -1742,6 +1758,37 @@ def auto_search(self, flag):
         else:
             print("WARNING: auto-search should have value of bool type")
 
+    @property
+    def heter_ccl_mode(self):
+        """
+        Indicating whether we are using heter_ccl_mode for model training.
+        This feature is currently an experimental feature. Currently,
+        heter_ccl_mode can be used only for dataparallel with dygraph mode.
+        Default Value: False
+
+        Examples:
+
+          .. code-block:: python
+
+            import paddle
+            import paddle.distributed.fleet as fleet
+
+            strategy = fleet.DistributedStrategy()
+            strategy.heter_ccl_mode = True
+
+            # for initialize parallel env, only need to call
+            paddle.distributed.init_parallel_env()
+            # then the heterogenous context will be created.
+        """
+        return self.strategy.heter_ccl_mode
+
+    @heter_ccl_mode.setter
+    def heter_ccl_mode(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.heter_ccl_mode = flag
+        else:
+            print("WARNING: heter_ccl_mode should have value of bool type")
+
     @property
     def cudnn_exhaustive_search(self):
         """
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index 57199b8a1e8cc..0d54a0ea5d3b1 100755
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -33,7 +33,7 @@
 from .topology import ParallelMode
 from ..meta_parallel import TensorParallel, model_parallel_random_seed
 from ..meta_parallel import PipelineParallel, ShardingParallel
-from ..meta_optimizers import HybridParallelOptimizer
+from ..meta_optimizers import HybridParallelOptimizer, HeterParallelOptimizer
 from paddle import _C_ops
 from paddle.fluid import core
 from paddle.fluid.dygraph import to_variable
@@ -277,13 +277,15 @@ def init(self, role_maker=None, is_collective=False, strategy=None):
                         self._user_defined_strategy.nccl_comm_num)
                 paddle.distributed.init_parallel_env()
 
-            # init hybrid parallel environment in dygraph
-            if tp._HYBRID_PARALLEL_GROUP is None:
-                self._init_hybrid_parallel_env()
-            else:
-                warnings.warn(
-                    "The dygraph hybrid parallel environment has been initialized."
-                )
+            # hybrid parallel not support for npu/xpu
+            if self._user_defined_strategy.heter_ccl_mode == False:
+                # init hybrid parallel environment in dygraph
+                if tp._HYBRID_PARALLEL_GROUP is None:
+                    self._init_hybrid_parallel_env()
+                else:
+                    warnings.warn(
+                        "The dygraph hybrid parallel environment has been initialized."
+                    )
         elif self._is_collective:
             use_sharding = self._user_defined_strategy.sharding
 
@@ -823,7 +825,7 @@ def save_persistables(self, executor, dirname, main_program=None, mode=0):
         self._runtime_handle._save_persistables(executor, dirname, main_program,
                                                 mode)
 
-    def shrink(self, threshold):
+    def shrink(self, threshold=None):
         self._runtime_handle._shrink(threshold)
 
     def distributed_optimizer(self, optimizer, strategy=None):
@@ -872,8 +874,12 @@ def distributed_optimizer(self, optimizer, strategy=None):
 
         if paddle.fluid.framework.in_dygraph_mode():
             if self.worker_num() > 1:
-                return HybridParallelOptimizer(optimizer, self._hcg,
-                                               self._user_defined_strategy)
+                if self._user_defined_strategy.heter_ccl_mode == False:
+                    return HybridParallelOptimizer(optimizer, self._hcg,
+                                                   self._user_defined_strategy)
+                else:
+                    return HeterParallelOptimizer(optimizer,
+                                                  self._user_defined_strategy)
             else:
                 return optimizer
         return self
@@ -938,6 +944,17 @@ def forward(self, x):
         if self.worker_num() <= 1:
             return model
 
+        if self._user_defined_strategy.heter_ccl_mode == True:
+            distributed_model = paddle.DataParallel(
+                model,
+                comm_buffer_size=self._user_defined_strategy.
+                fuse_grad_size_in_MB,
+                last_comm_buffer_size=self._user_defined_strategy.
+                last_comm_group_size_MB,
+                find_unused_parameters=self._user_defined_strategy.
+                find_unused_parameters)
+            return distributed_model
+
         if self._hcg.get_parallel_mode() == ParallelMode.SHARDING_PARALLEL:
             distributed_model = ShardingParallel(
                 model, self._hcg, strategy=self._user_defined_strategy)
@@ -1569,13 +1586,13 @@ def unscale_method(self, optimizer):
                 ]
                 param_grads_fp16 = [
                     param._grad_ivar() for param in optimizer._parameter_list
-                    if (param._grad_ivar() is not None) and (param._grad_ivar(
-                    ).dtype == core.VarDesc.VarType.FP16)
+                    if (param._grad_ivar() is not None) and
+                    (param._grad_ivar().dtype == core.VarDesc.VarType.FP16)
                 ]
                 param_grads_fp32 = [
                     param._grad_ivar() for param in optimizer._parameter_list
-                    if (param._grad_ivar() is not None) and (param._grad_ivar(
-                    ).dtype == core.VarDesc.VarType.FP32)
+                    if (param._grad_ivar() is not None) and
+                    (param._grad_ivar().dtype == core.VarDesc.VarType.FP32)
                 ]
             temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool))
             temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool))
diff --git a/python/paddle/distributed/fleet/base/meta_optimizer_factory.py b/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
index 52eeebd0c126c..322989099c856 100755
--- a/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
+++ b/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
@@ -19,9 +19,10 @@
 meta_optimizer_names = list(
     filter(lambda name: name.endswith("Optimizer"), dir()))
 
-# Because HybridParallelOptimizer is dygraph optimizer, it 
+# Because HybridParallelOptimizer is dygraph optimizer, it
 # should be removed
 meta_optimizer_names.remove("HybridParallelOptimizer")
+meta_optimizer_names.remove("HeterParallelOptimizer")
 
 
 class MetaOptimizerFactory(object):
diff --git a/python/paddle/distributed/fleet/elastic/__init__.py b/python/paddle/distributed/fleet/elastic/__init__.py
index 127aba7d93717..503d2966a80e7 100644
--- a/python/paddle/distributed/fleet/elastic/__init__.py
+++ b/python/paddle/distributed/fleet/elastic/__init__.py
@@ -18,14 +18,20 @@
 from .manager import ElasticManager
 from .manager import ElasticStatus
 from .manager import ELASTIC_EXIT_CODE
+from .manager import ElasticLevel
 from .collective import CollectiveLauncher
 
 from paddle.distributed.fleet.launch_utils import DistributeMode
 
 
 def enable_elastic(args, distribute_mode):
-    if distribute_mode != DistributeMode.COLLECTIVE:
-        return False
+    #elastic_level = os.getenv('PADDLE_ELASTIC_FAULT_TOLERANC_LEVEL')
+    #if not elastic_level and (elastic_level != ElasticLevel.FAULT_TOLERANCE and
+    #                          elastic_level != ElasticLevel.ELASTIC):
+    #    return False
+
+    #if distribute_mode != DistributeMode.COLLECTIVE:
+    #    return False
 
     if not args.elastic_server and not os.getenv('PADDLE_ELASTIC_SERVER'):
         return False
diff --git a/python/paddle/distributed/fleet/elastic/collective.py b/python/paddle/distributed/fleet/elastic/collective.py
index d9c2735c4bd01..de350e15d35c0 100644
--- a/python/paddle/distributed/fleet/elastic/collective.py
+++ b/python/paddle/distributed/fleet/elastic/collective.py
@@ -30,42 +30,10 @@ def __init__(self, args):
     def launch(self):
         logger.info("collective lauchner launch ...")
         args = self.args
-        # parse arguments, used for cloud-single-machine and local
-        (device_mode,
-         devices_per_proc) = launch_utils.get_device_proc_info(args)
-        trainers_num = cloud_utils.get_trainers_num()
-        logger.debug("parsed from args trainerss_num:{} mode:{} devices:{}".
-                     format(trainers_num, device_mode, devices_per_proc))
-
-        cluster = None
-        pod = None
-
-        start_port = 6170
-        if os.environ.get('FLAGS_START_PORT') is not None:
-            start_port = os.environ.get('FLAGS_START_PORT')
-        if cloud_utils.use_paddlecloud() and trainers_num != 1:
-            cluster, pod = cloud_utils.get_cloud_cluster(
-                args.ips, device_mode, devices_per_proc, start_port)
-            logger.debug("get cluster from cloud:{}".format(cluster))
-        elif device_mode == DeviceMode.ASCEND_NPU:
-            # for ascend
-            cluster, pod = ascend_utils.get_cloud_cluster(
-                rank_table_file=os.getenv("RANK_TABLE_FILE", None),
-                device_mode=device_mode,
-                start_port=start_port)
-        else:
-            # trainers_num = 1 or not use paddlecloud ips="a,b"
-            cluster, pod = paddle.distributed.fleet.launch.get_cluster_from_args(
-                args, device_mode, devices_per_proc)
-            logger.debug("get cluster from args:{}".format(cluster))
-
-        global_envs = copy.copy(os.environ.copy())
-        self.gloo_rendezvous_dir = tempfile.mkdtemp()
-        # add gloo env
-        global_envs["PADDLE_WITH_GLOO"] = str(
-            os.getenv("PADDLE_WITH_GLOO", "0"))
-        global_envs["PADDLE_GLOO_RENDEZVOUS"] = "3"
-        global_envs["PADDLE_GLOO_FS_PATH"] = self.gloo_rendezvous_dir
+        self.tmp_dir = tempfile.mkdtemp()
+        cluster, pod = paddle.distributed.fleet.launch.get_cluster_info(args)
+        global_envs = paddle.distributed.fleet.launch.get_global_envs(
+            args, self.tmp_dir)
 
         self.procs = start_local_trainers(
             cluster,
@@ -82,8 +50,8 @@ def stop(self):
         logger.info("collective lauchner stop ...")
         if not self._terminate_procs():
             logger.error("kill process failed")
-        if os.path.exists(self.gloo_rendezvous_dir):
-            shutil.rmtree(self.gloo_rendezvous_dir)
+        if os.path.exists(self.tmp_dir):
+            shutil.rmtree(self.tmp_dir)
 
     def watch(self):
         logger.debug("collective lauchner watch ...")
diff --git a/python/paddle/distributed/fleet/elastic/manager.py b/python/paddle/distributed/fleet/elastic/manager.py
index 391c563166318..1716e332c8286 100644
--- a/python/paddle/distributed/fleet/elastic/manager.py
+++ b/python/paddle/distributed/fleet/elastic/manager.py
@@ -35,6 +35,7 @@
 logger.addHandler(ch)
 
 ELASTIC_EXIT_CODE = 101
+ELASTIC_AUTO_PARALLEL_EXIT_CODE = 102
 
 # wait for timeout, unit: seconds
 ELASTIC_TIMEOUT = 2 * 60
@@ -103,6 +104,9 @@ def _check_procs(self):
             if ret is None:
                 alive = True
             elif ret != 0:
+                if ret == ELASTIC_AUTO_PARALLEL_EXIT_CODE:
+                    logger.info("return form elastic auto parallel re-launch")
+                    return ret
                 logger.error("ABORT!!! ABORT!!! ABORT!!!")
                 logger.error(
                     "ERROR rank {} error with exit code {}, check log for detail.".
@@ -232,6 +236,7 @@ def host_call_back(event):
                 six.ensure_str(i[0])
                 for i in self.etcd.get_prefix(self.node_prefix)
             ]
+            self.hosts = list(set(self.hosts)) if self.hosts else self.hosts
             logger.info(
                 f"host_call_back curr_host={self.curr_host}, hosts:{self.hosts}")
             self.need_sync = True
@@ -251,6 +256,7 @@ def lease_heartbeat():
                         six.ensure_str(i[0])
                         for i in self.etcd.get_prefix(self.node_prefix)
                     ]
+                    hosts = list(set(hosts)) if hosts else hosts
                     logger.info(
                         f"[lease_heartbeat] curr_host={self.curr_host}, hosts={hosts}"
                     )
@@ -335,6 +341,7 @@ def pre_hook(self):
         if not self.args.elastic_pre_hook:
             logger.info("skip pre_hook")
             return
+        logger.info("execute pre_hook...")
         current_env = copy.copy(os.environ.copy())
         out, err = subprocess.Popen(
             self.args.elastic_pre_hook,
@@ -391,6 +398,7 @@ def _match(self, host_list: list=None):
                 six.ensure_str(i[0])
                 for i in self.etcd.get_prefix(self.node_prefix)
             ]
+        self.hosts = list(set(self.hosts)) if self.hosts else self.hosts
 
         if self.elastic_level == ElasticLevel.FAULT_TOLERANCE:
             if len(self.hosts) == self.np:
@@ -430,6 +438,9 @@ def _update_endpoint(self, endpoints, hosts):
 
     def _update_fault_tolrance(self):
         rank = int(os.getenv('PADDLE_TRAINER_ID', -1))
+        logger.debug(
+            f"self.curr_host={self.curr_host}, self.dist_endpoints={self.dist_endpoints}"
+        )
         if self.curr_host in self.dist_endpoints:
             os.environ['DISTRIBUTED_TRAINER_ENDPOINTS'] = self.dist_endpoints
             os.environ['PADDLE_TRAINERS'] = self.trainers
@@ -550,7 +561,6 @@ def wait(self):
                                                                    self.hosts))
             idx += 1
             time.sleep(2)
-
         return
 
     def run(self, launcher):
@@ -571,6 +581,11 @@ def watch(self):
 
             if ret is not None:  # self terminated
                 logger.info('job exit with code {}'.format(ret))
+                if ret == ELASTIC_AUTO_PARALLEL_EXIT_CODE:
+                    logger.info('job re-launch for auto parallel')
+                    self.launcher.stop()
+                    return ElasticStatus.HOLD
+
                 # process is completed if ret >= 0 or error else
                 completed = True if ret == 0 else False
                 self.exit(completed=completed)
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 2595512789bb6..19306d3da9916 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -65,6 +65,7 @@
 import time
 import six
 import copy
+import pathlib
 import argparse
 from argparse import ArgumentParser, REMAINDER
 import paddle
@@ -107,9 +108,9 @@ def _parse_args():
     base_group.add_argument(
         "--backend",
         type=str,
-        default="auto",
-        help="Specifize the backend, can be gloo|nccl|bkcl|auto. Default value is auto which perfers nccl or bkcl."
-    )
+        default=os.environ.get('PADDLE_DISTRI_BACKEND', 'auto'),
+        help="Specifize the backend, can be gloo|nccl|bkcl|auto|hccl|heter. "
+        "Default value is auto which perfers nccl or bkcl.")
     base_group.add_argument(
         "--nproc_per_node",
         type=int,
@@ -145,6 +146,16 @@ def _parse_args():
         )
         base_group.add_argument("--selected_xpus", dest="xpus")
 
+    if fluid.core.is_compiled_with_npu():
+        base_group.add_argument(
+            "--npus",
+            type=str,
+            default=None,
+            help="It's for xpu training. For example: "
+            "--npus=\"0,1,2,3\" will launch four training processes each bound to one npu."
+        )
+        base_group.add_argument("--selected_npus", dest="npus")
+
     base_group.add_argument(
         "training_script",
         type=str,
@@ -164,25 +175,17 @@ def _parse_args():
         default="127.0.0.1",
         help="Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..")
     collective_group.add_argument(
-        "--rank_mapping_file",
-        type=argparse.FileType('r'),
-        default=sys.stdin,
-        help="This rank mapping information in json format is used specifically "
-        "for lazy launch for auto parallel. Some of the ranks in each node "
-        "may not be used, and the indices of rank should be kept the same "
-        "as the indices of sub-task splited by auto parallel. "
-        " { "
-        "   \"ip_ranks\": [ "
-        "     { "
-        "       \"ip\": \"127.0.0.1\", "
-        "       \"ranks\": [0,1] "
-        "     }, "
-        "     { "
-        "       \"ip\": \"127.0.0.2\", "
-        "       \"ranks\": [2,3,4] "
-        "     } "
-        "   ] "
-        " } ")
+        "--cluster_topo_path",
+        type=str,
+        default=None,
+        help="A json format file will be stored in this path which is used"
+        "to represent the cluster topology information for auto parallel.")
+    collective_group.add_argument(
+        "--rank_mapping_path",
+        type=str,
+        default=None,
+        help="A json format file will be stored in this path which is used"
+        "to map processes to machines for auto parallel.")
     collective_group.add_argument(
         "--enable_auto_mapping",
         type=bool,
@@ -283,47 +286,91 @@ def cpuonly_check(args):
     return True
 
 
-def launch_collective(args):
+def get_cluster_info(args):
     # parse arguments, used for cloud-single-machine and local
     if args.backend == 'gloo': cpuonly_check(args)
-    (device_mode, devices_per_proc) = launch_utils.get_device_proc_info(args)
+    if args.enable_auto_mapping:
+        (device_mode, devices_per_proc) = (DeviceMode.GPU, [])
+    else:
+        (device_mode,
+         devices_per_proc) = launch_utils.get_device_proc_info(args)
     trainers_num = cloud_utils.get_trainers_num()
     logger.debug("parsed from args trainerss_num:{} mode:{} devices:{}".format(
         trainers_num, device_mode, devices_per_proc))
 
+    cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
+
     cluster = None
     pod = None
 
     start_port = 6170
     if os.environ.get('FLAGS_START_PORT') is not None:
         start_port = os.environ.get('FLAGS_START_PORT')
-    # lazy launch for auto-parallel
+    # auto mapping between processes and devices for auto-parallel
     if args.enable_auto_mapping == True:
-        cluster, pod = get_mapped_cluster_from_args(args, device_mode)
-    else:
-        # for ascend
-        if device_mode == DeviceMode.ASCEND_NPU:
-            cluster, pod = ascend_utils.get_cloud_cluster(
-                rank_table_file=os.getenv("RANK_TABLE_FILE", None),
-                device_mode=device_mode,
-                start_port=start_port)
-        elif cloud_utils.use_paddlecloud() and trainers_num != 1:
-            cluster, pod = cloud_utils.get_cloud_cluster(
-                args.ips, device_mode, devices_per_proc, start_port)
-            logger.debug("get cluster from cloud:{}".format(cluster))
+        assert args.cluster_topo_path is not None, \
+            "The cluster topology must be provied when enabling auto mapping."
+        rank_mapping_path = args.rank_mapping_path or os.getenv(
+            "PADDLE_RANK_MAPPING_PATH")
+        if not rank_mapping_path:
+            os.environ["PADDLE_NEED_RANK_MAPPING"] = str(True)
+            os.environ["PADDLE_ENABLE_ELASTIC"] = str(
+                enable_elastic(args, device_mode))
+            cwd = pathlib.Path().resolve()
+            rank_mapping_path = os.path.join(cwd,
+                                             "auto_parallel_rank_mapping.json")
+            os.environ["PADDLE_RANK_MAPPING_PATH"] = str(rank_mapping_path)
+
+            original_args = sys.argv[1:]
+            os.environ["PADDLE_ORIGINAL_CMD_ARGS"] = " ".join(original_args)
+            os.environ["PADDLE_CLUSTER_TOPO_PATH"] = str(args.cluster_topo_path)
+            os.environ["PADDLE_ENABLE_AUTO_MAPPING"] = str(
+                args.enable_auto_mapping)
+            cluster, pod = launch_utils.get_mapped_cluster_from_args_without_rank_mapping(
+                args, device_mode)
         else:
-            # trainers_num = 1 or not use paddlecloud ips="a,b"
-            cluster, pod = get_cluster_from_args(args, device_mode,
-                                                 devices_per_proc)
-            logger.debug("get cluster from args:{}".format(cluster))
+            os.environ["PADDLE_NEED_RANK_MAPPING"] = str(False)
+            os.environ["PADDLE_ENABLE_ELASTIC"] = str(
+                enable_elastic(args, device_mode))
+
+            os.environ["PADDLE_CLUSTER_TOPO_PATH"] = str(args.cluster_topo_path)
+            os.environ["PADDLE_RANK_MAPPING_PATH"] = str(rank_mapping_path)
+            os.environ["PADDLE_ENABLE_AUTO_MAPPING"] = str(
+                args.enable_auto_mapping)
+            cluster, pod = launch_utils.get_mapped_cluster_from_args_with_rank_mapping(
+                args, device_mode)
+    elif cloud_utils.use_paddlecloud() and trainers_num != 1:
+        cluster, pod = cloud_utils.get_cloud_cluster(
+            args.ips, device_mode, devices_per_proc, start_port)
+        logger.debug("get cluster from cloud:{}".format(cluster))
+    elif device_mode == DeviceMode.ASCEND_NPU:
+        # for ascend
+        cluster, pod = ascend_utils.get_cloud_cluster(
+            rank_table_file=os.getenv("RANK_TABLE_FILE", None),
+            device_mode=device_mode,
+            start_port=start_port)
+    else:
+        # trainers_num = 1 or not use paddlecloud ips="a,b"
+        cluster, pod = get_cluster_from_args(args, device_mode,
+                                             devices_per_proc)
+        logger.debug("get cluster from args:{}".format(cluster))
+    return cluster, pod
+
 
+def get_global_envs(args, tmp_dir):
     global_envs = copy.copy(os.environ.copy())
-    gloo_rendezvous_dir = tempfile.mkdtemp()
     # add gloo env
     global_envs["PADDLE_WITH_GLOO"] = str(os.getenv("PADDLE_WITH_GLOO", "0"))
     global_envs["PADDLE_GLOO_RENDEZVOUS"] = "3"
-    global_envs["PADDLE_GLOO_FS_PATH"] = gloo_rendezvous_dir
+    global_envs["PADDLE_GLOO_FS_PATH"] = tmp_dir
     global_envs["PADDLE_DISTRI_BACKEND"] = args.backend
+    return global_envs
+
+
+def launch_collective(args):
+    tmp_dir = tempfile.mkdtemp()
+    cluster, pod = get_cluster_info(args)
+    global_envs = get_global_envs(args, tmp_dir)
 
     procs = start_local_trainers(
         cluster,
@@ -352,8 +399,8 @@ def launch_collective(args):
             terminate_local_procs(procs)
             exit(1)
 
-    if os.path.exists(gloo_rendezvous_dir):
-        shutil.rmtree(gloo_rendezvous_dir)
+    if os.path.exists(tmp_dir):
+        shutil.rmtree(tmp_dir)
 
 
 def launch_ps(args, distribute_mode):
@@ -446,15 +493,15 @@ def which_distributed_mode(args):
         ) and not fluid.core.is_compiled_with_xpu():
             if args.servers:
                 logger.warning(
-                    "Not found distinct arguments and not compiled with cuda or xpu. \
-But found args.servers not empty, default use ps mode")
+                    "Not found distinct arguments and not compiled with cuda or xpu or npu. "
+                    "But found args.servers not empty, default use ps mode")
                 return DistributeMode.PS
             else:
                 return DistributeMode.COLLECTIVE
         else:
             logger.warning(
-                "Not found distinct arguments and compiled with cuda or xpu. Default use collective mode"
-            )
+                "Not found distinct arguments and compiled with cuda or xpu or npu. "
+                "Default use collective mode")
             return DistributeMode.COLLECTIVE
 
 
@@ -641,7 +688,7 @@ def launch():
         check_backend(args.backend)
         distribute_mode = DistributeMode.COLLECTIVE
 
-    assert args.backend in ['gloo', 'nccl', 'bkcl', 'unknown']
+    #assert args.backend in ['gloo', 'nccl', 'bkcl', 'heter', 'unknown']
 
     if args.backend == 'gloo':
         logger.warning("launch start with CPUONLY mode")
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index f7f50e76af61b..c20c209d60171 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -511,6 +511,17 @@ def start_local_trainers(cluster,
             "PADDLE_WORLD_DEVICE_IDS": ",".join(res),
         }
 
+        # The following three environnement variables are used for auto mapping
+        if current_env.get("PADDLE_CLUSTER_TOPO_PATH", None) is not None:
+            proc_env["PADDLE_CLUSTER_TOPO_PATH"] = current_env[
+                "PADDLE_CLUSTER_TOPO_PATH"]
+        if current_env.get("PADDLE_RANK_MAPPING_PATH", None) is not None:
+            proc_env["PADDLE_RANK_MAPPING_PATH"] = current_env[
+                "PADDLE_RANK_MAPPING_PATH"]
+        if current_env.get("PADDLE_ENABLE_AUTO_MAPPING", None) is not None:
+            proc_env["PADDLE_ENABLE_AUTO_MAPPING"] = current_env[
+                "PADDLE_ENABLE_AUTO_MAPPING"]
+
         if len(t.accelerators) > 0 and pod.device_mode == DeviceMode.GPU:
             proc_env["FLAGS_selected_gpus"] = "%s" % ",".join(
                 [str(g) for g in t.accelerators])
@@ -531,7 +542,8 @@ def start_local_trainers(cluster,
         current_env.update(proc_env)
 
         coverage_args = []
-        if run_with_coverage():
+        if run_with_coverage() or os.environ.get("WITH_COVERAGE",
+                                                 "OFF") == "ON":
             coverage_args = ["-m", "coverage", "run", "--branch", "-p"]
         cmd = [sys.executable, "-u"] + coverage_args + [training_script
                                                         ] + training_script_args
@@ -557,7 +569,11 @@ def start_local_trainers(cluster,
             with open("%s/endpoints.log" % log_dir, "w") as f:
                 f.write("PADDLE_TRAINER_ENDPOINTS: \n")
                 f.write("\n".join(cluster.trainers_endpoints()))
-            fn = open("%s/workerlog.%d" % (log_dir, idx), "a")
+            if current_env.get("PADDLE_ENABLE_AUTO_MAPPING") is not None \
+                and current_env.get("PADDLE_NEED_RANK_MAPPING").lower() == "true":
+                fn = open("%s/prelaunchlog.%d" % (log_dir, idx), "a")
+            else:
+                fn = open("%s/workerlog.%d" % (log_dir, idx), "a")
             proc = subprocess.Popen(
                 cmd, env=current_env, stdout=fn, stderr=fn, preexec_fn=pre_fn)
         else:
@@ -690,9 +706,51 @@ def get_xpus(xpus):
     return res_xpus
 
 
+def get_npus(npus):
+    if npus is None:
+        npus_num = fluid.core.get_npu_device_count()
+        res_npus = [str(x) for x in range(0, npus_num)]
+    else:
+        npu_visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
+        if npu_visible_devices is None or npu_visible_devices == "":
+            res_npus = [x.strip() for x in npus.split(',')]
+        else:
+            # change npus into relative values
+            # e.g. ASCEND_VISIBLE_DEVICES=4,5,6,7; args.npus=4,5,6,7;
+            # therefore npus=0,1,2,3
+            npu_visible_devices_list = npu_visible_devices.split(',')
+            for x in npus.split(','):
+                assert x in npu_visible_devices_list, "Can't find "\
+                    "your npus %s in ASCEND_VISIBLE_DEVICES[%s]."\
+                    % (x, npu_visible_devices)
+            res_npus = [
+                npu_visible_devices_list.index(x.strip())
+                for x in npus.split(',')
+            ]
+            logger.info("Change selected_npus into reletive values. --ips:{} "
+                        "will change into relative_ips:{} according to your "
+                        "ASCEND_VISIBLE_DEVICES:{}".format(
+                            npus, res_npus, npu_visible_devices_list))
+
+    return res_npus
+
+
 def get_device_mode(backend):
-    if fluid.core.is_compiled_with_npu() and \
+    if backend == 'heter':
+        if fluid.core.is_compiled_with_cuda() and \
+            fluid.core.get_cuda_device_count() > 0:
+            print("launch train in heter mode with GPU device.")
+            return DeviceMode.GPU
+        if fluid.core.is_compiled_with_xpu() and \
+            fluid.core.get_xpu_device_count() > 0:
+            print("launch train in heter mode with XPU device.")
+            return DeviceMode.XPU
+        if fluid.core.is_compiled_with_npu() and \
             fluid.core.get_npu_device_count() > 0:
+            print("launch train in heter mode with NPU device.")
+            return DeviceMode.ASCEND_NPU
+
+    if backend == 'hccl' and fluid.core.get_npu_device_count() > 0:
         print("launch train in ascend npu mode!")
         return DeviceMode.ASCEND_NPU
 
@@ -731,7 +789,17 @@ def get_device_proc_info(args):
         else:
             devices_per_proc = gpus
     elif device_mode == DeviceMode.ASCEND_NPU:
-        devices_per_proc = None
+        npus = get_npus(args.npus)
+        if args.nproc_per_node is not None:
+            assert (len(npus) % int(args.nproc_per_node)) ==0, \
+                "npus' number:{} mod args.nproc_per_node:{} must == 0".format(len(npus), args.nproc_per_node)
+
+            n = int(len(npus) / int(args.nproc_per_node))
+            devices_per_proc = [
+                npus[i:i + n] for i in six.moves.range(0, len(npus), n)
+            ]
+        else:
+            devices_per_proc = npus
     elif device_mode == DeviceMode.XPU:
         xpus = get_xpus(args.xpus)
         if args.nproc_per_node is not None:
@@ -824,8 +892,8 @@ def get_custom_endpoints(origin_endpoints, offset=0):
 #        pretty_print_envs(environs)))
 
 
-def get_mapped_cluster(node_ips, node_ip, trainer_endpoints, device_mode,
-                       node_mapping_ranks):
+def get_mapped_cluster_without_rank_mapping(
+        node_ips, node_ip, trainer_endpoints, device_mode, node_ranks):
     assert type(trainer_endpoints) is list, "trainer_endpoints must be list"
     assert device_mode == DeviceMode.GPU, \
         "Only support get mapped cluster for gpu now."
@@ -838,17 +906,121 @@ def get_mapped_cluster(node_ips, node_ip, trainer_endpoints, device_mode,
         cur_node_endpoints = trainer_endpoints[node_rank]
 
         # choose rank from global mapped ranks and set it to the trainer.
-        ranks_per_node = node_mapping_ranks[node_rank]
+        ranks_per_node = node_ranks[node_rank]
+        assert len(ranks_per_node) == 1
         for i in range(len(ranks_per_node)):
             trainer = Trainer()
-            # change global rank(mapped) to local rank within each node.
-            # e.g. mapped ranks of node: 3,4,7 -> 0,1,2
-            local_rank = ranks_per_node.index(ranks_per_node[i])
-            trainer.accelerators.append(local_rank)
             trainer.endpoint = "%s" % (cur_node_endpoints[i])
-            # global mapped ranks
             trainer.rank = ranks_per_node[i]
+            pod.trainers.append(trainer)
+        cluster.pods.append(pod)
+
+    pod_rank = node_ips.index(node_ip)
+    return cluster, cluster.pods[pod_rank]
+
+
+def get_mapped_cluster_from_args_without_rank_mapping(args, device_mode):
+    assert device_mode == DeviceMode.GPU, \
+        "Only support get mapped cluster for gpu now."
+    gpus_num = fluid.core.get_cuda_device_count()
+
+    # parse ip-ranks json file
+    cluster_topo = None
+    with open(args.cluster_topo_path, "r") as json_file:
+        cluster_topo = json.load(json_file)
+
+    node_ips = []
+    node_ranks = []
+    for idx, cur_cluster_topo in enumerate(cluster_topo["machines"]):
+        node_ips.append(cur_cluster_topo['addr'])
+        node_ranks.append([idx])
+
+    if len(node_ips) == 1:
+        node_ip = node_ips[0]
+    else:
+        if args.host:
+            node_ip = args.host
+        else:
+            _, node_ip = get_host_name_ip()
+
+    assert node_ip in node_ips, \
+        "Can't find your local ip {%s} in node_ips: {%s}" % (node_ip, node_ips)
+    node_rank = node_ips.index(node_ip)
+
+    assert len(node_ranks) == len(node_ips), \
+        "ranks length should be equal to ips length."
+
+    logger.debug("parsed from args: node_ips:{} node_ip:{} "
+                 "node_rank:{} node_ranks:{}".format(
+                     node_ips, node_ip, node_rank, node_ranks[node_rank]))
+
+    # NOTE: there are different number of global mapped ranks on each node.
+    free_ports = []
+    trainer_endpoints = []
+    for ip in node_ips:
+        node_rank = node_ips.index(ip)
+        if os.environ.get('PADDLE_PORT') is not None:
+            start_port = int(os.getenv("PADDLE_PORT", ""))
+            free_ports = [
+                x
+                for x in range(start_port, start_port + len(node_ranks[
+                    node_rank]))
+            ]
+        elif os.environ.get('FLAGS_START_PORT') is not None:
+            start_port = int(os.environ.get('FLAGS_START_PORT'))
+            free_ports = [
+                x
+                for x in range(start_port, start_port + len(node_ranks[
+                    node_rank]))
+            ]
+        else:
+            free_ports = find_free_ports(len(node_ranks[node_rank]))
+        trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
+
+    return get_mapped_cluster_without_rank_mapping(
+        node_ips, node_ip, trainer_endpoints, device_mode, node_ranks)
+
+
+def get_mapped_cluster_with_rank_mapping(node_ips, node_ip, trainer_endpoints,
+                                         device_mode, node_ranks,
+                                         node_rank_mappings):
+    assert type(trainer_endpoints) is list, "trainer_endpoints must be list"
+    assert device_mode == DeviceMode.GPU, \
+        "Only support get mapped cluster for gpu now."
+
+    def get_relative_gpu_id(gpu_id):
+        cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
+        if cuda_visible_devices is None or cuda_visible_devices == "":
+            return gpu_id
+        else:
+            cuda_visible_devices_list = cuda_visible_devices.split(',')
+            relative_id = cuda_visible_devices_list.index(str(gpu_id))
+            logger.info(
+                "Change gpu id from {} to {} based on CUDA_VISIBLE_DEVICES {}".
+                format(gpu_id, relative_id, cuda_visible_devices_list))
+            return relative_id
 
+    cluster = Cluster(hdfs=None)
+    for node_rank, ip in enumerate(node_ips):
+        pod = Pod()
+        pod.rank = node_rank
+        pod.addr = ip
+        pod.device_mode = device_mode
+        cur_node_endpoints = trainer_endpoints[node_rank]
+
+        # choose rank from global mapped ranks and set it to the trainer.
+        ranks_per_node = node_ranks[node_rank]
+        cur_node_rank_mapping = node_rank_mappings[node_rank]
+        for i in range(len(ranks_per_node)):
+            trainer = Trainer()
+            local_device_ids = cur_node_rank_mapping["ranks"][str(
+                ranks_per_node[i])]
+            assert len(local_device_ids) == 1, \
+                "Only support one process to one device mapping"
+            trainer.accelerators.append(
+                get_relative_gpu_id(local_device_ids[0]))
+            trainer.endpoint = "%s" % (cur_node_endpoints[i])
+            trainer.rank = ranks_per_node[i]
             pod.trainers.append(trainer)
         cluster.pods.append(pod)
 
@@ -856,22 +1028,31 @@ def get_mapped_cluster(node_ips, node_ip, trainer_endpoints, device_mode,
     return cluster, cluster.pods[pod_rank]
 
 
-def get_mapped_cluster_from_args(args, device_mode):
+def get_mapped_cluster_from_args_with_rank_mapping(args, device_mode):
     assert device_mode == DeviceMode.GPU, \
         "Only support get mapped cluster for gpu now."
     gpus_num = fluid.core.get_cuda_device_count()
 
     # parse ip-ranks json file
-    json_data = None
-    with args.rank_mapping_file as json_file:
-        json_data = json.load(json_file)
+    rank_mapping_path = args.rank_mapping_path or os.getenv(
+        "PADDLE_RANK_MAPPING_PATH")
+    rank_mapping = None
+    with open(rank_mapping_path, "r") as json_file:
+        rank_mapping = json.load(json_file)
+    # reset PADDLE_RANK_MAPPING_PATH env
+    os.environ["PADDLE_RANK_MAPPING_PATH"] = ""
 
     node_ips = []
-    node_ranks_mapping = []
-    ip_ranks_list = json_data['ip_ranks']
-    for ip_ranks in ip_ranks_list:
-        node_ips.append(ip_ranks['ip'])
-        node_ranks_mapping.append(ip_ranks['ranks'])
+    node_ranks = []
+    node_rank_mappings = []
+    for cur_rank_mapping in rank_mapping:
+        node_ips.append(cur_rank_mapping['addr'])
+        cur_node_rank_list = [
+            int(i) for i in list(cur_rank_mapping['ranks'].keys())
+        ]
+        cur_node_rank_list.sort()
+        node_ranks.append(cur_node_rank_list)
+        node_rank_mappings.append(cur_rank_mapping)
 
     if len(node_ips) == 1:
         node_ip = node_ips[0]
@@ -885,34 +1066,41 @@ def get_mapped_cluster_from_args(args, device_mode):
         "Can't find your local ip {%s} in node_ips: {%s}" % (node_ip, node_ips)
     node_rank = node_ips.index(node_ip)
 
-    assert len(node_ranks_mapping[node_rank]) <= gpus_num, \
+    assert len(node_ranks[node_rank]) <= gpus_num, \
         "number of ranks mapped to one node should not exceed the avaiable ones."
-    assert len(node_ranks_mapping) == len(node_ips), \
+    assert len(node_ranks) == len(node_ips), \
         "ranks length should be equal to ips length."
 
     logger.debug("parsed from args: node_ips:{} node_ip:{} "
-                 "node_rank:{} node_ranks_mapping:{}".format(
-                     node_ips, node_ip, node_rank, node_ranks_mapping[
-                         node_rank]))
+                 "node_rank:{} node_ranks:{}".format(
+                     node_ips, node_ip, node_rank, node_ranks[node_rank]))
 
     # NOTE: there are different number of global mapped ranks on each node.
     free_ports = []
     trainer_endpoints = []
     for ip in node_ips:
         node_rank = node_ips.index(ip)
-        if os.environ.get('FLAGS_START_PORT') is not None:
+        if os.environ.get('PADDLE_PORT') is not None:
+            start_port = int(os.getenv("PADDLE_PORT", ""))
+            free_ports = [
+                x
+                for x in range(start_port, start_port + len(node_ranks[
+                    node_rank]))
+            ]
+        elif os.environ.get('FLAGS_START_PORT') is not None:
             start_port = int(os.environ.get('FLAGS_START_PORT'))
             free_ports = [
                 x
-                for x in range(start_port, start_port + len(node_ranks_mapping[
+                for x in range(start_port, start_port + len(node_ranks[
                     node_rank]))
             ]
         else:
-            free_ports = find_free_ports(len(node_ranks_mapping[node_rank]))
+            free_ports = find_free_ports(len(node_ranks[node_rank]))
         trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
 
-    return get_mapped_cluster(node_ips, node_ip, trainer_endpoints, device_mode,
-                              node_ranks_mapping)
+    return get_mapped_cluster_with_rank_mapping(node_ips, node_ip,
+                                                trainer_endpoints, device_mode,
+                                                node_ranks, node_rank_mappings)
 
 
 class ParameterServerLauncher(object):
@@ -1531,11 +1719,11 @@ def start_pod_heter_worker(self, args, pod):
 
 
 def check_backend(backend):
-    if backend not in ['nccl', 'gloo', 'bkcl', 'auto']:
-        raise ValueError(
-            "paddle.distributed initialize error, "
-            "backend argument can only be one of 'nccl', 'gloo', 'bkcl', 'auto', but got %s"
-            % backend)
+    if backend not in ['nccl', 'gloo', 'bkcl', 'auto', 'hccl', 'heter']:
+        raise ValueError("paddle.distributed initialize error, "
+                         "backend argument can only be one of "
+                         "'nccl', 'gloo', 'bkcl', 'auto', 'hccl', 'heter' "
+                         "but got %s" % backend)
 
     if backend == 'nccl' and not fluid.core.is_compiled_with_cuda():
         raise ValueError(
@@ -1549,6 +1737,12 @@ def check_backend(backend):
             "your paddle is not compiled with xpu but you assign 'bkcl' as backend."
         )
 
+    if backend == 'hccl' and not fluid.core.is_compiled_with_npu():
+        raise ValueError(
+            "paddle.distributed initialize error, "
+            "your paddle is not compiled with npu but you assign 'hccl' as backend."
+        )
+
 
 def block_windows_and_macos(backend):
     if backend != 'gloo': return
@@ -1569,4 +1763,7 @@ def get_backend_by_compile_flag():
     if fluid.core.is_compiled_with_xpu():
         return 'bkcl'
 
+    if fluid.core.is_compiled_with_npu():
+        return 'hccl'
+
     return 'gloo'
diff --git a/python/paddle/distributed/fleet/meta_optimizers/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
index 739de0de57725..13496ad8ee5d9 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/__init__.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
@@ -28,6 +28,7 @@
 from .fp16_allreduce_optimizer import FP16AllReduceOptimizer
 from .sharding_optimizer import ShardingOptimizer
 from .dygraph_optimizer import HybridParallelOptimizer
+from .dygraph_optimizer import HeterParallelOptimizer
 from .dygraph_optimizer import HybridParallelGradScaler
 from .tensor_parallel_optimizer import TensorParallelOptimizer
 from .raw_program_optimizer import RawProgramOptimizer
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py
index 28260d7aa1863..3beb8635ba41a 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py
@@ -13,5 +13,6 @@
 from .hybrid_parallel_optimizer import HybridParallelOptimizer
 from .hybrid_parallel_gradscaler import HybridParallelGradScaler
 from .dygraph_sharding_optimizer import DygraphShardingOptimizer
+from .heter_parallel_optimizer import HeterParallelOptimizer
 
 __all__ = []
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/heter_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/heter_parallel_optimizer.py
new file mode 100755
index 0000000000000..9218024be1720
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/heter_parallel_optimizer.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.fluid.dygraph import base as imperative_base
+from paddle.fluid import framework
+
+__all__ = []
+
+
+def _obtain_optimizer_parameters_list(optimizer):
+    if getattr(optimizer, '_param_groups', None) and isinstance(
+            optimizer._param_groups[0], dict):
+        parameters_list = []
+        for group in optimizer._param_groups:
+            for param in group['params']:
+                parameters_list.append(param)
+    else:
+        parameters_list = [param for param in optimizer._parameter_list]
+
+    return parameters_list
+
+
+class HeterParallelOptimizer:
+    # adapter wrapper for optimizer
+    def __init__(self, optimizer, strategy):
+        self._inner_opt = optimizer
+        self._strategy = strategy
+
+        # NOTE(liubo48): In pure DataParallel mode,
+        # the gradient synchronization is achieved through reducer.
+
+    @imperative_base.no_grad
+    @framework.dygraph_only
+    def step(self):
+        parameters_list = _obtain_optimizer_parameters_list(self._inner_opt)
+        self._inner_opt.step()
+
+    @imperative_base.no_grad
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameters=None,
+                 no_grad_set=None):
+
+        # minimize does not support parameters in the form of param_group,
+        # so no need use _obtain_optimizer_parameters_list
+        parameter_list = parameters if parameters \
+            else self._inner_opt._parameter_list
+
+        return self._inner_opt.minimize(loss, startup_program, parameter_list,
+                                        no_grad_set)
+
+    def __getattr__(self, item):
+        return getattr(self._inner_opt, item)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
index 9595896188ba5..ffd24add50a4d 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
@@ -68,7 +68,6 @@ def __init__(self,
                  broadcast_fp16=False,
                  offload=False,
                  device="gpu",
-                 accumulation_steps=None,
                  **kw):
 
         super().__init__(optim._learning_rate, params, kw)
@@ -84,9 +83,14 @@ def __init__(self,
         # Default information
         self._optim_defaults = kw
         self._optim = optim
+        assert hasattr(self._optim, "_master_weights"
+                       ), "Must use optimizer with _master_weights attribute"
         self._local_params = params
         self._default_device = device
-        self._accumulation_steps = accumulation_steps
+        self._pfp16 = len(
+            list(
+                filter(lambda x: x.trainable and x.dtype == Type.fp16.value,
+                       self._local_params))) > 0
 
         assert group is not None, "Distributed communication group is must be gived"
         self.group = group
@@ -100,6 +104,12 @@ def __init__(self,
         # Update optimizer parameters and adjust parameter storage and use according to rank.
         self.update_opt_status()
 
+    def _generate_master_params(self, trainable_params):
+        for param in trainable_params:
+            if param.dtype == Type.fp16.value:
+                self._optim._master_weights[param.name] = paddle.cast(
+                    param, Type.fp32.value)
+
     def update_opt_status(self):
         """Update optimizer status and parameter storage information, and special functions to be developed.
         """
@@ -136,10 +146,6 @@ def segment_params(self):
     def local_params(self):
         return self._local_params
 
-    @property
-    def accumulation_steps(self):
-        return self._accumulation_steps
-
     @property
     def param2rank(self):
         """Map the params to the rank which owns them"""
@@ -213,6 +219,8 @@ def _integration_params(self):
                     # Merge all the trainable params in a single InternalStorage
                     trainable_params = list(
                         filter(lambda x: x.trainable, params))
+                    if self._pfp16 and dst_rank == self.rank:
+                        self._generate_master_params(trainable_params)
                     if trainable_params:
                         param_storage = ParamStorage(
                             size=self.rank_buffer_size[dtype][dst_rank],
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
new file mode 100644
index 0000000000000..37b85751149f7
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
@@ -0,0 +1,510 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#Taken and modified for fairscale from:
+#    https://github.com/facebookresearch/fairscale/blob/main/fairscale/nn/data_parallel/sharded_ddp.py
+#Commit: 8acbec718f3c70a6b9785470bb9e05cd84fc3f8e
+
+import os
+import contextlib
+import logging
+import time
+import functools
+import numpy as np
+from itertools import chain
+from functools import reduce
+from collections import deque
+
+import paddle
+from paddle import nn
+import paddle.distributed as dist
+
+from ...utils.internal_storage import GradStorage
+from ...meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2
+from .sharding_utils import Taskflow, Type
+
+
+def _trainable(param):
+    return param.trainable
+
+
+class ShardingStage2(nn.Layer):
+    """ 
+    A wrapper for Sharding Stage2 Layer in Dygraph. 
+    .. warning: ShardingStage2 encapsulates the layer strategy and integrates it into the nn.Layer.
+    .. ZeRO: https://arxiv.org/pdf/1910.02054.pdf.
+    """
+
+    # TODO (Baibaifan) 
+    # Feature Notes::
+    # 1. Unified memory for param and param.grad to InternalStorage.
+    # 2. Divide param.grad according to rank to centrally apply for and release GPU memory.
+    # 3. Dynamically adjust training parameters and models。
+    # 4. Support offload function.
+    # 5. Support the establishment of independent communication groups.
+
+    def __init__(
+            self,
+            layer,
+            sharding_optimizer,
+            group,
+            sync_buffers=False,
+            pertrain_sync_models=True,
+            buffer_max_size=2**23,  #8MB
+            auto_refresh_trainable=True,
+            device="gpu",
+            use_grad_storage=True,
+            accumulate_grads=False):
+        super().__init__()
+
+        # training options
+        self._layer = layer
+        self._sharding_optimizers = [sharding_optimizer] if not isinstance(
+            sharding_optimizer, list) else sharding_optimizer
+        assert all(
+            list(
+                map(lambda opt: isinstance(opt, ShardingOptimizerStage2),
+                    self._sharding_optimizers))
+        ), "Please use ShardingOptimizerStage2 optimizer"
+        self._sync_buffers = sync_buffers
+        self._auto_refresh_trainable = auto_refresh_trainable
+
+        # Gradient accumulation, Gradient flip
+        self._accumulate_grads = accumulate_grads
+
+        # Communication related attributes
+        assert group is not None, "Distributed communication group is must be gived"
+        self._group = group
+        self._world_size_scaling = 1.0 / self._group.nranks
+        assert self._group.nranks > 1, "Training must be distributed, ranks must be greater than 1"
+        self._rank = self._group.rank
+        self._global_root_rank = 0  # picking rank 0 as the reference
+        self._global_ranks = self._group.ranks
+        self._default_device = device
+
+        # Global statistical parameters
+        self._all_params = list(
+            chain(*[optim.local_params for optim in self._sharding_optimizers]))
+        self._trainable_params = []
+        self._grad_reduced = []
+        self._trainable_param2rank = {}
+        self._trainable_param2align = {}
+        self._trainable_mask = list(map(_trainable, self._all_params))
+        self._param_grads = []
+
+        # Set grad storage size & Display param sizes and model sizes
+        model_size = sum(
+            [np.prod(p.shape) for p in self._layer.parameters()]).item()
+        self._buffer_max_size = self._rank_buffer_size(buffer_max_size,
+                                                       model_size)
+        self._use_grad_storage = use_grad_storage
+        self._grad_storages = {}  # {dtype: {rank: GradStorage}}
+        self._has_grad_storage = []
+        self._grad_storage_list = []
+
+        # Set backward pass hooks
+        self._bw_hooks = []
+
+        # Synchronous all ranks models
+        if pertrain_sync_models:
+            self._sync_params_and_buffers()
+
+        # Set tasks flow
+        self._tasks_flow = deque()
+
+    def forward(self, *inputs, **kwargs):
+        """
+        A wrapper for Sharding Stage2 layer.
+        - Fresh trainable params or rebuild grad storage
+        - Sync layer's buffer params
+        - Clear all flags states
+        - Forward for origin layers
+        """
+
+        # Whether to need to reset trainable parameters
+        needs_fresh = len(self._bw_hooks) == 0 and self.training
+
+        if self._auto_refresh_trainable:
+            needs_fresh |= self._detect_train_change()
+
+        # Front hook
+        self._init_internal_storage(needs_fresh)
+
+        # Sync layer's buffers state
+        if self._sync_buffers:
+            self.__sync_buffers()
+
+        # Normal FW on the base model
+        fw = self._layer(*inputs, **kwargs)
+
+        return fw
+
+    def clear_gradients(self):
+        """
+        Set zero to the gradient of the optimizer's current rank trainable parameters.
+        """
+        # Release grad storages
+        for dtype in self._grad_storages.keys():
+            if self._rank in self._grad_storages[dtype].keys():
+                self._grad_storages[dtype][self._rank].buffer.zero_()
+
+        # Release params
+        for param in self._trainable_params:
+            if param.name in self._param_grads and param.grad is not None:
+                param.clear_gradient()
+
+    def grad_scale(self):
+        """
+        Before the gradient accumulation, scale the gradient.
+        """
+        # Scale grad storages
+        for dtype in self._grad_storages.keys():
+            if self._rank in self._grad_storages[dtype].keys():
+                self._grad_storages[dtype][self._rank].buffer.scale_(
+                    scale=self._world_size_scaling)
+
+        # Scale params
+        for param in self._trainable_params:
+            if param.name in self._param_grads and param.grad is not None:
+                param.grad.scale_(scale=self._world_size_scaling)
+                param._reset_grad_inplace_version(True)
+
+    def _init_internal_storage(self, needs_fresh):
+        """
+        Judge Fresh trainable params or rebuild grad storage.
+        """
+        if needs_fresh:
+            self._fresh_trainable()
+        else:
+            self._build_grad_storages()
+
+        # Clear all flags state 
+        self._clear_counters()
+
+    def to(self, device=None, dtype=None, blocking=True):
+        """
+        Synchronously or asynchronously convert the data type of the layer, the device is not supported now.
+        """
+        assert device == self._default_device, "New devices are not supported, because of the optimizer state is not sync"
+
+    def _fresh_trainable(self):
+        """ Whether to update training parameters. """
+
+        # Make sure that this is not done while gradients are waiting to be reduced (if no_sync context for instance)
+        if reduce(lambda x, y: x or y, self._grad_reduced, False):
+            logging.warning("Grads waiting to be reduced.")
+
+        self._trainable_params = list(
+            filter(lambda x: x.trainable, self._all_params))
+        self._trainable_params.sort(key=lambda x: np.prod(x.shape))
+
+        self._trainable_param2rank = {}
+        for optim in self._sharding_optimizers:
+            # Need to be wrappered for Sharding Stage2 Optimizer
+            if len(optim.param_storages.keys()) == 0:
+                optim.update_opt_status()
+
+            # Get the parameters split by the optimizer according to rank
+            for per_rank_params in optim.dtype_rank_params.values(
+            ):  # all the params from all ranks
+                for params in per_rank_params:
+                    for param in filter(lambda x: x.trainable, params):
+                        self._trainable_param2rank[
+                            param.name] = optim.param2rank[param.name]
+                        self._trainable_param2align[
+                            param.name] = optim._param2align[param.name]
+
+        self._setup_use_grad_storage()
+
+        # wait next func hook support
+        self._setup_backward_hooks()
+
+    @paddle.no_grad()
+    def __sync_buffers(self):
+        """
+        Sync all the param buffers from all ranks (exp: batch norm statistics).
+        """
+
+        for buffer in self._layer.buffers(include_sublayers=True):
+            dist.broadcast(
+                buffer,
+                self._global_root_rank,
+                self._group,
+                use_calc_stream=True)
+        # Multi stream operation will be supported later
+        dist.wait(tensor=buffer, group=self._group, use_calc_stream=True)
+
+    def __getattr__(self, name):
+        """Forward missing attributes to wrapped layer."""
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(self._layer, name)
+
+    @paddle.no_grad()
+    def _clear_counters(self):
+        """Reset all the grad reduce and call counters."""
+        if self.training:
+            self._grad_reduced = [True for _ in self._trainable_params]
+
+        if self._use_grad_storage:
+            for grad_storage in self._grad_storage_list:
+                grad_storage.reset_checked_in()
+
+        if not self._accumulate_grads:
+            self._grads_flipped = False
+
+    def _get_reduce_fn(self, index, param, dst_rank):
+        """
+        There are two ways to reduce gradient.
+        - 1. Do not use use_grad_storage or exceeded buffer_max_size will be reduced separately.
+        - 2. Use grad_storage Reduce the storage to get the full gradient from different ranks.
+        """
+
+        if not self._use_grad_storage or not self._has_grad_storage[index]:
+            # Direct reduction
+            @paddle.no_grad()
+            def reduce(*_):
+                # Skip gradient reduction, do not change status information
+                if self._grad_reduced[index]:
+                    assert param.grad is not None, "Parameter gradient cannot be None"
+
+                    # Change reduce information
+                    self._grad_reduced[index] = False
+                    if not self._accumulate_grads:
+                        param.grad.scale_(scale=self._world_size_scaling)
+                    param._reset_grad_inplace_version(True)
+
+                    # Clear the gradient that does not belong to the current rank through the callback function
+                    def cleanup():
+                        if dst_rank != self._rank:
+                            param.clear_gradient(False)
+
+                    # Synchronize the reduce parameter gradient
+                    self._tasks_flow.append(
+                        Taskflow(
+                            task=dist.reduce(
+                                tensor=param.grad,
+                                dst=dst_rank,
+                                group=self._group,
+                                use_calc_stream=True),
+                            callback=cleanup))
+
+                    # Multi stream operation will be supported later
+                    dist.wait(
+                        tensor=param.grad,
+                        group=self._group,
+                        use_calc_stream=True)
+
+                    # Clear the task flow and trigger callback to clear the redundant gradient
+                    self._clear_task_flow()
+
+        else:
+            # Buffer reduction
+            @paddle.no_grad()
+            def reduce(*_):
+                # Skip gradient reduction, do not change status information
+                if self._grad_reduced[index]:
+                    assert param.grad is not None, "Parameter gradient cannot be None"
+
+                    # Change reduce information
+                    self._grad_reduced[index] = False
+                    grad_storage = self._grad_storages[param.dtype][dst_rank]
+                    grad_storage.params_checked_in += 1
+
+                    if grad_storage.all_checked_in:
+                        assert grad_storage.buffer is not None
+
+                        # Normalize all ranks grad_storage
+                        if not self._accumulate_grads:
+                            grad_storage.buffer.scale_(
+                                scale=self._world_size_scaling)
+
+                        # Clearing up the grad_storage buffer
+                        def cleanup():
+                            if dst_rank != self._rank:
+                                for p in grad_storage._params:
+                                    p.clear_gradient(False)
+                                    p._gradient_set_empty(False)
+
+                                grad_storage.buffer.value().get_tensor()._clear(
+                                )
+
+                        # Reduce the bucket
+                        grad_storage.sent = True
+                        self._tasks_flow.append(
+                            Taskflow(
+                                task=dist.reduce(
+                                    tensor=grad_storage.buffer,
+                                    dst=grad_storage.destination,
+                                    group=self._group,
+                                    use_calc_stream=True),
+                                callback=cleanup))
+
+                        # Multi stream operation will be supported later
+                        dist.wait(
+                            tensor=grad_storage.buffer,
+                            group=self._group,
+                            use_calc_stream=True)
+
+                    # Clear the task flow and trigger callback to clear the redundant gradient
+                    self._clear_task_flow()
+
+        return reduce
+
+    def _setup_backward_hooks(self):
+        """
+        Set the backward hook to synchronize the gradients of all rank by reduce group ranks.
+        """
+
+        # Remove previous backward hooks
+        while len(self._bw_hooks) > 0:
+            self._bw_hooks.pop().remove()
+
+        # Go through the parameters, attach the hook
+        self._grad_accs = []
+        if not self.training:
+            return
+
+        for index, param in enumerate(self._trainable_params):
+            dst_rank = self._trainable_param2rank[param.name]
+
+            reduce_function = self._get_reduce_fn(index, param, dst_rank)
+
+            self._bw_hooks.append(
+                param._register_backward_hook(reduce_function))
+
+    @paddle.no_grad()
+    def _sync_params_and_buffers(self):
+        """
+        Sync all model states for all ranks
+        """
+
+        for t in self._layer.parameters():
+            dist.broadcast(
+                t,
+                src=self._global_root_rank,
+                group=self._group,
+                use_calc_stream=True)
+
+        # Multi stream operation will be supported later
+        dist.wait(tensor=t, group=self._group, use_calc_stream=True)
+
+    def _setup_use_grad_storage(self):
+        """
+        Integrate the parameters gradient into a continuous memory according to rank, and support the update of training parameters.
+        """
+
+        if not self._use_grad_storage:
+            return
+
+        # According to parameters's numel sort, allocate memory of parameter gradient to continuous memory according to rank
+        self._grad_storages = {}
+        self._has_grad_storage = [False for _ in self._trainable_params]
+
+        for index, param in enumerate(self._trainable_params):
+            dst_rank = self._trainable_param2rank[param.name]
+
+            if param.dtype not in self._grad_storages.keys():
+                self._grad_storages[param.dtype] = {}
+
+            if dst_rank not in self._grad_storages[param.dtype].keys():
+                self._grad_storages[param.dtype][dst_rank] = GradStorage(
+                    self._buffer_max_size[param.dtype],
+                    dtype=param.dtype,
+                    device=self._default_device,
+                    destination=dst_rank,
+                    parm2align=self._trainable_param2align)
+
+            # Criteria to decide whether this parameter is to be put in GradStorage
+            if self._grad_storages[param.dtype][dst_rank].can_add_grad_view(
+                    param, self._trainable_param2align[param.name]):
+                self._grad_storages[param.dtype][dst_rank].add_grad(
+                    param, self._trainable_param2align[param.name])
+                self._has_grad_storage[index] = True
+            else:
+                self._param_grads.append(param.name)
+                print(
+                    "Can not add param: {}, param's shape: {}, param align: {}, grad_storages fill: {}, ".
+                    format(param.name, param.shape, self._trainable_param2align[
+                        param.name], self._grad_storages[param.dtype][dst_rank]
+                           ._fill))
+
+        self._grad_storage_list = list(
+            chain(*[
+                self._grad_storages[dtype].values()
+                for dtype in self._grad_storages.keys()
+            ]))
+
+    def _clear_task_flow(self):
+        """Try to consume the previous tasks."""
+        while len(self._tasks_flow) > 0:
+            task = self._tasks_flow.popleft()
+            if task.callback is not None:
+                task.callback()
+
+    def _detect_train_change(self):
+        # Current trainable parameters
+        trainable_mask = list(map(_trainable, self._all_params))
+
+        # Whether parameters trainability changed
+        trainability_changed = trainable_mask != self._trainable_mask
+
+        # The whole model is not trainable but we still have grad hooks
+        trainability_changed |= not self.training and len(self._bw_hooks) > 0
+
+        if trainability_changed:
+            logging.warning(
+                "Trainable params changed, because of eval/train mode or parameter freezing/unfreeze."
+            )
+            self._trainable_mask = trainable_mask
+
+        return trainability_changed
+
+    def _build_grad_storages(self):
+        """
+        Rebuild grad storages.
+        """
+        # Rebuild fp16/fp32 grad storages
+        for dtype in self._grad_storages.keys():
+            for dst_rank, grad_storage in self._grad_storages[dtype].items():
+                if dst_rank != self._rank:
+                    grad_storage.manumal_relase()
+                    grad_storage.rebuild()
+
+    def _rank_buffer_size(self, buffer_max_size, model_size):
+        """
+        Generate the minimum buffer size for each rank & Display param sizes and model sizes.
+        """
+
+        # Initialize buffer size
+        rank_buffer_size = {}
+        for shard_opt in self._sharding_optimizers:
+            if shard_opt.rank_buffer_size:
+                for dtype in shard_opt.rank_buffer_size.keys():
+                    sizes = max(shard_opt.rank_buffer_size[dtype].values())
+                    rank_buffer_size[dtype] = min(sizes, buffer_max_size)
+
+        if Type.fp16.value in rank_buffer_size.keys():
+            # FP16 GradStorage and model size
+            print(
+                "====== FP16 GradStorage size: {:.2f}M parameters, Model size {:.2f}M parameters ======".
+                format(rank_buffer_size[Type.fp16.value] / 2**19, model_size / 2
+                       **19))
+        if Type.fp32.value in rank_buffer_size.keys():
+            # FP32 GradStorage and model size
+            print(
+                "====== FP32 GradStorage size: {:.2f}M parameters, Model size {:.2f}M parameters ======".
+                format(rank_buffer_size[Type.fp32.value] / 2**18, model_size / 2
+                       **18))
+        return rank_buffer_size
diff --git a/python/paddle/distributed/fleet/runtime/the_one_ps.py b/python/paddle/distributed/fleet/runtime/the_one_ps.py
index 81613cc1efdfb..1c51e833f53f6 100644
--- a/python/paddle/distributed/fleet/runtime/the_one_ps.py
+++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py
@@ -24,7 +24,6 @@
 from paddle.fluid.framework import Variable, Parameter
 from .runtime_base import RuntimeBase
 from ..base.private_helper_function import wait_server_ready
-import paddle.distributed.fleet as fleet
 
 __all__ = []
 
@@ -53,6 +52,70 @@ def parse_table_class(varname, o_main_program):
                 return "MemorySparseTable"
 
 
+def get_default_accessor_proto(accessor, varname, o_main_program):
+    embedding_dim = 0
+    for var in o_main_program.list_vars():
+        if var.name == varname:
+            print("var:", var)
+            print("var.shape:", var.shape)
+            embedding_dim = var.shape[1]
+            print("sparse dim:", embedding_dim)
+            break
+
+    accessor.accessor_class = "CtrCommonAccessor"
+    accessor.fea_dim = embedding_dim + 2
+    accessor.embedx_dim = embedding_dim - 1
+    accessor.embedx_threshold = 0
+
+    ctr_accessor_param = accessor.ctr_accessor_param
+    ctr_accessor_param.nonclk_coeff = 0.1
+    ctr_accessor_param.click_coeff = 1.0
+    ctr_accessor_param.base_threshold = 0
+    ctr_accessor_param.delta_threshold = 0
+    ctr_accessor_param.delta_keep_days = 16
+    ctr_accessor_param.show_click_decay_rate = 1
+    ctr_accessor_param.delete_threshold = 0
+    ctr_accessor_param.delete_after_unseen_days = 30
+    ctr_accessor_param.ssd_unseenday_threshold = 1
+
+    embed_sgd_param = accessor.embed_sgd_param
+    embed_sgd_param.name = "SparseAdaGradSGDRule"
+    embed_sgd_param.adagrad.learning_rate = 0.05
+    embed_sgd_param.adagrad.initial_g2sum = 3.0
+    embed_sgd_param.adagrad.initial_range = 0.0001
+    embed_sgd_param.adagrad.weight_bounds.append(-10.0)
+    embed_sgd_param.adagrad.weight_bounds.append(10.0)
+
+    embedx_sgd_param = accessor.embedx_sgd_param
+    embedx_sgd_param.name = "SparseAdaGradSGDRule"
+    embedx_sgd_param.adagrad.learning_rate = 0.05
+    embedx_sgd_param.adagrad.initial_g2sum = 3.0
+    embedx_sgd_param.adagrad.initial_range = 0.0001
+    embedx_sgd_param.adagrad.weight_bounds.append(-10.0)
+    embedx_sgd_param.adagrad.weight_bounds.append(10.0)
+
+
+def check_embedding_dim(accessor, varname, o_main_program):
+    embedding_dim = 0
+    for var in o_main_program.list_vars():
+        if var.name == varname:
+            print("var:", var)
+            print("var.shape:", var.shape)
+            embedding_dim = var.shape[1]
+            print("sparse dim:", embedding_dim)
+            break
+    fea_dim = accessor.fea_dim
+    if fea_dim != embedding_dim + 2:
+        raise ValueError(
+            "The fea_dim is wrong, it will be sparse_embedding_dim + 2: {}, but got {}".
+            format(embedding_dim + 2, fea_dim))
+    embedx_dim = accessor.embedx_dim
+    if embedx_dim != embedding_dim - 1:
+        raise ValueError(
+            "The embedx_dim is wrong, it will be sparse_embedding_dim - 1: {}, but got {}".
+            format(embedding_dim - 1, embedx_dim))
+
+
 class Accessor:
     def __init__(self):
         self.accessor_class = ""
@@ -344,6 +407,11 @@ def __init__(self):
         self.accessor_proto = None
 
     def to_string(self, indent):
+        # if self.id == 1:
+        #     proto_txt = ''
+        #     with open('./sparse_table.prototxt') as f:
+        #         proto_txt = f.read()
+        #     return proto_txt
         table_str = "{}downpour_table_param {{{}\n{}}}"
 
         attrs = ""
@@ -586,6 +654,8 @@ def sync_strategy_envs():
             return kwargs
 
         proto_txt = str(worker) + "\n" + str(server)
+        with open('proto_txt', 'w') as f:
+            f.write(proto_txt)
 
         debug = bool(int(os.getenv("PSERVER_DEBUG", "0")))
 
@@ -847,54 +917,54 @@ def _get_tables():
                     if self.compiled_strategy.is_geo_mode():
                         table.table_class = "SparseGeoTable"
                     else:
-                        table.table_class = parse_table_class(
-                            common.table_name, self.origin_main_program)
-                        table_proto = self.context[
-                            "user_defined_strategy"].sparse_table_configs
-                        table.shard_num = table_proto.shard_num
+                        import copy
+                        table_proto = copy.deepcopy(self.context[
+                            "user_defined_strategy"].sparse_table_configs)
+                        print('table proto:', table_proto)
+                        print('table_class:', table_proto.table_class)
+                        print('shard_num:', table_proto.shard_num)
+                        print('table_proto.accessor:', table_proto.accessor)
+                        print('accessor.IsInitialized',
+                              table_proto.accessor.IsInitialized())
+                        print('accessor.ByteSize',
+                              table_proto.accessor.ByteSize())
+                        if table_proto.table_class:
+                            print('table_proto.table_class is true')
+                            table.table_class = table_proto.table_class
+                        else:
+                            table.table_class = parse_table_class(
+                                common.table_name, self.origin_main_program)
+                        if table.table_class != 'MemorySparseTable':
+                            table.table_class = 'MemorySparseTable'
+                            warnings.warn(
+                                "The PS mode must use MemorySparseTable.")
+
+                        if table_proto.shard_num:
+                            print('table_proto.shard_num is true')
+                            table.shard_num = table_proto.shard_num
+                        else:
+                            table.shard_num = 1000
+                            warnings.warn(
+                                "The shard_num of sparse table is not set, use default value 1000."
+                            )
+
+                        if table_proto.accessor.ByteSize() == 0:
+                            print('table_proto.accessor is false')
+                            get_default_accessor_proto(table_proto.accessor,
+                                                       common.table_name,
+                                                       self.origin_main_program)
+                            warnings.warn(
+                                "The accessor of sparse table is not set, use default value."
+                            )
+                        check_embedding_dim(table_proto.accessor,
+                                            common.table_name,
+                                            self.origin_main_program)
+                        print('accessor.ByteSize',
+                              table_proto.accessor.ByteSize())
                         from google.protobuf import text_format
                         table.accessor_proto = text_format.MessageToString(
                             table_proto.accessor)
-
-                        print('table proto:', table_proto)
-                        if table.table_class == 'MemorySparseTable' and table.accessor_proto == '':
-                            emb_dim = ctx.sections()[1]
-                            table.shard_num = 1950
-                            table.accessor_proto = 'accessor_class: "CtrCommonAccessor"\n' \
-                                                   'embed_sgd_param {\n' \
-                                                   '  name: "SparseAdaGradSGDRule"\n' \
-                                                   '  adagrad {\n' \
-                                                   '    learning_rate: 0.05\n' \
-                                                   '    initial_g2sum: 3.0\n' \
-                                                   '    initial_range: 0.0001\n' \
-                                                   '    weight_bounds: -10.0\n' \
-                                                   '    weight_bounds: 10.0\n' \
-                                                   '  }\n' \
-                                                   '}\n' \
-                                                   'embedx_sgd_param {\n' \
-                                                   '  name: "SparseAdaGradSGDRule"\n' \
-                                                   '  adagrad {\n' \
-                                                   '    learning_rate: 0.05\n' \
-                                                   '    initial_g2sum: 3.0\n' \
-                                                   '    initial_range: 0.0001\n' \
-                                                   '    weight_bounds: -10.0\n' \
-                                                   '    weight_bounds: 10.0\n' \
-                                                   '  }\n' \
-                                                   '}\n' \
-                                                   'fea_dim: ' + str(emb_dim+2) + '\n' \
-                                                   'embedx_dim: ' + str(emb_dim-1) + '\n' \
-                                                   'embedx_threshold: 10\n' \
-                                                   'ctr_accessor_param {\n' \
-                                                   '  nonclk_coeff: 0.1\n' \
-                                                   '  click_coeff: 1.0\n' \
-                                                   '  base_threshold: 1.5\n' \
-                                                   '  delta_threshold: 0.25\n' \
-                                                   '  delta_keep_days: 16.0\n' \
-                                                   '  show_click_decay_rate: 0.98\n' \
-                                                   '  delete_threshold: 0.8\n' \
-                                                   '  delete_after_unseen_days: 30.0\n' \
-                                                   '  ssd_unseenday_threshold: 1\n' \
-                                                   '}'
+                        print("the_one_ps table_proto:", table.accessor_proto)
                 else:
                     table.type = "PS_DENSE_TABLE"
                     table.table_class = "CommonDenseTable"
@@ -916,7 +986,6 @@ def _get_tables():
                     common.sync = "true"
                 else:
                     common.sync = "false"
-
                 table.common = common
 
                 if table.table_class != 'MemorySparseTable':
@@ -1108,8 +1177,6 @@ def _save_distributed_persistables(self,
                 TheOnePSRuntime.__exclude_vars(saved_varnames),
                 main_program.list_vars()))
 
-        self._communicator.pull_dense(denses)
-
         import paddle
         for var in remaining_vars:
             # if var.name not in recv_dense_varnames:
@@ -1209,9 +1276,8 @@ def _ps_inference_save_inference_model(self,
             split_dense_table=self.role_maker._is_heter_parameter_server_mode,
             use_origin_program=True)
         print("the one ps sparses:", sparses)
-        sparse_names = []
-        for id, name in sparses.items():
-            sparse_names.extend(name)
+        sparse_names = self._save_sparse_params(executor, dirname, sparses,
+                                                main_program, mode)
         print("the one ps sparse names:", sparse_names)
 
         denses = self.compiled_strategy.get_the_one_recv_context(
@@ -1225,7 +1291,7 @@ def _ps_inference_save_inference_model(self,
         generate_vars = [var for var in generate_vars]
         remaining_vars = list(
             filter(
-                TheOnePSRuntime.__exclude_vars(generate_vars + sparse_names),
+                TheOnePSRuntime.__exclude_vars(sparse_names),
                 infer_program.list_vars()))
         print("remain_vars:", [var.name for var in remaining_vars])
         for var in remaining_vars:
@@ -1235,9 +1301,6 @@ def _ps_inference_save_inference_model(self,
                 os.path.join(model_path, var.name),
                 use_binary_format=True)
 
-        self._ps_inference_save_persistables(executor, dirname, infer_program,
-                                             mode)
-
     def _save_inference_model(self, *args, **kwargs):
         self._ps_inference_save_inference_model(*args, **kwargs)
 
@@ -1314,8 +1377,15 @@ def load_model(self, path, mode):
             self._load_distributed_persistables(path, mode)
         else:
             self._ps_inference_load_inference_model(path, mode)
+        # self._load_distributed_persistables(path, mode=mode)
 
-    def _shrink(self, threshold):
+    def _shrink(self, threshold=None):
+        if threshold is not None:
+            warnings.warn(
+                "The param threshold is not used in MemorySparseTable, if you need to shrink, please set the config of accessor"
+            )
+        else:
+            threshold = 0
         import paddle.distributed.fleet as fleet
         fleet.util.barrier()
         if self.role_maker._is_first_worker():
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 7ea479f0fbb14..177e19194a522 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -58,7 +58,7 @@ def _start_kv_server(port, http_server_d, size):
 
 def _is_cpuonly(backend):
     check_backend(backend)
-    if backend in ['auto', 'nccl', 'bkcl', 'hccl'] and (
+    if backend in ['auto', 'nccl', 'bkcl', 'hccl', 'heter'] and (
             core.is_compiled_with_cuda() or core.is_compiled_with_xpu() or
             core.is_compiled_with_npu()):
 
@@ -68,6 +68,14 @@ def _is_cpuonly(backend):
         return True
 
 
+def _check_var_exists(var_name):
+    var = os.environ.get(var_name, None)
+    if var is None:
+        raise ValueError("paddle.distributed initialize error, "
+                         "environment variable %s is needed, but not set." %
+                         var_name)
+
+
 def init_parallel_env():
     """
     Initialize parallel training environment in dynamic graph mode.
@@ -148,27 +156,22 @@ def train():
         raise NotImplementedError(
             "If you want to use CPU-only version, please use 'gloo' as backend")
 
-    # 2. check env
-    def _check_var_exists(var_name):
-        var = os.environ.get(var_name, None)
-        if var is None:
-            raise ValueError("paddle.distributed initialize error, "
-                             "environment variable %s is needed, but not set." %
-                             var_name)
-
     if not is_cpu_only and core.is_compiled_with_cuda():
         _check_var_exists("FLAGS_selected_gpus")
     elif not is_cpu_only and core.is_compiled_with_xpu():
         _check_var_exists('FLAGS_selected_xpus')
+    elif not is_cpu_only and core.is_compiled_with_npu():
+        _check_var_exists('FLAGS_selected_npus')
 
     _check_var_exists("PADDLE_TRAINER_ID")
     _check_var_exists("PADDLE_CURRENT_ENDPOINT")
     _check_var_exists("PADDLE_TRAINERS_NUM")
     _check_var_exists("PADDLE_TRAINER_ENDPOINTS")
 
+    node_num = set([i.split(":")[0] for i in parallel_env.trainer_endpoints])
     # 3: init gloo context (step 1: httpsever start)
     init_gloo = int(os.getenv("PADDLE_WITH_GLOO", "0"))
-    if is_cpu_only or init_gloo:
+    if is_cpu_only or init_gloo or backend == "heter":
         ep_rank_0 = parallel_env.trainer_endpoints[0].split(":")
         manager = Manager()
         # glboal dict to store status
@@ -177,6 +180,8 @@ def _check_var_exists(var_name):
         if parallel_env.rank == 0:
             # The scope for worker used by http server is '_worker'
             size = {'_worker': parallel_env.world_size}
+            if backend == "heter":
+                size = {'_worker': len(node_num)}
             http_server = Process(
                 target=_start_kv_server,
                 args=(int(ep_rank_0[1]), http_server_d, size))
@@ -210,10 +215,13 @@ def _check_var_exists(var_name):
         place = core.NPUPlace(parallel_env.device_id)
 
     _set_expected_place(place)
-    # init nccl or bkcl context
+    # init nccl or hccl or bkcl or heter context
     if is_cpu_only:
         parallel_helper._set_parallel_ctx(
             core.GLOOParallelContext(strategy, place))
+    elif (backend == "heter"):
+        parallel_helper._set_parallel_ctx(
+            core.HeterParallelContext(strategy, parallel_env.device_id))
     elif core.is_compiled_with_cuda():
         parallel_helper._set_parallel_ctx(
             core.NCCLParallelContext(strategy, place))
@@ -224,17 +232,19 @@ def _check_var_exists(var_name):
         parallel_helper._set_parallel_ctx(
             core.HCCLParallelContext(strategy, place))
 
-    other_endpoints = strategy.trainer_endpoints[:]
-    other_endpoints.remove(strategy.current_endpoint)
-    if not is_cpu_only and strategy.local_rank == 0:
-        wait_server_ready(other_endpoints)
+    if backend != "heter":
+        other_endpoints = strategy.trainer_endpoints[:]
+        other_endpoints.remove(strategy.current_endpoint)
+        if not is_cpu_only and strategy.local_rank == 0:
+            wait_server_ready(other_endpoints)
 
     parallel_helper._init_parallel_ctx()
+
     # 5: init gloo context (step 2: gloo init)
     # dividing init_gloo into two part beacause nccl and gloo
     # are separately looking for free ports which sometimes
     # leads to port-conflict.
-    if is_cpu_only and parallel_env.rank == 0:
+    if (is_cpu_only or backend == "heter") and parallel_env.rank == 0:
         # compare to init_gloo, we don't need to 
         # init gloo, because we do this in _init_parallel_ctx;
         http_server_d["running"] = False
diff --git a/python/paddle/fft.py b/python/paddle/fft.py
index a62e502203b63..305eb1e23181b 100644
--- a/python/paddle/fft.py
+++ b/python/paddle/fft.py
@@ -15,7 +15,7 @@
 from typing import Sequence
 import numpy as np
 import paddle
-from .tensor.attribute import is_complex, is_floating_point, is_interger, _real_to_complex_dtype, _complex_to_real_dtype
+from .tensor.attribute import is_complex, is_floating_point, is_integer, _real_to_complex_dtype, _complex_to_real_dtype
 from .fluid.framework import in_dygraph_mode
 from . import _C_ops
 from .fluid.data_feeder import check_variable_and_dtype
@@ -196,7 +196,7 @@ def fft(x, n=None, axis=-1, norm="backward", name=None):
 
 
     """
-    if is_interger(x) or is_floating_point(x):
+    if is_integer(x) or is_floating_point(x):
         return fft_r2c(
             x, n, axis, norm, forward=True, onesided=False, name=name)
     else:
@@ -260,7 +260,7 @@ def ifft(x, n=None, axis=-1, norm="backward", name=None):
             #   0.14285714+6.25898038e-01j]
 
     """
-    if is_interger(x) or is_floating_point(x):
+    if is_integer(x) or is_floating_point(x):
         return fft_r2c(
             x, n, axis, norm, forward=False, onesided=False, name=name)
     else:
@@ -521,7 +521,7 @@ def fftn(x, s=None, axes=None, norm="backward", name=None):
             #   [-8.+0.j  0.+0.j  0.+0.j  0.-0.j]
             #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]]
     """
-    if is_interger(x) or is_floating_point(x):
+    if is_integer(x) or is_floating_point(x):
         return fftn_r2c(
             x, s, axes, norm, forward=True, onesided=False, name=name)
     else:
@@ -585,7 +585,7 @@ def ifftn(x, s=None, axes=None, norm="backward", name=None):
             #   [ 0.33333333+0.j         -0.16666667-0.28867513j -0.16666667+0.28867513j]]
 
     """
-    if is_interger(x) or is_floating_point(x):
+    if is_integer(x) or is_floating_point(x):
         return fftn_r2c(
             x, s, axes, norm, forward=False, onesided=False, name=name)
     else:
@@ -1355,7 +1355,7 @@ def ifftshift(x, axes=None, name=None):
 
 # internal functions
 def fft_c2c(x, n, axis, norm, forward, name):
-    if is_interger(x):
+    if is_integer(x):
         x = paddle.cast(x, _real_to_complex_dtype(paddle.get_default_dtype()))
     elif is_floating_point(x):
         x = paddle.cast(x, _real_to_complex_dtype(x.dtype))
@@ -1388,7 +1388,7 @@ def fft_c2c(x, n, axis, norm, forward, name):
 
 
 def fft_r2c(x, n, axis, norm, forward, onesided, name):
-    if is_interger(x):
+    if is_integer(x):
         x = paddle.cast(x, paddle.get_default_dtype())
     _check_normalization(norm)
     axis = axis if axis is not None else -1
@@ -1425,7 +1425,7 @@ def fft_r2c(x, n, axis, norm, forward, onesided, name):
 
 
 def fft_c2r(x, n, axis, norm, forward, name):
-    if is_interger(x):
+    if is_integer(x):
         x = paddle.cast(x, _real_to_complex_dtype(paddle.get_default_dtype()))
     elif is_floating_point(x):
         x = paddle.cast(x, _real_to_complex_dtype(x.dtype))
@@ -1464,7 +1464,7 @@ def fft_c2r(x, n, axis, norm, forward, name):
 
 
 def fftn_c2c(x, s, axes, norm, forward, name):
-    if is_interger(x):
+    if is_integer(x):
         x = paddle.cast(x, _real_to_complex_dtype(paddle.get_default_dtype()))
     elif is_floating_point(x):
         x = paddle.cast(x, _real_to_complex_dtype(x.dtype))
@@ -1512,7 +1512,7 @@ def fftn_c2c(x, s, axes, norm, forward, name):
 
 
 def fftn_r2c(x, s, axes, norm, forward, onesided, name):
-    if is_interger(x):
+    if is_integer(x):
         x = paddle.cast(x, paddle.get_default_dtype())
     _check_normalization(norm)
     if s is not None:
@@ -1567,7 +1567,7 @@ def fftn_r2c(x, s, axes, norm, forward, onesided, name):
 
 
 def fftn_c2r(x, s, axes, norm, forward, name):
-    if is_interger(x):
+    if is_integer(x):
         x = paddle.cast(x, _real_to_complex_dtype(paddle.get_default_dtype()))
     elif is_floating_point(x):
         x = paddle.cast(x, _real_to_complex_dtype(x.dtype))
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 5683750c4d829..5482413dbbc5d 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -55,6 +55,7 @@
 from .initializer import set_global_initializer
 from . import layers
 from . import dygraph
+from . import eager
 from . import contrib
 from . import nets
 from . import optimizer
@@ -90,6 +91,7 @@
 from .io import save, load, load_program_state, set_program_state
 from .dygraph.checkpoint import save_dygraph, load_dygraph
 from .dygraph.varbase_patch_methods import monkey_patch_varbase
+from .eager.eager_tensor_patch_methods import monkey_patch_eagertensor
 from . import generator
 from .core import _cuda_synchronize
 from .generator import Generator
@@ -113,6 +115,7 @@
         'contrib',
         'data',
         'dygraph',
+        'eager',
         'enable_dygraph',
         'disable_dygraph',
         'enable_imperative',
@@ -211,6 +214,7 @@ def remove_flag_if_exists(name):
 monkey_patch_variable()
 __bootstrap__()
 monkey_patch_varbase()
+monkey_patch_eagertensor()
 
 # NOTE(zhiqiu): register npu_finalize on the exit of Python,
 # do some clean up manually.
diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
index 7930923668c7d..0627bf2123adb 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
@@ -62,14 +62,16 @@ def __init__(self,
         self._ops_to_quantize = _ops_to_quantize
         self._op_ids_to_skip = _op_ids_to_skip if _op_ids_to_skip is not None else set(
             [-1])
-        self._scale_immutable_ops = ['transpose2', 'reshape2', 'pool2d']
+        self._scale_immutable_ops = [
+            'transpose2', 'reshape2', 'pool2d', 'slice'
+        ]
         self._scale_ops = ['scale']
         self._conv_ops = ['conv2d', 'depthwise_conv2d']
         self._pool_ops = ['pool2d']
         self._mul_ops = ['mul']
         self._fc_ops = ['fc']
         self._relu_ops = ['relu', 'relu6']
-        self._matmul_ops = ['matmul']
+        self._matmul_ops = ['matmul', 'matmul_v2']
         self._gru_ops = ['fusion_gru', 'multi_gru']
         self._lstm_ops = ['fusion_lstm']
         self._weight_thresholds = {}
@@ -241,7 +243,10 @@ def _update_scales(graph):
             waiting_for_scale = set()
             for op in graph.all_op_nodes():
                 if op.name() in self._scale_immutable_ops:
-                    input_name = op.input("X")[0]
+                    if op.name() == 'slice':
+                        input_name = op.input("Input")[0]
+                    else:
+                        input_name = op.input("X")[0]
                     output_name = op.output("Out")[0]
                     tensor_names = [input_name, output_name]
 
@@ -328,14 +333,18 @@ def _swap_inputs(self, op, old_input, new_input):
     def _dequantize_weights(self, graph):
         def _is_int8_weights(op_node, weight_name):
             weight_var_name = op_node.input(weight_name)[0]
+            if self._scope.find_var(weight_var_name) is None:
+                return False
             weight = self._load_param(self._scope, weight_var_name)
             return np.all(np.mod(weight, 1) == 0)
 
+        mul_and_matmul_ops = self._mul_ops + self._matmul_ops
         for op in graph.all_op_nodes():
             if op.name() in self._conv_ops and _is_int8_weights(op, "Filter"):
                 self._dequantize_op_weights(graph, op, "Filter", "Output")
-            elif op.name() in self._mul_ops and _is_int8_weights(op, "Y"):
+            elif op.name() in mul_and_matmul_ops and _is_int8_weights(op, "Y"):
                 self._dequantize_op_weights(graph, op, "Y", "Out")
+
         return graph
 
     def _dequantize_op_weights(self, graph, op_node, weight_name, output_name):
@@ -419,6 +428,7 @@ def _optimize_fp32_graph(self, graph):
         if self._is_fc_quantized(graph):
             graph = self._apply_pass(graph, 'fc_mkldnn_pass')
         graph = self._apply_pass(graph, 'matmul_transpose_reshape_fuse_pass')
+        graph = self._apply_pass(graph, 'matmul_v2_transpose_reshape_fuse_pass')
         # the following pass should be the last one since it will work on all fused ops.
         graph = self._apply_pass(graph, 'runtime_context_cache_pass')
         return graph
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index 9b2954b13f222..645feda21f0f3 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -136,7 +136,7 @@
     "flatten": [["X"], ["Out"]],
     "flatten2": [["X"], ["Out"]],
     "unsqueeze2": [["X"], ["Out"]],
-    "flatten_contiguous_range": [['X'], ["Out", "XShape"]],
+    "flatten_contiguous_range": [['X'], ["Out"]],
 }
 
 _conv_ops = ['conv2d', 'depthwise_conv2d', 'conv2d_transpose']
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index 03503111fca9a..94d7a2ed15348 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -253,7 +253,7 @@ if(LINUX AND WITH_MKLDNN)
 	set(FP32_ERNIE_MODEL_ARCHIVE "ernie_fp32_model.tar.gz")
 	set(FP32_ERNIE_MODEL_DIR "${QUANT_INSTALL_DIR}/Ernie_float")
 	download_quant_fp32_model(${FP32_ERNIE_MODEL_DIR} ${FP32_ERNIE_MODEL_ARCHIVE} 114f38804a3ef8c45e7259e68bbd838b)
-	set(QUANT2_ERNIE_OPS_TO_QUANTIZE "fc,reshape2,transpose2,matmul,elementwise_add")
+	set(QUANT2_ERNIE_OPS_TO_QUANTIZE "fc,reshape2,transpose2,matmul,elementwise_add,slice")
 	inference_quant2_int8_nlp_test(test_quant2_int8_ernie_mkldnn ${QUANT2_ERNIE_MODEL_DIR}/Ernie_qat/float ${FP32_ERNIE_MODEL_DIR}/ernie_fp32_model ${NLP_DATA_PATH} ${NLP_LABLES_PATH} ${QUANT2_ERNIE_OPS_TO_QUANTIZE})
 
 	# Quant2 GRU
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
index 9ba0164afbe60..994f89ab3e9f3 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
@@ -23,7 +23,93 @@
 paddle.enable_static()
 
 
-class TestQuant2Int8MkldnnPass(unittest.TestCase):
+class TestQuant2Int8MkldnnPassMul(unittest.TestCase):
+    def op_name(self):
+        return "mul"
+
+    def setUp(self):
+        self.scope = fluid.Scope()
+        self.place = fluid.CPUPlace()
+        self.dtype = np.float32
+        self.use_mkldnn = True
+
+        self.quantized_ops = self.op_name()
+        self.mul_input_size = [1, 3]
+        self.mul_weights_size = [3, 5]
+        self.mul_output_size = [1, 5]
+        self.mul_input = np.random.random(self.mul_input_size).astype(
+            self.dtype)
+        self.mul_weights = np.ones(self.mul_weights_size, self.dtype)
+        self.mul_weights_bad = np.ones([1, 1], self.dtype)
+        self.mul_output = np.ndarray(self.mul_output_size).astype(self.dtype)
+        self.mul_output_scale = np.linspace(1, 5, num=5).astype(self.dtype)
+
+        self.variables_mul = {
+            "mul_input": self.mul_input,
+            "mul_weights": self.mul_weights,
+            "mul_output": self.mul_output,
+            "mul_weights_bad": self.mul_weights_bad
+        }
+
+    def prepare_program_mul(self, program):
+        block = program.global_block()
+        for name in self.variables_mul:
+            block.create_var(
+                name=name,
+                dtype="float32",
+                shape=self.variables_mul[name].shape)
+
+        mul_op1 = block.append_op(
+            type=self.op_name(),
+            inputs={
+                "X": block.var('mul_input'),
+                "Y": block.var('mul_weights')
+            },
+            outputs={"Out": block.var('mul_output')},
+            attrs={'use_mkldnn': self.use_mkldnn})
+
+    def test_dequantize_op_weights(self):
+        program = fluid.Program()
+        with fluid.program_guard(program):
+            self.prepare_program_mul(program)
+            graph = IrGraph(core.Graph(program.desc), for_test=True)
+
+            op_node = ""
+            for op in graph.all_op_nodes():
+                if op.op().type() == self.op_name():
+                    op_node = op
+                    break
+            assert op_node != "", "op of type %s not found" % self.op_name()
+
+            qpass = Quant2Int8MkldnnPass(
+                self.quantized_ops,
+                _scope=self.scope,
+                _place=self.place,
+                _core=core,
+                _debug=False)
+            qpass._weight_thresholds["mul_output"] = self.mul_output_scale
+            param = self.scope.var("mul_weights").get_tensor()
+            param.set(self.variables_mul["mul_weights"], self.place)
+            qpass._dequantize_op_weights(graph, op_node, "Y", "Out")
+
+            assert np.allclose(
+                self.scope.find_var("mul_weights").get_tensor(),
+                [[1. / 127., 2. / 127., 3. / 127., 4. / 127., 5. / 127.],
+                 [1. / 127., 2. / 127., 3. / 127., 4. / 127., 5. / 127.],
+                 [1. / 127., 2. / 127., 3. / 127., 4. / 127., 5. / 127.]])
+
+            param = self.scope.var("mul_weights").get_tensor()
+            param.set(self.variables_mul["mul_weights_bad"], self.place)
+            with self.assertRaises(ValueError):
+                qpass._dequantize_op_weights(graph, op_node, "Y", "Out")
+
+
+class TestQuant2Int8MkldnnPassMatmulV2(TestQuant2Int8MkldnnPassMul):
+    def op_name(self):
+        return "matmul_v2"
+
+
+class TestQuant2Int8MkldnnPassConv2D(unittest.TestCase):
     def setUp(self):
         self.scope = fluid.Scope()
         self.place = fluid.CPUPlace()
@@ -46,7 +132,7 @@ def setUp(self):
         self.conv_output = np.ndarray(self.conv_output_size).astype(self.dtype)
         self.conv_output2 = np.ndarray(self.conv_output2_size).astype(
             self.dtype)
-        self.quantized_ops = 'conv2d,mul'
+        self.quantized_ops = 'conv2d'
         self.variables = {
             "input": self.input,
             "filter": self.filter,
@@ -54,24 +140,8 @@ def setUp(self):
             "conv_output": self.conv_output,
             "conv_output2": self.conv_output2,
         }
-        self.mul_input_size = [1, 3]
-        self.mul_weights_size = [3, 5]
-        self.mul_output_size = [1, 5]
-        self.mul_input = np.random.random(self.mul_input_size).astype(
-            self.dtype)
-        self.mul_weights = np.ones(self.mul_weights_size, self.dtype)
-        self.mul_weights_bad = np.ones([1, 1], self.dtype)
-        self.mul_output = np.ndarray(self.mul_output_size).astype(self.dtype)
-        self.mul_output_scale = np.linspace(1, 5, num=5).astype(self.dtype)
 
-        self.variables_mul = {
-            "mul_input": self.mul_input,
-            "mul_weights": self.mul_weights,
-            "mul_output": self.mul_output,
-            "mul_weights_bad": self.mul_weights_bad
-        }
-
-    def prepare_program(self, program):
+    def prepare_program_conv2d(self, program):
         block = program.global_block()
         for name in self.variables:
             block.create_var(
@@ -111,23 +181,6 @@ def prepare_program(self, program):
                 'fuse_brelu': True
             })
 
-    def prepare_program_mul(self, program):
-        block = program.global_block()
-        for name in self.variables_mul:
-            block.create_var(
-                name=name,
-                dtype="float32",
-                shape=self.variables_mul[name].shape)
-
-        mul_op1 = block.append_op(
-            type="mul",
-            inputs={
-                "X": block.var('mul_input'),
-                "Y": block.var('mul_weights')
-            },
-            outputs={"Out": block.var('mul_output')},
-            attrs={'use_mkldnn': self.use_mkldnn})
-
     def remove_fuse_activation_attribute(self, graph):
         for op in graph.all_op_nodes():
             op.op().remove_attr("fuse_activation")
@@ -150,7 +203,7 @@ def check_graph_after_pass(self, graph):
     def test_quant_update_activation(self):
         program = fluid.Program()
         with fluid.program_guard(program):
-            self.prepare_program(program)
+            self.prepare_program_conv2d(program)
             graph = IrGraph(core.Graph(program.desc), for_test=True)
             graph = self.remove_fuse_activation_attribute(graph)
             self.check_graph_before_pass(graph)
@@ -163,39 +216,6 @@ def test_quant_update_activation(self):
             graph = quant2_int8_mkldnn_pass._update_activations(graph)
             self.check_graph_after_pass(graph)
 
-    def test_dequantize_op_weights(self):
-        program = fluid.Program()
-        with fluid.program_guard(program):
-            self.prepare_program_mul(program)
-            graph = IrGraph(core.Graph(program.desc), for_test=True)
-
-            for op in graph.all_op_nodes():
-                if op.op().type() == "mul":
-                    op_node = op
-                    break
-
-            qpass = Quant2Int8MkldnnPass(
-                self.quantized_ops,
-                _scope=self.scope,
-                _place=self.place,
-                _core=core,
-                _debug=False)
-            qpass._weight_thresholds["mul_output"] = self.mul_output_scale
-            param = self.scope.var("mul_weights").get_tensor()
-            param.set(self.variables_mul["mul_weights"], self.place)
-            qpass._dequantize_op_weights(graph, op_node, "Y", "Out")
-
-            assert np.allclose(
-                self.scope.find_var("mul_weights").get_tensor(),
-                [[1. / 127., 2. / 127., 3. / 127., 4. / 127., 5. / 127.],
-                 [1. / 127., 2. / 127., 3. / 127., 4. / 127., 5. / 127.],
-                 [1. / 127., 2. / 127., 3. / 127., 4. / 127., 5. / 127.]])
-
-            param = self.scope.var("mul_weights").get_tensor()
-            param.set(self.variables_mul["mul_weights_bad"], self.place)
-            with self.assertRaises(ValueError):
-                qpass._dequantize_op_weights(graph, op_node, "Y", "Out")
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index 52be7493cf229..60f844b27bef1 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -93,10 +93,10 @@ def check_type(input, input_name, expected_type, op_name, extra_message=''):
     if in_dygraph_mode():
         return
 
-    from .dygraph.dygraph_to_static.program_translator import in_declarative_mode
     # NOTE: `in_declarative_mode` is used to determined whether this op is called under
     # @declarative in transformation from dygrah to static layer. We add VarBase in
     # expected_type to skip checking because varBase may be created and used in unusual way.
+    from .dygraph.base import in_declarative_mode
     # Need a better design to be fix this.
     if in_declarative_mode():
         if not isinstance(expected_type, tuple):
diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py
index 9f698134eee17..0e291648b3754 100644
--- a/python/paddle/fluid/dataset.py
+++ b/python/paddle/fluid/dataset.py
@@ -862,8 +862,12 @@ def global_shuffle(self, fleet=None, thread_num=12):
             thread_num(int): shuffle thread num. Default is 12.
 
         """
+        from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
         if fleet is not None:
-            fleet._role_maker.barrier_worker()
+            if not isinstance(fleet, PSLib):
+                fleet.barrier_worker()
+            else:
+                fleet._role_maker.barrier_worker()
             if self.trainer_num == -1:
                 self.trainer_num = fleet.worker_num()
         if self.fleet_send_batch_size is None:
@@ -875,14 +879,23 @@ def global_shuffle(self, fleet=None, thread_num=12):
         self.dataset.set_fleet_send_batch_size(self.fleet_send_batch_size)
         self.dataset.set_fleet_send_sleep_seconds(self.fleet_send_sleep_seconds)
         if fleet is not None:
-            fleet._role_maker.barrier_worker()
+            if not isinstance(fleet, PSLib):
+                fleet.barrier_worker()
+            else:
+                fleet._role_maker.barrier_worker()
         self.dataset.global_shuffle(thread_num)
         if fleet is not None:
-            fleet._role_maker.barrier_worker()
+            if not isinstance(fleet, PSLib):
+                fleet.barrier_worker()
+            else:
+                fleet._role_maker.barrier_worker()
         if self.merge_by_lineid:
             self.dataset.merge_by_lineid()
         if fleet is not None:
-            fleet._role_maker.barrier_worker()
+            if not isinstance(fleet, PSLib):
+                fleet.barrier_worker()
+            else:
+                fleet._role_maker.barrier_worker()
 
     @deprecated(
         since="2.0.0",
@@ -1011,10 +1024,15 @@ def get_shuffle_data_size(self, fleet=None):
         import numpy as np
         local_data_size = self.dataset.get_shuffle_data_size()
         local_data_size = np.array([local_data_size])
+        print('global shuffle local_data_size: ', local_data_size)
         if fleet is not None:
+            from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
             global_data_size = local_data_size * 0
-            fleet._role_maker.all_reduce_worker(local_data_size,
-                                                global_data_size)
+            if not isinstance(fleet, PSLib):
+                global_data_size = fleet.util.all_reduce(local_data_size)
+            else:
+                fleet._role_maker.all_reduce_worker(local_data_size,
+                                                    global_data_size)
             return global_data_size[0]
         return local_data_size[0]
 
diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py
index 432b178ea6706..f7c2d6be574c4 100644
--- a/python/paddle/fluid/dygraph/amp/loss_scaler.py
+++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py
@@ -127,6 +127,10 @@ def __init__(self,
             self._use_dynamic_loss_scaling = use_dynamic_loss_scaling
 
             self._found_inf = to_variable(np.array([0]).astype(np.bool))
+            self._temp_found_inf_fp16 = to_variable(
+                np.array([0]).astype(np.bool))
+            self._temp_found_inf_fp32 = to_variable(
+                np.array([0]).astype(np.bool))
             self._scale = to_variable(
                 np.array([self._init_loss_scaling]).astype(np.float32))
             self._cache_founf_inf = None
@@ -282,17 +286,20 @@ def _unscale(self, optimizer):
                     ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP32
                            )
             ]
-        temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool))
-        temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool))
         if len(param_grads_fp16):
             _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
                                             param_grads_fp16,
-                                            temp_found_inf_fp16)
+                                            self._temp_found_inf_fp16)
         if len(param_grads_fp32):
             _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
                                             param_grads_fp32,
-                                            temp_found_inf_fp32)
-        self._found_inf = temp_found_inf_fp16 or temp_found_inf_fp32
+                                            self._temp_found_inf_fp32)
+        if len(param_grads_fp16) and len(param_grads_fp32):
+            self._found_inf = self._temp_found_inf_fp16 or self._temp_found_inf_fp32
+        elif len(param_grads_fp16):
+            self._found_inf = self._temp_found_inf_fp16
+        else:
+            self._found_inf = self._temp_found_inf_fp32
 
         optimizer_state["state"] = OptimizerState.UNSCALED
 
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index 460831f8745b3..f54a1629196a0 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -33,6 +33,17 @@
     'enabled', 'to_variable'
 ]
 
+# Flag that indicates whether running code under `@declarative`
+_in_declarative_mode_ = False
+
+
+def in_declarative_mode():
+    """
+    Return a bool value that indicates whether running code under `@declarative`
+
+    """
+    return _in_declarative_mode_
+
 
 def _switch_to_static_graph_(func):
     def __impl__(*args, **kwargs):
@@ -45,6 +56,16 @@ def __impl__(*args, **kwargs):
 switch_to_static_graph = wrap_decorator(_switch_to_static_graph_)
 
 
+@signature_safe_contextmanager
+def _switch_declarative_mode_guard_(is_declarative=True):
+
+    global _in_declarative_mode_
+    original_val = _in_declarative_mode_
+    _in_declarative_mode_ = is_declarative
+    yield
+    _in_declarative_mode_ = original_val
+
+
 @signature_safe_contextmanager
 def program_desc_tracing_guard(enable):
     tracer = framework._dygraph_tracer()
@@ -63,7 +84,6 @@ def program_desc_tracing_guard(enable):
 
 @signature_safe_contextmanager
 def param_guard(parameters):
-    from paddle.fluid.dygraph.dygraph_to_static.program_translator import in_declarative_mode
     # Note: parameters is a reference of self._parameters or self._buffers
     if in_declarative_mode() and not framework.in_dygraph_mode() and parameters:
         origin_parameters = parameters.copy()
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py
index 3e606139245d6..a80dfa11402c5 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py
@@ -39,7 +39,7 @@ def _no_need_convert_call(self, node):
         Determines whether a function needs to be transformed by `convert_call`.
         It doesn't need to be transformed when a function satisfies the following conditions:
           1. It's a api of paddle
-          2. It's a python builtin function not include `len`
+          2. It's a python builtin function not include `len` and `zip`
         """
         assert isinstance(node, gast.Call)
         if is_paddle_api(node):
@@ -47,10 +47,11 @@ def _no_need_convert_call(self, node):
 
         func_str = ast_to_source_code(node.func).strip()
         try:
-            from paddle.fluid.dygraph.dygraph_to_static.convert_call_func import is_builtin_len, is_builtin
+            from paddle.fluid.dygraph.dygraph_to_static.convert_call_func import is_builtin_len, is_builtin, is_builtin_zip
             is_builtin = eval("is_builtin({})".format(func_str))
             is_builtin_len = eval("is_builtin_len({})".format(func_str))
-            return is_builtin and not is_builtin_len
+            is_builtin_zip = eval("is_builtin_zip({})".format(func_str))
+            return is_builtin and not is_builtin_len and not is_builtin_zip
         except Exception:
             return False
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
index 300586969ff65..0b009c0049dcb 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
@@ -27,7 +27,7 @@
 import six
 
 from paddle.fluid.dygraph.container import Sequential
-from paddle.fluid.dygraph.dygraph_to_static.convert_operators import convert_len
+from paddle.fluid.dygraph.dygraph_to_static.convert_operators import convert_len, convert_zip
 from paddle.fluid.dygraph.dygraph_to_static.logging_utils import TranslatorLogger
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import StaticFunction
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import convert_to_static
@@ -79,6 +79,10 @@ def is_builtin_len(func):
     return False
 
 
+def is_builtin_zip(func):
+    return is_builtin(func) and func.__name__ == 'zip'
+
+
 def is_unsupported(func):
     """
     Checks whether the func is supported by dygraph to static graph.
@@ -164,6 +168,9 @@ def dyfunc(x):
     if is_builtin_len(func):
         return convert_len
 
+    if is_builtin_zip(func):
+        return convert_zip
+
     if is_builtin(func) or is_unsupported(func):
         return func
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
index 0ac4da947a46b..ba45dedc40faa 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
@@ -298,6 +298,15 @@ def convert_len(var):
         return len(var)
 
 
+def convert_zip(*args):
+    for i, arg in enumerate(args):
+        if isinstance(arg, Variable) and arg.shape[0] == -1:
+            raise RuntimeError(
+                "Not support zip(tensor, ...) when tensor.shape[0] == -1, "
+                "but found args[{}].shape[0] == -1 in 'zip'".format(str(i)))
+    return zip(*args)
+
+
 def convert_var_shape(x, idx=None, in_control_flow=False):
     """
     A function representation of the shape of variable.
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index d5d0e8ab88b86..19479a190c3b9 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -573,28 +573,6 @@ def function_spec(self):
         return self._function_spec
 
 
-# Flag that indicates whether running code under `@declarative`
-_in_declarative_mode_ = False
-
-
-def in_declarative_mode():
-    """
-    Return a bool value that indicates whether running code under `@declarative`
-
-    """
-    return _in_declarative_mode_
-
-
-@signature_safe_contextmanager
-def _switch_declarative_mode_guard_(is_declarative=True):
-
-    global _in_declarative_mode_
-    original_val = _in_declarative_mode_
-    _in_declarative_mode_ = is_declarative
-    yield
-    _in_declarative_mode_ = original_val
-
-
 def _verify_init_in_dynamic_mode(class_instance):
     """
     Verifies the instance is initialized in dynamic mode.
@@ -658,6 +636,7 @@ def from_func_spec(func_spec, input_spec, input_kwargs_spec, class_instance,
         startup_program.random_seed = framework.default_startup_program(
         ).random_seed
 
+        from paddle.fluid.dygraph.base import _switch_declarative_mode_guard_
         with framework.program_guard(main_program, startup_program):
             with _switch_declarative_mode_guard_(is_declarative=True):
                 # 1. Adds `fluid.data` layers for input if needed
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 8ff960a90ea91..0373c1e63da81 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -31,7 +31,7 @@
 from paddle.fluid import core
 from .layer_object_helper import LayerObjectHelper
 from .layer_hooks import record_program_ops_pre_hook, set_op_customized_attrs_post_hook, LayerOpsRecoder
-from .base import program_desc_tracing_guard, param_guard
+from .base import program_desc_tracing_guard, param_guard, in_declarative_mode
 from paddle.fluid import framework
 from ..param_attr import ParamAttr
 from paddle.fluid.executor import Executor, global_scope
@@ -78,7 +78,7 @@ def remove(self):
             del hooks[self._hook_id]
 
 
-class Layer(core.Layer):
+class Layer(object):
     """
     Dynamic graph Layer based on OOD, includes the parameters of the layer, the structure of the forward graph and so on.
 
@@ -881,41 +881,48 @@ def clear_gradients(self):
     def _build_once(self, *args, **kwargs):
         pass
 
-    def __call__(self, *inputs, **kwargs):
-        # NOTE(Aurelius84): Why we still need param_guard here?
-        # In case of ControlFlow, true_fn and false_fn will contain
-        # parameters that may not trigger logic of `Operator` to create
-        # them. we add this to make sure all parameters is available.
-        with param_guard(self._parameters), param_guard(self._buffers):
-            for forward_pre_hook in self._forward_pre_hooks.values():
-                hook_result = forward_pre_hook(self, inputs)
-                if hook_result is not None:
-                    if not isinstance(hook_result, tuple):
-                        hook_result = (hook_result, )
-                    inputs = hook_result
+    def _dygraph_call_func(self, *inputs, **kwargs):
+        for forward_pre_hook in self._forward_pre_hooks.values():
+            hook_result = forward_pre_hook(self, inputs)
+            if hook_result is not None:
+                if not isinstance(hook_result, tuple):
+                    hook_result = (hook_result, )
+                inputs = hook_result
 
-            if not self._built:
-                with program_desc_tracing_guard(False):
-                    self._build_once(*inputs, **kwargs)
+        if not self._built:
+            with program_desc_tracing_guard(False):
+                self._build_once(*inputs, **kwargs)
 
-                    # TODO(liuyuhui) Only xpu broadcast parameters here.
-                    # The other device is to call _sync_params_buffers in DataParallel
-                    # to realize the parameter synchronization among multiply cards.
-                    if parallel_helper._is_data_parallel_mode(
-                    ) and paddle.is_compiled_with_xpu():
-                        parallel_helper._broadcast_parameters(
-                            self._parameters.values())
+                # TODO(liuyuhui) Only xpu broadcast parameters here.
+                # The other device is to call _sync_params_buffers in DataParallel
+                # to realize the parameter synchronization among multiply cards.
+                if parallel_helper._is_data_parallel_mode(
+                ) and paddle.is_compiled_with_xpu():
+                    parallel_helper._broadcast_parameters(
+                        self._parameters.values())
 
-                self._built = True
+            self._built = True
 
-            outputs = self.forward(*inputs, **kwargs)
+        outputs = self.forward(*inputs, **kwargs)
+
+        for forward_post_hook in self._forward_post_hooks.values():
+            hook_result = forward_post_hook(self, inputs, outputs)
+            if hook_result is not None:
+                outputs = hook_result
 
-            for forward_post_hook in self._forward_post_hooks.values():
-                hook_result = forward_post_hook(self, inputs, outputs)
-                if hook_result is not None:
-                    outputs = hook_result
+        return outputs
 
-            return outputs
+    def __call__(self, *inputs, **kwargs):
+        # NOTE(Aurelius84): Why we still need param_guard here?
+        # In case of ControlFlow, true_fn and false_fn will contain
+        # parameters that may not trigger logic of `Operator` to create
+        # them. we add this to make sure all parameters is available.
+
+        if in_declarative_mode() and not framework.in_dygraph_mode():
+            with param_guard(self._parameters), param_guard(self._buffers):
+                return self._dygraph_call_func(*inputs, **kwargs)
+        else:
+            return self._dygraph_call_func(*inputs, **kwargs)
 
     def forward(self, *inputs, **kwargs):
         """
@@ -968,7 +975,7 @@ def forward(self, input):
                 for prefix, layer in model.named_sublayers():
                     print(prefix, layer)
         """
-        assert (isinstance(sublayer, core.Layer) or sublayer == None)
+        assert (isinstance(sublayer, Layer) or sublayer == None)
 
         self._sub_layers[name] = sublayer
         return sublayer
@@ -1135,7 +1142,7 @@ def _remove_if_exist(*dicts):
             params[name] = None
         else:
             layers = self.__dict__.get('_sub_layers', None)
-            if isinstance(value, core.Layer):
+            if isinstance(value, Layer):
                 if layers is None:
                     raise ValueError(
                         "super(YourLayer, self).__init__() should be called first"
diff --git a/python/paddle/fluid/dygraph/parallel_helper.py b/python/paddle/fluid/dygraph/parallel_helper.py
index 40d5d18c9a40f..5fe4d4162e6e3 100644
--- a/python/paddle/fluid/dygraph/parallel_helper.py
+++ b/python/paddle/fluid/dygraph/parallel_helper.py
@@ -28,11 +28,11 @@ def _is_parallel_ctx_initialized():
     return __parallel_ctx__clz__ is not None
 
 
-def _set_parallel_ctx(nccl_parallel_context):
+def _set_parallel_ctx(ccl_parallel_context):
     global __parallel_ctx__clz__
     assert __parallel_ctx__clz__ is None, \
         "ParallelContext can only be initialized once."
-    __parallel_ctx__clz__ = nccl_parallel_context
+    __parallel_ctx__clz__ = ccl_parallel_context
 
 
 def _init_parallel_ctx():
diff --git a/python/paddle/fluid/eager/__init__.py b/python/paddle/fluid/eager/__init__.py
new file mode 100644
index 0000000000000..1dc82ef69979c
--- /dev/null
+++ b/python/paddle/fluid/eager/__init__.py
@@ -0,0 +1,20 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+# incubate directory is mainly for internal use
+# after we have tested incubate APIs in industrial application for a period
+# we will move stable functions into fluid
+
+from . import eager_tensor_patch_methods
+
+__all__ = []
diff --git a/python/paddle/fluid/eager/eager_tensor_patch_methods.py b/python/paddle/fluid/eager/eager_tensor_patch_methods.py
new file mode 100644
index 0000000000000..b61bf78116aeb
--- /dev/null
+++ b/python/paddle/fluid/eager/eager_tensor_patch_methods.py
@@ -0,0 +1,23 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .. import core as core
+
+
+def monkey_patch_eagertensor():
+    def __str__(self):
+        from paddle.tensor.to_string import eager_tensor_to_string
+        return eager_tensor_to_string(self)
+
+    setattr(core.eager.EagerTensor, "__str__", __str__)
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 19279d5f3ee9a..a84175fa41fb9 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1282,9 +1282,7 @@ def _run_impl(self, program, feed, fetch_list, feed_var_name,
         if isinstance(program, Program) and program._pipeline_opt:
             if "fleet_opt" in program._pipeline_opt:
                 return self._run_using_fleet_executor(
-                    program,
-                    fetch_list=fetch_list,
-                    use_program_cache=use_program_cache)
+                    program=program, feed=feed, fetch_list=fetch_list)
             if "startup_program" in program._pipeline_opt:
                 program = program._pipeline_opt["startup_program"]
             else:
@@ -1958,46 +1956,21 @@ def _get_real_program_fetch_list():
 
         return ctx
 
-    def _run_using_fleet_executor(self,
-                                  program=None,
-                                  dataset=None,
-                                  scope=None,
-                                  thread=0,
-                                  is_infer=False,
-                                  debug=False,
-                                  fetch_list=None,
-                                  fetch_info=None,
-                                  print_period=100,
-                                  fetch_handler=None,
-                                  use_program_cache=False):
-        scope, real_fetch_list, trainer_instance = \
-            self._prepare_pipeline_ctx(program, dataset, scope, thread,
-                                       is_infer, debug, fetch_list, fetch_info,
-                                       print_period, fetch_handler,
-                                       use_program_cache)
+    def _prepare_fleet_executor(self, program=None, scope=None, fleet_opt=None):
         from ..distributed.fleet.proto import fleet_executor_desc_pb2
         from google.protobuf import text_format
-        cur_rank = os.getenv("PADDLE_TRAINER_ID")
-        trainer_endpoints_str = os.getenv("PADDLE_TRAINER_ENDPOINTS")
+        assert program, "Program for fleet executor should not be None"
+        assert fleet_opt, "Configurations for fleet executor should not be None"
+        trainer_endpoints_str = os.getenv("PADDLE_TRAINER_ENDPOINTS", "")
+        trainer_endpoints = trainer_endpoints_str.split(',')
         fleet_exe_desc = fleet_executor_desc_pb2.FleetExecutorDesc()
-        nrank = 1
-        if cur_rank and trainer_endpoints_str:
-            fleet_exe_desc.cur_rank = int(cur_rank)
-            trainer_endpoints = trainer_endpoints_str.split(',')
-            for rank, endpoint in enumerate(trainer_endpoints):
-                rank_info = fleet_executor_desc_pb2.RankInfo()
-                rank_info.rank = rank
-                rank_info.ip_port = endpoint
-                fleet_exe_desc.cluster_info.append(rank_info)
-            nrank = len(trainer_endpoints)
-        else:
-            fleet_exe_desc.cur_rank = 0
+        fleet_exe_desc.cur_rank = int(os.getenv("PADDLE_TRAINER_ID", 0))
+        nrank = len(trainer_endpoints)
+        for rank, endpoint in enumerate(trainer_endpoints):
             rank_info = fleet_executor_desc_pb2.RankInfo()
-            rank_info.rank = 0
-            rank_info.ip_port = ''
+            rank_info.rank = rank
+            rank_info.ip_port = endpoint
             fleet_exe_desc.cluster_info.append(rank_info)
-            logging.warning("Fleet Executor will run on single device only.")
-        fleet_opt = program._pipeline_opt["fleet_opt"]
         if "dist_strategy" in fleet_opt:
             fleet_exe_desc.dp_degree = fleet_opt["dist_strategy"]["dp_degree"]
             fleet_exe_desc.mp_degree = fleet_opt["dist_strategy"]["mp_degree"]
@@ -2009,10 +1982,46 @@ def _run_using_fleet_executor(self,
         fleet_exe = core.FleetExecutor(fleet_exe_desc.SerializeToString())
         place = core.Place()
         place.set_place(self.place)
-        fleet_exe.init(program._pipeline_opt["section_program"].desc, scope,
-                       place)
-        fleet_exe.run()
-        fleet_exe.release()
+        fleet_exe.init(program.desc, scope, place)
+        return fleet_exe
+
+    def _run_using_fleet_executor(self,
+                                  program=None,
+                                  feed=None,
+                                  feed_var_name="feed",
+                                  fetch_var_name="fetch",
+                                  fetch_list=None):
+        cache_key = _get_strong_program_cache_key(program, feed, fetch_list)
+        cached_ctx = self._get_ctx_cache(cache_key)
+        cached_scope = self._get_scope_cache(cache_key)
+        cached_program = self._get_program_cache(cache_key)
+        if cached_scope is None:
+            cached_scope = global_scope()
+            self._add_scope_cache(cache_key, cached_scope)
+        if cached_program is None:
+            real_feed = [] if feed is None else feed
+            real_program = program
+            if "section_program" in program._pipeline_opt:
+                real_program = program._pipeline_opt["section_program"]
+            cached_program = self._add_feed_fetch_ops(
+                program=real_program,
+                feed=real_feed,
+                fetch_list=fetch_list,
+                feed_var_name=feed_var_name,
+                fetch_var_name=fetch_var_name)
+            self._add_program_cache(cache_key, cached_program)
+        if cached_ctx is None:
+            fleet_opt = program._pipeline_opt["fleet_opt"]
+            cached_ctx = self._prepare_fleet_executor(
+                program=cached_program, scope=cached_scope, fleet_opt=fleet_opt)
+            self._add_ctx_cache(cache_key, cached_ctx)
+        if feed:
+            self._feed_data(cached_program, feed, feed_var_name, cached_scope)
+        cached_ctx.run()
+        if fetch_list:
+            arr = cached_scope.find_var(fetch_var_name).get_fetch_list()
+            tensors = arr._move_to_list()
+            return as_numpy(tensors)
         return None
 
     def _run_pipeline(self,
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 04042eac953ba..1bc0315a34706 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -40,11 +40,14 @@
 import warnings
 import functools
 from .variable_index import _getitem_impl_, _setitem_impl_
+from paddle import _C_ops
 
 __all__ = [
     'Program',
     'default_startup_program',
     'default_main_program',
+    'eager_guard',
+    'in_eager_mode',
     'program_guard',
     'name_scope',
     'cuda_places',
@@ -52,6 +55,7 @@
     'xpu_places',
     'cuda_pinned_places',
     'in_dygraph_mode',
+    'is_compiled_with_cinn',
     'is_compiled_with_cuda',
     'is_compiled_with_rocm',
     'is_compiled_with_xpu',
@@ -75,6 +79,44 @@
 global_prog_seed = 0
 _current_pipeline_stage = None
 _global_flags_ = core.globals()
+_eager_mode_ = False
+
+
+@signature_safe_contextmanager
+def eager_mode_place_guard(place):
+    if place is not None:
+        expected_place = _get_paddle_place(place)
+    else:
+        expected_place = _current_expected_place()
+
+    global _global_expected_place_
+    tmp_place = _global_expected_place_
+    _global_expected_place_ = expected_place
+
+    _set_expected_place(expected_place)
+
+    try:
+        yield
+    finally:
+        _global_expected_place_ = tmp_place
+        _set_expected_place(tmp_place)
+
+
+@signature_safe_contextmanager
+def eager_guard(place=None):
+    global _eager_mode_
+    _eager_mode_ = True
+    _C_ops.switch_to_eager_ops()
+    try:
+        with eager_mode_place_guard(place):
+            yield
+    finally:
+        _eager_mode_ = False
+        _C_ops.switch_to_core_ops()
+
+
+def in_eager_mode():
+    return _eager_mode_
 
 
 def require_version(min_version, max_version=None):
@@ -340,7 +382,10 @@ def _set_dygraph_tracer_expected_place(place):
 def _set_expected_place(place):
     global _global_expected_place_
     _global_expected_place_ = place
-    _set_dygraph_tracer_expected_place(place)
+    if in_eager_mode():
+        return core.eager._set_expected_place(place)
+    else:
+        _set_dygraph_tracer_expected_place(place)
 
 
 # TODO(zhiqiu): remove this function.
@@ -457,6 +502,21 @@ def disable_signal_handler():
     core.disable_signal_handler()
 
 
+def is_compiled_with_cinn():
+    """
+    Whether this whl package can be used to run the model on CINN.
+
+    Returns (bool): `True` if CINN is currently available, otherwise `False`.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            support_cinn = paddle.device.is_compiled_with_cinn()
+    """
+    return core.is_compiled_with_cinn()
+
+
 def is_compiled_with_cuda():
     """
     Whether this whl package can be used to run the model on GPU.
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
index 11fa70b70ba85..ff10c8ea097ad 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
@@ -771,7 +771,7 @@ def _append_heter_op(op, current_heter_block_ops, heter_ops):
 
                 """
                 output_vars_no_grad = []
-                for key in pre_op.output_names:
+                for key in op.output_names:
                     for varname in op.output(key):
                         if varname == "@EMPTY@":
                             continue
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 930995cee6d08..a7631848cd38c 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -53,11 +53,7 @@ def __call__(self, param, block=None):
 
     def _check_block(self, block):
         if block is None:
-            if in_dygraph_mode():
-                block = default_main_program().global_block()
-            else:
-                raise ValueError(
-                    "The parameter 'block' is needed in static graph mode.")
+            block = default_main_program().global_block()
 
         return block
 
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 7412d3a3fe6cf..f849d61c5d700 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -6582,7 +6582,10 @@ def mlp(input_x, input_y, hid_dim=128, label_dim=2):
                 print("Finished apply_optimize")
         """
 
-        return self._optimizer.apply_optimize(
+        func = self._optimizer.apply_optimize if hasattr(
+            self._optimizer,
+            'apply_optimize') else self._optimizer._apply_optimize
+        return func(
             loss, startup_program=startup_program, params_grads=params_grads)
 
     def minimize(self,
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 099dadd617390..5fdcd6d0a9d38 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -33,6 +33,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_pipeline_parallel)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_tensor_parallel)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sharding_parallel)
 list(APPEND DIST_TEST_OPS test_dygraph_sharding_optimizer_stage2)
+list(APPEND DIST_TEST_OPS test_dygraph_sharding_stage2)
 list(APPEND DIST_TEST_OPS test_auto_parallel_parallelizer)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers)
 list(APPEND DIST_TEST_OPS test_hybrid_parallel_inference_helper)
@@ -91,6 +92,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_auto)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_static_mp_layers)
 list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_partitioner)
 list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_partitioner_gpt)
+list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_searcher)
 list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard)
 list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard_serial)
 list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard_mppp)
@@ -244,6 +246,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_tensor_parallel)
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sharding_parallel)
     list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_optimizer_stage2)
+    list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage2)
     list(REMOVE_ITEM TEST_OPS test_auto_parallel_parallelizer)
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mp_layers)
     LIST(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision)
@@ -255,6 +258,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
     LIST(REMOVE_ITEM TEST_OPS test_parallel_margin_cross_entropy)
     LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_partitioner)
     LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_partitioner_gpt)
+    LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_searcher)
     LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard)
     LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_serial)
     LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_mppp)
@@ -587,6 +591,8 @@ set_tests_properties(test_nn_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 if(WITH_DISTRIBUTE)
     add_subdirectory(distributed_passes)
   
+    add_subdirectory(auto_parallel)
+
     # FIXME(typhoonzero): add these tests back
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transformer")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transpiler")
@@ -639,6 +645,7 @@ if(WITH_DISTRIBUTE)
             py_test_modules(test_fleet_lamb_meta_optimizer MODULES test_fleet_lamb_meta_optimizer ENVS ${dist_ENVS})
             py_test_modules(test_auto_parallel_partitioner MODULES test_auto_parallel_partitioner ENVS ${dist_ENVS})
             py_test_modules(test_auto_parallel_partitioner_gpt MODULES test_auto_parallel_partitioner_gpt ENVS ${dist_ENVS})
+            py_test_modules(test_auto_parallel_searcher MODULES test_auto_parallel_searcher ENVS ${dist_ENVS})
             py_test_modules(test_auto_parallel_reshard MODULES test_auto_parallel_reshard ENVS ${dist_ENVS})
             py_test_modules(test_auto_parallel_reshard_serial MODULES test_auto_parallel_reshard_serial ENVS ${dist_ENVS})
             py_test_modules(test_auto_parallel_reshard_mppp MODULES test_auto_parallel_reshard_mppp ENVS ${dist_ENVS})
@@ -1039,6 +1046,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
     set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200)
     set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120)
     set_tests_properties(test_dygraph_sharding_optimizer_stage2 PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 120)
     set_tests_properties(test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120)
     set_tests_properties(test_hybrid_parallel_inference_helper PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
new file mode 100644
index 0000000000000..4244fda0c51d9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
@@ -0,0 +1,6 @@
+# file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+# string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+if(WITH_DISTRIBUTE AND WITH_GPU)
+    py_test_modules(test_auto_parallel_relaunch MODULES test_auto_parallel_relaunch ENVS ${dist_ENVS})
+    set_tests_properties(test_auto_parallel_relaunch PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 120)
+endif()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_model.py b/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_model.py
new file mode 100644
index 0000000000000..8e5221ed5ffa6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_model.py
@@ -0,0 +1,162 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import time
+import paddle.fluid as fluid
+import copy
+import os
+import numpy as np
+import subprocess
+import paddle
+import paddle.nn as nn
+import paddle.fluid as fluid
+import paddle.static as static
+import paddle.nn.functional as F
+import paddle.utils as utils
+from paddle.fluid import layers
+from paddle.io import IterableDataset, DataLoader
+from paddle.distributed import fleet
+import paddle.distributed.auto_parallel as auto
+
+paddle.enable_static()
+_global_parallel_strategy = None
+_global_process_mesh = None
+batch_size = 4
+hidden_size = 1024
+sequence_len = 512
+
+
+def get_random_inputs_and_labels(input_shape, label_shape):
+    input = np.random.random(size=input_shape).astype('float32')
+    label = np.random.random(size=label_shape).astype('float32')
+    return input, label
+
+
+def batch_generator_creator():
+    def __reader__():
+        for _ in range(batch_size):
+            batch_input, batch_label = get_random_inputs_and_labels(
+                [batch_size, sequence_len, hidden_size],
+                [batch_size, sequence_len, 1])
+            yield batch_input, batch_label
+
+    return __reader__
+
+
+class MLPLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 dropout_ratio=0.1,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
+            mean=0.0, std=initializer_range))
+        bias_attr = None
+
+        self.linear0 = nn.Linear(
+            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
+        self.linear1 = nn.Linear(
+            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr)
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+        self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
+
+    def forward(self, input):
+        out = self.norm(input)
+        out = self.linear0(out)
+        out = F.gelu(out, approximate=True)
+        out = self.linear1(out)
+        out = self.dropout(out)
+        out = self.linear2(out)
+
+        return out
+
+
+def mlp_pretrain_forward(train_program, start_program):
+    with static.program_guard(train_program,
+                              start_program), utils.unique_name.guard():
+        input = static.data(
+            name="input",
+            shape=[batch_size, sequence_len, hidden_size],
+            dtype='float32')
+        label = static.data(
+            name="label", shape=[batch_size, sequence_len, 1], dtype='float32')
+
+        auto.shard_tensor(
+            input,
+            dist_attr={
+                "process_mesh": _global_process_mesh,
+                "dims_mappig": [-1, -1, -1]
+            })
+
+        mlp = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            dropout_ratio=0.1,
+            initializer_range=0.02)
+
+        predict = mlp(input)
+        error_cost = paddle.nn.functional.square_error_cost(predict, label)
+        loss = paddle.mean(error_cost)
+
+        loader = paddle.io.DataLoader.from_generator(
+            feed_list=[input, label], capacity=4 * batch_size, iterable=True)
+
+    return loss, train_program, start_program, loader
+
+
+def train():
+    global _global_process_mesh
+    _global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
+
+    dist_strategy = fleet.DistributedStrategy()
+    dist_strategy.amp = False
+    dist_strategy.pipeline = False
+    dist_strategy.recompute = False
+    # init parallel optimizer
+    dist_strategy.semi_auto = True
+
+    fleet.init(is_collective=True, strategy=dist_strategy)
+
+    train_program = static.Program()
+    start_program = static.Program()
+    loss, train_program, start_program, loader = mlp_pretrain_forward(
+        train_program, start_program)
+
+    optimizer = paddle.fluid.optimizer.AdamOptimizer(
+        learning_rate=0.00001,
+        beta1=0.9,
+        beta2=0.999,
+        epsilon=1e-08,
+        grad_clip=None)
+
+    optimizer = fleet.distributed_optimizer(optimizer)
+    _, _, distributed_startup_program, distributed_main_program = optimizer.minimize(
+        loss, start_program)
+
+    places = static.cuda_places()
+    loader.set_batch_generator(batch_generator_creator(), places=places)
+    exe = paddle.static.Executor(places[0])
+    exe.run(distributed_startup_program)
+
+    for data in loader():
+        exe.run(distributed_main_program, feed=data, fetch_list=[loss])
+
+
+if __name__ == "__main__":
+    train()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/launch.py b/python/paddle/fluid/tests/unittests/auto_parallel/launch.py
new file mode 100644
index 0000000000000..c225fe85cd844
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/launch.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from paddle.distributed.fleet import launch
+from paddle.distributed.fleet.launch_utils import run_with_coverage
+
+if __name__ == "__main__":
+    if os.environ.get("WITH_COVERAGE", "OFF") == "ON":
+        run_with_coverage(True)
+    launch.launch()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py
new file mode 100644
index 0000000000000..321b262286218
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import sys
+import json
+import shutil
+import subprocess
+from paddle.distributed.fleet.launch_utils import run_with_coverage
+
+cluster_json = """
+{
+  "machines": [
+    {
+      "hostname": "machine1",
+      "addr": "127.0.0.1",
+      "port": "768",
+      "devices": [
+        {
+          "global_id": 0,
+          "local_id": 0,
+          "type": "GPU",
+          "model": "Tesla V100-SXM2-32GB",
+          "sp_gflops": 15700,
+          "dp_gflops": 7800,
+          "memory": 32
+        },
+        {
+          "global_id": 1,
+          "local_id": 1,
+          "type": "GPU",
+          "model": "Tesla V100-SXM2-32GB",
+          "sp_gflops": 15700,
+          "dp_gflops": 7800,
+          "memory": 32
+        },
+        {
+          "global_id": 2,
+          "local_id": 0,
+          "type": "CPU",
+          "model": "Intel(R) Xeon(R) Gold 6271C CPU @ 2.60G",
+          "arch": "x86_64",
+          "vendor": "GenuineIntel",
+          "sp_gflops": 150,
+          "dp_gflops": 75,
+          "memory": "503"
+        }
+      ],
+      "links": [
+        {
+          "source_global_id": 0,
+          "target_global_id": 1,
+          "type": "NVL",
+          "bandwidth": 42
+        },
+        {
+          "source_global_id": 1,
+          "target_global_id": 0,
+          "type": "PHB",
+          "bandwidth": 12
+        }
+      ]
+    }
+  ]
+}
+"""
+
+
+class TestAutoParallelReLaunch(unittest.TestCase):
+    def test_relaunch(self):
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        cluster_json_path = os.path.join(file_dir, "auto_parallel_cluster.json")
+        cluster_json_object = json.loads(cluster_json)
+        with open(cluster_json_path, "w") as cluster_json_file:
+            json.dump(cluster_json_object, cluster_json_file)
+
+        launch_model_path = os.path.join(file_dir,
+                                         "auto_parallel_relaunch_model.py")
+
+        if os.environ.get("WITH_COVERAGE", "OFF") == "ON":
+            coverage_args = ["-m", "coverage", "run", "--branch", "-p"]
+        else:
+            coverage_args = []
+
+        cmd = [sys.executable, "-u"] + coverage_args + [
+            "-m", "launch", "--cluster_topo_path", cluster_json_path,
+            "--enable_auto_mapping", "True", launch_model_path
+        ]
+        process = subprocess.Popen(cmd)
+        process.wait()
+        self.assertEqual(process.returncode, 0)
+
+        # Remove unnecessary files
+        if os.path.exists(cluster_json_path):
+            os.remove(cluster_json_path)
+        rank_mapping_json_path = os.path.join(file_dir,
+                                              "auto_parallel_rank_mapping.json")
+        if os.path.exists(rank_mapping_json_path):
+            os.remove(rank_mapping_json_path)
+        log_path = os.path.join(file_dir, "log")
+        if os.path.exists(log_path):
+            shutil.rmtree(log_path)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
index 65c8a7500f246..2bd397b0ef3f5 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
@@ -241,7 +241,7 @@ def do_pyreader_training(self, fleet):
         self.check_model_right(model_dir)
         shutil.rmtree(model_dir)
 
-    def do_dataset_training(self, fleet):
+    def do_dataset_training_queuedataset(self, fleet):
         train_file_list = ctr_dataset_reader.prepare_fake_data()
 
         exe = self.get_executor()
@@ -288,5 +288,56 @@ def do_dataset_training(self, fleet):
         if dirname:
             fleet.save_persistables(exe, dirname=dirname)
 
+    def do_dataset_training(self, fleet):
+        train_file_list = ctr_dataset_reader.prepare_fake_data()
+
+        exe = self.get_executor()
+        exe.run(fluid.default_startup_program())
+        fleet.init_worker()
+
+        thread_num = 2
+        batch_size = 128
+        filelist = train_file_list
+
+        # config dataset
+        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+        dataset.set_use_var(self.feeds)
+        dataset.set_batch_size(128)
+        dataset.set_thread(2)
+        dataset.set_filelist(filelist)
+        dataset.set_pipe_command('python ctr_dataset_reader.py')
+        dataset.load_into_memory()
+
+        dataset.global_shuffle(fleet, 12)  ##TODO: thread configure
+        shuffle_data_size = dataset.get_shuffle_data_size(fleet)
+        local_data_size = dataset.get_shuffle_data_size()
+        data_size_list = fleet.util.all_gather(local_data_size)
+        print('after global_shuffle data_size_list: ', data_size_list)
+        print('after global_shuffle data_size: ', shuffle_data_size)
+
+        for epoch_id in range(1):
+            pass_start = time.time()
+            exe.train_from_dataset(
+                program=fluid.default_main_program(),
+                dataset=dataset,
+                fetch_list=[self.avg_cost],
+                fetch_info=["cost"],
+                print_period=2,
+                debug=int(os.getenv("Debug", "0")))
+            pass_time = time.time() - pass_start
+        dataset.release_memory()
+
+        if os.getenv("SAVE_MODEL") == "1":
+            model_dir = tempfile.mkdtemp()
+            fleet.save_inference_model(exe, model_dir,
+                                       [feed.name for feed in self.feeds],
+                                       self.avg_cost)
+            self.check_model_right(model_dir)
+            shutil.rmtree(model_dir)
+
+        dirname = os.getenv("SAVE_DIRNAME", None)
+        if dirname:
+            fleet.save_persistables(exe, dirname=dirname)
+
 if __name__ == "__main__":
     runtime_main(TestDistCTR2x2)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
new file mode 100644
index 0000000000000..05008a3bc12f7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
@@ -0,0 +1,201 @@
+# -*- coding: UTF-8 -*-
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import argparse
+import ast
+import time
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.nn import Linear
+from paddle.distributed import fleet
+from paddle.fluid.dygraph import nn
+
+from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2
+
+seed = 2021
+epoch = 2
+batch_size = 32
+
+strategy = fleet.DistributedStrategy()
+strategy.hybrid_configs = {
+    "dp_degree": 2,
+    "mp_degree": 1,
+    "pp_degree": 1,
+    "sharding_degree": 1
+}
+fleet.init(is_collective=True, strategy=strategy)
+
+np.random.seed(seed)
+paddle.seed(seed)
+
+
+class MLP(fluid.Layer):
+    def __init__(self, param_attr=None, bias_attr=None):
+        super(MLP, self).__init__()
+
+        self._linear1 = Linear(10000, 10000)
+        self._linear2 = Linear(10000, 10000)
+        self._linear3 = Linear(10000, 10)
+
+    def forward(self, inputs):
+        y = self._linear1(inputs)
+        y = self._linear2(y)
+        y = self._linear3(y)
+        return y
+
+
+def reader_decorator():
+    def __reader__():
+        for _ in range(100):
+            img = np.random.rand(10000).astype('float32')
+            label = np.ones(1).astype('int64')
+            yield img, label
+
+    return __reader__
+
+
+def optimizer_setting(model, use_pure_fp16):
+    clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
+    optimizer = paddle.optimizer.AdamW(
+        parameters=model.parameters(),
+        learning_rate=0.001,
+        weight_decay=0.00001,
+        grad_clip=clip,
+        multi_precision=use_pure_fp16)
+
+    return optimizer
+
+
+def train_mlp(model,
+              sharding_stage,
+              use_pure_fp16=False,
+              all_test=False,
+              accumulate_grad=False):
+    if sharding_stage == "dp":
+        hcg = fleet.get_hybrid_communicate_group()
+        group = hcg.get_check_parallel_group()
+    else:
+        group = paddle.distributed.new_group([0, 1])
+    optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16)
+
+    if use_pure_fp16:
+        model = paddle.amp.decorate(
+            models=model, level='O2', save_dtype='float32')
+
+    if sharding_stage == 2:
+        optimizer = ShardingOptimizerStage2(
+            params=model.parameters(), optim=optimizer, group=group)
+        if all_test:
+            model = ShardingStage2(
+                model, optimizer, group=group, accumulate_grads=accumulate_grad)
+        else:
+            model = ShardingStage2(model, optimizer, group=group)
+    else:
+        optimizer = fleet.distributed_optimizer(optimizer)
+        model = fleet.distributed_model(model)
+
+    train_reader = paddle.batch(
+        reader_decorator(), batch_size=batch_size, drop_last=True)
+
+    train_loader = paddle.io.DataLoader.from_generator(
+        capacity=32,
+        use_double_buffer=True,
+        iterable=True,
+        return_list=True,
+        use_multiprocess=True)
+    train_loader.set_sample_list_generator(train_reader)
+
+    for eop in range(epoch):
+        model.train()
+
+        for batch_id, data in enumerate(train_loader()):
+            img, label = data
+            label.stop_gradient = True
+            img.stop_gradient = True
+
+            with paddle.amp.auto_cast(enable=use_pure_fp16, level='O2'):
+                out = model(img)
+                loss = paddle.nn.functional.cross_entropy(
+                    input=out, label=label)
+
+            avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32))
+            avg_loss.backward()
+
+            if accumulate_grad and batch_id == 2:
+                model.grad_scale()
+                optimizer.step()
+                model.clear_gradients()
+                return model.parameters()
+
+            if not accumulate_grad:
+                optimizer.step()
+
+                if sharding_stage == 2:
+                    model.clear_gradients()
+                else:
+                    optimizer.clear_grad()
+
+            if all_test and batch_id == 2:
+                return model.parameters()
+
+    if sharding_stage == 2:
+        model.to(device="gpu")
+
+    return model.parameters()
+
+
+def test_dp_stage2():
+    mlp = MLP()
+    state_dict = mlp.state_dict()
+    mlp1 = MLP()
+    mlp2 = MLP()
+    mlp3 = MLP()
+    mlp4 = MLP()
+    mlp1.set_state_dict(state_dict)
+    mlp2.set_state_dict(state_dict)
+    mlp3.set_state_dict(state_dict)
+    mlp4.set_state_dict(state_dict)
+    dp_params = train_mlp(mlp1, sharding_stage="dp", use_pure_fp16=False)
+    stage2_params = train_mlp(mlp2, sharding_stage=2, use_pure_fp16=False)
+    for i in range(len(dp_params)):
+        for j in range(len(stage2_params)):
+            if dp_params[i].name == stage2_params[j].name:
+                np.testing.assert_allclose(
+                    dp_params[i].numpy(), stage2_params[j].numpy(), rtol=1e-6)
+
+    stage2_params = train_mlp(
+        mlp3, sharding_stage=2, use_pure_fp16=True, all_test=True)
+    stage2_accumulate_grad = train_mlp(
+        mlp4,
+        sharding_stage=2,
+        use_pure_fp16=True,
+        all_test=True,
+        accumulate_grad=True)
+    for i in range(len(stage2_params)):
+        for j in range(len(stage2_accumulate_grad)):
+            if stage2_params[i].name == stage2_accumulate_grad[j].name:
+                np.testing.assert_allclose(
+                    stage2_params[i].numpy(),
+                    stage2_accumulate_grad[j].numpy(),
+                    rtol=1e-6)
+
+    return
+
+
+if __name__ == '__main__':
+    test_dp_stage2()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py
index 2aab27c03110d..750ed615e7109 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py
@@ -20,6 +20,7 @@
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
+from paddle.static import InputSpec
 
 program_translator = ProgramTranslator()
 
@@ -322,6 +323,24 @@ def for_original_tuple():
     return z
 
 
+# 23. for zip error
+@paddle.jit.to_static(
+    input_spec=[InputSpec(shape=[None, 10]), InputSpec(shape=[None, 10])])
+def for_zip_error(x, y):
+    for i, j in zip(x, y):
+        a = i + j
+    return x + y
+
+
+# 24. for zip
+@paddle.jit.to_static(
+    input_spec=[InputSpec(shape=[2, 10]), InputSpec(shape=[2, 10])])
+def for_zip(x, y):
+    for i, j in zip(x, y):
+        a = i + j
+    return x + y
+
+
 class TestTransformBase(unittest.TestCase):
     def setUp(self):
         self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
@@ -512,5 +531,14 @@ def test_transformed_result_compare(self):
         self.transformed_result_compare()
 
 
+class TestForZip(unittest.TestCase):
+    def test_for_zip_error(self):
+        with self.assertRaises(RuntimeError):
+            paddle.jit.save(for_zip_error, './for_zip_error')
+
+    def test_for_zip(self):
+        paddle.jit.save(for_zip, './for_zip')
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist_pure_fp16.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist_pure_fp16.py
index 4ddc9d1aa0860..1860362896cfb 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist_pure_fp16.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist_pure_fp16.py
@@ -91,13 +91,13 @@ def train(self, to_static=False):
                 loss_data.append(avg_loss.numpy()[0])
                 # save checkpoint
                 mnist.clear_gradients()
-                if batch_id % 10 == 0:
+                if batch_id % 2 == 0:
                     print(
                         "Loss at epoch {} step {}: loss: {:}, acc: {}, cost: {}"
                         .format(epoch, batch_id,
                                 avg_loss.numpy(), acc.numpy(), time() - start))
                     start = time()
-                if batch_id == 50:
+                if batch_id == 10:
                     break
         return loss_data
 
diff --git a/python/paddle/fluid/tests/unittests/fft/test_fft.py b/python/paddle/fluid/tests/unittests/fft/test_fft.py
index 0ef7a1e939e02..7ee5a04ece496 100644
--- a/python/paddle/fluid/tests/unittests/fft/test_fft.py
+++ b/python/paddle/fluid/tests/unittests/fft/test_fft.py
@@ -120,6 +120,33 @@ def test_fft(self):
                     atol=ATOL.get(str(self.x.dtype))))
 
 
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
+    [('test_x_float64', rand_x(5, np.float64), None, -1, 'backward'),
+     ('test_x_complex', rand_x(
+         5, complex=True), None, -1,
+      'backward'), ('test_n_grater_input_length', rand_x(
+          5, max_dim_len=5), 11, -1,
+                    'backward'), ('test_n_smaller_than_input_length', rand_x(
+                        5, min_dim_len=5, complex=True), 3, -1, 'backward'),
+     ('test_axis_not_last', rand_x(5), None, 3, 'backward'),
+     ('test_norm_forward', rand_x(5), None, 3, 'forward'),
+     ('test_norm_ortho', rand_x(5), None, 3, 'ortho')])
+class TestIfft(unittest.TestCase):
+    def test_fft(self):
+        """Test ifft with norm condition
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            self.assertTrue(
+                np.allclose(
+                    scipy.fft.ifft(self.x, self.n, self.axis, self.norm),
+                    paddle.fft.ifft(
+                        paddle.to_tensor(self.x), self.n, self.axis, self.norm),
+                    rtol=RTOL.get(str(self.x.dtype)),
+                    atol=ATOL.get(str(self.x.dtype))))
+
+
 @place(DEVICES)
 @parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [
     ('test_n_nagative', rand_x(2), -1, -1, 'backward', ValueError),
@@ -230,6 +257,32 @@ def test_fftn(self):
                 atol=ATOL.get(str(self.x.dtype)))
 
 
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
+    [('test_x_float64', rand_x(5, np.float64), None, None, 'backward'),
+     ('test_x_complex128', rand_x(
+         5, complex=True), None, None,
+      'backward'), ('test_n_grater_input_length', rand_x(
+          5, max_dim_len=5), (6, 6), (1, 2), 'backward'), (
+              'test_n_smaller_input_length', rand_x(
+                  5, min_dim_len=5, complex=True), (3, 3), (1, 2), 'backward'),
+     ('test_axis_not_default', rand_x(5), None, (1, 2),
+      'backward'), ('test_norm_forward', rand_x(5), None, None, 'forward'),
+     ('test_norm_ortho', rand_x(5), None, None, 'ortho')])
+class TestIFftn(unittest.TestCase):
+    def test_ifftn(self):
+        """Test ifftn with norm condition
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            np.testing.assert_allclose(
+                scipy.fft.ifftn(self.x, self.n, self.axis, self.norm),
+                paddle.fft.ifftn(
+                    paddle.to_tensor(self.x), self.n, self.axis, self.norm),
+                rtol=RTOL.get(str(self.x.dtype)),
+                atol=ATOL.get(str(self.x.dtype)))
+
+
 @place(DEVICES)
 @parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
     ('test_x_complex128',
diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
index 23dee7338ae18..01b2cccfc48b2 100644
--- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
+++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
@@ -130,6 +130,10 @@ def test_result(self):
         for gt, out in zip(ground_truths, res):
             self.assertEqual(gt[0], out[0])
 
+        res_sequential = self.run_new_executor_sequential()
+        for gt, out in zip(ground_truths, res_sequential):
+            self.assertEqual(gt[0], out[0])
+
     def run_raw_executor(self):
         paddle.seed(2020)
         main_program, startup_program, fetch_list = build_program()
@@ -158,6 +162,12 @@ def run_new_executor(self):
                 np.array(inter_core.run({}, fetch_list)._move_to_list()[0]))
         return outs
 
+    def run_new_executor_sequential(self):
+        os.environ['FLAGS_new_executor_sequential_run'] = '1'
+        res = self.run_new_executor()
+        del os.environ['FLAGS_new_executor_sequential_run']
+        return res
+
 
 class SwitchExecutorInterfaceTestCase(MultiStreamModelTestCase):
     def run_new_executor(self):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index 0b127d2a11f76..67d300fe186a8 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -47,9 +47,12 @@ if(WITH_MKLDNN)
   endforeach()
 endif()
 
-foreach(target ${TEST_INFERENCE_IR_PASSES})
-  py_test_modules(${target} MODULES ${target})
-endforeach()
+if (WITH_MKLDNN AND TENSORRT_FOUND AND WITH_GPU)
+  foreach(target ${TEST_INFERENCE_IR_PASSES})
+    py_test_modules(${target} MODULES ${target})
+  endforeach()
+endif()
+
 if(WITH_GPU AND TENSORRT_FOUND)
 set_tests_properties(test_trt_subgraph_pass PROPERTIES TIMEOUT 120)
 set_tests_properties(test_trt_activation_pass PROPERTIES TIMEOUT 120)
@@ -71,8 +74,13 @@ set_tests_properties(test_trt_matmul_quant_dequant PROPERTIES TIMEOUT 100)
 set_tests_properties(test_trt_conv3d_op PROPERTIES TIMEOUT 60)
 set_tests_properties(test_trt_conv3d_transpose_op PROPERTIES TIMEOUT 60)
 set_tests_properties(test_trt_nearest_interp_v2_op PROPERTIES TIMEOUT 30)
-set_tests_properties(test_emb_eltwise_layernorm_fuse_pass PROPERTIES TIMEOUT 120)
-set_tests_properties(test_fc_fuse_pass PROPERTIES TIMEOUT 120)
+
+if (WITH_MKLDNN AND TENSORRT_FOUND AND WITH_GPU)
+  set_tests_properties(test_emb_eltwise_layernorm_fuse_pass PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_fc_fuse_pass PROPERTIES TIMEOUT 240)
+  set_tests_properties(test_simplify_with_basic_ops_pass_autoscan PROPERTIES TIMEOUT 60)
+  set_tests_properties(test_adaptive_pool2d_convert_global_pass_autoscan PROPERTIES TIMEOUT 60)
+endif()
 
 if (WITH_MKLDNN)
   set_tests_properties(test_mkldnn_prelu_op PROPERTIES TIMEOUT 300)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/program_config.py b/python/paddle/fluid/tests/unittests/ir/inference/program_config.py
index a58be906762cf..a8c43daab731b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/program_config.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/program_config.py
@@ -14,6 +14,7 @@
 
 from typing import Optional, List, Callable, Dict, Any, Set
 import numpy as np
+import enum
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
@@ -57,6 +58,12 @@ def __repr__(self):
         return str({'shape': self.shape, 'lod': self.lod, 'dtype': self.dtype})
 
 
+class VarType(enum.Enum):
+    LOD_TENSOR = 1
+    LOD_TENSOR_ARRAY = 2
+    STEP_SCOPES = 3
+
+
 class OpConfig:
     '''  A config builder for generating a Op.  '''
 
@@ -65,10 +72,14 @@ def __init__(self,
                  inputs: Dict[str, List[str]],
                  outputs: Dict[str, List[str]],
                  attrs: Dict[str, Any]=None,
+                 outputs_var_type: Dict[str, VarType]=None,
+                 outputs_dtype: Dict[str, np.dtype]=None,
                  **kwargs):
         self.type = type
         self.inputs = inputs
         self.outputs = outputs
+        self.outputs_dtype = outputs_dtype
+        self.outputs_var_type = outputs_var_type
         self.attrs = attrs
         if self.attrs is None:
             self.attrs = dict()
@@ -80,6 +91,88 @@ def __repr__(self):
         return log_str
 
 
+_OP_WITHOUT_KERNEL_SET = {
+    'feed', 'fetch', 'recurrent', 'go', 'rnn_memory_helper_grad',
+    'conditional_block', 'while', 'send', 'recv', 'listen_and_serv',
+    'fl_listen_and_serv', 'ncclInit', 'select', 'checkpoint_notify',
+    'gen_bkcl_id', 'c_gen_bkcl_id', 'gen_nccl_id', 'c_gen_nccl_id',
+    'c_comm_init', 'c_sync_calc_stream', 'c_sync_comm_stream',
+    'queue_generator', 'dequeue', 'enqueue', 'heter_listen_and_serv',
+    'c_wait_comm', 'c_wait_compute', 'c_gen_hccl_id', 'c_comm_init_hccl',
+    'copy_cross_scope'
+}
+
+
+class BlockConfig:
+    ''' A config builder for generating a Block. '''
+
+    def __init__(self,
+                 ops: List[OpConfig],
+                 vars: List[str],
+                 vars_dtype: Dict[str, np.dtype]=None,
+                 vars_var_type: Dict[str, VarType]=None,
+                 vars_lod_level: Dict[str, int]=None):
+        self.ops = ops
+        self.vars = vars
+        self.vars_dtype = vars_dtype
+        self.vars_var_type = vars_var_type
+        self.vars_lod_level = vars_lod_level
+
+    def fill_block_desc(self, block_desc):
+        for name in self.vars:
+            var_desc = block_desc.var(cpt.to_bytes(name))
+            var_desc.set_type(core.VarDesc.VarType.LOD_TENSOR)
+            if self.vars_lod_level is not None and name in self.vars_lod_level.keys(
+            ):
+                var_desc.set_lod_level(self.vars_lod_level[name])
+            if self.vars_var_type is not None and name in self.vars_var_type.keys(
+            ):
+                if self.vars_var_type[name] == VarType.LOD_TENSOR_ARRAY:
+                    var_desc.set_type(core.VarDesc.VarType.LOD_TENSOR_ARRAY)
+                elif self.vars_var_type[name] == VarType.STEP_SCOPES:
+                    var_desc.set_type(core.VarDesc.VarType.STEP_SCOPES)
+                    continue
+            var_desc.set_dtype(convert_np_dtype_to_dtype_(np.float32))
+            if self.vars_dtype is not None and name in self.vars_dtype.keys():
+                var_desc.set_dtype(
+                    convert_np_dtype_to_dtype_(self.vars_dtype[name]))
+
+        for op_config in self.ops:
+            op_desc = block_desc.append_op()
+            op_desc.set_type(op_config.type)
+            for name, values in op_config.inputs.items():
+                op_desc.set_input(name, values)
+            for name, values in op_config.attrs.items():
+                op_desc._set_attr(name, values)
+            for name, values in op_config.outputs.items():
+                op_desc.set_output(name, values)
+                for v in values:
+                    if block_desc.has_var_recursive(cpt.to_bytes(v)):
+                        continue
+                    var_desc = block_desc.var(cpt.to_bytes(v))
+                    var_desc.set_type(core.VarDesc.VarType.LOD_TENSOR)
+                    if op_config.outputs_var_type is not None and v in op_config.outputs_var_type.keys(
+                    ):
+                        if op_config.outputs_var_type[
+                                v] == VarType.LOD_TENSOR_ARRAY:
+                            var_desc.set_type(
+                                core.VarDesc.VarType.LOD_TENSOR_ARRAY)
+                        elif op_config.outputs_var_type[
+                                v] == VarType.STEP_SCOPES:
+                            var_desc.set_type(core.VarDesc.VarType.STEP_SCOPES)
+                            continue
+                    var_desc.set_dtype(convert_np_dtype_to_dtype_(np.float32))
+                    if op_config.outputs_dtype is not None and v in op_config.outputs_dtype.keys(
+                    ):
+                        var_desc.set_dtype(
+                            convert_np_dtype_to_dtype_(op_config.outputs_dtype[
+                                v]))
+            if op_config.type not in _OP_WITHOUT_KERNEL_SET:
+                op_desc.infer_var_type(block_desc)
+                op_desc.infer_shape(block_desc)
+            op_desc.check_attrs()
+
+
 class ProgramConfig:
     '''  A config builder for generating a Program.  '''
 
@@ -137,6 +230,8 @@ def create_fake_model(program_config):
         var_desc.set_dtype(convert_np_dtype_to_dtype_(tensor_config.dtype))
         var_desc.set_shape(tensor_config.shape)
         var_desc.set_need_check_feed(True)
+        if tensor_config.lod is not None:
+            var_desc.set_lod_level(len(tensor_config.lod))
         op_desc = main_block_desc._prepend_op()
         op_desc.set_type("feed")
         op_desc.set_input('X', ["feed"])
@@ -177,16 +272,36 @@ def create_fake_model(program_config):
         for name, values in op_config.inputs.items():
             op_desc.set_input(name, values)
         for name, values in op_config.attrs.items():
-            op_desc._set_attr(name, values)
+            if name == 'sub_block':
+                sub_block_desc = main_program_desc.append_block(main_block_desc)
+                values.fill_block_desc(sub_block_desc)
+                op_desc._set_attr(name, sub_block_desc)
+            else:
+                op_desc._set_attr(name, values)
         for name, values in op_config.outputs.items():
             op_desc.set_output(name, values)
             for v in values:
+                if main_block_desc.has_var_recursive(cpt.to_bytes(v)):
+                    continue
                 var_desc = main_block_desc.var(cpt.to_bytes(v))
                 var_desc.set_type(core.VarDesc.VarType.LOD_TENSOR)
-                var_desc.set_dtype(
-                    convert_np_dtype_to_dtype_(tensor_config.dtype))
-        op_desc.infer_var_type(main_block_desc)
-        op_desc.infer_shape(main_block_desc)
+                if op_config.outputs_var_type is not None and v in op_config.outputs_var_type.keys(
+                ):
+                    if op_config.outputs_var_type[
+                            v] == VarType.LOD_TENSOR_ARRAY:
+                        var_desc.set_type(core.VarDesc.VarType.LOD_TENSOR_ARRAY)
+                    elif op_config.outputs_var_type[v] == VarType.STEP_SCOPES:
+                        var_desc.set_type(core.VarDesc.VarType.STEP_SCOPES)
+                        continue
+                var_desc.set_dtype(convert_np_dtype_to_dtype_(np.float32))
+                if op_config.outputs_dtype is not None and v in op_config.outputs_dtype.keys(
+                ):
+                    var_desc.set_dtype(
+                        convert_np_dtype_to_dtype_(op_config.outputs_dtype[v]))
+        if op_config.type not in _OP_WITHOUT_KERNEL_SET:
+            op_desc.infer_var_type(main_block_desc)
+            op_desc.infer_shape(main_block_desc)
+        op_desc.check_attrs()
 
     for index, name in enumerate(program_config.outputs):
         var_desc = main_block_desc.var(cpt.to_bytes("fetch"))
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py b/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py
new file mode 100644
index 0000000000000..8cb6af1dcf044
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest, IgnoreReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume
+import hypothesis.strategies as st
+
+
+class TestAdaptivePool2dConvertGlobalPass(PassAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_config(self, draw):
+        x_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=4), min_size=4, max_size=4))
+        pooling_type = draw(st.sampled_from(["max", "avg"]))
+
+        data_format = "NCHW"  #trt support this format only
+        strides = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=4), min_size=2, max_size=2))
+
+        paddings = [0, 0]  # only 0 0 is right
+        ceil_mode = draw(st.booleans())
+        exclusive = draw(st.booleans())
+        global_pooling = False  #only false is right
+        padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VAILD"]))
+
+        pool_op = OpConfig(
+            "pool2d",
+            inputs={"X": ["input_data"]},
+            outputs={"Out": ["pool_output"]},
+            ksize=[1, 1],
+            adaptive=True,
+            pooling_type=pooling_type,
+            data_format=data_format,
+            strides=strides,
+            paddings=paddings,
+            ceil_mode=ceil_mode,
+            global_pooling=global_pooling,
+            padding_algorithm=padding_algorithm,
+            exclusive=exclusive)
+        ops = [pool_op]
+
+        program_config = ProgramConfig(
+            ops=ops,
+            weights={},
+            inputs={"input_data": TensorConfig(shape=x_shape), },
+            outputs=["pool_output"])
+
+        return program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_trt_inference_config()
+        config.enable_tensorrt_engine(
+            max_batch_size=4,
+            workspace_size=102400,
+            min_subgraph_size=0,
+            precision_mode=paddle_infer.PrecisionType.Float32,
+            use_static=False,
+            use_calib_mode=False)
+        yield config, ['pool2d'], (1e-5, 1e-5)
+
+    def add_ignore_pass_case(self):
+        # Here we put some skip rules to avoid known bugs
+        def teller1(program_config, predictor_config):
+            if program_config.ops[0].attrs["pooling_type"] == "max":
+                x_shape = list(program_config.inputs["input_data"].shape)
+                if x_shape[-1] != 1 or x_shape[-2] != 1:
+                    return True
+            return False
+
+        def teller2(program_config, predictor_config):
+            if program_config.ops[0].attrs["padding_algorithm"] == "SAME":
+                return True
+            return False
+
+        self.add_ignore_check_case(
+            teller1,
+            IgnoreReasons.PASS_ACCURACY_ERROR,
+            "max pooling has diff if H or W is not equals to 1", )
+        self.add_ignore_check_case(
+            teller2,
+            IgnoreReasons.PASS_ACCURACY_ERROR,
+            "output has wrong result if padding_algorithm equals to SAME", )
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            max_examples=100,
+            passes=["adaptive_pool2d_convert_global_pass"],
+            min_success_num=40)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_fuse_pass.py
index 1db3a007131aa..dccc29e75f036 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_fuse_pass.py
@@ -46,6 +46,17 @@ def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(use_gpu=True)
         yield config, ["fc"], (1e-5, 1e-5)
 
+        # trt static_shape
+        config = self.create_trt_inference_config()
+        config.enable_tensorrt_engine(
+            max_batch_size=8,
+            workspace_size=102400,
+            min_subgraph_size=0,
+            precision_mode=paddle_infer.PrecisionType.Float32,
+            use_static=False,
+            use_calib_mode=False)
+        yield config, ['fc'], (1e-5, 1e-5)
+
     def add_ignore_pass_case(self):
         # Here we put some skip rules to avoid known bugs
         def teller1(program_config, predictor_config):
@@ -53,14 +64,22 @@ def teller1(program_config, predictor_config):
             x_shape = list(program_config.inputs["mul_x"].shape)
             y_shape = list(program_config.weights["mul_y"].shape)
             bias_shape = program_config.weights["bias"].shape
-            if (bias_shape != [y_shape[-1], ] and
-                    bias_shape != [1, y_shape[-1]]):
+            bias_shape = list(program_config.weights["bias"].shape)
+
+            if predictor_config.tensorrt_engine_enabled():
+                # TensorRT cann't handle all the situation of elementwise_add
+                # disable it until this problem fixed
+                predictor_config.exp_disable_tensorrt_ops(["elementwise_add"])
+
+            if bias_shape != [y_shape[-1]] and bias_shape != [1, y_shape[-1]]:
                 return True
             return False
 
         def teller2(program_config, predictor_config):
             # TODO fuse has bug while axis != -1
-            if program_config.ops[1].attrs["axis"] != -1:
+            axis = program_config.ops[1].attrs["axis"]
+            if axis != -1 and axis != program_config.ops[0].attrs[
+                    "x_num_col_dims"]:
                 return True
             return False
 
@@ -164,7 +183,7 @@ def sample_program_config(self, draw):
 
     def test(self):
         self.run_and_statis(
-            quant=False, max_examples=300, passes=["fc_fuse_pass"])
+            quant=False, max_examples=500, passes=["fc_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_simplify_with_basic_ops_pass_autoscan.py b/python/paddle/fluid/tests/unittests/ir/inference/test_simplify_with_basic_ops_pass_autoscan.py
new file mode 100644
index 0000000000000..03e9feb418a82
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_simplify_with_basic_ops_pass_autoscan.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume
+import hypothesis.strategies as st
+
+
+class TestSimplifyWithBasicOpsPassUpscale(PassAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_config(self, draw):
+        #scale = draw(st.floats(min_value=0.01, max_value=1.0))
+        #bias = draw(st.floats(min_value=0.01, max_value=2.0))
+        #bias_after_scale = draw(st.booleans())
+        fix_seed = draw(st.booleans())
+        dropout_implementation = "upscale_in_train"
+        dropout_prob = draw(st.floats(min_value=0.0, max_value=1.0))
+        seed = draw(st.integers(min_value=0, max_value=512))
+        x_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=4), min_size=2, max_size=4))
+        is_test = True
+
+        dropout_op = OpConfig(
+            "dropout",
+            inputs={"X": ["input_data"]},
+            outputs={"Out": ["dropout_output"]},
+            fix_seed=fix_seed,
+            dropout_implementation=dropout_implementation,
+            dropout_prob=dropout_prob,
+            seed=seed,
+            is_test=is_test)
+        relu_op = OpConfig(
+            "relu",
+            inputs={"X": ["dropout_output"]},
+            outputs={"Out": ["relu_out"]})
+        ops = [dropout_op, relu_op]
+
+        program_config = ProgramConfig(
+            ops=ops,
+            weights={},
+            inputs={"input_data": TensorConfig(shape=x_shape), },
+            outputs=["relu_out"])
+
+        return program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_gpu=True)
+        yield config, ['relu'], (1e-5, 1e-5)
+        config = self.create_inference_config(use_gpu=False)
+        yield config, ['relu'], (1e-5, 1e-5)
+        config = self.create_trt_inference_config()
+        config.enable_tensorrt_engine(
+            max_batch_size=4,
+            workspace_size=102400,
+            min_subgraph_size=0,
+            precision_mode=paddle_infer.PrecisionType.Float32,
+            use_static=False,
+            use_calib_mode=False)
+        yield config, ['relu'], (1e-5, 1e-5)
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            max_examples=30,
+            passes=["simplify_with_basic_ops_pass"],
+            min_success_num=30)
+
+
+class TestSimplifyWithBasicOpsPassDowngrade(PassAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_config(self, draw):
+        fix_seed = draw(st.booleans())
+        dropout_implementation = "downgrade_in_infer"
+        dropout_prob = draw(st.floats(min_value=0.0, max_value=1.0))
+        seed = draw(st.integers(min_value=0, max_value=512))
+        x_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=4), min_size=2, max_size=4))
+        is_test = True
+
+        dropout_op = OpConfig(
+            "dropout",
+            inputs={"X": ["input_data"]},
+            outputs={"Out": ["dropout_output"]},
+            fix_seed=fix_seed,
+            dropout_implementation=dropout_implementation,
+            dropout_prob=dropout_prob,
+            seed=seed,
+            is_test=is_test)
+        relu_op = OpConfig(
+            "relu",
+            inputs={"X": ["dropout_output"]},
+            outputs={"Out": ["relu_out"]})
+        ops = [dropout_op, relu_op]
+
+        program_config = ProgramConfig(
+            ops=ops,
+            weights={},
+            inputs={"input_data": TensorConfig(shape=x_shape), },
+            outputs=["relu_out"])
+
+        return program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_gpu=True)
+        yield config, ['scale', 'relu'], (1e-5, 1e-5)
+        config = self.create_inference_config(use_gpu=False)
+        yield config, ['scale', 'relu'], (1e-5, 1e-5)
+        config = self.create_trt_inference_config()
+        config.enable_tensorrt_engine(
+            max_batch_size=4,
+            workspace_size=102400,
+            min_subgraph_size=0,
+            precision_mode=paddle_infer.PrecisionType.Float32,
+            use_static=False,
+            use_calib_mode=False)
+        yield config, ['scale', 'relu'], (1e-5, 1e-5)
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            max_examples=30,
+            passes=["simplify_with_basic_ops_pass"],
+            min_success_num=30)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool_op.py
index 3d317446f00f3..26ad45db7a18d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool_op.py
@@ -27,9 +27,9 @@
 class TensorRTPoolTest(InferencePassTest):
     def setUp(self):
         self.bs = 1
-        self.channel = 3
-        self.height = 8
-        self.width = 8
+        self.channel = 2
+        self.height = 2
+        self.width = 2
         self.pool_size = 2
         self.pool_type = 'max'
         self.pool_stride = 1
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_unsqueeze2_eltwise_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_unsqueeze2_eltwise_fuse_pass.py
new file mode 100644
index 0000000000000..81acd9856cf24
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_unsqueeze2_eltwise_fuse_pass.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume, reproduce_failure
+import hypothesis.strategies as st
+
+
+class TestUnsqueezeEltwiseFusePass(PassAutoScanTest):
+    """
+        y_var  
+          |          
+       unsqueeze2 
+          \
+    unsqueeze2_out_var    x_var
+             \           /
+            elementwise_mul 
+    """
+
+    def sample_predictor_configs(self, program_config):
+        # TRT
+        config = self.create_trt_inference_config()
+        config.enable_tensorrt_engine(
+            max_batch_size=10,
+            workspace_size=102400,
+            min_subgraph_size=0,
+            precision_mode=paddle_infer.PrecisionType.Float32,
+            use_static=False,
+            use_calib_mode=False)
+        yield config, ['elementwise_mul', ], (1e-5, 1e-5)
+
+    def sample_program_config(self, draw):
+        # 1. Generate shape and attr of mul
+        x_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=10), min_size=4, max_size=4))
+        axis = -1
+
+        # 2. Generate legal shape and attr of input:Y of unsqueeze2
+        y_shape = x_shape[:2]
+        unsqueeze2_axes = [2, 3]
+
+        unsqueeze2_op = OpConfig(
+            "unsqueeze2",
+            inputs={
+                "X": ["unsqueeze2_x"],
+                "AxesTensor": [],
+                "AxesTensorList": []
+            },
+            axes=unsqueeze2_axes,
+            outputs={"Out": ["unsqueeze2_out"],
+                     "XShape": ["xshape"]}, )
+        mul_op = OpConfig(
+            "elementwise_mul",
+            inputs={"Y": ["unsqueeze2_out"],
+                    "X": ["mul_x"]},
+            axis=axis,
+            outputs={"Out": ["mul_out"]}, )
+
+        ops = [
+            unsqueeze2_op,
+            mul_op,
+        ]
+
+        program_config = ProgramConfig(
+            ops=ops,
+            weights={},
+            inputs={
+                "mul_x": TensorConfig(shape=x_shape),
+                "unsqueeze2_x": TensorConfig(shape=y_shape),
+            },
+            outputs=ops[-1].outputs["Out"], )
+        return program_config
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            max_examples=300,
+            passes=["unsqueeze2_eltwise_fuse_pass"])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
index 86609f015a260..a36fc28013bb4 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
@@ -154,6 +154,27 @@ def init_test_case(self):
         self.padding_algorithm = "EXPLICIT"
 
 
+class TestMKLDNNWithGroups(TestConv2DTransposeMKLDNNOp):
+    def init_test_case(self):
+        TestConv2DTransposeMKLDNNOp.init_test_case(self)
+        self.pad = [1, 1]
+        self.groups = 2
+        self.input_size = [2, 4, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 3, 3, 3]
+
+
+class TestMKLDNNWithGroups_NHWC(TestConv2DTransposeMKLDNNOp):
+    def init_test_case(self):
+        TestConv2DTransposeMKLDNNOp.init_test_case(self)
+        self.pad = [1, 1]
+        self.groups = 2
+        self.input_size = [2, 5, 5, 4]  # NHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 3, 3, 3]
+        self.data_format = 'NHWC'
+
+
 if __name__ == '__main__':
     enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_scatter_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_scatter_op_npu.py
index c05b53d9a4862..c353654641932 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_scatter_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_scatter_op_npu.py
@@ -27,7 +27,7 @@
 SEED = 2021
 
 
-class TestCast1(OpTest):
+class TestCast1_FP32(OpTest):
     def setUp(self):
         self.set_npu()
         self.op_type = "scatter"
@@ -50,7 +50,7 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
 
-class TestCast2(OpTest):
+class TestCast_INT32(OpTest):
     def setUp(self):
         self.set_npu()
         self.op_type = "scatter"
@@ -73,7 +73,7 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
 
-class TestCast3(OpTest):
+class TestCast2_FP32(OpTest):
     def setUp(self):
         self.set_npu()
         self.op_type = "scatter"
@@ -96,7 +96,7 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
 
-class TestCast4(OpTest):
+class TestCast3_FP32(OpTest):
     def setUp(self):
         self.set_npu()
         self.op_type = "scatter"
@@ -120,5 +120,28 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
 
+class TestCast_INT64(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "scatter"
+        self.place = paddle.NPUPlace(0)
+
+        ref_np = np.ones((3, 2)).astype("int64")
+        index_np = np.array([1]).astype("int32")
+        updates_np = np.zeros((1, 2)).astype("int64")
+
+        output_np = np.copy(ref_np)
+        output_np[index_np] = updates_np
+        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
+        self.outputs = {'Out': output_np}
+        self.attrs = {'overwrite': True}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_angle_op.py b/python/paddle/fluid/tests/unittests/test_angle_op.py
new file mode 100644
index 0000000000000..05397c2434d8c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_angle_op.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+import paddle
+from paddle.fluid import dygraph
+from paddle import static
+paddle.enable_static()
+
+
+def angle_grad(x, dout):
+    if np.iscomplexobj(x):
+
+        def angle_grad_element(xi, douti):
+            if xi == 0:
+                return 0
+            rsquare = np.abs(xi)**2
+            return -douti * xi.imag / rsquare + 1j * douti * xi.real / rsquare
+
+        return np.vectorize(angle_grad_element)(x, dout)
+    else:
+        return np.zeros_like(x).astype(x.dtype)
+
+
+class TestAngleOpFloat(OpTest):
+    def setUp(self):
+        self.op_type = "angle"
+        self.dtype = "float64"
+        self.x = np.linspace(-5, 5, 101).astype(self.dtype)
+        out_ref = np.angle(self.x)
+        self.inputs = {'X': self.x}
+        self.outputs = {'Out': out_ref}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            ['X'],
+            'Out',
+            user_defined_grads=[
+                angle_grad(self.x, np.ones_like(self.x) / self.x.size)
+            ])
+
+
+class TestAngleOpComplex(OpTest):
+    def setUp(self):
+        self.op_type = "angle"
+        self.dtype = "complex128"
+        real = np.expand_dims(np.linspace(-2, 2, 11), -1).astype("float64")
+        imag = np.linspace(-2, 2, 11).astype("float64")
+        self.x = real + 1j * imag
+        out_ref = np.angle(self.x)
+        self.inputs = {'X': self.x}
+        self.outputs = {'Out': out_ref}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            ['X'],
+            'Out',
+            user_defined_grads=[
+                angle_grad(self.x, np.ones_like(self.x) / self.x.size)
+            ])
+
+
+class TestAngleAPI(unittest.TestCase):
+    def setUp(self):
+        self.x = np.random.randn(2, 3) + 1j * np.random.randn(2, 3)
+        self.out = np.angle(self.x)
+
+    def test_dygraph(self):
+        with dygraph.guard():
+            x = paddle.to_tensor(self.x)
+            out_np = paddle.angle(x).numpy()
+        self.assertTrue(np.allclose(self.out, out_np))
+
+    def test_static(self):
+        mp, sp = static.Program(), static.Program()
+        with static.program_guard(mp, sp):
+            x = static.data("x", shape=[2, 3], dtype="complex128")
+            out = paddle.angle(x)
+
+        exe = static.Executor()
+        exe.run(sp)
+        [out_np] = exe.run(mp, feed={"x": self.x}, fetch_list=[out])
+        self.assertTrue(np.allclose(self.out, out_np))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py b/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py
index 422cb58ff9ab6..4552d600bafd7 100644
--- a/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py
+++ b/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py
@@ -123,7 +123,7 @@ def check_after_applied(self, main, startup):
             if op.type != "share_buffer":
                 continue
 
-            share_dims = op.attr("share_dims")
+            share_dims = op.attr("share_dims_and_dtype")
             if share_dims:
                 for i in range(len(share_dims)):
                     self.assertEqual(share_dims[0], share_dims[i])
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
index 7b60a9753bd6d..de37ac56bfbb6 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
@@ -52,6 +52,9 @@
 from paddle.distributed.auto_parallel.mapper import get_dtype_bytes
 from paddle.distributed.auto_parallel.mapper import get_comm_volume
 
+if os.getenv("CUDA_VISIBLE_DEVICES") is not None:
+    os.environ["CUDA_VISIBLE_DEVICES"] = ""
+
 paddle.enable_static()
 _global_parallel_strategy = None
 _global_process_mesh = None
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py
new file mode 100644
index 0000000000000..665a16c862c84
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+# import os
+# import copy
+# import json
+import unittest
+
+import paddle
+import paddle.nn as nn
+import paddle.static as static
+import paddle.nn.functional as F
+import paddle.utils as utils
+# from paddle.distributed import fleet
+import paddle.distributed.auto_parallel as auto
+# from paddle.distributed.auto_parallel.cluster import Cluster
+# from paddle.distributed.auto_parallel.utils import SerialProgramInfo
+# from paddle.distributed.auto_parallel.searcher import Checker, Enumerater
+from paddle.distributed.auto_parallel.dist_context import DistributedContext
+# from paddle.distributed.auto_parallel.utils import get_all_distributed_main_program
+from paddle.distributed.auto_parallel.dist_attribute import TensorDistributedAttribute
+from paddle.distributed.auto_parallel.dist_attribute import OperatorDistributedAttribute
+from paddle.distributed.auto_parallel.utils import update_op_dims_mapping_by_default_dist_impl
+from paddle.distributed.auto_parallel.utils import update_op_dims_mapping_by_elementwise_like_dist_impl
+
+paddle.enable_static()
+
+
+class MLPLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
+            mean=0.0, std=initializer_range))
+        bias_attr = None
+
+        self.linear0 = nn.Linear(
+            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
+        self.linear1 = nn.Linear(
+            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+
+    def forward(self, input):
+        out = self.norm(input)
+        out = self.linear0(out)
+        out = F.gelu(out, approximate=True)
+        out = self.linear1(out)
+        out = paddle.unsqueeze(out, axis=0)
+        out = paddle.reshape(out, [4, 1024])
+        return out
+
+
+def mlp_forward(train_program, start_program):
+    with static.program_guard(train_program,
+                              start_program), utils.unique_name.guard():
+        batch_size = 4
+        hidden_size = 1024
+        sequence_len = 512
+        input = static.data(
+            name="input", shape=[batch_size, hidden_size], dtype='float32')
+        label = static.data(
+            name="label", shape=[batch_size, 1], dtype='float32')
+        loss_func = paddle.nn.CrossEntropyLoss(reduction="none")
+        mlp = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            initializer_range=0.02)
+
+        predict = mlp(input)
+        error_cost = loss_func(predict, label)
+        loss = paddle.mean(error_cost)
+
+    return loss, train_program, start_program
+
+
+def set_default_dist_attr(program, dist_context, process_mesh):
+    ops = program.global_block().ops
+    vars = program.global_block().vars
+    for op in ops:
+        op_dist_attr = OperatorDistributedAttribute()
+        op_dist_attr.process_mesh = process_mesh
+        for var_name in op.input_arg_names:
+            tensor_dist_attr = TensorDistributedAttribute()
+            tensor_dist_attr.process_mesh = process_mesh
+            tensor_dist_attr.dims_mapping = [-1 for i in vars[var_name].shape]
+            dist_context.set_tensor_dist_attr_for_program(vars[var_name],
+                                                          tensor_dist_attr)
+            op_dist_attr.set_input_dims_mapping(var_name,
+                                                tensor_dist_attr.dims_mapping)
+
+        for var_name in op.output_arg_names:
+            tensor_dist_attr = TensorDistributedAttribute()
+            tensor_dist_attr.process_mesh = process_mesh
+            tensor_dist_attr.dims_mapping = [-1 for i in vars[var_name].shape]
+            dist_context.set_tensor_dist_attr_for_program(vars[var_name],
+                                                          tensor_dist_attr)
+            op_dist_attr.set_output_dims_mapping(var_name,
+                                                 tensor_dist_attr.dims_mapping)
+        dist_context.set_op_dist_attr_for_program(op, op_dist_attr)
+
+    dist_context.add_process_mesh(process_mesh)
+
+
+class TestMLPSearcher(unittest.TestCase):
+    def test_update(self):
+        train_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        _, train_program, startup_program = mlp_forward(train_program,
+                                                        startup_program)
+        global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
+        dist_context = DistributedContext()
+        set_default_dist_attr(train_program, dist_context, global_process_mesh)
+        ops = train_program.global_block().ops
+        vars = train_program.global_block().vars
+        from paddle.distributed.auto_parallel.operators.common import get_distributed_operator_impl_container
+        from paddle.distributed.auto_parallel.completion import is_elementwise_like_op
+        from paddle.distributed.auto_parallel.dist_op import DistributedOperator
+
+        for op in ops:
+            dist_op_impl_container = get_distributed_operator_impl_container(
+                op.type)
+            if dist_op_impl_container is None:
+                op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
+                dist_op = DistributedOperator(op, op_dist_attr)
+                if is_elementwise_like_op(op.type):
+                    changed = update_op_dims_mapping_by_elementwise_like_dist_impl(
+                        dist_op)
+                    self.assertFalse(changed)
+
+                    dist_op.dist_attr.set_output_dims_mapping(
+                        op.output_arg_names[0], [0] + [
+                            -1
+                            for i in range(
+                                1, len(vars[op.output_arg_names[0]].shape))
+                        ])
+                    try:
+                        changed = update_op_dims_mapping_by_elementwise_like_dist_impl(
+                            dist_op)
+                    except:
+                        continue
+                    self.assertTrue(changed)
+                else:
+                    changed = update_op_dims_mapping_by_default_dist_impl(
+                        dist_op)
+                    self.assertFalse(changed)
+
+                    dist_op.dist_attr.set_output_dims_mapping(
+                        op.output_arg_names[0], [0] + [
+                            -1
+                            for i in range(
+                                1, len(vars[op.output_arg_names[0]].shape))
+                        ])
+                    try:
+                        changed = update_op_dims_mapping_by_default_dist_impl(
+                            dist_op)
+                    except:
+                        continue
+                    self.assertTrue(changed)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_deg2rad.py b/python/paddle/fluid/tests/unittests/test_deg2rad.py
new file mode 100644
index 0000000000000..31219d5ab97af
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_deg2rad.py
@@ -0,0 +1,75 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import Program, program_guard
+from op_test import OpTest
+
+paddle.enable_static()
+
+
+class TestDeg2radAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_dtype = 'float64'
+        self.x_np = np.array(
+            [180.0, -180.0, 360.0, -360.0, 90.0, -90.0]).astype(np.float64)
+        self.x_shape = [6]
+        self.out_np = np.deg2rad(self.x_np)
+
+    def test_static_graph(self):
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(startup_program, train_program):
+            x = fluid.data(name='input', dtype=self.x_dtype, shape=self.x_shape)
+            out = paddle.deg2rad(x)
+
+            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+            ) else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            res = exe.run(fluid.default_main_program(),
+                          feed={'input': self.x_np},
+                          fetch_list=[out])
+            self.assertTrue((np.array(out[0]) == self.out_np).all())
+
+    def test_dygraph(self):
+        paddle.disable_static()
+        x1 = paddle.to_tensor([180.0, -180.0, 360.0, -360.0, 90.0, -90.0])
+        result1 = paddle.deg2rad(x1)
+        self.assertEqual(np.allclose(self.out_np, result1.numpy()), True)
+
+        paddle.enable_static()
+
+
+class TestDeg2radAPI2(TestDeg2radAPI):
+    # Test input data type is int
+    def setUp(self):
+        self.x_np = 180
+        self.x_shape = [1]
+        self.out_np = np.pi
+        self.x_dtype = 'int64'
+
+    def test_dygraph(self):
+        paddle.disable_static()
+
+        x2 = paddle.to_tensor(180)
+        result2 = paddle.deg2rad(x2)
+        self.assertEqual(np.allclose(np.pi, result2.numpy()), True)
+
+        paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_diff_op.py b/python/paddle/fluid/tests/unittests/test_diff_op.py
new file mode 100644
index 0000000000000..345dad54132bc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_diff_op.py
@@ -0,0 +1,214 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle.fluid.core as core
+
+
+class TestDiffOp(unittest.TestCase):
+    def set_args(self):
+        self.input = np.array([1, 4, 5, 2]).astype('float32')
+        self.n = 1
+        self.axis = -1
+        self.prepend = None
+        self.append = None
+
+    def get_output(self):
+        if self.prepend is not None and self.append is not None:
+            self.output = np.diff(
+                self.input,
+                n=self.n,
+                axis=self.axis,
+                prepend=self.prepend,
+                append=self.append)
+        elif self.prepend is not None:
+            self.output = np.diff(
+                self.input, n=self.n, axis=self.axis, prepend=self.prepend)
+        elif self.append is not None:
+            self.output = np.diff(
+                self.input, n=self.n, axis=self.axis, append=self.append)
+        else:
+            self.output = np.diff(self.input, n=self.n, axis=self.axis)
+
+    def setUp(self):
+        self.set_args()
+        self.get_output()
+        self.places = [paddle.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+
+    def test_dygraph(self):
+        for place in self.places:
+            paddle.disable_static(place)
+            x = paddle.to_tensor(self.input, place=place)
+            if self.prepend is not None:
+                self.prepend = paddle.to_tensor(self.prepend, place=place)
+            if self.append is not None:
+                self.append = paddle.to_tensor(self.append, place=place)
+            out = paddle.diff(
+                x,
+                n=self.n,
+                axis=self.axis,
+                prepend=self.prepend,
+                append=self.append)
+            self.assertTrue((out.numpy() == self.output).all(), True)
+
+    def test_static(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for place in places:
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                x = paddle.fluid.data(
+                    name="input",
+                    shape=self.input.shape,
+                    dtype=self.input.dtype)
+                has_pend = False
+                prepend = None
+                append = None
+                if self.prepend is not None:
+                    has_pend = True
+                    prepend = paddle.fluid.data(
+                        name="prepend",
+                        shape=self.prepend.shape,
+                        dtype=self.prepend.dtype)
+                if self.append is not None:
+                    has_pend = True
+                    append = paddle.fluid.data(
+                        name="append",
+                        shape=self.append.shape,
+                        dtype=self.append.dtype)
+
+                exe = fluid.Executor(place)
+                out = paddle.diff(
+                    x, n=self.n, axis=self.axis, prepend=prepend, append=append)
+                fetches = exe.run(fluid.default_main_program(),
+                                  feed={
+                                      "input": self.input,
+                                      "prepend": self.prepend,
+                                      "append": self.append
+                                  },
+                                  fetch_list=[out])
+                self.assertTrue((fetches[0] == self.output).all(), True)
+
+    def test_grad(self):
+        for place in self.places:
+            x = paddle.to_tensor(self.input, place=place, stop_gradient=False)
+            if self.prepend is not None:
+                self.prepend = paddle.to_tensor(self.prepend, place=place)
+            if self.append is not None:
+                self.append = paddle.to_tensor(self.append, place=place)
+            out = paddle.diff(
+                x,
+                n=self.n,
+                axis=self.axis,
+                prepend=self.prepend,
+                append=self.append)
+            try:
+                out.backward()
+                x_grad = x.grad
+            except:
+                raise RuntimeError("Check Diff Gradient Failed")
+
+
+class TestDiffOpAxis(TestDiffOp):
+    def set_args(self):
+        self.input = np.array([[1, 4, 5, 2], [1, 5, 4, 2]]).astype('float32')
+        self.n = 1
+        self.axis = 0
+        self.prepend = None
+        self.append = None
+
+
+class TestDiffOpNDim(TestDiffOp):
+    def set_args(self):
+        self.input = np.random.rand(10, 10).astype('float32')
+        self.n = 1
+        self.axis = -1
+        self.prepend = None
+        self.append = None
+
+
+class TestDiffOpBool(TestDiffOp):
+    def set_args(self):
+        self.input = np.array([0, 1, 1, 0, 1, 0]).astype('bool')
+        self.n = 1
+        self.axis = -1
+        self.prepend = None
+        self.append = None
+
+
+class TestDiffOpPrepend(TestDiffOp):
+    def set_args(self):
+        self.input = np.array([[1, 4, 5, 2], [1, 5, 4, 2]]).astype('float32')
+        self.n = 1
+        self.axis = -1
+        self.prepend = np.array([[2, 3, 4], [1, 3, 5]]).astype('float32')
+        self.append = None
+
+
+class TestDiffOpPrependAxis(TestDiffOp):
+    def set_args(self):
+        self.input = np.array([[1, 4, 5, 2], [1, 5, 4, 2]]).astype('float32')
+        self.n = 1
+        self.axis = 0
+        self.prepend = np.array(
+            [[0, 2, 3, 4], [1, 3, 5, 7], [2, 5, 8, 0]]).astype('float32')
+        self.append = None
+
+
+class TestDiffOpAppend(TestDiffOp):
+    def set_args(self):
+        self.input = np.array([[1, 4, 5, 2], [1, 5, 4, 2]]).astype('float32')
+        self.n = 1
+        self.axis = -1
+        self.prepend = None
+        self.append = np.array([[2, 3, 4], [1, 3, 5]]).astype('float32')
+
+
+class TestDiffOpAppendAxis(TestDiffOp):
+    def set_args(self):
+        self.input = np.array([[1, 4, 5, 2], [1, 5, 4, 2]]).astype('float32')
+        self.n = 1
+        self.axis = 0
+        self.prepend = None
+        self.append = np.array([[2, 3, 4, 1]]).astype('float32')
+
+
+class TestDiffOpPreAppend(TestDiffOp):
+    def set_args(self):
+        self.input = np.array([[1, 4, 5, 2], [1, 5, 4, 2]]).astype('float32')
+        self.n = 1
+        self.axis = -1
+        self.prepend = np.array([[0, 4], [5, 9]]).astype('float32')
+        self.append = np.array([[2, 3, 4], [1, 3, 5]]).astype('float32')
+
+
+class TestDiffOpPreAppendAxis(TestDiffOp):
+    def set_args(self):
+        self.input = np.array([[1, 4, 5, 2], [1, 5, 4, 2]]).astype('float32')
+        self.n = 1
+        self.axis = 0
+        self.prepend = np.array([[0, 4, 5, 9], [5, 9, 2, 3]]).astype('float32')
+        self.append = np.array([[2, 3, 4, 7], [1, 3, 5, 6]]).astype('float32')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
index 3beb1d3dfe033..59d196fdf55e5 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
@@ -20,6 +20,41 @@
 from test_dist_fleet_base import TestFleetBase
 
 
+class TestDistMnistAsyncInMemoryDataset2x2(TestFleetBase):
+    def _setup_config(self):
+        self._mode = "async"
+        #self._reader = "pyreader"
+        self._reader = "dataset"
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
+            "http_proxy": "",
+            "CPU_NUM": "2",
+            "LOG_DIRNAME": "/tmp",
+            "LOG_PREFIX": self.__class__.__name__,
+        }
+
+        required_envs.update(need_envs)
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "3"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
+
+    def test_dist_train(self):
+        self.check_with_place(
+            "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
+
+
 class TestDistMnistAsync2x2(TestFleetBase):
     def _setup_config(self):
         self._mode = "async"
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py
new file mode 100644
index 0000000000000..c5cf8c5d5ed69
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestDygraphShardingStage2(TestMultipleGpus):
+
+    # check sharding logic as well as the accuracy with single mode
+    def test_dygraph_sharding_optimizer_stage2(self):
+        self.run_mnist_2gpu('dygraph_sharding_stage2.py')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_egr_code_generate_api.py b/python/paddle/fluid/tests/unittests/test_egr_code_generate_api.py
new file mode 100644
index 0000000000000..728185c055958
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_egr_code_generate_api.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid.core as core
+import paddle.fluid.eager.eager_tensor_patch_methods as eager_tensor_patch_methods
+import paddle
+import numpy as np
+from paddle.fluid import eager_guard
+import unittest
+
+
+class EagerOpAPIGenerateTestCase(unittest.TestCase):
+    def test_elementwise_add(self):
+        with eager_guard():
+            paddle.set_device("cpu")
+            np_x = np.ones([4, 16, 16, 32]).astype('float32')
+            np_y = np.ones([4, 16, 16, 32]).astype('float32')
+            x = paddle.to_tensor(np_x)
+            y = paddle.to_tensor(np_y)
+            out = paddle.add(x, y)
+            out_arr = out.numpy()
+
+            out_arr_expected = np.add(np_x, np_y)
+            self.assertTrue(np.array_equal(out_arr, out_arr_expected))
+
+    def test_sum(self):
+        with eager_guard():
+            x_data = np.array(
+                [[0.2, 0.3, 0.5, 0.9], [0.1, 0.2, 0.6, 0.7]]).astype('float32')
+            x = paddle.to_tensor(x_data, 'float32')
+            out = paddle.sum(x, axis=0)
+            out_arr = out.numpy()
+            out_arr_expected = np.sum(x_data, axis=0)
+            self.assertTrue(np.array_equal(out_arr, out_arr_expected))
+
+    def test_mm(self):
+        with eager_guard():
+            np_input = np.random.random([16, 32]).astype('float32')
+            np_mat2 = np.random.random([32, 32]).astype('float32')
+            input = paddle.to_tensor(np_input)
+            mat2 = paddle.to_tensor(np_mat2)
+            out = paddle.mm(input, mat2)
+            out_arr = out.numpy()
+            out_arr_expected = np.matmul(np_input, np_mat2)
+            self.assertTrue(np.allclose(out_arr, out_arr_expected))
+
+    def test_sigmoid(self):
+        with eager_guard():
+            np_x = np.array([-0.4, -0.2, 0.1, 0.3]).astype('float32')
+            x = paddle.to_tensor(np_x)
+            out = paddle.nn.functional.sigmoid(x)
+            out_arr = out.numpy()
+            out_arr_expected = np.array(
+                [0.40131234, 0.450166, 0.52497919, 0.57444252]).astype(
+                    'float32')
+            self.assertTrue(np.allclose(out_arr, out_arr_expected))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
new file mode 100644
index 0000000000000..c497c7f9bd80a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid.core as core
+import paddle.fluid.eager.eager_tensor_patch_methods as eager_tensor_patch_methods
+import paddle
+import numpy as np
+from paddle.fluid import eager_guard
+import unittest
+
+
+class EagerScaleTestCase(unittest.TestCase):
+    def test_scale_base(self):
+        with eager_guard():
+            paddle.set_device("cpu")
+            arr = np.ones([4, 16, 16, 32]).astype('float32')
+            tensor = paddle.to_tensor(arr, 'float32', core.CPUPlace())
+            print(tensor)
+            tensor = core.eager.scale(tensor, 2.0, 0.9, True, False)
+            for i in range(0, 100):
+                tensor = core.eager.scale(tensor, 2.0, 0.9, True, False)
+            print(tensor)
+            self.assertEqual(tensor.shape, [4, 16, 16, 32])
+            self.assertEqual(tensor.stop_gradient, True)
+
+    def test_retain_grad_and_run_backward(self):
+        with eager_guard():
+            paddle.set_device("cpu")
+
+            input_data = np.ones([4, 16, 16, 32]).astype('float32')
+            data_eager = paddle.to_tensor(input_data, 'float32',
+                                          core.CPUPlace(), False)
+
+            grad_data = np.ones([4, 16, 16, 32]).astype('float32')
+            grad_eager = paddle.to_tensor(grad_data, 'float32', core.CPUPlace())
+
+            core.eager.retain_grad_for_tensor(data_eager)
+
+            out_eager = core.eager.scale(data_eager, 1.0, 0.9, True, True)
+            self.assertFalse(data_eager.grad._is_initialized())
+            core.eager.run_backward([out_eager], [grad_eager], False)
+            self.assertTrue(data_eager.grad._is_initialized())
+            self.assertTrue(np.array_equal(data_eager.grad.numpy(), input_data))
+
+
+class EagerDtypeTestCase(unittest.TestCase):
+    def check_to_tesnsor_and_numpy(self, dtype):
+        with eager_guard():
+            arr = np.random.random([4, 16, 16, 32]).astype(dtype)
+            tensor = paddle.to_tensor(arr, dtype)
+            self.assertEqual(tensor.dtype, dtype)
+            self.assertTrue(np.array_equal(arr, tensor.numpy()))
+
+    def test_dtype_base(self):
+        self.check_to_tesnsor_and_numpy('bool')
+        self.check_to_tesnsor_and_numpy('int8')
+        self.check_to_tesnsor_and_numpy('uint8')
+        self.check_to_tesnsor_and_numpy('int16')
+        self.check_to_tesnsor_and_numpy('int32')
+        self.check_to_tesnsor_and_numpy('int64')
+        self.check_to_tesnsor_and_numpy('float16')
+        self.check_to_tesnsor_and_numpy('float32')
+        self.check_to_tesnsor_and_numpy('float64')
+        self.check_to_tesnsor_and_numpy('complex64')
+        self.check_to_tesnsor_and_numpy('complex128')
+
+
+class EagerTensorPropertiesTestCase(unittest.TestCase):
+    def test_properties(self):
+        with eager_guard():
+            paddle.set_device("cpu")
+            arr = np.ones([4, 16, 16, 32]).astype('float32')
+            tensor = paddle.to_tensor(arr, 'float32', core.CPUPlace())
+            self.assertEqual(tensor.shape, [4, 16, 16, 32])
+            tensor.name = 'tensor_name_test'
+            self.assertEqual(tensor.name, 'tensor_name_test')
+            self.assertEqual(tensor.persistable, False)
+            tensor.persistable = True
+            self.assertEqual(tensor.persistable, True)
+            tensor.persistable = False
+            self.assertEqual(tensor.persistable, False)
+            self.assertTrue(tensor.place.is_cpu_place())
+            self.assertEqual(tensor._place_str, 'CPUPlace')
+            self.assertEqual(tensor.stop_gradient, True)
+            tensor.stop_gradient = False
+            self.assertEqual(tensor.stop_gradient, False)
+            tensor.stop_gradient = True
+            self.assertEqual(tensor.stop_gradient, True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
index 9cf3eb251b396..a9193c0abdfc1 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
@@ -259,11 +259,18 @@ def test_sparse_table_configs(self):
         strategy = paddle.distributed.fleet.DistributedStrategy()
         configs = {
             "table_parameters.accessor.embed_sgd_param.adagrad.learning_rate":
-            0.05
+            0.05,
+            "table_parameters.accessor.table_accessor_save_param.num": 2,
+            "table_parameters.accessor.table_accessor_save_param.param":
+            [1, 2]
         }
         strategy.sparse_table_configs = configs
         self.assertEqual(strategy.sparse_table_configs.accessor.embed_sgd_param.
                          adagrad.learning_rate, 0.05)
+        self.assertEqual(
+            strategy.sparse_table_configs.accessor.table_accessor_save_param[
+                0].param, 1)
+
         strategy.adam_d2sum = True
         self.assertEqual(strategy.adam_d2sum, True)
         strategy.fs_client_param = {
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_elastic_collective.py b/python/paddle/fluid/tests/unittests/test_fleet_elastic_collective.py
new file mode 100644
index 0000000000000..2d2f019c5ed09
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_elastic_collective.py
@@ -0,0 +1,111 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import time
+import json
+import unittest
+import argparse
+import tempfile
+import traceback
+from warnings import catch_warnings
+
+from paddle.distributed.fleet.elastic.collective import CollectiveLauncher
+from paddle.distributed.fleet.launch import launch_collective
+
+fake_python_code = """
+print("test")
+"""
+
+
+class TestCollectiveLauncher(unittest.TestCase):
+    def setUp(self):
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+
+        self.code_path = os.path.join(file_dir, "fake_python_for_elastic.py")
+        with open(self.code_path, "w") as f:
+            f.write(fake_python_code)
+
+    def test_launch(self):
+        class Argument:
+            elastic_server = "127.0.0.1:2379"
+            job_id = "test_job_id_123"
+            np = "1"
+            gpus = "0"
+            nproc_per_node = 1
+            host = None
+            curr_host = None
+            ips = "127.0.0.1"
+            scale = None
+            force = None
+            backend = 'gloo'
+            enable_auto_mapping = False
+            run_mode = "cpuonly"
+            servers = None
+            rank_mapping_path = None
+            training_script = "fake_python_for_elastic.py"
+            training_script_args = ["--use_amp false"]
+            log_dir = None
+
+        args = Argument()
+
+        launch = CollectiveLauncher(args)
+
+        try:
+            args.backend = "gloo"
+            launch.launch()
+            launch.stop()
+        except Exception as e:
+            pass
+
+        try:
+            args.backend = "gloo"
+            launch_collective(args)
+        except Exception as e:
+            pass
+
+    def test_stop(self):
+        class Argument:
+            elastic_server = "127.0.0.1:2379"
+            job_id = "test_job_id_123"
+            np = "1"
+            gpus = "0"
+            nproc_per_node = 1
+            host = None
+            curr_host = None
+            ips = "127.0.0.1"
+            scale = None
+            force = None
+            backend = 'gloo'
+            enable_auto_mapping = False
+            run_mode = "cpuonly"
+            servers = None
+            rank_mapping_path = None
+            training_script = "fake_python_for_elastic.py"
+            training_script_args = ["--use_amp false"]
+            log_dir = None
+
+        args = Argument()
+        try:
+            launch = CollectiveLauncher(args)
+            launch.tmp_dir = tempfile.mkdtemp()
+            launch.stop()
+        except Exception as e:
+            pass
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_elastic_manager.py b/python/paddle/fluid/tests/unittests/test_fleet_elastic_manager.py
index 149304e505c12..6dc9f69d03f7c 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_elastic_manager.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_elastic_manager.py
@@ -20,7 +20,9 @@
 import argparse
 
 from paddle.distributed.fleet.elastic.manager import ElasticManager
+from paddle.distributed.fleet.elastic.manager import LauncherInterface
 from paddle.distributed.fleet.elastic.manager import ELASTIC_TIMEOUT
+from paddle.distributed.fleet.elastic.manager import ELASTIC_AUTO_PARALLEL_EXIT_CODE
 
 
 class MockLease():
@@ -347,6 +349,47 @@ class Argument:
         args.elastic_pre_hook = "hostname"
         elastic.pre_hook()
 
+    def test_watch(self):
+        class Argument:
+            elastic_server = "127.0.0.1:2379"
+            job_id = "test_job_id_123"
+            np = "2"
+            gpus = "0"
+            nproc_per_node = 1
+            host = None
+            curr_host = None
+            ips = None
+            scale = None
+            force = None
+            backend = 'gloo'
+            elastic_pre_hook = None
+
+        class ElasticLauncher:
+            def watch(self):
+                return ELASTIC_AUTO_PARALLEL_EXIT_CODE
+
+            def stop(self):
+                pass
+
+        args = Argument()
+        elastic = ElasticManager(args, self.etcd_client)
+        elastic.stopped = False
+        elastic.launcher = ElasticLauncher()
+        elastic.watch()
+
+    def test_launcher_interface_check_procs(self):
+        class Proc:
+            def poll(self):
+                return ELASTIC_AUTO_PARALLEL_EXIT_CODE
+
+        class ProcList:
+            def __init__(self):
+                self.proc = Proc()
+
+        launch = LauncherInterface(None)
+        launch.procs = [ProcList()]
+        launch._check_procs()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor.py b/python/paddle/fluid/tests/unittests/test_fleet_executor.py
index 76d4a546746a4..09f9fa6ce105d 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_executor.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import unittest
+import numpy as np
 import paddle
 import paddle.fluid as fluid
 
@@ -20,20 +21,53 @@
 
 
 class TestFleetExecutor(unittest.TestCase):
-    def run_fleet_executor(self, place):
+    def fake_fleet_opt(self):
+        # TODO: Fake for coverage will be removed in the future
+        import paddle.distributed.fleet as fleet
+        strategy = fleet.DistributedStrategy()
+        strategy.sharding_configs = {
+            "dp_degree": 1,
+            "mp_degree": 1,
+            "pp_degree": 1
+        }
+        strategy.pipeline_configs = {"accumulate_steps": 1}
+        fleet_opt = {
+            "dist_strategy": strategy.sharding_configs,
+            "num_micro_batches": strategy.pipeline_configs["accumulate_steps"]
+        }
+        return fleet_opt
+
+    def run_fleet_executor(self, place, x_data, y_data):
         exe = paddle.static.Executor(place)
         empty_program = paddle.static.Program()
         with fluid.program_guard(empty_program, empty_program):
-            x = fluid.layers.data(name='x', shape=[1], dtype=paddle.float32)
+            x = fluid.layers.data(
+                name='x', shape=x_data.shape, dtype=x_data.dtype)
+            y = fluid.layers.data(
+                name='y', shape=y_data.shape, dtype=y_data.dtype)
+            z = x + y
+            a = 2 * x + 3 * y
+        # TODO: section_program will be removed in the future
         empty_program._pipeline_opt = {
-            "fleet_opt": {},
+            "fleet_opt": self.fake_fleet_opt(),
             "section_program": empty_program
         }
-        exe.run(empty_program, feed={'x': [1]})
+        res = exe.run(empty_program,
+                      feed={'x': x_data,
+                            'y': y_data},
+                      fetch_list=[z.name, a.name])
+        return res
 
     def test_executor_on_single_device(self):
         if fluid.is_compiled_with_cuda():
-            self.run_fleet_executor(fluid.CUDAPlace(0))
+            shape = (10000, 3462)
+            x_data = np.random.rand(*shape)
+            y_data = np.random.rand(*shape)
+            z_data = x_data + y_data
+            a_data = 2 * x_data + 3 * y_data
+            res = self.run_fleet_executor(fluid.CUDAPlace(0), x_data, y_data)
+            self.assertTrue(np.allclose(res[0], z_data))
+            self.assertTrue(np.allclose(res[1], a_data))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor_multi_devices.py b/python/paddle/fluid/tests/unittests/test_fleet_executor_multi_devices.py
index adffd228591b9..fb82c71b2ff7f 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_executor_multi_devices.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_executor_multi_devices.py
@@ -49,7 +49,8 @@ def test_dist_executor_on_multi_devices(self):
             "num_micro_batches": strategy.pipeline_configs["accumulate_steps"]
         }
         if fluid.is_compiled_with_cuda():
-            self.run_fleet_executor(fluid.CUDAPlace(0), fleet_opt)
+            # TODO: Distribute test case is not supported for executor can not stop
+            pass
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor_task_node.py b/python/paddle/fluid/tests/unittests/test_fleet_executor_task_node.py
index 2c944aa5dbc47..3dae8a5bf6b95 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_executor_task_node.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_executor_task_node.py
@@ -28,8 +28,9 @@ def test_task_node(self):
         self.assertEqual(task_node_0.task_id(), 0)
         self.assertEqual(task_node_1.task_id(), 1)
         self.assertEqual(task_node_2.task_id(), 2)
-        self.assertTrue(task_node_0.add_downstream_task(task_node_1.task_id()))
-        self.assertTrue(task_node_1.add_upstream_task(task_node_0.task_id()))
+        self.assertTrue(
+            task_node_0.add_downstream_task(task_node_1.task_id(), 1))
+        self.assertTrue(task_node_1.add_upstream_task(task_node_0.task_id(), 1))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index 3a1ff82290e0f..d6835069b9d2a 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -400,8 +400,6 @@ def test_set_persistable(self):
 
     def test_layer(self):
         with fluid.dygraph.guard():
-            cl = core.Layer()
-            cl.forward([])
             l = fluid.Layer("l")
             self.assertRaises(NotImplementedError, l.forward, [])
 
diff --git a/python/paddle/fluid/tests/unittests/test_merged_momentum_op.py b/python/paddle/fluid/tests/unittests/test_merged_momentum_op.py
index 96e458795a3c0..9bc3bb7ad341f 100644
--- a/python/paddle/fluid/tests/unittests/test_merged_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_merged_momentum_op.py
@@ -130,6 +130,130 @@ def run_momentum_op(params,
         return exe.run(main, feed=feed_dict, fetch_list=fetch_list)
 
 
+def run_momentum_op2(params,
+                     grads,
+                     velocitys,
+                     master_params,
+                     learning_rate,
+                     place,
+                     multi_precision,
+                     mu=0.9,
+                     rescale_grad=0.01,
+                     use_merged=False,
+                     use_nesterov=True):
+    assert len(params) == len(grads)
+    assert len(params) == len(velocitys)
+    if multi_precision:
+        assert len(params) == len(master_params)
+    op_type = 'merged_momentum' if use_merged else 'momentum'
+    main = paddle.static.Program()
+    startup = paddle.static.Program()
+    with paddle.static.program_guard(main, startup):
+        helper = LayerHelper(op_type, **locals())
+
+        param_vars = [
+            helper.create_variable(
+                persistable=True, shape=p.shape, dtype=p.dtype) for p in params
+        ]
+        grad_vars = [
+            helper.create_variable(
+                shape=g.shape, dtype=g.dtype) for g in grads
+        ]
+        velocity_vars = [
+            helper.create_variable(
+                persistable=True, shape=v.shape, dtype=v.dtype)
+            for v in velocitys
+        ]
+        lr_var = helper.create_variable(
+            persistable=True,
+            shape=learning_rate.shape,
+            dtype=learning_rate.dtype)
+
+        feed_dict = OrderedDict()
+
+        feed_dict.update(
+            OrderedDict([(p_var.name, p_val)
+                         for p_var, p_val in zip(param_vars, params)]))
+        feed_dict.update(
+            OrderedDict([(v_var.name, v_val)
+                         for v_var, v_val in zip(velocity_vars, velocitys)]))
+        fetch_list = list(feed_dict.keys())
+
+        feed_dict.update(
+            OrderedDict([(g_var.name, g_val)
+                         for g_var, g_val in zip(grad_vars, grads)]))
+        feed_dict.update({lr_var.name: learning_rate})
+
+        if multi_precision:
+            master_param_vars = [
+                helper.create_variable(
+                    persistable=True, shape=p.shape, dtype=p.dtype)
+                for p in master_params
+            ]
+            feed_dict.update(
+                OrderedDict([(mp_var.name, mp_val)
+                             for mp_var, mp_val in zip(master_param_vars,
+                                                       master_params)]))
+            # CPUPlace does not use MasterParam
+            if isinstance(place, paddle.CUDAPlace):
+                fetch_list = fetch_list + [
+                    mp_var.name for mp_var in master_param_vars
+                ]
+        else:
+            master_param_vars = None
+
+        if not use_merged:
+            for i, (p, g,
+                    v) in enumerate(zip(param_vars, grad_vars, velocity_vars)):
+                inputs = {
+                    'Param': p,
+                    'Grad': g,
+                    'Velocity': v,
+                    'LearningRate': lr_var,
+                }
+                outputs = {'ParamOut': p, 'VelocityOut': v}
+                if multi_precision:
+                    inputs['MasterParam'] = master_param_vars[i]
+                    outputs['MasterParamOut'] = master_param_vars[i]
+                attrs = {
+                    'mu': mu,
+                    'multi_precision': multi_precision,
+                    'rescale_grad': rescale_grad,
+                    'use_nesterov': use_nesterov,
+                    'regularization_method': 'l2_decay',
+                    'regularization_coeff': 2.0,
+                }
+                helper.append_op(
+                    type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+        else:
+            inputs = {
+                'Param': param_vars,
+                'Grad': grad_vars,
+                'Velocity': velocity_vars,
+                'LearningRate': lr_var,
+            }
+            outputs = {'ParamOut': param_vars, 'VelocityOut': velocity_vars}
+            if multi_precision:
+                inputs['MasterParam'] = master_param_vars
+                outputs['MasterParamOut'] = master_param_vars
+            attrs = {
+                'mu': mu,
+                'multi_precision': multi_precision,
+                'rescale_grad': rescale_grad,
+                'use_nesterov': use_nesterov,
+                'regularization_method':
+                ['l2_decay' for i in range(len(param_vars))],
+                'regularization_coeff': [2.0 for i in range(len(param_vars))],
+            }
+            helper.append_op(
+                type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+
+    exe = paddle.static.Executor(place)
+    with paddle.static.scope_guard(paddle.static.Scope()):
+        exe.run(startup)
+        return exe.run(main, feed=feed_dict, fetch_list=fetch_list)
+
+
 class TestMergedMomentum(unittest.TestCase):
     def setUp(self):
         paddle.enable_static()
@@ -193,5 +317,78 @@ def test_main(self):
                 self.check_with_place(place, multi_precision)
 
 
+class TestMergedMomentum2(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+        self.shapes = [[3, 4], [2, 7], [5, 6], [7, 8]]
+        self.seed = 10
+
+    def gen_rand_data(self, shapes, dtype):
+        return [np.random.random(s).astype(dtype) for s in shapes]
+
+    def prepare_data(self, shapes, multi_precision, seed, place):
+        np.random.seed(seed)
+        mp_dtype = np.float32
+        dtype = np.float16 if multi_precision and isinstance(
+            place, paddle.CUDAPlace) else np.float32
+        params = self.gen_rand_data(shapes, dtype)
+        grads = self.gen_rand_data(shapes, dtype)
+        velocitys = self.gen_rand_data(shapes, mp_dtype)
+        learning_rate = self.gen_rand_data([[1]], mp_dtype)[0]
+        if multi_precision:
+            master_params = [p.astype(mp_dtype) for p in params]
+        else:
+            master_params = None
+        return params, grads, velocitys, master_params, learning_rate
+
+    def check_with_place(self, place, multi_precision):
+        params, grads, velocitys, master_params, learning_rate = self.prepare_data(
+            self.shapes, multi_precision, self.seed, place)
+
+        def run_op(use_nesterov, use_merged):
+            # FIXME(zengjinle): CPU Momentum Op does not support rescale_grad 
+            rescale_grad = 1.0 if isinstance(place, paddle.CPUPlace) else 0.01
+            return run_momentum_op2(
+                params,
+                grads,
+                velocitys,
+                master_params,
+                learning_rate,
+                place,
+                multi_precision,
+                rescale_grad=rescale_grad,
+                use_merged=use_merged,
+                use_nesterov=use_nesterov)
+
+        outs1 = run_op(use_nesterov=True, use_merged=True)
+        outs2 = run_op(use_nesterov=True, use_merged=False)
+        self.assertEqual(len(outs1), len(outs2))
+        for i, (out1, out2) in enumerate(zip(outs1, outs2)):
+            if isinstance(place, paddle.CUDAPlace):
+                self.assertTrue(np.array_equal(out1, out2))
+            else:
+                self.assertTrue(np.allclose(out1, out2, atol=1e-7))
+
+        outs3 = run_op(use_nesterov=False, use_merged=True)
+        outs4 = run_op(use_nesterov=False, use_merged=False)
+        self.assertEqual(len(outs3), len(outs4))
+        for j, (out3, out4) in enumerate(zip(outs3, outs4)):
+            if isinstance(place, paddle.CUDAPlace):
+                self.assertTrue(np.array_equal(out3, out4))
+            else:
+                self.assertTrue(np.allclose(out3, out4, atol=1e-7))
+
+    def get_places(self):
+        places = [paddle.CPUPlace()]
+        if paddle.is_compiled_with_cuda():
+            places.append(paddle.CUDAPlace(0))
+        return places
+
+    def test_main(self):
+        for multi_precision in [False, True]:
+            for place in self.get_places():
+                self.check_with_place(place, multi_precision)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pylayer_op.py b/python/paddle/fluid/tests/unittests/test_pylayer_op.py
index a852b4c90421a..200273c6066f9 100644
--- a/python/paddle/fluid/tests/unittests/test_pylayer_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pylayer_op.py
@@ -406,6 +406,32 @@ def forward(self, data):
             z.backward()
             self.assertTrue(data.grad is not None)
 
+    def test_pylayer_inplace_and_leaf_exception(self):
+        class cus_pylayer_op(PyLayer):
+            @staticmethod
+            def forward(ctx, x):
+                return x
+
+            @staticmethod
+            def backward(ctx, dy):
+                return dy
+
+        class Layer(paddle.nn.Layer):
+            def __init__(self):
+                super(Layer, self).__init__()
+
+            def forward(self, data):
+                z = cus_pylayer_op.apply(data)
+                return z.mean()
+
+        for i in range(2):
+            data = paddle.ones([2, 3], dtype="float64") / (i + 1)
+            data.stop_gradient = False
+            layer = Layer()
+
+            with self.assertRaises(ValueError):
+                z = layer(data)
+
     def test_backward_in_backward(self):
         class cus_tanh(PyLayer):
             @staticmethod
diff --git a/python/paddle/fluid/tests/unittests/test_rad2deg.py b/python/paddle/fluid/tests/unittests/test_rad2deg.py
new file mode 100644
index 0000000000000..9f117cbab9a4d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_rad2deg.py
@@ -0,0 +1,92 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import Program, program_guard
+from op_test import OpTest
+
+paddle.enable_static()
+
+
+class TestRad2degAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_dtype = 'float64'
+        self.x_np = np.array(
+            [3.142, -3.142, 6.283, -6.283, 1.570, -1.570]).astype(np.float64)
+        self.x_shape = [6]
+        self.out_np = np.rad2deg(self.x_np)
+
+    def test_static_graph(self):
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(startup_program, train_program):
+            x = fluid.data(name='input', dtype=self.x_dtype, shape=self.x_shape)
+            out = paddle.rad2deg(x)
+
+            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+            ) else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            res = exe.run(fluid.default_main_program(),
+                          feed={'input': self.x_np},
+                          fetch_list=[out])
+            self.assertTrue((np.array(out[0]) == self.out_np).all())
+
+    def test_dygraph(self):
+        paddle.disable_static()
+        x1 = paddle.to_tensor([3.142, -3.142, 6.283, -6.283, 1.570, -1.570])
+        result1 = paddle.rad2deg(x1)
+        self.assertEqual(np.allclose(self.out_np, result1.numpy()), True)
+
+        paddle.enable_static()
+
+
+class TestRad2degAPI2(TestRad2degAPI):
+    def setUp(self):
+        self.x_np = np.pi / 2
+        self.x_shape = [1]
+        self.out_np = 90
+        self.x_dtype = 'float32'
+
+    def test_dygraph(self):
+        paddle.disable_static()
+
+        x2 = paddle.to_tensor(np.pi / 2)
+        result2 = paddle.rad2deg(x2)
+        self.assertEqual(np.allclose(90, result2.numpy()), True)
+
+        paddle.enable_static()
+
+
+class TestRad2degAPI3(TestRad2degAPI):
+    # Test input data type is int
+    def setUp(self):
+        self.x_np = 1
+        self.x_shape = [1]
+        self.out_np = 180 / np.pi
+        self.x_dtype = 'int64'
+
+    def test_dygraph(self):
+        paddle.disable_static()
+
+        x2 = paddle.to_tensor(1)
+        result2 = paddle.rad2deg(x2)
+        self.assertEqual(np.allclose(180 / np.pi, result2.numpy()), True)
+
+        paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_reset_grad_inplace_version.py b/python/paddle/fluid/tests/unittests/test_reset_grad_inplace_version.py
index d9634f4997d80..fee5bb8f47f26 100644
--- a/python/paddle/fluid/tests/unittests/test_reset_grad_inplace_version.py
+++ b/python/paddle/fluid/tests/unittests/test_reset_grad_inplace_version.py
@@ -20,12 +20,13 @@
 paddle.set_device('cpu')
 
 
-def clear_grad(w, a):
+# Test 1
+def clear_grad_test_0(w, a):
     @paddle.no_grad()
     def warp(*_):
         assert w.grad is not None
         _C_ops.scale_(w.grad, 'scale', 0.5)
-        w._reset_grad_inplace_version()
+        w._reset_grad_inplace_version(True)
 
     return warp
 
@@ -35,7 +36,7 @@ def test(self):
         input_data = np.ones([1, 1])
         w = paddle.to_tensor(input_data, 'float32', stop_gradient=False)
 
-        _clear_grad = clear_grad(w, a="1")
+        _clear_grad = clear_grad_test_0(w, a="1")
         w._register_backward_hook(_clear_grad)
         for i in range(2):
             print(" Step: ", i)
@@ -45,5 +46,60 @@ def test(self):
         assert w.grad[0] == 0.15
 
 
+# Test 2
+class Counter:
+    def __init__(self):
+        self.num_calls = 0
+        self.step = 0
+
+
+def clear_grad_test_1(w, c):
+    @paddle.no_grad()
+    def warp(*_):
+        assert w.grad is not None
+        if c.step == 1:
+            w.grad.scale_(scale=0.5)
+            w._reset_grad_inplace_version(True)
+
+        c.num_calls += 1
+
+    return warp
+
+
+class TestInplaceClearGradAccumulation(unittest.TestCase):
+    def test(self):
+        input_data = np.ones([1, 1])
+        w = paddle.to_tensor(input_data, 'float32', stop_gradient=False)
+        c = Counter()
+
+        _clear_grad = clear_grad_test_1(w, c)
+        w._register_backward_hook(_clear_grad)
+
+        for c.step in range(5):
+            out0 = _C_ops.scale(w, 'scale', 0.1)
+            out = _C_ops.matmul_v2(out0, w, 'trans_x', False, 'trans_y', False)
+
+            out.backward()
+
+            if c.step == 1:
+                w.clear_gradient(False)
+
+            assert c.num_calls == 1
+            c.num_calls = 0
+
+
+class TestInplaceClearGradAccumulationAlt(unittest.TestCase):
+    def test(self):
+        input_data = np.ones([1, 1])
+        w = paddle.to_tensor(input_data, 'float32', stop_gradient=False)
+        out = _C_ops.scale(w, 'scale', 0.1)
+        out.backward()
+
+        w.grad.scale_(scale=0.5)
+        w._reset_grad_inplace_version(False)
+
+        assert w.grad._inplace_version() == 1
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_rot90_op.py b/python/paddle/fluid/tests/unittests/test_rot90_op.py
new file mode 100644
index 0000000000000..4ab7c4f14f96b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_rot90_op.py
@@ -0,0 +1,262 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import Program, program_guard
+
+
+class TestRot90_API(unittest.TestCase):
+    """Test rot90 api."""
+
+    def test_static_graph(self):
+        paddle.enable_static()
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+            output = paddle.rot90(input, k=1, axes=[0, 1])
+            output = paddle.rot90(output, k=1, axes=[0, 1])
+            output = output.rot90(k=1, axes=[0, 1])
+            place = fluid.CPUPlace()
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(startup_program)
+
+            img = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
+            res = exe.run(train_program,
+                          feed={'input': img},
+                          fetch_list=[output])
+
+            out_np = np.array(res[0])
+            out_ref = np.array([[4, 1], [5, 2], [6, 3]]).astype(np.float32)
+
+            self.assertTrue(
+                (out_np == out_ref).all(),
+                msg='rot90 output is wrong, out =' + str(out_np))
+
+    def test_static_k_0(self):
+        paddle.enable_static()
+        input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+            output = paddle.rot90(input, k=0, axes=[0, 1])
+            place = fluid.CPUPlace()
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(startup_program)
+
+            img = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
+            res = exe.run(train_program,
+                          feed={'input': img},
+                          fetch_list=[output])
+
+            out_np = np.array(res[0])
+            out_ref = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
+
+            self.assertTrue(
+                (out_np == out_ref).all(),
+                msg='rot90 output is wrong, out =' + str(out_np))
+
+    def test_static_k_2(self):
+        paddle.enable_static()
+        input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+            output = paddle.rot90(input, k=2, axes=[0, 1])
+            place = fluid.CPUPlace()
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(startup_program)
+
+            img = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
+            res = exe.run(train_program,
+                          feed={'input': img},
+                          fetch_list=[output])
+
+            out_np = np.array(res[0])
+            out_ref = np.array([[6, 5, 4], [3, 2, 1]]).astype(np.float32)
+
+            self.assertTrue(
+                (out_np == out_ref).all(),
+                msg='rot90 output is wrong, out =' + str(out_np))
+
+    def test_static_k_3(self):
+        paddle.enable_static()
+        input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+            output = paddle.rot90(input, k=3, axes=[0, 1])
+            place = fluid.CPUPlace()
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(startup_program)
+
+            img = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
+            res = exe.run(train_program,
+                          feed={'input': img},
+                          fetch_list=[output])
+
+            out_np = np.array(res[0])
+            out_ref = np.array([[4, 1], [5, 2], [6, 3]]).astype(np.float32)
+
+            self.assertTrue(
+                (out_np == out_ref).all(),
+                msg='rot90 output is wrong, out =' + str(out_np))
+
+    def test_static_neg_k_1(self):
+        paddle.enable_static()
+        input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+            output = paddle.rot90(input, k=-1, axes=[0, 1])
+            place = fluid.CPUPlace()
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(startup_program)
+
+            img = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
+            res = exe.run(train_program,
+                          feed={'input': img},
+                          fetch_list=[output])
+
+            out_np = np.array(res[0])
+            out_ref = np.array([[4, 1], [5, 2], [6, 3]]).astype(np.float32)
+
+            self.assertTrue(
+                (out_np == out_ref).all(),
+                msg='rot90 output is wrong, out =' + str(out_np))
+
+    def test_static_neg_k_2(self):
+        paddle.enable_static()
+        input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+            output = paddle.rot90(input, k=-2, axes=[0, 1])
+            place = fluid.CPUPlace()
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(startup_program)
+
+            img = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
+            res = exe.run(train_program,
+                          feed={'input': img},
+                          fetch_list=[output])
+
+            out_np = np.array(res[0])
+            out_ref = np.array([[6, 5, 4], [3, 2, 1]]).astype(np.float32)
+
+            self.assertTrue(
+                (out_np == out_ref).all(),
+                msg='rot90 output is wrong, out =' + str(out_np))
+
+    def test_static_neg_k_3(self):
+        paddle.enable_static()
+        input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+            output = paddle.rot90(input, k=-3, axes=[0, 1])
+            place = fluid.CPUPlace()
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(startup_program)
+
+            img = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
+            res = exe.run(train_program,
+                          feed={'input': img},
+                          fetch_list=[output])
+
+            out_np = np.array(res[0])
+            out_ref = np.array([[3, 6], [2, 5], [1, 4]]).astype(np.float32)
+
+            self.assertTrue(
+                (out_np == out_ref).all(),
+                msg='rot90 output is wrong, out =' + str(out_np))
+
+    def test_error_api(self):
+        paddle.enable_static()
+
+        ## dims error
+        def run1():
+            input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+            output = paddle.rot90(input, k=1, axes=[0])
+
+        self.assertRaises(ValueError, run1)
+
+        ## input dims error
+        def run2():
+            input = fluid.data(name='input', dtype='float32', shape=[2])
+            output = paddle.rot90(input, k=1, axes=[0, 1])
+
+        self.assertRaises(ValueError, run2)
+
+        def run3():
+            input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+            output = paddle.rot90(input, k=1, axes=[0, 0])
+
+        self.assertRaises(ValueError, run3)
+
+        def run4():
+            input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+            output = paddle.rot90(input, k=1, axes=[3, 1])
+
+        self.assertRaises(ValueError, run4)
+
+        def run5():
+            input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+            output = paddle.rot90(input, k=1, axes=[0, 3])
+
+        self.assertRaises(ValueError, run5)
+
+    def test_dygraph(self):
+        img = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
+        with fluid.dygraph.guard():
+            inputs = fluid.dygraph.to_variable(img)
+
+            ret = paddle.rot90(inputs, k=1, axes=[0, 1])
+            ret = ret.rot90(1, axes=[0, 1])
+            ret = paddle.rot90(ret, k=1, axes=[0, 1])
+            out_ref = np.array([[4, 1], [5, 2], [6, 3]]).astype(np.float32)
+
+            self.assertTrue(
+                (ret.numpy() == out_ref).all(),
+                msg='rot90 output is wrong, out =' + str(ret.numpy()))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index 0e50a20a04e65..ab6e8003833ec 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -1279,7 +1279,7 @@ def test_varbase_init(self):
 
 
 class TestVarBaseNumel(unittest.TestCase):
-    def test_numel(self):
+    def test_numel_normal(self):
         paddle.disable_static()
         np_x = np.random.random((3, 8, 8))
         x = paddle.to_tensor(np_x, dtype="float64")
@@ -1287,6 +1287,12 @@ def test_numel(self):
         x_expected_numel = np.product((3, 8, 8))
         self.assertEqual(x_actual_numel, x_expected_numel)
 
+    def test_numel_without_holder(self):
+        paddle.disable_static()
+        x_without_holder = core.VarBase()
+        x_actual_numel = x_without_holder._numel()
+        self.assertEqual(x_actual_numel, 0)
+
 
 class TestVarBaseCopyGradientFrom(unittest.TestCase):
     def test_copy_gradient_from(self):
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py
new file mode 100644
index 0000000000000..44137f4718743
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py
@@ -0,0 +1,201 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+import math
+import paddle
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+
+paddle.enable_static()
+
+
+class TestPriorBoxOp(XPUOpTest):
+    def set_data(self):
+        self.init_test_params()
+        self.init_test_input()
+        self.init_test_output()
+        self.inputs = {'Input': self.input, 'Image': self.image}
+
+        self.attrs = {
+            'min_sizes': self.min_sizes,
+            'aspect_ratios': self.aspect_ratios,
+            'variances': self.variances,
+            'flip': self.flip,
+            'clip': self.clip,
+            'min_max_aspect_ratios_order': self.min_max_aspect_ratios_order,
+            'step_w': self.step_w,
+            'step_h': self.step_h,
+            'offset': self.offset
+        }
+        if len(self.max_sizes) > 0:
+            self.attrs['max_sizes'] = self.max_sizes
+
+        self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var}
+
+    def test_check_output(self):
+        place = paddle.XPUPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        pass
+
+    def setUp(self):
+        self.op_type = "prior_box"
+        self.use_xpu = True
+        self.set_data()
+
+    def set_max_sizes(self):
+        max_sizes = [5, 10]
+        self.max_sizes = np.array(max_sizes).astype('float32').tolist()
+
+    def set_min_max_aspect_ratios_order(self):
+        self.min_max_aspect_ratios_order = False
+
+    def init_test_params(self):
+        self.layer_w = 32
+        self.layer_h = 32
+
+        self.image_w = 40
+        self.image_h = 40
+
+        self.step_w = float(self.image_w) / float(self.layer_w)
+        self.step_h = float(self.image_h) / float(self.layer_h)
+
+        self.input_channels = 2
+        self.image_channels = 3
+        self.batch_size = 10
+
+        self.min_sizes = [2, 4]
+        self.min_sizes = np.array(self.min_sizes).astype('float32').tolist()
+        self.set_max_sizes()
+        self.aspect_ratios = [2.0, 3.0]
+        self.flip = True
+        self.set_min_max_aspect_ratios_order()
+        self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0]
+        self.aspect_ratios = np.array(
+            self.aspect_ratios, dtype=np.float).flatten()
+        self.variances = [0.1, 0.1, 0.2, 0.2]
+        self.variances = np.array(self.variances, dtype=np.float).flatten()
+
+        self.clip = True
+        self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes)
+        if len(self.max_sizes) > 0:
+            self.num_priors += len(self.max_sizes)
+        self.offset = 0.5
+
+    def init_test_input(self):
+        self.image = np.random.random(
+            (self.batch_size, self.image_channels, self.image_w,
+             self.image_h)).astype('float32')
+
+        self.input = np.random.random(
+            (self.batch_size, self.input_channels, self.layer_w,
+             self.layer_h)).astype('float32')
+
+    def init_test_output(self):
+        out_dim = (self.layer_h, self.layer_w, self.num_priors, 4)
+        out_boxes = np.zeros(out_dim).astype('float32')
+        out_var = np.zeros(out_dim).astype('float32')
+
+        idx = 0
+        for h in range(self.layer_h):
+            for w in range(self.layer_w):
+                c_x = (w + self.offset) * self.step_w
+                c_y = (h + self.offset) * self.step_h
+                idx = 0
+                for s in range(len(self.min_sizes)):
+                    min_size = self.min_sizes[s]
+                    if not self.min_max_aspect_ratios_order:
+                        # rest of priors
+                        for r in range(len(self.real_aspect_ratios)):
+                            ar = self.real_aspect_ratios[r]
+                            c_w = min_size * math.sqrt(ar) / 2
+                            c_h = (min_size / math.sqrt(ar)) / 2
+                            out_boxes[h, w, idx, :] = [
+                                (c_x - c_w) / self.image_w, (c_y - c_h) /
+                                self.image_h, (c_x + c_w) / self.image_w,
+                                (c_y + c_h) / self.image_h
+                            ]
+                            idx += 1
+
+                        if len(self.max_sizes) > 0:
+                            max_size = self.max_sizes[s]
+                            # second prior: aspect_ratio = 1,
+                            c_w = c_h = math.sqrt(min_size * max_size) / 2
+                            out_boxes[h, w, idx, :] = [
+                                (c_x - c_w) / self.image_w, (c_y - c_h) /
+                                self.image_h, (c_x + c_w) / self.image_w,
+                                (c_y + c_h) / self.image_h
+                            ]
+                            idx += 1
+                    else:
+                        c_w = c_h = min_size / 2.
+                        out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w,
+                                                   (c_y - c_h) / self.image_h,
+                                                   (c_x + c_w) / self.image_w,
+                                                   (c_y + c_h) / self.image_h]
+                        idx += 1
+                        if len(self.max_sizes) > 0:
+                            max_size = self.max_sizes[s]
+                            # second prior: aspect_ratio = 1,
+                            c_w = c_h = math.sqrt(min_size * max_size) / 2
+                            out_boxes[h, w, idx, :] = [
+                                (c_x - c_w) / self.image_w, (c_y - c_h) /
+                                self.image_h, (c_x + c_w) / self.image_w,
+                                (c_y + c_h) / self.image_h
+                            ]
+                            idx += 1
+
+                        # rest of priors
+                        for r in range(len(self.real_aspect_ratios)):
+                            ar = self.real_aspect_ratios[r]
+                            if abs(ar - 1.) < 1e-6:
+                                continue
+                            c_w = min_size * math.sqrt(ar) / 2
+                            c_h = (min_size / math.sqrt(ar)) / 2
+                            out_boxes[h, w, idx, :] = [
+                                (c_x - c_w) / self.image_w, (c_y - c_h) /
+                                self.image_h, (c_x + c_w) / self.image_w,
+                                (c_y + c_h) / self.image_h
+                            ]
+                            idx += 1
+
+        # clip the prior's coordidate such that it is within[0, 1]
+        if self.clip:
+            out_boxes = np.clip(out_boxes, 0.0, 1.0)
+        # set the variance.
+        out_var = np.tile(self.variances, (self.layer_h, self.layer_w,
+                                           self.num_priors, 1))
+        self.out_boxes = out_boxes.astype('float32')
+        self.out_var = out_var.astype('float32')
+
+
+class TestPriorBoxOpWithoutMaxSize(TestPriorBoxOp):
+    def set_max_sizes(self):
+        self.max_sizes = []
+
+
+class TestPriorBoxOpWithSpecifiedOutOrder(TestPriorBoxOp):
+    def set_min_max_aspect_ratios_order(self):
+        self.min_max_aspect_ratios_order = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/variable_index.py b/python/paddle/fluid/variable_index.py
index 19067b8ae1252..5aa7f9c972f9b 100644
--- a/python/paddle/fluid/variable_index.py
+++ b/python/paddle/fluid/variable_index.py
@@ -674,14 +674,6 @@ def _setitem_impl_(var, item, value):
 
 # the item is a tensor of bool 
 def set_value_for_bool_tensor(var, item, value):
-
-    # TODO(zyfncg): Now scatter_nd_add only support float32 and float64 tensor, 
-    # so in the current version we also only support float32 and float64 tensor, 
-    # this problem will be fixed in the future.
-    if var.dtype != core.VarDesc.VarType.FP32 and var.dtype != core.VarDesc.VarType.FP64:
-        raise TypeError("Only support float and double tensor for bool index, "
-                        "but received {}.".format(var.dtype))
-
     if len(item.shape) > len(var.shape):
         raise IndexError("The dims of bool index doesn't match indexed array, "
                          "the dims of bool index except to be equal or less "
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index 8b72f05f363cb..8367205a7e7c2 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -253,7 +253,7 @@ def create_layer_dispatch_table(layer):
         dispatch_table_layer[layer.__class__] = reduce_Layer
         return layer
 
-    _parse_every_object(obj, lambda v: isinstance(v, core.Layer),
+    _parse_every_object(obj, lambda v: isinstance(v, fluid.Layer),
                         create_layer_dispatch_table)
 
     def add_dispatch_table():
@@ -316,7 +316,7 @@ def _is_state_dict(obj):
     if isinstance(obj, dict):
 
         def condition(obj):
-            return isinstance(obj, (core.Layer, Program, core.VarBase,
+            return isinstance(obj, (fluid.Layer, Program, core.VarBase,
                                     core.LoDTensor, core.SelectedRows))
 
         # If the value of a dict is a core.VarBase/LoDTensor or a dict 
@@ -422,7 +422,7 @@ def _parse_every_object(obj, condition_func, convert_func):
 
 def _parse_load_result(obj, return_numpy):
     def is_layer(obj):
-        return isinstance(obj, core.Layer)
+        return isinstance(obj, fluid.Layer)
 
     def parse_layer(obj):
         temp_dict = _parse_load_result(obj.__dict__, False)
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 1abe74e9783dc..3afd2b56569a4 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -295,5 +295,6 @@ def weight_norm(*args):
            'ELU',
            'ReLU6',
            'LayerDict',
-           'ZeroPad2D'
+           'ZeroPad2D',
+           'MaxUnPool2D',
 ]
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index b5d6d7834f959..cc49db9b2056f 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -1181,7 +1181,7 @@ class MaxUnPool2D(Layer):
         pool_out, indices = F.max_pool2d(data, kernel_size=2, stride=2, padding=0, return_mask=True)
         # pool_out shape: [1, 1, 3, 3],  indices shape: [1, 1, 3, 3]
         Unpool2D = paddle.nn.MaxUnPool2D(kernel_size=2, padding=0)
-        unpool_out = UnPool2D(pool_out, indices)
+        unpool_out = Unpool2D(pool_out, indices)
         # unpool_out shape: [1, 1, 6, 6]
 
     """
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 21d1dd1793b2c..793fdb89d0677 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -106,6 +106,7 @@
 from .manipulation import unsqueeze_  # noqa: F401
 from .manipulation import unstack  # noqa: F401
 from .manipulation import flip  # noqa: F401
+from .manipulation import rot90  # noqa: F401
 from .manipulation import unbind  # noqa: F401
 from .manipulation import roll  # noqa: F401
 from .manipulation import chunk  # noqa: F401
@@ -189,6 +190,10 @@
 from .math import neg  # noqa: F401
 from .math import lgamma  # noqa: F401
 from .math import diagonal  # noqa: F401
+from .math import rad2deg  # noqa: F401
+from .math import deg2rad  # noqa: F401
+from .math import diff  # noqa: F401
+from .math import angle  # noqa: F401
 
 from .random import multinomial  # noqa: F401
 from .random import standard_normal  # noqa: F401
@@ -366,6 +371,7 @@
            'unsqueeze_',
            'unstack',
            'flip',
+           'rot90',
            'unbind',
            'roll',
            'tile',
@@ -400,7 +406,9 @@
            'uniform_',
            'multi_dot',
            'solve',
-           'triangular_solve'
+           'triangular_solve',
+           'diff',
+           'angle',
 ]
 
 #this list used in math_op_patch.py for magic_method bind
diff --git a/python/paddle/tensor/attribute.py b/python/paddle/tensor/attribute.py
index 8d8c2a83de1db..44b34b3e2d67e 100644
--- a/python/paddle/tensor/attribute.py
+++ b/python/paddle/tensor/attribute.py
@@ -60,7 +60,7 @@ def is_floating_point(x):
     return is_fp_dtype
 
 
-def is_interger(x):
+def is_integer(x):
     dtype = x.dtype
     is_int_dtype = (dtype == core.VarDesc.VarType.UINT8 or
                     dtype == core.VarDesc.VarType.INT8 or
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 72b6bd29fd9e7..812c7e8b5ac04 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -31,6 +31,7 @@
 from ..fluid.layers import linspace  # noqa: F401
 import paddle
 from paddle import _C_ops
+from ..fluid.framework import in_eager_mode
 
 __all__ = []
 
@@ -115,6 +116,12 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
             ) != _current_expected_place()._get_device_id():
         place = _current_expected_place()
 
+    if in_eager_mode():
+        if dtype is None:
+            dtype = paddle.get_default_dtype()
+        return core.eager.to_tensor(data,
+                                    convert_dtype(dtype), place, stop_gradient)
+
     if not isinstance(data, np.ndarray):
 
         def _handle_dtype(data, dtype):
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index c5bf19e83ded8..f333b527db38f 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -1573,7 +1573,7 @@ def svd(x, full_matrices=False, name=None):
         outputs={'U': u,
                  'VH': vh,
                  'S': s},
-        attr=attrs, )
+        attrs=attrs, )
     return u, s, vh
 
 
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 9b9b2d9431eeb..f48e5a3a764cd 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -495,6 +495,92 @@ def flip(x, axis, name=None):
     return out
 
 
+def rot90(x, k=1, axes=[0, 1], name=None):
+    """
+    Rotate a n-D tensor by 90 degrees in the plane specified by dims axis. Rotation direction is from the first towards the second axis if k > 0, and from the second towards the first for k < 0.
+
+    Args:
+        x (Tensor): The input Tensor(or LoDTensor). The data type of the input Tensor x
+            should be float32, float64, int32, int64, bool.
+        k (int): Number of times to rotate
+        axes (list|tuple): Axis to rotate
+        name (str, optional): The default value is None.  Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name` .
+
+    Returns:
+        Tensor: Tensor or LoDTensor calculated by rot90 layer. The data type is same with input x.
+
+    Raises:
+        TypeError: If the data type of ``x`` is not Variable
+        TypeError: If the dtype of ``x`` is not float16, float32, float64, int32, int64, bool
+        TypeError: If the data type of ``dims`` is not list, tuple
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          data = paddle.arange(4)
+          data = paddle.reshape(data, (2, 2))
+          print(data) ## [[0, 1],[2, 3]]
+          y = paddle.rot90(data, 1, [0, 1])
+          print(y) #[[1, 3],[0, 2]]
+          y= paddle.rot90(data, -1, [0, 1])
+          print(y) #[[2, 0],[3, 1]]
+          data2 = paddle.arange(8)
+          data2 = paddle.reshape(data2, (2,2,2))
+          print(data2) ###[[[0, 1],[2, 3]],[[4, 5],[6, 7]]]
+          y = paddle.rot90(data2, 1, [1, 2])
+          print(y)   ### [[[1, 3],[0, 2]],[[5, 7],[4, 6]]]
+    """
+
+    helper = LayerHelper("rot90", **locals())
+    check_type(x, 'X', (Variable), 'rot90')
+    dtype = helper.input_dtype('x')
+    check_dtype(dtype, 'X',
+                ['float16', 'float32', 'float64', 'int32', 'int64', 'bool'],
+                'rot90')
+    check_type(axes, 'axes', (list, tuple), 'rot90')
+
+    input_total_dims = len(x.shape)
+    total_rot_dims = len(axes)
+    if total_rot_dims != 2:
+        raise ValueError("expected total rotation axes == 2, but got axes = {}".
+                         format(total_rot_dims))
+    if input_total_dims < 2:
+        raise ValueError("expected total dims >= 2, but got total dims = {}".
+                         format(input_total_dims))
+
+    if not (axes[0] != axes[1] and abs(axes[0] - axes[1]) != input_total_dims):
+        raise ValueError(
+            "expected rotation axes to be different, but got axis0 = {}, and axis1 = {}".
+            format(axes[0], axes[1]))
+
+    if not (axes[0] < input_total_dims and axes[0] >= -input_total_dims):
+        raise ValueError("Rotation axis0 out of range, axis0 = {}".format(axes[
+            0]))
+    if not (axes[1] < input_total_dims and axes[1] >= -input_total_dims):
+        raise ValueError("Rotation axis1 out of range, axis1 = {}".format(axes[
+            1]))
+
+    ## k % 4
+    k = k % 4 if k >= 0 else 4 - (-k % 4)
+    if k == 0:
+        return x
+    if k == 2:
+        return flip(flip(x, axes[0]), axes[1])
+
+    axes_list = list(range(0, input_total_dims))
+    (axes_list[axes[0]], axes_list[axes[1]]) = (axes_list[axes[1]],
+                                                axes_list[axes[0]])
+    if k == 1:
+        return transpose(flip(x, axes[1]), axes_list)
+    else:
+        # k == 3
+        return flip(transpose(x, axes_list), axes[1])
+
+
 def flatten(x, start_axis=0, stop_axis=-1, name=None):
     r"""
     **Flatten op**
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index f5f0b5ed0873c..df0116c4c29c2 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -24,6 +24,7 @@
 from paddle.common_ops_import import dygraph_utils
 
 from paddle.tensor import cast
+from paddle.tensor.attribute import _complex_to_real_dtype
 import paddle
 from ..fluid import layers
 from ..fluid.framework import core, _varbase_creator, in_dygraph_mode, Variable, convert_np_dtype_to_dtype_
@@ -1077,7 +1078,8 @@ def logsumexp(x, axis=None, keepdim=False, name=None):
        logsumexp(x) = \\log\\sum exp(x)
 
     Args:
-        x (Tensor): The input Tensor with data type float32, float64.
+        x (Tensor): The input Tensor with data type float32 or float64, which 
+            have no more than 4 dimensions.
         axis (int|list|tuple, optional): The axis along which to perform
             logsumexp calculations. ``axis`` should be int, list(int) or
             tuple(int). If ``axis`` is a list/tuple of dimension(s), logsumexp
@@ -2611,3 +2613,330 @@ def atan2(x, y, name=None):
         helper.append_op(
                 type='atan2', inputs=inputs, outputs={'Out': out})
         return out
+
+def rad2deg(x, name=None):
+    """
+    Convert each of the elements of input x from angles in radians to degrees.
+    
+    Equation:
+        .. math::
+
+            rad2deg(x)=180/ \pi * x
+
+    Args:
+        x (Tensor): An N-D Tensor, the data type is float32, float64, int32, int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        out (Tensor): An N-D Tensor, the shape and data type is the same with input (The output data type is float32 when the input data type is int).
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            
+            x1 = paddle.to_tensor([3.142, -3.142, 6.283, -6.283, 1.570, -1.570])
+            result1 = paddle.rad2deg(x1)
+            print(result1)
+            # Tensor(shape=[6], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #         [180.02334595, -180.02334595,  359.98937988, -359.98937988,
+            #           9.95437622 , -89.95437622])
+
+            x2 = paddle.to_tensor(np.pi/2)
+            result2 = paddle.rad2deg(x2)
+            print(result2)
+            # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #         [90.])
+                     
+            x3 = paddle.to_tensor(1)
+            result3 = paddle.rad2deg(x3)
+            print(result3)
+            # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #         [57.29578018])
+    """
+    rad2deg_scale = 180 / np.pi
+    if in_dygraph_mode():
+        if convert_dtype(x.dtype) in ['int32', 'int64']:
+            x = cast(x, dtype="float32")
+        return _C_ops.scale(x, 'scale', rad2deg_scale)
+    else:
+        check_variable_and_dtype(x, 'x', ['int32', 'int64', 'float32', 'float64'], 'rad2deg')
+        helper = LayerHelper('rad2deg', **locals())
+        out_cast = x
+        if convert_dtype(x.dtype) in ['int32', 'int64']:
+            out_cast = helper.create_variable_for_type_inference(dtype=paddle.float32)
+            helper.append_op(
+                    type='cast', inputs={'X':x}, outputs={'Out': out_cast}, attrs={'in_dtype': x.dtype,'out_dtype': paddle.float32})
+        out = helper.create_variable_for_type_inference(dtype=out_cast.dtype)
+        helper.append_op(
+            type='scale', inputs={'X':out_cast}, outputs={'Out': out}, attrs={'scale': rad2deg_scale})
+        return out
+
+def deg2rad(x, name=None):
+    """
+    Convert each of the elements of input x from degrees to angles in radians.
+    
+    Equation:
+        .. math::
+
+            deg2rad(x)=\pi * x / 180
+
+    Args:
+        x (Tensor): An N-D Tensor, the data type is float32, float64, int32, int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        out (Tensor): An N-D Tensor, the shape and data type is the same with input (The output data type is float32 when the input data type is int).
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            
+            x1 = paddle.to_tensor([180.0, -180.0, 360.0, -360.0, 90.0, -90.0])
+            result1 = paddle.deg2rad(x1)
+            print(result1)
+            # Tensor(shape=[6], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #         [3.14159274, -3.14159274,  6.28318548, -6.28318548,  1.57079637,
+            #           -1.57079637])
+
+            x2 = paddle.to_tensor(180)
+            result2 = paddle.deg2rad(x2)
+            print(result2)
+            # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #         [3.14159274])
+    """
+    deg2rad_scale = np.pi / 180.0
+    if in_dygraph_mode():
+        if convert_dtype(x.dtype) in ['int32', 'int64']:
+            x = cast(x, dtype="float32")
+        return _C_ops.scale(x, 'scale', deg2rad_scale)
+    else:
+        check_variable_and_dtype(x, 'x', ['int32', 'int64', 'float32', 'float64'], 'deg2rad')
+        helper = LayerHelper('deg2rad', **locals())
+        out_cast = x
+        if convert_dtype(x.dtype) in ['int32', 'int64']:
+            out_cast = helper.create_variable_for_type_inference(dtype=paddle.float32)
+            helper.append_op(
+                    type='cast', inputs={'X':x}, outputs={'Out': out_cast}, attrs={'in_dtype': x.dtype,'out_dtype': paddle.float32})
+        out = helper.create_variable_for_type_inference(dtype=out_cast.dtype)
+        helper.append_op(
+            type='scale', inputs={'X':out_cast}, outputs={'Out': out}, attrs={'scale': deg2rad_scale})
+        return out
+
+def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
+    r"""
+    Computes the n-th forward difference along the given axis.
+    The first-order differences is computed by using the following formula: 
+
+    .. math::
+
+        out[i] = x[i+1] - x[i]
+    
+    Higher-order differences are computed by using paddle.diff() recursively. 
+    Only n=1 is currently supported.
+
+    Args:
+        x(Tensor): The input tensor to compute the forward difference on
+        n(int, optional): The number of times to recursively compute the difference. 
+                          Only support n=1. Default:1
+        axis(int, optional): The axis to compute the difference along. Default:-1
+        prepend(Tensor, optional): The tensor to prepend to input along axis before computing the difference.
+                                   It's dimensions must be equivalent to that of x, 
+                                   and its shapes must match x's shape except on axis.
+        append(Tensor, optional): The tensor to append to input along axis before computing the difference, 
+                                   It's dimensions must be equivalent to that of x, 
+                                   and its shapes must match x's shape except on axis.
+        name(str|None): A name for this layer(optional). If set None, 
+                        the layer will be named automatically.
+    
+    Returns:
+        Tensor: The output tensor with same dtype with x.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.to_tensor([1, 4, 5, 2])
+            out = paddle.diff(x)
+            print(out)
+            # out:
+            # [3, 1, -3]
+
+            y = paddle.to_tensor([7, 9])
+            out = paddle.diff(x, append=y)
+            print(out)
+            # out: 
+            # [3, 1, -3, 5, 2]
+
+            z = paddle.to_tensor([[1, 2, 3], [4, 5, 6]])
+            out = paddle.diff(z, axis=0)
+            print(out)
+            # out:
+            # [[3, 3, 3]]
+            out = paddle.diff(z, axis=1)
+            print(out)
+            # out:
+            # [[1, 1], [1, 1]]
+    """
+
+    if axis < 0:
+        axis = axis + len(x.shape)
+    if axis > len(x.shape):
+        axis = len(x.shape)
+    if axis < 0:
+        axis = 0
+    dtype = x.dtype
+    axes = [axis]
+    infer_flags = list(1 for i in range(len(axes)))
+    if in_dygraph_mode():
+        has_pend = False
+        input_list = []
+        if prepend is not None and append is not None:
+            input_list = [prepend, x, append]
+            has_pend = True
+        elif prepend is not None:
+            input_list = [prepend, x]
+            has_pend = True
+        elif append is not None:
+            input_list = [x, append]
+            has_pend = True
+        if has_pend:
+            new_input = _C_ops.concat(input_list, 'axis', axis)
+        else:
+            new_input = x
+
+        attrs_1 = ()
+        attrs_2 = ()
+
+        dim_len = new_input.shape[axis]
+
+        starts_1 = [0]
+        attrs_1 += ('starts', starts_1)
+        ends_1 = [dim_len - 1]
+        attrs_1 += ('ends', ends_1)
+        input_front = _C_ops.slice(new_input, None, None, 'axes', axes, \
+            'infer_flags', infer_flags, *attrs_1)
+        starts_2 = [1]
+        attrs_2 += ('starts', starts_2)
+        ends_2 = [dim_len]
+        attrs_2 += ('ends', ends_2)
+        input_back = _C_ops.slice(new_input, None, None, 'axes', axes, \
+            'infer_flags', infer_flags, *attrs_2)
+
+        if x.dtype == paddle.bool:
+            op = getattr(_C_ops, "logical_xor")
+            out = op(input_back, input_front)
+        else:
+            out = layers.elementwise_sub(input_back, input_front, axis=axis)
+        return out
+    else:
+        check_variable_and_dtype(x, 'x', ['float32', 'float64', 'bool', 'int32', 'int64'], 'diff')
+        check_type(axis, 'axis', (int), 'diff')
+        helper = LayerHelper('diff', **locals())
+        has_pend = False
+        input_list = []
+        if prepend is not None and append is not None:
+            input_list = [prepend, x, append]
+            has_pend = True
+        elif prepend is not None:
+            input_list = [prepend, x]
+            has_pend = True
+        elif append is not None:
+            input_list = [x, append]
+            has_pend = True
+
+        if has_pend:
+            new_input = helper.create_variable_for_type_inference(dtype)
+            helper.append_op(
+                type='concat', inputs={'X': input_list}, outputs={'Out': [new_input]}, attrs={'axis': axis}
+            )
+        else:
+            new_input = x
+
+        dim_len = new_input.shape[axis]
+        attrs_1 = {'axes': axes}
+        starts_1 = [0]
+        ends_1 = [dim_len - 1]
+        attrs_1['starts'] = starts_1
+        attrs_1['ends'] = ends_1
+        input_front = helper.create_variable_for_type_inference(dtype)
+        helper.append_op(
+            type='slice', inputs={'Input': new_input}, attrs=attrs_1, outputs={'Out': input_front}
+        )
+        attrs_2 = {'axes': axes}
+        starts_2 = [1]
+        ends_2 = [dim_len]
+        attrs_2['starts'] = starts_2
+        attrs_2['ends'] = ends_2
+        input_back = helper.create_variable_for_type_inference(dtype)
+        helper.append_op(
+            type='slice', inputs={'Input': new_input}, attrs=attrs_2, outputs={'Out': input_back}
+        )
+
+        if dtype == paddle.bool:
+            out = helper.create_variable_for_type_inference(dtype)
+            helper.append_op(
+                type='logical_xor', inputs={"X": input_back, "Y": input_front}, outputs={"Out": out}
+            )
+        else:
+            out = layers.elementwise_sub(input_back, input_front, axis=axis)
+
+        return out
+
+
+def angle(x, name=None):
+    r"""
+    Element-wise angle of complex numbers. For non-negative real numbers, the angle is 0 while 
+    for negative real numbers, the angle is :math:`\pi`.
+
+    Equation:
+        .. math::
+
+            angle(x)=arctan2(x.imag, x.real)
+
+    Args:
+        x (Tensor): An N-D Tensor, the data type is complex64, complex128, or float32, float64 .
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        out (Tensor): y (Tensor): An N-D Tensor of real data type with the same precision as that of x's data type.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.to_tensor([-2, -1, 0, 1]).unsqueeze(-1).astype('float32')
+            y = paddle.to_tensor([-2, -1, 0, 1]).astype('float32')
+            z = x + 1j * y
+            print(z.numpy())
+            # [[-2.-2.j -2.-1.j -2.+0.j -2.+1.j]
+            #  [-1.-2.j -1.-1.j -1.+0.j -1.+1.j]
+            #  [ 0.-2.j  0.-1.j  0.+0.j  0.+1.j]
+            #  [ 1.-2.j  1.-1.j  1.+0.j  1.+1.j]]
+
+            theta = paddle.angle(z)
+            print(theta.numpy())
+            # [[-2.3561945 -2.6779451  3.1415927  2.6779451]
+            #  [-2.0344439 -2.3561945  3.1415927  2.3561945]
+            #  [-1.5707964 -1.5707964  0.         1.5707964]
+            #  [-1.1071488 -0.7853982  0.         0.7853982]]
+    """
+
+    if in_dygraph_mode():
+        return _C_ops.angle(x)
+
+    check_variable_and_dtype(x, 'x',
+        ['float32', 'float64', 'complex64', 'complex128'], 'angle')
+    op_type = "angle"
+    helper = LayerHelper(op_type, **locals())
+    inputs = {"X": x}
+    out = helper.create_variable_for_type_inference(
+        dtype=_complex_to_real_dtype(x.dtype))
+    outputs = {"Out": out}
+    helper.append_op(type=op_type, inputs=inputs, outputs=outputs)
+    return out
diff --git a/python/paddle/tensor/to_string.py b/python/paddle/tensor/to_string.py
index f640882893034..6fd20457fe619 100644
--- a/python/paddle/tensor/to_string.py
+++ b/python/paddle/tensor/to_string.py
@@ -255,3 +255,39 @@ def to_string(var, prefix='Tensor'):
         stop_gradient=var.stop_gradient,
         indent=' ' * indent,
         data=data)
+
+
+def eager_tensor_to_string(tensor, prefix='Tensor'):
+    indent = len(prefix) + 1
+
+    _template = "{prefix}(shape={shape}, dtype={dtype}, place={place}, stop_gradient={stop_gradient},\n{indent}{data})"
+
+    if not tensor._is_initialized():
+        return "Tensor(Not initialized)"
+
+    np_tensor = tensor.numpy()
+
+    if len(tensor.shape) == 0:
+        size = 0
+    else:
+        size = 1
+        for dim in tensor.shape:
+            size *= dim
+
+    sumary = False
+    if size > DEFAULT_PRINT_OPTIONS.threshold:
+        sumary = True
+
+    max_width, signed = _get_max_width(_to_summary(np_tensor))
+
+    data = _format_tensor(
+        np_tensor, sumary, indent=indent, max_width=max_width, signed=signed)
+
+    return _template.format(
+        prefix=prefix,
+        shape=tensor.shape,
+        dtype=tensor.dtype,
+        place=tensor._place_str,
+        stop_gradient=tensor.stop_gradient,
+        indent=' ' * indent,
+        data=data)
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
new file mode 100644
index 0000000000000..581aaef62a78f
--- /dev/null
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -0,0 +1,153 @@
+- api : add
+  args : (const Tensor& x, const Tensor& y)
+  output : Tensor
+  infer_meta : 
+    func : ElementwiseInferMeta
+    param : [x, y, -1]
+  kernel :
+    func : elementwise_add
+    param : [x, y, -1]
+
+- api : cast
+  args : (const Tensor& x, DataType out_dtype)
+  output : Tensor
+  infer_meta : 
+    func : CastInferMeta
+  kernel :
+    func : cast
+    param : [x, out_dtype, x.dtype()]
+    data_type : x
+
+- api : divide
+  args : (const Tensor& x, const Tensor& y)
+  output : Tensor
+  infer_meta : 
+    func : ElementwiseInferMeta
+    param : [x, y, -1]
+  kernel :
+    func : elementwise_div
+    param : [x, y, -1]
+
+- api : dot
+  args : (const Tensor& x, const Tensor& y)
+  output : Tensor
+  infer_meta : 
+    func : DotInferMeta
+  kernel : 
+    func : dot
+
+- api : flatten
+  args : (const Tensor& x, int start_axis, int stop_axis)
+  output : Tensor
+  infer_meta : 
+    func : FlattenInferMeta
+  kernel : 
+    func : flatten_contiguous_range
+
+- api : full
+  args : (const ScalarArray& shape, const Scalar& value, DataType dtype=DataType::FLOAT32, Backend place=Backend::CPU, DataLayout layout=DataLayout::NCHW)
+  output: Tensor
+  infer_meta : 
+    func : FullInferMeta
+    param : [shape, dtype, layout]
+  kernel : 
+    func : fill_constant
+    param : [shape, value]
+    data_type : dtype
+    backend : place
+    layout : layout
+  
+- api : full_like
+  args : (const Tensor& x, const Scalar& value, DataType dtype = DataType::UNDEFINED, Backend place = Backend::UNDEFINED, DataLayout layout = DataLayout::UNDEFINED)
+  output: Tensor
+  infer_meta : 
+    func : FullLikeInferMeta
+    param : [x, dtype, layout]
+  kernel : 
+    func : fill_any_like
+    param : [x, value]
+    data_type : dtype > x
+    backend : place > x
+    layout : layout > x
+
+- api : matmul
+  args : (const Tensor& x, const Tensor& y, bool transpose_x = false, bool transpose_y = false)
+  output : Tensor
+  infer_meta : 
+    func : MatmulInferMeta
+  kernel : 
+    func : matmul_v2
+
+- api : mean
+  args : (const Tensor& x, const std::vector<int64_t>& axis, bool keep_dim)
+  output : Tensor
+  infer_meta : 
+    func : ReduceInferMeta
+  kernel : 
+    func : reduce_mean
+    param : [x, axis, keep_dim, false, x.dtype(), DataType::UNDEFINED]
+
+- api : multiply
+  args : (const Tensor& x, const Tensor& y)
+  output : Tensor
+  infer_meta : 
+    func : ElementwiseInferMeta
+    param : [x, y, -1]
+  kernel :
+    func : elementwise_mul
+    param : [x, y, -1]
+
+- api : ones_like
+  args : (const Tensor& x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED, DataLayout layout=DataLayout::UNDEFINED)
+  output : Tensor
+  invoke : full_like(x, 1, dtype, place, layout)
+
+- api : reshape
+  args : (const Tensor& x, const std::vector<int64_t>& shape)
+  output : Tensor
+  infer_meta : 
+    func : InferMetaFromVecValue
+  kernel : 
+    func : reshape2
+
+- api : scale
+  args : (const Tensor& x, const Scalar& scale, float bias, bool bias_after_scale)
+  output : Tensor
+  infer_meta : 
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : scale
+
+- api : subtract
+  args : (const Tensor& x, const Tensor& y)
+  output : Tensor
+  infer_meta : 
+    func : ElementwiseInferMeta
+    param : [x, y, -1]
+  kernel :
+    func : elementwise_sub
+    param : [x, y, -1]
+
+- api : sum
+  args : (const Tensor& x, const std::vector<int64_t>& axis, bool keep_dim)
+  output : Tensor
+  infer_meta : 
+    func : ReduceInferMeta
+  kernel : 
+    func : reduce_sum
+    param : [x, axis, keep_dim, false, x.dtype(), DataType::UNDEFINED]
+
+- api : zeros_like
+  args : (const Tensor& x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED, DataLayout layout=DataLayout::UNDEFINED)
+  output : Tensor
+  invoke : full_like(x, 0, dtype, place, layout)
+
+# - api : full_like
+#   args : (const Tensor& x, const Scalar& value, DataType dtype, Backend place)->Tensor
+#   output: {Tensor : dtype}
+#   kernel : fill_any_like
+#   T : [dtype, x]
+#   backend : [place, x]
+#   layout : []
+#   InferMeta : UnchangedInferMeta(x)
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
new file mode 100644
index 0000000000000..5506ee95bd7c9
--- /dev/null
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -0,0 +1,452 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import yaml
+import argparse
+
+
+class API:
+    prefix_tensor_name = 'dense_'
+
+    def __init__(self, api_item_yaml):
+        self.api = api_item_yaml['api']
+        # args:
+        #   inputs: 
+        #     names : [], list of input names
+        #   attrs:
+        #     names : [], list of attribute names
+        #     attr_info : { attr_name : (type, default_values)}    
+        self.args = self.parse_args(api_item_yaml['args'])
+        self.output = api_item_yaml['output']
+        self.is_base_api = True
+        if 'invoke' in api_item_yaml:
+            self.is_base_api = False
+            self.invoke = api_item_yaml['invoke']
+        else:
+            self.kernel = api_item_yaml['kernel']
+            if 'backend' not in self.kernel or len(self.kernel['backend']) == 0:
+                self.kernel['backend'] = None
+            if 'layout' not in self.kernel or len(self.kernel['layout']) == 0:
+                self.kernel['layout'] = None
+            if 'data_type' not in self.kernel or len(self.kernel[
+                    'data_type']) == 0:
+                self.kernel['data_type'] = None
+            if 'param' not in self.kernel or len(self.kernel['param']) == 0:
+                self.kernel['param'] = None
+
+            self.infer_meta = api_item_yaml['infer_meta']
+            if 'param' not in self.infer_meta or len(self.infer_meta[
+                    'param']) == 0:
+                self.infer_meta['param'] = None
+
+    def parse_args(self, args_str):
+        inputs = {'names': []}
+        attrs = {'names': [], 'attr_info': {}}
+        args_str = args_str.strip()
+        assert args_str.startswith('(') and args_str.endswith(')'), \
+            f"Args declaration should start with '(' and end with ')', please check the args of {self.api} in api.yaml."
+        args_str = args_str[1:-1]
+        args_list = args_str.split(',')
+        input_types = ['const Tensor&', 'const Tensor &']
+        attr_types = ['const Scalar&', 'const Scalar &', 'const ScalarArray&', 'const ScalarArray &', \
+                      'int', 'int32_t', 'int64_t', 'size_t', 'float', 'double', 'bool', \
+                      'const std::vector<int64_t>&', 'Backend', 'DataLayout', 'DataType']
+        args_declare_str = ""
+        args_define_str = ""
+        for item in args_list:
+            item = item.strip()
+            # match the input tensor
+            has_input = False
+            for in_type in input_types:
+                if item.startswith(in_type):
+                    input_name = item[len(in_type):].strip()
+                    assert len(input_name) > 0, \
+                        f"The input tensor name should not be empty. Please check the args of {self.api} in api.yaml."
+                    inputs['names'].append(input_name)
+                    args_declare_str = args_declare_str + in_type + ' ' + input_name + ', '
+                    args_define_str = args_define_str + in_type + ' ' + input_name + ', '
+                    has_input = True
+                    break
+            if has_input:
+                continue
+
+            # match the attribute
+            for attr_type in attr_types:
+                if item.startswith(attr_type):
+                    attr_name = item[len(attr_type):].strip()
+                    assert len(attr_name) > 0, \
+                        f"The attribute name should not be empty. Please check the args of {self.api} in api.yaml."
+                    default_value = None
+                    if '=' in attr_name:
+                        attr_infos = attr_name.split('=')
+                        attr_name = attr_infos[0].strip()
+                        default_value = attr_infos[1].strip()
+
+                    default_value_str = "" if default_value is None else '=' + default_value
+                    args_declare_str = args_declare_str + attr_type + ' ' + attr_name + default_value_str + ', '
+                    args_define_str = args_define_str + attr_type + ' ' + attr_name + ', '
+                    attrs['names'].append(attr_name)
+                    attrs['attr_info'][attr_name] = (attr_type, default_value)
+                    break
+
+        args = {
+            'inputs': inputs,
+            'attrs': attrs,
+            'args_declare': args_declare_str[:-2],
+            'args_define': args_define_str[:-2]
+        }
+        return args
+
+    def gene_api_declaration(self):
+        return f"""
+PADDLE_API {self.output} {self.api}({self.args['args_declare']});
+"""
+
+    def gene_kernel_select(self, input_names, attrs, kernel):
+
+        kernel_key_item_init = """
+  Backend kernel_backend = Backend::UNDEFINED;
+  DataLayout kernel_layout = DataLayout::UNDEFINED;
+  DataType kernel_data_type = DataType::UNDEFINED;
+"""
+        # Check the tensor options
+        attr_backend_count = 0
+        attr_layout_count = 0
+        attr_data_type_count = 0
+        for attr_name in attrs['names']:
+            if attrs['attr_info'][attr_name][0] == 'Backend':
+                assert kernel['backend'] is not None, \
+                    f"{self.api} api: When there is a parameter with 'Backend' type in attributes, you must set backend of kernel manually."
+                attr_backend_count = attr_backend_count + 1
+            if attrs['attr_info'][attr_name][0] == 'DataLayout':
+                assert kernel['layout'] is not None, \
+                    f"{self.api} api: When there is a parameter with 'DataLayout' type in attributes, you must set layout of kernel manually."
+                attr_layout_count = attr_layout_count + 1
+            if attrs['attr_info'][attr_name][0] == 'DataType':
+                assert kernel['data_type'] is not None, \
+                    f"{self.api} api: When there is a parameter with 'DataType' type in attributes, you must set data_type of kernel manually."
+                attr_data_type_count = attr_data_type_count + 1
+
+        # preprocess kernel configures
+        kernel_select_code = ""
+        if kernel['backend'] is not None:
+            if '>' in kernel['backend']:
+                vars_list = kernel['backend'].split('>')
+                assert len(
+                    vars_list
+                ) == 2, f"{self.api} api: The number of params to set backend with '>' only allows 2, but received {len(vars_list)}."
+                assert (vars_list[0].strip() in attrs['names']) and (attrs['attr_info'][vars_list[0].strip()][0] == 'Backend'), \
+                    f"{self.api} api: When use '>' to set kernel backend, the first param should be a attribute with Backend type."
+                kernel_select_code = kernel_select_code + f"""
+  kernel_backend = ParseBackendWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
+"""
+
+            else:
+                args_str = ""
+                for ele in kernel['backend'].split(','):
+                    args_str = args_str + ele.strip() + ', '
+                kernel_select_code = kernel_select_code + f"""
+  kernel_backend = ParseBackend({args_str[:-2]});
+"""
+
+        if kernel['layout'] is not None:
+            if '>' in kernel['layout']:
+                vars_list = kernel['layout'].split('>')
+                assert len(
+                    vars_list
+                ) == 2, f"{self.api} api: The number of params to set layout with '>' only allows 2, but received {len(vars_list)}."
+                assert vars_list[0].strip() in attrs['names'] and attrs['attr_info'][vars_list[0].strip()][0] == 'DataLayout', \
+                    f"{self.api} api: When use '>' to set kernel layout, the first param should be a attribute with DataLayout type."
+                kernel_select_code = kernel_select_code + f"""
+  kernel_layout = ParseLayoutWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
+"""
+
+            else:
+                vars_list = kernel['layout'].split(',')
+                assert len(
+                    vars_list
+                ) == 1, f"{self.api} api: The number of params to set layout must be 1, but received {len(vars_list)}."
+                kernel_select_code = kernel_select_code + f"""
+  kernel_layout = ParseLayout({vars_list[0].strip()});
+"""
+
+        if kernel['data_type'] is not None:
+            if '>' in kernel['data_type']:
+                vars_list = kernel['data_type'].split('>')
+                assert len(
+                    vars_list
+                ) == 2, f"{self.api} api: The number of params to set data_type with '>' only allows 2, but received {len(vars_list)}."
+                assert vars_list[0].strip() in attrs['names'] and attrs['attr_info'][vars_list[0].strip()][0] == 'DataType', \
+                    f"{self.api} api: When use '>' to set kernel data_type, the first param should be a attribute with DataType type."
+                kernel_select_code = kernel_select_code + f"""
+  kernel_data_type = ParseDataTypeWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
+"""
+
+            else:
+                vars_list = kernel['data_type'].split(',')
+                assert len(
+                    vars_list
+                ) == 1, f"{self.api} api: The number of params to set data_type only allows 2, but received {len(vars_list)}."
+                kernel_select_code = kernel_select_code + f"""
+  kernel_data_type = ParseDataType({vars_list[0].strip()});
+"""
+
+        if len(input_names) == 0:
+            assert attr_backend_count > 0 and attr_layout_count > 0 and attr_data_type_count > 0, \
+                f"{self.api} api: When there is no input tensor, the args must have 'Backend', 'DataLayout' and 'DataType'."
+
+        kernel_select_args = ""
+        for input_name in input_names:
+            kernel_select_args = kernel_select_args + input_name + ", "
+
+        if len(kernel_select_args) > 2:
+            kernel_select_args = kernel_select_args[:-2]
+
+        kernel_select_code = kernel_key_item_init + kernel_select_code
+
+        if len(input_names) > 0:
+            kernel_select_code = kernel_select_code + f"""
+  if (kernel_backend == Backend::UNDEFINED 
+        || kernel_layout == DataLayout::UNDEFINED
+        || kernel_data_type == DataType::UNDEFINED ) {{
+    auto kernel_key_set = ParseKernelKeyByInputArgs({kernel_select_args});
+    auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+    if (kernel_backend == Backend::UNDEFINED) {{
+      kernel_backend = kernel_key.backend();
+    }}
+    if (kernel_layout == DataLayout::UNDEFINED) {{
+      kernel_layout = kernel_key.layout();
+    }}
+    if (kernel_data_type == DataType::UNDEFINED) {{
+      kernel_data_type = kernel_key.dtype();
+    }}
+  }}"""
+
+        kernel_select_code = kernel_select_code + f"""
+  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
+      "{kernel['func']}", {{kernel_backend, kernel_layout, kernel_data_type}});
+  VLOG(6) << "{self.api} API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
+  VLOG(6) << "{self.api} API kernel: " << kernel;"""
+
+        return kernel_select_code
+
+    def gene_infer_meta(self, input_names, attr_names, infer_meta) -> str:
+        infer_meta_params = infer_meta['param'] if infer_meta[
+            'param'] is not None else input_names + attr_names
+        param_code = ""
+        for param in infer_meta_params:
+            if param in input_names:
+                param_code = param_code + self.prefix_tensor_name + param + "->meta(), "
+            elif param in attr_names:
+                param_code = param_code + param + ", "
+            elif isinstance(param, str):
+                param_code = param_code + "\"" + param + "\", "
+            elif isinstance(param, bool):
+                param_code = param_code + str(param).lower() + ", "
+            else:
+                param_code = param_code + str(param) + ", "
+
+        param_code = param_code[:-2]
+        return f"""
+  auto out_meta = pten::{infer_meta['func']}({param_code});
+"""
+
+    def gene_kernel_context(self, input_names, attrs, infer_meta, kernel_param):
+        attr_names = attrs['names']
+        if kernel_param is None:
+            kernel_param = input_names + attr_names
+
+        input_code_str = ""
+        attr_code_str = ""
+        for param in kernel_param:
+            if param in input_names:
+                # set input for kernel_context
+                input_code_str = input_code_str + f"""
+  auto {self.prefix_tensor_name}{param} = std::dynamic_pointer_cast<pten::DenseTensor>({param}.impl());
+  kernel_context.EmplaceBackInput({self.prefix_tensor_name}{param});"""
+
+            elif param in attr_names:
+                # set attr for kernel_context
+                if 'ScalarArray' in attrs['attr_info'][param][0]:
+                    param = 'pten::ScalarArray(' + param + ')'
+                elif 'Scalar' in attrs['attr_info'][param][0]:
+                    param = 'pten::Scalar(' + param + ')'
+                attr_code_str = attr_code_str + f"""
+  kernel_context.EmplaceBackAttr({param});"""
+
+            elif isinstance(param, bool):
+                attr_code_str = attr_code_str + f"""
+  kernel_context.EmplaceBackAttr({str(param).lower()});"""
+
+            else:
+                attr_code_str = attr_code_str + f"""
+  kernel_context.EmplaceBackAttr({param});"""
+
+        return f"""
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
+  auto kernel_context = pten::KernelContext(dev_ctx);
+{input_code_str}
+{attr_code_str}
+{self.gene_infer_meta(input_names, attr_names, infer_meta)}
+  const auto allocator =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          pten::TransToFluidPlace(kernel_backend));
+  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
+  kernel_context.EmplaceBackOutput(dense_out);
+
+  Tensor out;
+  out.set_impl(dense_out);"""
+
+    def gene_api_code(self):
+        if self.is_base_api:
+            return f"""
+PADDLE_API {self.output} {self.api}({self.args["args_define"]}) {{
+{self.gene_kernel_select(self.args['inputs']['names'], self.args['attrs'], self.kernel)}
+{self.gene_kernel_context(self.args['inputs']['names'], self.args['attrs'], self.infer_meta, self.kernel['param'])}
+
+  kernel(&kernel_context);
+  return out;
+}}
+"""
+
+        else:
+            return f"""
+PADDLE_API {self.output} {self.api}({self.args["args_define"]}) {{
+  return {self.invoke};
+}}
+"""
+
+
+def header_include():
+    return """
+#include "paddle/pten/api/include/tensor.h"
+#include "paddle/pten/common/scalar.h"
+#include "paddle/pten/common/scalar_array.h"
+"""
+
+
+def source_include(header_file_path):
+    return f"""
+#include "{header_file_path}"
+#include <memory>
+
+#include "glog/logging.h"
+
+#include "paddle/pten/api/lib/api_registry.h"
+#include "paddle/pten/api/lib/kernel_dispatch.h"
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/include/core.h"
+#include "paddle/pten/include/infermeta.h"
+"""
+
+
+def module_declare():
+    return """
+PT_DECLARE_MODULE(CreationCPU);
+PT_DECLARE_MODULE(LinalgCPU);
+PT_DECLARE_MODULE(ManipulationCPU);
+PT_DECLARE_MODULE(MathCPU);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_DECLARE_MODULE(CreationCUDA);
+PT_DECLARE_MODULE(LinalgCUDA);
+PT_DECLARE_MODULE(ManipulationCUDA);
+PT_DECLARE_MODULE(MathCUDA);
+#endif
+"""
+
+
+def api_register():
+    return """
+PT_REGISTER_API(Creation);
+PT_REGISTER_API(Linalg);
+PT_REGISTER_API(Manipulation);
+PT_REGISTER_API(Math);
+"""
+
+
+def api_namespace():
+    return ("""
+namespace paddle {
+namespace experimental {
+
+""", """
+
+}  // namespace experimental
+}  // namespace paddle
+""")
+
+
+def generate_api(api_yaml_path, header_file_path, source_file_path):
+
+    with open(api_yaml_path, 'r') as f:
+        apis = yaml.load(f, Loader=yaml.FullLoader)
+    header_file = open(header_file_path, 'w')
+    source_file = open(source_file_path, 'w')
+
+    namespace = api_namespace()
+
+    header_file.write("#pragma once\n")
+    header_file.write(header_include())
+    header_file.write(namespace[0])
+
+    include_header_file = "paddle/pten/api/include/api.h"
+    source_file.write(source_include(include_header_file))
+    source_file.write(module_declare())
+    source_file.write(namespace[0])
+
+    for api in apis:
+        api_code = API(api)
+        print(api_code.gene_api_declaration())
+        header_file.write(api_code.gene_api_declaration())
+        source_file.write(api_code.gene_api_code())
+
+    header_file.write(namespace[1])
+    source_file.write(namespace[1])
+    source_file.write(api_register())
+
+    header_file.close()
+    source_file.close()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Generate PaddlePaddle C++ API files')
+    parser.add_argument(
+        '--api_yaml_path',
+        help='path to yaml file directory',
+        default='python/paddle/utils/code_gen/api.yaml')
+    parser.add_argument(
+        '--api_header_path',
+        help='output of generated api header code file',
+        default='paddle/pten/api/include/api.h')
+
+    parser.add_argument(
+        '--api_source_path',
+        help='output of generated api source code file',
+        default='paddle/pten/api/lib/api.cc')
+
+    options = parser.parse_args()
+
+    api_yaml_path = options.api_yaml_path
+    header_file_path = options.api_header_path
+    source_file_path = options.api_source_path
+
+    generate_api(api_yaml_path, header_file_path, source_file_path)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py
index 5370de9ed42aa..3a7804d9012dd 100644
--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
@@ -440,7 +440,7 @@ def unix_custom_single_compiler(obj, src, ext, cc_args, extra_postargs,
                 # so we add this flag to ensure the symbol names from user compiled
                 # shared library have same ABI suffix with core_(no)avx.so.
                 # See https://stackoverflow.com/questions/34571583/understanding-gcc-5s-glibcxx-use-cxx11-abi-or-the-new-abi
-                add_compile_flag(['-D_GLIBCXX_USE_CXX11_ABI=1'], cflags)
+                add_compile_flag(cflags, ['-D_GLIBCXX_USE_CXX11_ABI=1'])
                 # Append this macor only when jointly compiling .cc with .cu
                 if not is_cuda_file(src) and self.contain_cuda_file:
                     if core.is_compiled_with_rocm():
diff --git a/python/setup.py.in b/python/setup.py.in
index e01019ed7da77..e286b9cc735df 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -307,6 +307,7 @@ packages=['paddle',
           'paddle.fluid.dygraph',
           'paddle.fluid.dygraph.dygraph_to_static',
           'paddle.fluid.dygraph.amp',
+          'paddle.fluid.eager',
           'paddle.fluid.proto',
           'paddle.fluid.proto.profiler',
           'paddle.fluid.distributed',
@@ -469,7 +470,9 @@ if '${WITH_LITE}' == 'ON':
 
 if '${WITH_CINN}' == 'ON':
     shutil.copy('${CINN_LIB_LOCATION}/${CINN_LIB_NAME}', libs_path)
+    shutil.copy('${CINN_INCLUDE_DIR}/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh', libs_path)
     package_data['paddle.libs']+=['libcinnapi.so']
+    package_data['paddle.libs']+=['cinn_cuda_runtime_source.cuh']
 
 if '${WITH_PSLIB}' == 'ON':
     shutil.copy('${PSLIB_LIB}', libs_path)
diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
index dcbe853d8a1bc..45d4731ba1dba 100644
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -76,9 +76,21 @@ if [ "$op_type_spec_diff" != "" ]; then
 fi
 
 op_desc_diff=`python ${PADDLE_ROOT}/tools/check_op_desc.py ${PADDLE_ROOT}/paddle/fluid/OP_DESC_DEV.spec  ${PADDLE_ROOT}/paddle/fluid/OP_DESC_PR.spec`
+inference_approve=`echo "$op_desc_diff" | grep "need inference to review" -`
+slim_approve=`echo "$op_desc_diff" | grep "need slim to review" -`
 if [ "$op_desc_diff" != "" ]; then
-    echo_line="You must have one RD (cyj1986, Superjomn) approval for the changes of Inputs/Output/Attrs of OPs. The changes of OPs will cause that the new version inference fails to load model trained by the old version. Please modify your code. \n For more details, please click [https://github.com/PaddlePaddle/Paddle/wiki/OP-Input-Output-Attribute-Compatibility-Modification].\n${op_desc_diff}\n"
-    check_approval 1 39645414 328693
+    echo_line="You must have one RD (inference[ Superjomn(Recommend), Shixiaowei02, cyj1986 ] or slim[ wanghaoshuang(Recommend), qingqing01 ]) approval for the changes of Inputs/Output/Attrs of OPs. The changes of OPs will cause that the new version inference fails to load model trained by the old version. Please modify your code. \n For more details, please click [https://github.com/PaddlePaddle/Paddle/wiki/OP-Input-Output-Attribute-Compatibility-Modification].\n${op_desc_diff}\n"
+    check_approval 1 39645414 328693 39303645 7534971 7845005
+fi
+
+if [ "$slim_approve" != "" ]; then
+    echo_line="You must have one RD (wanghaoshuang(Recommend), qingqing01) approval for the changes of `quant` Inputs/Output/Attrs of OPs. \n For more details, please click [https://github.com/PaddlePaddle/Paddle/wiki/OP-Input-Output-Attribute-Compatibility-Modification].\n${slim_approve}\n"
+    check_approval 1 7534971 7845005
+fi
+
+if [ "$inference_approve" != "" ]; then
+    echo_line="You must have one RD (Superjomn(Recommend), Shixiaowei02, cyj1986) approval for the changes of `def` Inputs/Output/Attrs of OPs. \n For more details, please click [https://github.com/PaddlePaddle/Paddle/wiki/OP-Input-Output-Attribute-Compatibility-Modification].\n${inference_approve}\n"
+    check_approval 1 39645414 328693 39303645
 fi
 
 DEV_OP_USE_DEFAULT_GRAD_MAKER_SPEC=${PADDLE_ROOT}/paddle/fluid/op_use_default_grad_maker_DEV.spec
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index e77b71e4da1c9..482e65a726ef5 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -225,7 +225,7 @@ if [ "${HAS_MODIFIED_DEMO_CMAKE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
 
 ALL_PADDLE_ENFORCE=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "PADDLE_ENFORCE\(.[^,\);]+.[^;]*\);\s" || true`
 if [ "${ALL_PADDLE_ENFORCE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="PADDLE_ENFORCE is not recommended. Please use PADDLE_ENFORCE_EQ/NE/GT/GE/LT/LE or PADDLE_ENFORCE_NOT_NULL or PADDLE_ENFORCE_CUDA_SUCCESS instead, see [ https://github.com/PaddlePaddle/Paddle/wiki/PADDLE_ENFORCE-Rewriting-Specification ] for details.\nYou must have one RD (chenwhql (Recommend) , luotao1 (Recommend) or lanxianghit) approval for the usage (either add or delete) of PADDLE_ENFORCE.\n${ALL_PADDLE_ENFORCE}\n"
+    echo_line="PADDLE_ENFORCE is not recommended. Please use PADDLE_ENFORCE_EQ/NE/GT/GE/LT/LE or PADDLE_ENFORCE_NOT_NULL or PADDLE_ENFORCE_GPU_SUCCESS instead, see [ https://github.com/PaddlePaddle/Paddle/wiki/PADDLE_ENFORCE-Rewriting-Specification ] for details.\nYou must have one RD (chenwhql (Recommend) , luotao1 (Recommend) or lanxianghit) approval for the usage (either add or delete) of PADDLE_ENFORCE.\n${ALL_PADDLE_ENFORCE}\n"
     check_approval 1 6836917 47554610 22561442
 fi
 
diff --git a/tools/check_op_desc.py b/tools/check_op_desc.py
index 78abb6f36c606..19984a55a41af 100644
--- a/tools/check_op_desc.py
+++ b/tools/check_op_desc.py
@@ -40,6 +40,11 @@
 GENERATED = "generated"
 DEFAULT_VALUE = "default_value"
 
+# add_with_extra, add_with_quant and add_with_def
+EXTRA = "extra"
+QUANT = "quant"
+DEF = "def"
+
 error = False
 
 version_update_map = {
@@ -64,6 +69,9 @@ def diff_vars(origin_vars, new_vars):
     var_add_dispensable_massage = []
     var_deleted_error_massage = []
 
+    var_add_quant_message = []
+    var_add_def_message = []
+
     common_vars_name = set(origin_vars.keys()) & set(new_vars.keys())
     vars_name_only_in_origin = set(origin_vars.keys()) - set(new_vars.keys())
     vars_name_only_in_new = set(new_vars.keys()) - set(origin_vars.keys())
@@ -73,11 +81,12 @@ def diff_vars(origin_vars, new_vars):
             continue
         else:
             error, var_error = True, True
-            var_changed_error_massage[var_name] = {}
             for arg_name in origin_vars.get(var_name):
                 new_arg_value = new_vars.get(var_name, {}).get(arg_name)
                 origin_arg_value = origin_vars.get(var_name, {}).get(arg_name)
                 if new_arg_value != origin_arg_value:
+                    if var_name not in var_changed_error_massage.keys():
+                        var_changed_error_massage[var_name] = {}
                     var_changed_error_massage[var_name][arg_name] = (
                         origin_arg_value, new_arg_value)
 
@@ -91,6 +100,21 @@ def diff_vars(origin_vars, new_vars):
             error, var_error = True, True
             var_add_dispensable_massage.append(var_name)
 
+        # if added var is extra, then no need to check.
+        if new_vars.get(var_name).get(EXTRA):
+            continue
+
+        # if added var is quant, slim needs to review, needs to register.
+        if new_vars.get(var_name).get(QUANT):
+            error, var_error = True, True
+            var_add_quant_message.append(var_name)
+
+        # if added var is def, inference needs to review, needs to register.
+        if not new_vars.get(var_name).get(EXTRA) and not new_vars.get(
+                var_name).get(QUANT):
+            error, var_error = True, True
+            var_add_def_message.append(var_name)
+
     var_diff_message = {}
     if var_add_massage:
         var_diff_message[ADD] = var_add_massage
@@ -100,6 +124,10 @@ def diff_vars(origin_vars, new_vars):
         var_diff_message[CHANGE] = var_changed_error_massage
     if var_deleted_error_massage:
         var_diff_message[DELETE] = var_deleted_error_massage
+    if var_add_quant_message:
+        var_diff_message[QUANT] = var_add_quant_message
+    if var_add_def_message:
+        var_diff_message[DEF] = var_add_def_message
 
     return var_error, var_diff_message
 
@@ -113,6 +141,9 @@ def diff_attr(ori_attrs, new_attrs):
     attr_added_def_error_massage = []
     attr_deleted_error_massage = []
 
+    attr_added_quant_message = []
+    attr_added_define_message = []
+
     common_attrs = set(ori_attrs.keys()) & set(new_attrs.keys())
     attrs_only_in_origin = set(ori_attrs.keys()) - set(new_attrs.keys())
     attrs_only_in_new = set(new_attrs.keys()) - set(ori_attrs.keys())
@@ -122,11 +153,12 @@ def diff_attr(ori_attrs, new_attrs):
             continue
         else:
             error, attr_error = True, True
-            attr_changed_error_massage[attr_name] = {}
             for arg_name in ori_attrs.get(attr_name):
                 new_arg_value = new_attrs.get(attr_name, {}).get(arg_name)
                 origin_arg_value = ori_attrs.get(attr_name, {}).get(arg_name)
                 if new_arg_value != origin_arg_value:
+                    if attr_name not in attr_changed_error_massage.keys():
+                        attr_changed_error_massage[attr_name] = {}
                     attr_changed_error_massage[attr_name][arg_name] = (
                         origin_arg_value, new_arg_value)
 
@@ -140,6 +172,17 @@ def diff_attr(ori_attrs, new_attrs):
             error, attr_error = True, True
             attr_added_def_error_massage.append(attr_name)
 
+        # if added attr is quant, slim needs to review, needs to register
+        if new_attrs.get(attr_name).get(QUANT):
+            error, var_error = True, True
+            attr_added_quant_message.append(attr_name)
+
+        # if added attr is def, inference needs to review, needs to register
+        if not new_attrs.get(attr_name).get(EXTRA) and not new_attrs.get(
+                attr_name).get(QUANT):
+            error, var_error = True, True
+            attr_added_define_message.append(attr_name)
+
     attr_diff_message = {}
     if attr_added_error_massage:
         attr_diff_message[ADD] = attr_added_error_massage
@@ -149,6 +192,10 @@ def diff_attr(ori_attrs, new_attrs):
         attr_diff_message[CHANGE] = attr_changed_error_massage
     if attr_deleted_error_massage:
         attr_diff_message[DELETE] = attr_deleted_error_massage
+    if attr_added_define_message:
+        attr_diff_message[DEF] = attr_added_define_message
+    if attr_added_quant_message:
+        attr_diff_message[QUANT] = attr_added_quant_message
 
     return attr_error, attr_diff_message
 
@@ -157,23 +204,49 @@ def check_io_registry(io_type, op, diff):
     checker = OpLastCheckpointChecker()
     results = {}
     for update_type in [ADD]:
-        for item in diff.get(update_type, {}):
+        for item in diff.get(update_type, []):
             infos = checker.filter_updates(
                 op, version_update_map[io_type][update_type], item)
             if not infos:
-                results[update_type] = (op, item, io_type)
+                if update_type not in results.keys():
+                    results[update_type] = []
+                # extra not need to register.
+                qaunt_ios = diff.get(QUANT, [])
+                def_ios = diff.get(DEF, [])
+                if item in qaunt_ios or item in def_ios:
+                    results[update_type].append((op, item, io_type))
+
     return results
 
 
-def check_attr_registry(op, diff):
+def check_attr_registry(op, diff, origin_attrs):
     checker = OpLastCheckpointChecker()
     results = {}
+    qaunt_attrs = diff.get(QUANT, [])
+    def_attrs = diff.get(DEF, [])
+    change_attrs = diff.get(CHANGE, {})
     for update_type in [ADD, CHANGE]:
         for item in diff.get(update_type, {}):
             infos = checker.filter_updates(
                 op, version_update_map[ATTRS][update_type], item)
             if not infos:
-                results[update_type] = (op, item)
+                if update_type == ADD:
+                    if update_type not in results.keys():
+                        results[update_type] = []
+                    # extra not need to register.
+                    if item in qaunt_attrs or item in def_attrs:
+                        results[update_type].append((op, item))
+                elif update_type == CHANGE:
+                    if CHANGE not in results.keys():
+                        results[update_type] = {}
+                    for attr_name, attr_change in change_attrs.items():
+                        # extra not need to register.
+                        if not origin_attrs.get(attr_name).get(EXTRA):
+                            results[update_type][attr_name] = attr_change
+
+    for update_type in [ADD, CHANGE]:
+        if update_type in results.keys() and len(results[update_type]) == 0:
+            del results[update_type]
     return results
 
 
@@ -206,13 +279,14 @@ def compare_op_desc(origin_op_desc, new_op_desc):
         origin_attrs = origin_info.get(ATTRS, {})
         new_attrs = new_info.get(ATTRS, {})
         attrs_error, attrs_diff = diff_attr(origin_attrs, new_attrs)
-        attrs_version_errors = check_attr_registry(op_type, attrs_diff)
+        attrs_version_errors = check_attr_registry(op_type, attrs_diff,
+                                                   origin_attrs)
 
-        if ins_error:
+        if ins_diff:
             desc_error_message.setdefault(op_type, {})[INPUTS] = ins_diff
-        if outs_error:
+        if outs_diff:
             desc_error_message.setdefault(op_type, {})[OUTPUTS] = outs_diff
-        if attrs_error:
+        if attrs_diff:
             desc_error_message.setdefault(op_type, {})[ATTRS] = attrs_diff
 
         if ins_version_errors:
@@ -250,6 +324,14 @@ def print_desc_error_message(error_message):
                     " * The arg '{}' of Input '{}' is changed: from '{}' to '{}'.".
                     format(arg, name, ori_value, new_value))
 
+        for name in Inputs_error.get(QUANT, {}):
+            print(" * The added Input '{}' is `quant`, need slim to review.".
+                  format(name))
+
+        for name in Inputs_error.get(DEF, {}):
+            print(" * The added Input '{}' is `def`, need inference to review.".
+                  format(name))
+
         # 2. print outputs error message
         Outputs_error = error_message.get(op_name, {}).get(OUTPUTS, {})
         for name in Outputs_error.get(ADD_DISPENSABLE, {}):
@@ -266,6 +348,15 @@ def print_desc_error_message(error_message):
                     " * The arg '{}' of Output '{}' is changed: from '{}' to '{}'.".
                     format(arg, name, ori_value, new_value))
 
+        for name in Outputs_error.get(QUANT, {}):
+            print(" * The added Output '{}' is `quant`, need slim to review.".
+                  format(name))
+
+        for name in Outputs_error.get(DEF, {}):
+            print(
+                " * The added Output '{}' is `def`, need inference to review.".
+                format(name))
+
         # 3. print attrs error message
         attrs_error = error_message.get(op_name, {}).get(ATTRS, {})
         for name in attrs_error.get(ADD_WITH_DEFAULT, {}):
@@ -283,6 +374,16 @@ def print_desc_error_message(error_message):
                     " * The arg '{}' of attr '{}' is changed: from '{}' to '{}'.".
                     format(arg, name, ori_value, new_value))
 
+        for name in attrs_error.get(QUANT, {}):
+            # TODO(Wilber):
+            print(" * The added attr '{}' is `quant`, need slim to review.".
+                  format(name))
+
+        for name in attrs_error.get(DEF, {}):
+            # TODO(Wilber):
+            print(" * The added attr '{}' is `def`, need inference to review.".
+                  format(name))
+
 
 def print_version_error_message(error_message):
     print(
@@ -294,28 +395,32 @@ def print_version_error_message(error_message):
 
         # 1. print inputs error message
         inputs_error = error_message.get(op_name, {}).get(INPUTS, {})
-        tuple = inputs_error.get(ADD, {})
-        if tuple:
-            print(" * The added input '{}' is not yet registered.".format(tuple[
-                1]))
+        error_list = inputs_error.get(ADD, [])
+        if error_list:
+            for tup in error_list:
+                print(" * The added input '{}' is not yet registered.".format(
+                    tup[1]))
 
-        # 2. print inputs error message
+        # 2. print outputs error message
         outputs_error = error_message.get(op_name, {}).get(OUTPUTS, {})
-        tuple = outputs_error.get(ADD, {})
-        if tuple:
-            print(" * The added output '{}' is not yet registered.".format(
-                tuple[1]))
+        error_list = outputs_error.get(ADD, [])
+        if error_list:
+            for tup in error_list:
+                print(" * The added output '{}' is not yet registered.".format(
+                    tup[1]))
 
         #3. print attrs error message
         attrs_error = error_message.get(op_name, {}).get(ATTRS, {})
-        tuple = attrs_error.get(ADD, {})
-        if tuple:
-            print(" * The added attribute '{}' is not yet registered.".format(
-                tuple[1]))
-        tuple = attrs_error.get(CHANGE, {})
-        if tuple:
+        error_list = attrs_error.get(ADD, [])
+        if error_list:
+            for tup in error_list:
+                print(" * The added attribute '{}' is not yet registered.".
+                      format(tup[1]))
+        error_dic = error_message.get(op_name, {}).get(ATTRS, {}).get(CHANGE,
+                                                                      {})
+        for key, val in error_dic.items():
             print(" * The change of attribute '{}' is not yet registered.".
-                  format(tuple[1]))
+                  format(key))
 
 
 def print_repeat_process():
diff --git a/tools/dockerfile/ci_dockerfile.sh b/tools/dockerfile/ci_dockerfile.sh
index bb7bdfe46c29b..48c2e70b01423 100644
--- a/tools/dockerfile/ci_dockerfile.sh
+++ b/tools/dockerfile/ci_dockerfile.sh
@@ -76,10 +76,40 @@ function make_cinn_dockerfile(){
   sed -i "${dockerfile_line}i RUN pip install wheel \&\& pip3 install PyGithub wheel \&\& pip3.7 install PyGithub " ${dockerfile_name}
 }
 
+
+function make_ce_framework_dockcerfile(){
+  dockerfile_name="Dockerfile.cuda11.2_cudnn8_gcc82_trt8"
+  sed "s/<baseimg>/11.2.0-cudnn8-devel-ubuntu18.04/g" ./Dockerfile.ubuntu18 >${dockerfile_name}
+  dockerfile_line=$(wc -l ${dockerfile_name}|awk '{print $1}')
+  sed -i "7i RUN chmod 777 /tmp" ${dockerfile_name}
+  sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz \&\& \
+     tar -xzf  hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name} 
+  sed -i "${dockerfile_line}i RUN apt remove git -y \&\& apt install -y libcurl4-openssl-dev gettext \&\& wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz \&\& \
+    tar -xvf git-2.17.1.tar.gz \&\& \
+    cd git-2.17.1 \&\& \
+    ./configure --with-openssl --with-curl --prefix=/usr/local \&\& \
+    make -j8 \&\& make install " ${dockerfile_name}
+  sed -i "${dockerfile_line}i RUN pip install wheel \&\& pip3 install PyGithub wheel \&\& pip3.7 install PyGithub " ${dockerfile_name}
+  sed -i "s#<install_gcc>#WORKDIR /usr/bin \\
+    COPY tools/dockerfile/build_scripts /build_scripts \\
+    RUN bash /build_scripts/install_gcc.sh gcc82 \&\& rm -rf /build_scripts \\
+    RUN cp gcc  gcc.bak \&\& cp g++  g++.bak \&\& rm gcc \&\& rm g++ \\
+    RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc \\
+    RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/local/bin/g++ \\
+    RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc \\
+    RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ \\
+    ENV PATH=/usr/local/gcc-8.2/bin:\$PATH #g" ${dockerfile_name}
+  sed -i "s#bash /build_scripts/install_nccl2.sh#wget -q --no-proxy https://nccl2-deb.cdn.bcebos.com/nccl-repo-ubuntu1604-2.7.8-ga-cuda10.1_1-1_amd64.deb \\
+    RUN dpkg -i nccl-repo-ubuntu1604-2.7.8-ga-cuda10.1_1-1_amd64.deb \\
+    RUN apt update \&\& apt remove -y libnccl* --allow-change-held-packages \&\&  apt-get install -y libnccl2=2.7.8-1+cuda10.1 libnccl-dev=2.7.8-1+cuda10.1 pigz --allow-change-held-packages #g" ${dockerfile_name}
+}
+
+
 function main() {
   make_ubuntu_dockerfile
   make_centos_dockerfile
   make_cinn_dockerfile
+  make_ce_framework_dockcerfile
 }
 
 main "$@"
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index b38a305c1b823..79a742c314bd0 100644
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -136,7 +136,7 @@
     'test_conv_concat_relu_mkldnn_fuse_pass',
     'test_bf16_utils',
     'test_sum_bf16_mkldnn_op',
-    'test_unsqueeze2_eltwise_fuse_pass',
+    'test_unsqueeze2_eltwise_fuse_pass_cc',
     'dense_table_test',
     'test_collective_optimizer',
     'test_origin_info',
@@ -1236,7 +1236,6 @@
     'test_set_bool_attr',
     'test_sequence_topk_avg_pooling',
     'test_sequence_scatter_op',
-    'test_sequence_scatter_op',
     'test_sequence_last_step',
     'test_sequence_first_step',
     'test_seqpool_cvm_concat_fuse_pass',
@@ -1258,7 +1257,6 @@
     'test_require_version',
     'test_requantize_mkldnn_op',
     'test_repeated_fc_relu_fuse_pass',
-    'test_repeated_fc_relu_fuse_pass',
     'test_registry',
     'test_reducescatter_api',
     'test_reducescatter',
@@ -1410,7 +1408,6 @@
     'test_fleet_nocvm_1',
     'test_fleet_base_4',
     'test_fleet',
-    'test_fleet',
     'test_flags_use_mkldnn',
     'test_flags_mkldnn_ops_on_off',
     'test_filter_by_instag_op',
@@ -1419,8 +1416,6 @@
     'test_feed_fetch_method',
     'test_fc_mkldnn_op',
     'test_fc_lstm_fuse_pass',
-    'test_fc_lstm_fuse_pass',
-    'test_fc_gru_fuse_pass',
     'test_fc_gru_fuse_pass',
     'test_fc_elementwise_layernorm_fuse_pass',
     'test_fc_bf16_mkldnn_op',
@@ -1511,7 +1506,6 @@
     'test_attention_lstm_op',
     'test_analyzer',
     'test_aligned_allocator',
-    'system_allocator_test',
     'stringprintf_test',
     'stringpiece_test',
     'split_test',
@@ -1525,7 +1519,6 @@
     'save_load_op_test',
     'save_load_combine_op_test',
     'rw_lock_test',
-    'retry_allocator_test',
     'reader_test',
     'reader_blocking_queue_test',
     'prune_test',
@@ -1581,14 +1574,12 @@
     'conditional_block_op_test',
     'cipher_utils_test',
     'check_reduce_rank_test',
-    'buffered_allocator_test',
     'broadcast_op_test',
     'bfloat16_test',
     'complex_test',
     'beam_search_decode_op_test',
     'auto_growth_best_fit_allocator_test',
     'assign_op_test',
-    'allocator_facade_frac_flags_test',
     'aes_cipher_test',
     'test_dist_sparse_tensor_load_adagrad',
     'test_dist_mnist_fp16_allreduce',
@@ -1673,9 +1664,7 @@
     'test_dist_fleet_grad_clip',
     'test_custom_concat',
     'test_analyzer_seq_pool1_fuse_statis',
-    'test_fc_lstm_fuse_pass_cc',
     'test_layer_norm_fuse_pass',
-    'test_fc_gru_fuse_pass_cc',
     'test_fleet_ps',
     'test_analyzer_multi_model_prediction',
     'test_fleet_base_3',
@@ -1683,9 +1672,7 @@
     'test_ascend_trigger',
     'test_fleet_amp_meta_optimizer',
     'test_fleetrun',
-    'test_check_abi',
     'dense_table_test',
-    'test_adaptive_pool2d_convert_global_pass',
     'test_fleet_recompute_meta_optimizer',
     'test_fleet_fp16_allreduce_meta_optimizer',
     'test_post_training_quantization_lstm_model',
@@ -1695,7 +1682,6 @@
     'test_listen_and_serv_op',
     'test_analyzer_zerocopytensor_tensor',
     'test_collective_optimizer',
-    'test_bf16_utils',
     'test_analyzer_seq_pool1_compare_determine',
     'test_avoid_twice_initialization',
     'test_fleet_distributed_strategy',
@@ -1704,7 +1690,6 @@
     'test_model_cast_to_bf16',
     'test_hybrid_parallel_topology',
     'barrier_table_test',
-    'test_check_error',
     'test_fleet_lamb_meta_optimizer',
     'test_fleet_rolemaker_2',
     'test_distributed_strategy',
@@ -1715,10 +1700,8 @@
     'test_recv_save_op',
     'heter_listen_and_server_test',
     'test_analyzer_capi_ner',
-    'test_unsqueeze2_eltwise_fuse_pass',
+    'test_unsqueeze2_eltwise_fuse_pass_cc',
     'test_dgc_optimizer',
-    'test_fleet_cc',
-    'test_repeated_fc_relu_fuse_pass_cc',
     'heter_server_test',
     'test_custom_conj',
     'test_fleet_private_function',
@@ -1726,7 +1709,6 @@
     'brpc_service_sparse_sgd_test',
     'test_tf32_cudnn',
     'test_communicator_geo',
-    'test_dispatch_jit',
     'test_fleet_dgc_meta_optimizer',
     'test_fc_fuse_pass_cc',
     'test_communicator_sync',
@@ -1775,7 +1757,6 @@
     'test_fc_gru_fuse_pass_cc',
     'test_conv_bn_fuse_pass_cc',
     'test_adaptive_pool2d_convert_global_pass',
-    'test_unsqueeze2_eltwise_fuse_pass',
     'test_layer_norm_fuse_pass_cc',
     'test_fc_act_mkldnn_fuse_pass',
     'test_fleet_cc',
@@ -1897,7 +1878,6 @@
     'test_traced_layer_err_msg',
     'test_unique_with_counts',
     'test_auc_single_pred_op',
-    'test_conv_bn_fuse_pass',
     'test_instance_norm_op_v2',
     'test_softmax_bf16_mkldnn_op',
     'test_mean_iou',
@@ -1954,10 +1934,8 @@
     'test_collect_fpn_proposals_op',
     'test_sequence_unpad_op',
     'test_conv1d_transpose_layer',
-    'test_sequence_slice_op',
     'test_sequence_pool',
     'test_conv_elementwise_add_fuse_pass',
-    'test_sequence_pad_op',
     'test_conv_shift_op',
     'test_sequence_expand_as',
     'test_cos_sim_op',
@@ -2056,16 +2034,12 @@
     'test_cvm_op',
     'test_selu_op',
     'test_cross_op',
-    'test_sequence_conv',
     'test_crop_tensor_op',
-    'test_sequence_expand',
     'test_sequence_mask',
-    'test_sequence_pool',
     'test_conv_elementwise_add2_act_fuse_pass',
     'test_sequence_reshape',
     'test_conv2d_fusion_op',
     'test_sequence_softmax_op',
-    'test_sequence_unpad_op',
     'test_compare_reduce_op',
     'test_clip_by_norm_op',
     'test_box_coder_op',
@@ -2116,8 +2090,6 @@
     'test_batch_norm_op_v2',
     'test_pool2d_mkldnn_op',
     'test_regularizer',
-    'test_sequence_concat',
-    'test_sequence_expand_as',
     'test_sequence_reverse',
     'test_shape_op',
     'test_diag',
@@ -2190,6 +2162,565 @@
     'lite_mul_model_test',
     'test_complex_simplenet',
     'test_imperative_layers',
+    'test_trt_convert_concat',
+    'test_trt_convert_affine_channel',
+    'test_multi_precision_fp16_train',
+    'test_trt_transpose_flatten_concat_fuse_pass',
+    'test_trt_tuned_dynamic_shape',
+    'test_quantization_pass',
+    'test_trt_fc_fuse_pass',
+    'test_var_base',
+    'trt_split_converter_test',
+    'test_user_defined_quantization',
+    'test_quantization_scale_pass',
+    'feed_forward_test',
+    'test_fuse_optimizer_pass',
+    'test_standalone_executor',
+    'test_imperative_qat_user_defined',
+    'test_mkldnn_fc_act_fuse_pass',
+    'test_cross_entropy_loss',
+    'test_trt_conv3d_op',
+    'test_signal',
+    'test_parallel_executor_drop_scope',
+    'test_fused_feedforward_op',
+    'test_weight_decay_extend',
+    'test_fuse_relu_depthwise_conv_pass',
+    'test_diag_v2',
+    'test_tensorrt_engine',
+    'test_tensordot',
+    'test_ir_memory_optimize_ifelse_op',
+    'test_parallel_executor_mnist',
+    'test_load_state_dict_from_old_format',
+    'test_fuse_elewise_add_act_pass',
+    'test_fetch_unmerged',
+    'test_rnn_decode_api',
+    'test_activation_op',
+    'test_clip_op',
+    'test_randint_op',
+    'test_imperative_ptb_rnn',
+    'test_standalone_controlflow',
+    'test_standalone_multiply_write',
+    'test_reshape_op',
+    'test_parallel_executor_fetch_isolated_var',
+    'test_inplace_abn_op',
+    'test_fused_transformer_encoder_layer',
+    'test_eager_deletion_while_op',
+    'test_dataloader_unkeep_order',
+    'test_parallel_executor_profiler',
+    'test_correlation',
+    'test_conv_affine_channel_fuse_pass',
+    'test_ir_inplace_pass',
+    'test_trt_convert_group_norm',
+    'test_scale_op',
+    'test_moving_average_abs_max_scale_op',
+    'test_tensor_fill_diagonal_',
+    'test_tensor_type_promotion',
+    'test_flatten_contiguous_range_op',
+    'test_fill_any_op',
+    'test_trt_yolo_box_op',
+    'test_transforms',
+    'test_sum_op',
+    'test_scatter_op',
+    'test_tensor_fill_',
+    'test_parallel_executor_pg',
+    'test_mix_precision_all_reduce_fuse',
+    'test_tensor_register_hook',
+    'test_fused_multihead_matmul_op',
+    'test_tensorrt_engine_op',
+    'test_zeropad2d',
+    'test_isclose_op',
+    'test_weight_decay',
+    'test_async_read_write',
+    'test_allclose_op',
+    'test_uniform_random_inplace_op',
+    'test_decoupled_py_reader',
+    'test_op_function_generator',
+    'test_dynamic_rnn_stop_gradient',
+    'test_api_impl',
+    'test_assign_op',
+    'test_py_reader_using_executor',
+    'test_trt_instance_norm_op',
+    'test_uniform_random_op',
+    'test_py_func_op',
+    'test_eager_deletion_delete_vars',
+    'test_bernoulli_op',
+    'test_rmsprop_op',
+    'test_multinomial_op',
+    'test_jit_save_load',
+    'test_asp_optimize',
+    'test_tensor_zero_',
+    'test_fused_elemwise_activation_op',
+    'test_profiler',
+    'test_ir_memory_optimize_pass',
+    'test_callback_reduce_lr_on_plateau',
+    'test_pass_builder',
+    'test_read_file',
+    'test_print_op',
+    'test_parallel_executor_dry_run',
+    'test_paddle_save_load',
+    'test_multiprocess_dataloader_iterable_dataset_static',
+    'test_pool3d_api',
+    'test_imperative_trace_non_persistable_inputs',
+    'test_executor_return_tensor_not_overwriting',
+    'test_density_prior_box_op',
+    'test_dataloader_keep_order',
+    'test_bce_loss',
+    'test_simnet_v2',
+    'test_fetch_lod_tensor_array',
+    'test_stack_op',
+    'test_overlap_add_op',
+    'test_frame_op',
+    'test_dygraph_spectral_norm',
+    'test_broadcast_tensors_op',
+    'test_pad3d_op',
+    'test_cumprod_op',
+    'test_imperative_basic',
+    'test_cumsum_op',
+    'test_atan2_op',
+    'trt_fc_prelu_test',
+    'test_std_layer',
+    'test_squeeze_op',
+    'test_split_op',
+    'test_sign_op',
+    'test_sigmoid_focal_loss',
+    'test_set_value_op',
+    'test_searchsorted_op',
+    'test_run_program_op',
+    'test_randperm_op',
+    'test_randint_like',
+    'test_pylayer_op',
+    'test_pow2_decay_with_linear_warmup_op',
+    'test_pow',
+    'test_pixel_shuffle',
+    'test_paddle_imperative_double_grad',
+    'test_optimizer_for_varbase',
+    'test_onnx_export',
+    'test_normalize',
+    'test_norm_all',
+    'test_nn_sigmoid_op',
+    'test_nn_matmul_v2_grad',
+    'test_nn_margin_rank_loss',
+    'test_mv_op',
+    'test_multihead_attention',
+    'test_multi_dot_op',
+    'test_mse_loss',
+    'test_modelaverage',
+    'test_min_op',
+    'test_metrics',
+    'test_merged_momentum_op',
+    'test_median',
+    'test_math_op_patch_var_base',
+    'test_lod_append_op',
+    'test_layer_norm_op_v2',
+    'test_label_smooth_functional',
+    'test_instance_norm_op',
+    'test_imperative_thread_local_has_grad',
+    'test_imperative_recurrent_usage',
+    'test_imperative_container_sequential',
+    'test_imperative_container_layerlist',
+    'test_imperative_container_layerdict',
+    'test_group_norm_op_v2',
+    'test_gelu_op',
+    'test_faster_tokenizer_op',
+    'test_expand_as_op',
+    'test_digamma_op',
+    'test_diff_op',
+    'test_diagonal_op',
+    'test_diagflat',
+    'test_determinant_op',
+    'test_deform_conv2d',
+    'test_conv_transpose_nn_grad',
+    'test_conj_op',
+    'test_complex_reshape',
+    'test_chunk_op',
+    'test_bmm_op',
+    'test_bincount_op',
+    'test_beam_search_decode_op',
+    'test_arg_min_max_v2_op',
+    'test_angle_op',
+    'test_adamw_op',
+    'test_adamax_api',
+    'test_activation_nn_grad',
+    'test_sparse_momentum_op',
+    'test_softmax_mask_fuse_op',
+    'test_sgd_op',
+    'test_paddle_save_load_binary',
+    'test_ops_roi_align',
+    'test_nonzero_api',
+    'test_nll_loss',
+    'test_neg_op',
+    'test_mul_nn_grad',
+    'test_inplace',
+    'test_graph_send_recv_op',
+    'test_fill_constant_op',
+    'test_einsum',
+    'test_distribution',
+    'test_cosine_similarity_api',
+    'test_compiled_program',
+    'test_compare_op',
+    'test_bitwise_op',
+    'test_bce_with_logits_loss',
+    'test_adaptive_avg_pool3d',
+    'test_seq2seq',
+    'test_yolo_box_op',
+    'test_word2vec',
+    'test_scale_mkldnn_op',
+    'test_feed_data_check_shape_type',
+    'test_asp_pruning_2d_greedy',
+    'test_asp_pruning_2d_best',
+    'test_asp_pruning_1d',
+    'test_activation_bf16_mkldnn_op',
+    'test_erf_op',
+    'test_complex_getitem',
+    'test_vhp',
+    'test_top_k_v2_op',
+    'test_reinforcement_learning',
+    'test_hessian',
+    'test_concat_mkldnn_op',
+    'test_reduce_mkldnn_op',
+    'test_jacobian',
+    'test_tril_triu_op',
+    'test_transfer_dtype_op',
+    'test_tile_op',
+    'test_yolov3_loss_op',
+    'test_where_op',
+    'test_where_index',
+    'test_variance_layer',
+    'test_unsqueeze_op',
+    'test_trunc_op',
+    'test_trt_dynamic_shape',
+    'test_trt_anchor_generator_op',
+    'test_translated_layer',
+    'test_tensor_shape',
+    'test_split_mkldnn_op',
+    'test_slice',
+    'test_simnet',
+    'test_save_inference_model',
+    'test_return',
+    'test_program_translator',
+    'test_print',
+    'test_prelu_mkldnn_op',
+    'test_op_attr',
+    'test_loop',
+    'test_logical',
+    'test_list',
+    'test_imperative_ocr_attention_model',
+    'test_ifelse',
+    'test_grad',
+    'test_full_name_usage',
+    'test_for_enumerate',
+    'test_error',
+    'test_elementwise_gradient_op',
+    'test_dict',
+    'test_declarative',
+    'test_convert_call',
+    'test_cast',
+    'test_cache_program',
+    'test_break_continue',
+    'test_vjp_jvp',
+    'test_unique_consecutive_op',
+    'test_save_load',
+    'test_partial_program',
+    'test_len',
+    'test_pool2d_api',
+    'test_dlpack',
+    'test_complex_variable',
+    'test_adaptive_max_pool1d',
+    'test_imperative_layer_trainable',
+    'test_cuda_graph',
+    'test_rad2deg',
+    'test_custom_grad_input',
+    'test_accuracy_op',
+    'test_pool1d_api',
+    'test_imperative_selected_rows',
+    'test_tf32_cublas',
+    'test_l1_loss',
+    'test_cuda_stream_event',
+    'test_adaptive_avg_pool2d',
+    'test_normalization_wrapper',
+    'test_select_input_output_op',
+    'test_max_op',
+    'test_variable_trans_func',
+    'test_param_guard',
+    'test_share_data_op',
+    'test_multiply',
+    'test_affine_grid_function',
+    'test_lambda',
+    'test_prod_op',
+    'test_fused_attention_op_api',
+    'test_complex_grad_accumulated',
+    'cc_imp_py_test',
+    'test_deg2rad',
+    'test_lgamma_op',
+    'test_grad_clip_minimize',
+    'test_get_tensor_from_selected_rows_op',
+    'test_executor_and_mul',
+    'test_tensor',
+    'test_complex_abs',
+    'test_subtract_op',
+    'test_complex_elementwise_layers',
+    'test_marker_op',
+    'test_typing',
+    'test_imperative_container_parameterlist',
+    'test_cuda_empty_cache',
+    'test_randn_op',
+    'test_maximum_op',
+    'test_conv2d_api',
+    'test_add_position_encoding_op',
+    'test_adaptive_max_pool2d',
+    'test_tensor_methods',
+    'test_imperative_partitial_backward',
+    'test_inplace_auto_generated_apis',
+    'test_imperative_triple_grad',
+    'test_cost_model',
+    'test_zeros_like_op',
+    'test_ops_roi_pool',
+    'test_nn_functional_embedding_dygraph',
+    'test_function_hook',
+    'test_real_imag_op',
+    'test_minimum_op',
+    'test_view_op_reuse_allocation',
+    'test_ast_util',
+    'test_nn_quant_functional_layers',
+    'test_adaptive_max_pool3d',
+    'test_ones_like',
+    'test_lod_array_length_op',
+    'test_fetch_feed',
+    'test_memory_reuse_exclude_feed_var',
+    'test_ir_embedding_eltwise_layernorm_fuse_pass',
+    'test_pairwise_distance',
+    'test_imperative_hook_for_layer',
+    'test_complex_sum_layer',
+    'test_sort_op',
+    'test_complex_cast',
+    'test_complex_transpose',
+    'test_reorder_lod_tensor',
+    'test_complex_kron',
+    'test_complex_trace_layer',
+    'test_merge_selectedrows_op',
+    'test_imperative_parallel_coalesce_split',
+    'test_viterbi_decode_op',
+    'test_square_error_cost',
+    'test_lod_tensor',
+    'test_array_read_write_op',
+    'test_weight_normalization',
+    'test_glu',
+    'test_nn_dice_loss',
+    'test_adaptive_avg_pool1d',
+    'data_type_transform_test',
+    'test_tracer',
+    'test_elementwise_div_grad_grad',
+    'tensor_util_test',
+    'concat_test',
+    'math_function_gpu_test',
+    'malloc_test',
+    'test_elementwise_add_grad_grad',
+    'transform_test',
+    'strided_memcpy_test',
+    'test_gradient_accmulator',
+    'test_fused_residual_dropout_bias',
+    'test_elementwise_add_op_inplace',
+    'lod_tensor_gpu_test',
+    'device_event_test',
+    'copy_cross_scope_test',
+    'test_fused_layernorm_residual_dropout_bias',
+    'test_fused_dropout_act_bias',
+    'test_tensorrt',
+    'test_matmul_api',
+    'test_egr_task_fwd_bwd_joint',
+    'beam_search_test',
+    'test_tensor_to_list',
+    'test_identity_op',
+    'test_eigvals_op',
+    'test_functional_conv1d_transpose',
+    'test_Tensor_type',
+    'test_analyzer_transformer',
+    'test_analyzer_text_classification',
+    'test_analyzer_small_dam',
+    'test_analyzer_int8_mobilenetv2',
+    'test_analyzer_int8_mobilenetv1',
+    'test_analyzer_int8_googlenet',
+    'test_analyzer_bfloat16_resnet50',
+    'test_analyzer_bfloat16_mobilenetv2',
+    'test_analyzer_bfloat16_mobilenetv1',
+    'test_analyzer_quant_performance_benchmark',
+    'test_analyzer_int8_resnet50',
+    'test_analyzer_int8_mobilenet_ssd',
+    'test_analyzer_bfloat16_googlenet',
+    'test_analyzer_transformer_profile',
+    'test_analyzer_capi_exp_gpu',
+    'test_ir_subgraph_python_interface',
+    'test_memory_analysis',
+    'test_functional_conv1d',
+    'test_op_converter',
+    'cost_model_test',
+    'test_mkldnn_softplus_activation_fuse_pass',
+    'test_custom_relu_op_jit',
+    'test_custom_relu_model',
+    'test_custom_linear',
+    'test_custom_attrs_jit',
+    'test_custom_relu_op_setup',
+    'test_mkldnn_matmul_v2_transpose_reshape_fuse_pass',
+    'pten_test_backend',
+    'test_allocator',
+    'pten_test_data_type',
+    'test_slice_api',
+    'test_scale_api',
+    'test_sum_api',
+    'enforce_test',
+    'test_op_compat_sensible_pass',
+    'test_generate_pass_cc',
+    'program_processing_test',
+    'build_strategy_test',
+    'workqueue_test',
+    'test_fc_rnn_mkldnn_fuse_pass',
+    'test_cpu_quantize_squash_pass',
+    'scope_guard_test',
+    'pten_utils_test',
+    'init_test',
+    'cpu_helper_test',
+    'complex_gpu_test',
+    'bfloat16_gpu_test',
+    'test_scale_dev_api',
+    'job',
+    'test_kernel_factory',
+    'test_dot_dev_api',
+    'test_copy_dev_api',
+    'test_convert_utils',
+    'test_type_info',
+    'test_flatten_dev_api',
+    'test_storage',
+    'test_intrusive_ptr',
+    'test_dense_tensor',
+    'test_mean_dev_api',
+    'test_cast_dev_api',
+    'test_trt_convert_slice',
+    'test_framework_tensor_utils',
+    'test_sum_dev_api',
+    'test_reshape_dev_api',
+    'test_elementwise_dev_api',
+    'small_vector_test',
+    'test_framework_place_utils',
+    'test_reshape_api',
+    'test_cast_api',
+    'test_pten_exception',
+    'test_mean_api',
+    'test_framework_storage',
+    'test_flatten_api',
+    'test_fill_api',
+    'test_elementwise_api',
+    'test_dot_api',
+    'test_split_plugin',
+    'test_auto_parallel_api',
+    'test_linear_chain_crf_op',
+    'test_callback_early_stop',
+    'test_tensor_copy_from',
+    'test_inplace_and_clear_gradient',
+    'test_analyzer_capi_exp_xpu',
+    'test_table_printer',
+    'test_egr_task_autocodegen',
+    'test_static_save_load_bf16',
+    'test_reset_grad_inplace_version',
+    'test_parallel_executor_run_cinn',
+    'test_initializer',
+    'test_egr_task_tensor_utils',
+    'test_egr_task_hook',
+    'test_egr_task_forward_autograd',
+    'test_egr_task_eager_utils',
+    'test_egr_task_cross_batch',
+    'test_egr_task_backward',
+    'test_egr_ds_tensor_wrapper',
+    'test_egr_ds_grad_tensor_holder',
+    'test_egr_ds_grad_node_info',
+    'test_egr_ds_auotgrad_meta',
+    'test_egr_ds_accumulation_node',
+    'test_save_inference_model_conditional_op',
+    'test_resnet50_with_cinn',
+    'test_parallel_executor_run_load_infer_program',
+    'test_parallel_dygraph_sync_batch_norm',
+    'test_monitor',
+    'test_mkldnn_quantizer_config',
+    'test_mkldnn_quantizer',
+    'test_lookup_table_v2_bf16_op',
+    'test_hapi_hub_model',
+    'test_get_inputs_outputs_in_block',
+    'test_get_device_properties',
+    'test_fleet_elastic_manager',
+    'test_fleet_elastic_init',
+    'test_fleet_elastic_collective',
+    'test_fleet_ascend_utils',
+    'test_executor_check_fetch_list',
+    'test_eig_op',
+    'test_egr_performance_benchmark_fluid_cpu',
+    'test_egr_performance_benchmark_eager_cpu',
+    'test_egr_ds_eager_tensor',
+    'test_datasets',
+    'test_dataset_wmt',
+    'test_dataset_movielens',
+    'test_dataset_download',
+    'test_dataset_consistency_inspection',
+    'test_dataset_cifar',
+    'test_cyclic_cifar_dataset',
+    'test_cuda_device_name_capability',
+    'test_cuda_device_count',
+    'test_cuda_cudnn_version',
+    'test_collective_base',
+    'test_collective_api_base',
+    'test_backward_infer_var_data_type_shape',
+    'test_auto_parallel_graph',
+    'test_auto_parallel_completion_gpt',
+    'test_auto_parallel_completion',
+    'test_auto_parallel_cluster',
+    'test_analyzer_transformer_fuse',
+    'test_analyzer_save_model',
+    'test_analyzer_lexical_gru_int8_multi_gru',
+    'test_analyzer_lexical_gru_int8',
+    'test_analyzer_lexical_gru_bfloat16',
+    'test_analyzer_lexical_gru',
+    'test_analyzer_lac',
+    'test_analyzer_detect_functional_mkldnn',
+    'test_analyzer_capi_exp_pd_tensor',
+    'test_analyzer_capi_exp_pd_config',
+    'test_analyzer_capi_exp_ner',
+    'test_analyzer_capi_exp_int',
+    'test_analyzer_capi_exp',
+    'string_helper_test',
+    'preprocess_local_pascalvoc',
+    'preprocess_local_imagenet',
+    'paddle_infer_api_errors_test',
+    'test_flatten_mkldnn_op',
+    'test_transfer_layout_op',
+    'test_squeeze2_mkldnn_op',
+    'test_conv2d_transpose_bf16_mkldnn_op',
+    'test_slice_mkldnn_op',
+    'test_parallel_executor_seresnext_base_cpu',
+    'test_stack_mkldnn_op',
+    'test_split_bf16_mkldnn_op',
+    'test_softplus_mkldnn_op',
+    'test_scale_bf16_mkldnn_op',
+    'test_parallel_executor_seresnext_with_reduce_cpu',
+    'test_nearest_interp_v2_mkldnn_op',
+    'test_ir_generate_pass',
+    'test_fusion_lstm_mkldnn_op',
+    'test_fuse_resnet_unit',
+    'test_expand_v2_mkldnn_op',
+    'test_elementwise_sub_mkldnn_op',
+    'test_elementwise_div_mkldnn_op',
+    'test_uniform_random_bf16_op',
+    'test_reshape_mkldnn_op',
+    'test_reduce_bf16_mkldnn_op',
+    'test_parallel_executor_seresnext_with_fuse_all_reduce_cpu',
+    'test_nearest_interp_mkldnn_op',
+    'test_ir_graph_to_program_pass',
+    'test_fusion_lstm_int8_mkldnn_op',
+    'test_fusion_lstm_bf16_mkldnn_op',
+    'test_convert_call_generator',
+    'test_container',
+    'test_clip_mkldnn_op',
+    'test_cast_mkldnn_op',
+    'test_bilinear_interp_v2_mkldnn_op',
+    'test_bilinear_interp_mkldnn_op',
+    'test_asp_utils',
 ]
 
 
diff --git a/tools/print_op_desc.py b/tools/print_op_desc.py
index 64445bab3a62c..b85103a7a25e1 100644
--- a/tools/print_op_desc.py
+++ b/tools/print_op_desc.py
@@ -18,7 +18,9 @@
             {input_name1:
                 {DISPENSABLE: bool,
                  INTERMEDIATE: bool,
-                 DUPLICABLE: bool
+                 DUPLICABLE: bool,
+                 EXTRA: bool,
+                 QUANT: bool,
                 },
             input_name2:{}
             },
@@ -28,6 +30,8 @@
                 {TYPE: int,
                  GENERATED: bool,
                  DEFAULT_VALUE: int/str/etc,
+                 EXTRA: bool,
+                 QUANT: bool,
                 }
             }
         }
@@ -55,6 +59,9 @@
 GENERATED = "generated"
 DEFAULT_VALUE = "default_value"
 
+EXTRA = "extra"
+QUANT = "quant"
+
 
 def get_attr_default_value(op_name):
     return core.get_op_attrs_default_value(cpt.to_bytes(op_name))
@@ -68,6 +75,8 @@ def get_vars_info(op_vars_proto):
         vars_info[name][DUPLICABLE] = var_proto.duplicable
         vars_info[name][DISPENSABLE] = var_proto.dispensable
         vars_info[name][INTERMEDIATE] = var_proto.intermediate
+        vars_info[name][EXTRA] = var_proto.extra
+        vars_info[name][QUANT] = var_proto.quant
     return vars_info
 
 
@@ -81,6 +90,8 @@ def get_attrs_info(op_proto, op_attrs_proto):
         attrs_info[attr_name][GENERATED] = attr_proto.generated
         attrs_info[attr_name][DEFAULT_VALUE] = attrs_default_values[
             attr_name] if attr_name in attrs_default_values else None
+        attrs_info[attr_name][EXTRA] = attr_proto.extra
+        attrs_info[attr_name][QUANT] = attr_proto.quant
     return attrs_info
 
 
diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh
index 4d6fa93c63744..4d62a9a88e1f5 100644
--- a/tools/windows/run_unittests.sh
+++ b/tools/windows/run_unittests.sh
@@ -53,10 +53,6 @@ if [ -f "$PADDLE_ROOT/added_ut" ];then
         echo "========================================"
         exit 8;
     fi
-    if nvcc --version | grep 11.2; then
-        echo "Only test added_ut temporarily when running in CI-Windows-inference of CUDA 11.2."
-        exit 0;
-    fi
 fi
 set -e
 
@@ -107,7 +103,6 @@ disable_win_trt_test="^test_trt_convert_conv2d$|\
 ^test_trt_convert_emb_eltwise_layernorm$|\
 ^test_trt_convert_pool2d$|\
 ^test_trt_conv3d_op$|\
-^test_trt_matmul_quant_dequant$|\
 ^test_trt_subgraph_pass$|\
 ^test_trt_convert_dropout$|\
 ^test_trt_convert_hard_sigmoid$|\
@@ -121,6 +116,16 @@ disable_win_trt_test="^test_trt_convert_conv2d$|\
 ^test_trt_convert_matmul$|\
 ^test_trt_convert_scale$"
 
+# /*==================Fixed Disabled Windows GPU inference_api_test unittests==============================*/
+disable_win_inference_api_test="^test_analyzer_capi_exp_pd_config$|\
+^trt_quant_int8_yolov3_r50_test$|\
+^test_trt_dynamic_shape_ernie$|\
+^test_trt_dynamic_shape_ernie_fp16_ser_deser$|\
+^lite_resnet50_test$|\
+^test_trt_dynamic_shape_transformer_prune$|\
+^lite_mul_model_test$|\
+^paddle_infer_api_copy_tensor_tester$"
+
 # /*============================================================================*/
 
 # /*==================Fixed Disabled Windows CPU OPENBLAS unittests==============================*/
@@ -156,17 +161,12 @@ long_time_test="^test_gru_op$|\
 ^test_activation_op$|\
 ^test_bicubic_interp_v2_op$|\
 ^test_bilinear_interp_v2_op$|\
-^test_conv_nn_grad$|\
 ^test_crop_tensor_op$|\
 ^test_cross_entropy2_op$|\
 ^test_cross_op$|\
-^test_elementwise_div_op$|\
 ^test_elementwise_nn_grad$|\
 ^test_fused_elemwise_activation_op$|\
-^test_group_norm_op$|\
-^test_gru_unit_op$|\
 ^test_imperative_lod_tensor_to_selected_rows$|\
-^test_imperative_optimizer$|\
 ^test_imperative_selected_rows_to_lod_tensor$|\
 ^test_layer_norm_op$|\
 ^test_multiclass_nms_op$|\
@@ -175,8 +175,6 @@ long_time_test="^test_gru_op$|\
 ^test_norm_nn_grad$|\
 ^test_normal$|\
 ^test_pool3d_op$|\
-^test_pool2d_op$|\
-^test_softmax_with_cross_entropy_op$|\
 ^test_static_save_load$|\
 ^test_trilinear_interp_op$|\
 ^test_trilinear_interp_v2_op$|\
@@ -185,10 +183,9 @@ long_time_test="^test_gru_op$|\
 ^test_sequence_conv$|\
 ^test_sgd_op$|\
 ^test_transformer$|\
-^test_lstmp_op$|\
-^test_conv2d_transpose_op$|\
 ^test_imperative_auto_mixed_precision$|\
 ^test_imperative_optimizer_v2$|\
+^test_trt_matmul_quant_dequant$|\
 ^test_strided_slice_op$"
 
 if [ ${WITH_GPU:-OFF} == "ON" ];then
@@ -340,6 +337,24 @@ function show_ut_retry_result() {
 set +e
 
 export FLAGS_call_stack_level=2
+
+if nvcc --version | grep 11.2; then
+    echo "Only test added_ut and inference_api_test temporarily when running in CI-Windows-inference of CUDA 11.2."
+    export CUDA_VISIBLE_DEVICES=0
+    tmpfile=$tmp_dir/$RANDOM
+    inference_api_test=^$(ls "paddle/fluid/inference/tests/api" | sed -n 's/\.exe$//pg' | awk BEGIN{RS=EOF}'{gsub(/\n/,"$|^");print}' | sed 's/|\^$//g')
+    (ctest -R "$inference_api_test" -E "$disable_win_inference_api_test" --output-on-failure -C Release -j 2 | tee $tmpfile ) &
+    wait;
+    collect_failed_tests
+    set -e
+    rm -f $tmp_dir/*
+    if [[ "$failed_test_lists" != "" ]]; then
+        unittests_retry
+        show_ut_retry_result
+    fi
+    exit 0;
+fi
+
 if [ "${WITH_GPU:-OFF}" == "ON" ];then
     run_unittest_gpu $cpu_parallel_job 10
     run_unittest_gpu $tetrad_parallel_job 4