From a821c4a941d1fdf698bf6ac632c90a02c103502b Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Sun, 6 Feb 2022 11:05:25 +0800
Subject: [PATCH] [PTEN] Add Gpu context (#39305)

---
 paddle/fluid/distributed/common/utils.h       |   2 +-
 .../ps/service/communicator/communicator.cc   |   8 +-
 .../ps/service/communicator/communicator.h    |   4 +-
 .../framework/data_type_transform_test.cu     |   5 +-
 paddle/fluid/framework/parallel_executor.cc   |   2 +-
 paddle/fluid/framework/pten_utils.h           |   7 +
 paddle/fluid/framework/tensor_util.cc         |   6 +-
 paddle/fluid/framework/tensor_util_test.cc    |  20 +
 paddle/fluid/imperative/gloo_context.cc       |  11 +
 .../fluid/inference/lite/test_engine_lite.cc  |   4 +
 .../fluid/inference/tensorrt/test_engine.cc   |  12 +
 .../allocation/best_fit_allocator_test.cu     |   5 +
 paddle/fluid/memory/malloc_test.cu            |  35 +-
 .../fluid/operators/arg_min_max_op_base.cu.h  |   2 +-
 paddle/fluid/operators/argsort_op.cu          |   4 +-
 .../fluid/operators/broadcast_tensors_op.cu   |   4 +-
 paddle/fluid/operators/cholesky_solve_op.cu   |   3 +-
 paddle/fluid/operators/clip_by_norm_op.cu     |   2 +-
 .../operators/controlflow/compare_all_op.cu   |   3 +-
 .../fluid/operators/copy_cross_scope_test.cc  |   8 +
 .../elementwise/elementwise_mul_op.cu         |   5 +-
 .../elementwise/elementwise_op_function.h     |   3 +-
 paddle/fluid/operators/feed_forward_test.cu   |  12 +
 paddle/fluid/operators/fused/attn_gemm.h      |   3 +-
 paddle/fluid/operators/gelu_op.cu             |   4 +-
 .../fluid/operators/graph_khop_sampler_op.cu  |   4 +-
 paddle/fluid/operators/graph_send_recv_op.cu  |   4 +-
 paddle/fluid/operators/gumbel_softmax_op.cu   |   2 +-
 paddle/fluid/operators/index_sample_op.cu     |   6 +-
 paddle/fluid/operators/interpolate_v2_op.cu   |   8 +-
 paddle/fluid/operators/kron_op.h              |   4 +-
 paddle/fluid/operators/kthvalue_op.cu         |   2 +-
 .../operators/lite/lite_engine_op_test.cc     |   4 +
 .../operators/margin_cross_entropy_op.cu      |   4 +-
 .../fluid/operators/math/beam_search_test.cc  |  79 ++
 paddle/fluid/operators/math/blas_impl.cu.h    | 577 +++++++++++
 paddle/fluid/operators/math/blas_impl.hip.h   | 406 ++++++++
 paddle/fluid/operators/math/concat_test.cc    |  27 +
 .../math/cusparse_conversion_api_test.cc      |  10 +
 paddle/fluid/operators/math/im2col_test.cc    | 162 ++++
 paddle/fluid/operators/math/inclusive_scan.h  |   2 +-
 paddle/fluid/operators/math/math_function.cu  |  13 +
 .../operators/math/math_function_test.cc      |   1 +
 .../operators/math/math_function_test.cu      |  36 +
 paddle/fluid/operators/math/vol2col_test.cc   | 121 ++-
 paddle/fluid/operators/mean_op.cu             |   3 +-
 .../fluid/operators/nccl/nccl_op_test.cu.cc   |   7 +-
 paddle/fluid/operators/p_norm_op.cu           |  12 +-
 paddle/fluid/operators/pool_op.h              |   4 +-
 paddle/fluid/operators/prelu_op.cu            |   3 +-
 .../pscore/send_and_recv_op_gpu_test.cc       |   4 +
 .../fluid/operators/reduce_ops/reduce_op.cu.h |   6 +-
 paddle/fluid/operators/renorm_op.cu           |  10 +-
 paddle/fluid/operators/reshape_op.cc          |   9 +-
 paddle/fluid/operators/scatter.cu.h           |   3 +-
 paddle/fluid/operators/solve_op.h             |   3 +-
 paddle/fluid/operators/strided_memcpy_test.cc |   9 +
 .../tensorrt/tensorrt_engine_op_test.cc       |  12 +
 paddle/fluid/operators/top_k_function_cuda.h  |   2 +-
 paddle/fluid/operators/trace_op.cu            |   3 +-
 paddle/fluid/operators/triangular_solve_op.cu |   3 +-
 paddle/fluid/operators/viterbi_decode_op.cu   |   2 +-
 paddle/fluid/platform/CMakeLists.txt          |   1 +
 paddle/fluid/platform/bfloat16_test.cu        |   4 +
 paddle/fluid/platform/collective_helper.cc    |  15 +-
 .../fluid/platform/device/gpu/CMakeLists.txt  |   4 +-
 .../platform/device/gpu/cuda/CMakeLists.txt   |   1 -
 .../platform/device/gpu/cuda/cuda_helper.h    |   5 +-
 .../device/gpu/cuda/cusparse_helper.h         |   6 +-
 paddle/fluid/platform/device/gpu/gpu_info.cc  | 107 ++-
 paddle/fluid/platform/device/gpu/gpu_info.h   |   5 +-
 .../fluid/platform/device/gpu/nccl_helper.h   |  17 +-
 .../platform/device/gpu/rocm/CMakeLists.txt   |   2 -
 paddle/fluid/platform/device_context.cc       | 229 ++---
 paddle/fluid/platform/device_context.h        | 156 ++-
 paddle/fluid/platform/device_context_test.cu  |  28 +
 paddle/fluid/platform/device_event_gpu.cc     |   5 +-
 paddle/fluid/platform/device_event_test.cc    |   5 +-
 paddle/fluid/platform/enforce.h               |   4 +-
 paddle/fluid/platform/event.h                 |   6 +-
 paddle/fluid/platform/float16_test.cu         |   4 +
 paddle/fluid/platform/for_range.h             |  36 +
 paddle/fluid/platform/stream/cuda_stream.cc   |  16 +-
 paddle/fluid/platform/stream/cuda_stream.h    |  16 +-
 paddle/fluid/platform/transform_test.cu       |   9 +
 paddle/fluid/pybind/cuda_streams_py.cc        |   8 +-
 paddle/fluid/pybind/pybind.cc                 |  46 +-
 paddle/pten/backends/CMakeLists.txt           |   8 +
 paddle/pten/backends/cpu/cpu_context.cc       |  68 +-
 paddle/pten/backends/cpu/cpu_context.h        |  34 +-
 paddle/pten/backends/gpu/CMakeLists.txt       |   9 +
 paddle/pten/backends/gpu/cuda/CMakeLists.txt  |   1 +
 paddle/pten/backends/gpu/cuda/cuda_helper.h   |  72 ++
 .../backends}/gpu/cuda/cuda_info.cc           | 131 ++-
 paddle/pten/backends/gpu/forwards.h           | 111 +++
 paddle/pten/backends/gpu/gpu_context.cc       | 899 ++++++++++++++++++
 paddle/pten/backends/gpu/gpu_context.h        | 165 +++-
 paddle/pten/backends/gpu/gpu_decls.h          |  70 ++
 paddle/pten/backends/gpu/gpu_helper.h         |  26 +
 paddle/pten/backends/gpu/gpu_info.cc          |  60 ++
 paddle/pten/backends/gpu/gpu_info.h           | 132 +++
 paddle/pten/backends/gpu/gpu_launch_config.h  | 181 ++++
 paddle/pten/backends/gpu/gpu_types.h          |  73 ++
 paddle/pten/backends/gpu/rocm/CMakeLists.txt  |   1 +
 paddle/pten/backends/gpu/rocm/rocm_helper.h   |  72 ++
 .../backends}/gpu/rocm/rocm_info.cc           | 128 ++-
 paddle/pten/backends/xpu/xpu_context.cc       | 102 +-
 paddle/pten/backends/xpu/xpu_context.h        |  32 +-
 paddle/pten/core/device_context.cc            |  14 +-
 paddle/pten/core/device_context.h             |   6 +-
 .../pten/kernels/funcs/cuda_kernel_config.h   |   3 +-
 paddle/pten/kernels/funcs/elementwise_base.h  |   5 +-
 paddle/pten/kernels/funcs/transpose.cu        |   1 +
 paddle/pten/kernels/gpu/concat_and_split.h    |  11 +-
 paddle/pten/kernels/gpu/copy_kernel.cu        |  20 +-
 paddle/pten/kernels/gpu/elementwise.h         |   8 +-
 paddle/pten/kernels/gpu/reduce.h              |  21 +-
 .../kernels/impl/matmul_grad_kernel_impl.h    |   8 +-
 paddle/pten/tests/api/test_matmul_api.cc      |   3 +-
 .../pten/tests/api/test_sparse_utils_api.cc   |   1 +
 paddle/pten/tests/core/test_device_context.cc |  32 +-
 .../pten/tests/kernels/test_cast_dev_api.cc   |   8 +-
 .../pten/tests/kernels/test_concat_dev_api.cc |   8 +-
 .../pten/tests/kernels/test_conj_dev_api.cc   |   8 +-
 .../pten/tests/kernels/test_copy_dev_api.cc   |   8 +-
 .../tests/kernels/test_creation_dev_api.cc    |  32 +-
 paddle/pten/tests/kernels/test_dot_dev_api.cc |   8 +-
 .../tests/kernels/test_elementwise_dev_api.cc |  32 +-
 .../tests/kernels/test_flatten_dev_api.cc     |   8 +-
 .../pten/tests/kernels/test_matmul_dev_api.cc |   8 +-
 .../pten/tests/kernels/test_mean_dev_api.cc   |   8 +-
 .../tests/kernels/test_reshape_dev_api.cc     |   8 +-
 .../pten/tests/kernels/test_scale_dev_api.cc  |  18 +-
 .../kernels/test_sparse_utils_dev_api.cc      |  61 +-
 paddle/pten/tests/kernels/test_sum_dev_api.cc |   9 +-
 135 files changed, 4438 insertions(+), 770 deletions(-)
 create mode 100644 paddle/pten/backends/gpu/CMakeLists.txt
 create mode 100644 paddle/pten/backends/gpu/cuda/CMakeLists.txt
 create mode 100644 paddle/pten/backends/gpu/cuda/cuda_helper.h
 rename paddle/{fluid/platform/device => pten/backends}/gpu/cuda/cuda_info.cc (71%)
 create mode 100644 paddle/pten/backends/gpu/forwards.h
 create mode 100644 paddle/pten/backends/gpu/gpu_context.cc
 create mode 100644 paddle/pten/backends/gpu/gpu_decls.h
 create mode 100644 paddle/pten/backends/gpu/gpu_helper.h
 create mode 100644 paddle/pten/backends/gpu/gpu_info.cc
 create mode 100644 paddle/pten/backends/gpu/gpu_info.h
 create mode 100644 paddle/pten/backends/gpu/gpu_launch_config.h
 create mode 100644 paddle/pten/backends/gpu/gpu_types.h
 create mode 100644 paddle/pten/backends/gpu/rocm/CMakeLists.txt
 create mode 100644 paddle/pten/backends/gpu/rocm/rocm_helper.h
 rename paddle/{fluid/platform/device => pten/backends}/gpu/rocm/rocm_info.cc (72%)

diff --git a/paddle/fluid/distributed/common/utils.h b/paddle/fluid/distributed/common/utils.h
index fb2189b8f5a1b..85b89d75b98b6 100644
--- a/paddle/fluid/distributed/common/utils.h
+++ b/paddle/fluid/distributed/common/utils.h
@@ -33,7 +33,7 @@ namespace distributed {
 template <typename T>
 inline paddle::operators::math::BlasT<paddle::platform::CPUDeviceContext, T>
 GetBlas() {
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
+  paddle::platform::CPUDeviceContext cpu_ctx;
   return paddle::operators::math::GetBlas<paddle::platform::CPUDeviceContext,
                                           T>(cpu_ctx);
 }
diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator.cc b/paddle/fluid/distributed/ps/service/communicator/communicator.cc
index 3f1667e5344d6..99973ee8bdd74 100644
--- a/paddle/fluid/distributed/ps/service/communicator/communicator.cc
+++ b/paddle/fluid/distributed/ps/service/communicator/communicator.cc
@@ -1155,7 +1155,7 @@ void GeoCommunicator::SendDense(const CommContext &send_ctx) {
     auto &t_latest = var_latest->Get<framework::LoDTensor>();
     auto t_timestamp = var_timestamp->GetMutable<framework::LoDTensor>();
 
-    auto cpu_ctx = paddle::platform::CPUDeviceContext();
+    paddle::platform::CPUDeviceContext cpu_ctx;
     auto *var_delta = delta_scope_->Var(varname);
     auto *t_delta = var_delta->GetMutable<framework::LoDTensor>();
     t_delta->mutable_data<float>(t_latest.dims(), cpu_ctx.GetPlace());
@@ -1185,7 +1185,7 @@ void GeoCommunicator::RecvDense(const CommContext &send_ctx) {
   RpcRecvDense(varnames, table_id, pserver_scope_.get());
 
   // 2.1 pserver - old => delta; 2.2 latest + old => latest 2.3 old => pserver
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
+  paddle::platform::CPUDeviceContext cpu_ctx;
   for (auto &varname : varnames) {
     auto *var_latest = recv_scope_->FindVar(varname);
     auto t_latest = var_latest->GetMutable<framework::LoDTensor>();
@@ -1292,7 +1292,7 @@ void GeoCommunicator::SendSparse(const std::string &varname,
   auto *t_old = var_old->GetMutable<framework::LoDTensor>();
 
   auto dims1 = t_latest.dims()[1];
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
+  paddle::platform::CPUDeviceContext cpu_ctx;
 
   auto *var_delta = delta_scope_->Var(varname);
   auto *t_delta = var_delta->GetMutable<pten::SelectedRows>();
@@ -1370,7 +1370,7 @@ void GeoCommunicator::RecvSparse(const std::string &varname, int table_id,
   std::vector<float> v_delta;
   v_delta.resize(numel);
 
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
+  paddle::platform::CPUDeviceContext cpu_ctx;
   auto blas =
       paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>(
           cpu_ctx);
diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator.h b/paddle/fluid/distributed/ps/service/communicator/communicator.h
index c63f341607439..da4e2f1a12898 100644
--- a/paddle/fluid/distributed/ps/service/communicator/communicator.h
+++ b/paddle/fluid/distributed/ps/service/communicator/communicator.h
@@ -179,7 +179,7 @@ inline void MergeVars(const std::string &var_name,
     }
 
     // set output tensor to 0.
-    auto cpu_ctx = paddle::platform::CPUDeviceContext();
+    paddle::platform::CPUDeviceContext cpu_ctx;
     paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext, T>
         constant_functor;
     constant_functor(cpu_ctx, out_t, static_cast<T>(0));
@@ -204,7 +204,7 @@ inline void MergeVars(const std::string &var_name,
     for (auto &var : vars) {
       inputs.push_back(&var->Get<pten::SelectedRows>());
     }
-    auto dev_ctx = paddle::platform::CPUDeviceContext();
+    paddle::platform::CPUDeviceContext dev_ctx;
     if (merge_add) {
       paddle::operators::math::scatter::MergeAdd<
           paddle::platform::CPUDeviceContext, T>
diff --git a/paddle/fluid/framework/data_type_transform_test.cu b/paddle/fluid/framework/data_type_transform_test.cu
index 0874509a8797c..6b84fdf0ec9a6 100644
--- a/paddle/fluid/framework/data_type_transform_test.cu
+++ b/paddle/fluid/framework/data_type_transform_test.cu
@@ -21,7 +21,10 @@ TEST(DataTypeTransform, GPUTransform) {
   auto cpu_place = paddle::platform::CPUPlace();
   auto gpu_place = paddle::platform::CUDAPlace(0);
   paddle::platform::CUDADeviceContext context(gpu_place);
-
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
   auto kernel_fp16 = paddle::framework::OpKernelType(
       paddle::framework::proto::VarType::FP16, gpu_place,
       paddle::framework::DataLayout::kAnyLayout,
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 8767b55062cda..7eebfb904cf66 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -1361,7 +1361,7 @@ void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) {
       auto *dev_ctx = static_cast<platform::XPUDeviceContext *>(
           pool.Get(member_->places_[dev_id]));
       auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]);
-      dev_ctx->set_bkcl_context(bkcl_ctx.comm());
+      dev_ctx->SetBkclContext(bkcl_ctx.comm());
     }
 #else
     PADDLE_THROW(
diff --git a/paddle/fluid/framework/pten_utils.h b/paddle/fluid/framework/pten_utils.h
index 2d335fc9c9894..ae0388079d280 100644
--- a/paddle/fluid/framework/pten_utils.h
+++ b/paddle/fluid/framework/pten_utils.h
@@ -77,6 +77,13 @@ struct ConvertToPtenContext<platform::CPUDeviceContext> {
   using TYPE = pten::CPUContext;
 };
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <>
+struct ConvertToPtenContext<platform::CUDADeviceContext> {
+  using TYPE = pten::GPUContext;
+};
+#endif
+
 #ifdef PADDLE_WITH_XPU
 template <>
 struct ConvertToPtenContext<platform::XPUDeviceContext> {
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index dff4879096056..b926a3cc7659b 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -1085,7 +1085,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     is.seekg(seekg, is.cur);
 
     void* buf;
-    auto ctx = platform::CPUDeviceContext();
+    platform::CPUDeviceContext ctx;
     size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
         platform::is_xpu_place(dev_ctx.GetPlace()) ||
@@ -1155,7 +1155,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
     tensor->Resize(framework::make_ddim(dims));
     void* buf;
-    auto ctx = platform::CPUDeviceContext();
+    platform::CPUDeviceContext ctx;
     size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
         platform::is_xpu_place(dev_ctx.GetPlace()) ||
@@ -1432,4 +1432,4 @@ std::ostream& operator<<(std::ostream& os, const pten::DenseTensor& t) {
   VLOG(1) << "PrintVar: unrecognized data type:" << t.type();
   return os;
 }
-}
+}  // namespace pten
diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
index 8587ee8d1e919..fbbf29254d0f9 100644
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -73,6 +73,10 @@ TEST(TensorCopy, Tensor) {
     // CPU Tensor to GPU Tensor
     auto gpu_place = new platform::CUDAPlace(0);
     platform::CUDADeviceContext gpu_ctx(*gpu_place);
+    gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                             .GetAllocator(*gpu_place, gpu_ctx.stream())
+                             .get());
+    gpu_ctx.PartialInitWithAllocator();
     TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
 
     // GPU Tensor to CPU Tensor
@@ -166,6 +170,10 @@ TEST(TensorFromVector, Tensor) {
     gpu_tensor.Resize(paddle::framework::make_ddim({3, 3}));
     auto gpu_place = new paddle::platform::CUDAPlace();
     paddle::platform::CUDADeviceContext gpu_ctx(*gpu_place);
+    gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                             .GetAllocator(*gpu_place, gpu_ctx.stream())
+                             .get());
+    gpu_ctx.PartialInitWithAllocator();
     paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
     // Copy from GPU to CPU tensor for comparison
     paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
@@ -230,6 +238,10 @@ TEST(TensorToVector, Tensor) {
     paddle::framework::Tensor gpu_tensor;
     paddle::platform::CUDAPlace place;
     paddle::platform::CUDADeviceContext gpu_ctx(place);
+    gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                             .GetAllocator(place, gpu_ctx.stream())
+                             .get());
+    gpu_ctx.PartialInitWithAllocator();
     paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
 
     std::vector<int> dst;
@@ -267,6 +279,10 @@ TEST(TensorToVector, Tensor_bool) {
     paddle::framework::Tensor gpu_tensor;
     paddle::platform::CUDAPlace place;
     paddle::platform::CUDADeviceContext gpu_ctx(place);
+    gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                             .GetAllocator(place, gpu_ctx.stream())
+                             .get());
+    gpu_ctx.PartialInitWithAllocator();
     paddle::framework::TensorFromVector<bool>(src_vec, gpu_ctx, &gpu_tensor);
 
     std::vector<bool> dst;
@@ -493,6 +509,10 @@ TEST(Tensor, FromAndToStream) {
 
     auto gpu_place = new platform::CUDAPlace();
     platform::CUDADeviceContext gpu_ctx(*gpu_place);
+    gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                             .GetAllocator(*gpu_place, gpu_ctx.stream())
+                             .get());
+    gpu_ctx.PartialInitWithAllocator();
 
     TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
 
diff --git a/paddle/fluid/imperative/gloo_context.cc b/paddle/fluid/imperative/gloo_context.cc
index 44315e267ee78..eeac500cac413 100644
--- a/paddle/fluid/imperative/gloo_context.cc
+++ b/paddle/fluid/imperative/gloo_context.cc
@@ -46,6 +46,17 @@ void GLOOParallelContext::Init() {
   gloo_wrapper->Init();
   device_ = std::unique_ptr<platform::CPUDeviceContext>(
       new platform::CPUDeviceContext(platform::CPUPlace()));
+  device_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                            .GetAllocator(platform::CPUPlace())
+                            .get());
+  device_->SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+  device_->SetZeroAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetZeroAllocator(platform::CPUPlace())
+          .get());
 }
 
 void GLOOParallelContext::InitWithRingID(int ring_id) {
diff --git a/paddle/fluid/inference/lite/test_engine_lite.cc b/paddle/fluid/inference/lite/test_engine_lite.cc
index b2750fd070d3e..85f7d3ee363a7 100644
--- a/paddle/fluid/inference/lite/test_engine_lite.cc
+++ b/paddle/fluid/inference/lite/test_engine_lite.cc
@@ -77,6 +77,10 @@ void make_fake_model(std::string* model, std::string* param) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   platform::CUDAPlace place;
   platform::CUDADeviceContext ctx(place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(place, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
 #else
   platform::CPUPlace place;
   platform::CPUDeviceContext ctx(place);
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
index d5cc69ea661d9..fa03d5320f2e7 100644
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -27,6 +27,18 @@ class TensorRTEngineTest : public ::testing::Test {
  protected:
   void SetUp() override {
     ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0));
+    ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(platform::CUDAPlace(0), ctx_->stream())
+                           .get());
+    ctx_->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    ctx_->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(platform::CUDAPlace(0))
+            .get());
+    ctx_->PartialInitWithAllocator();
 
     engine_ = new TensorRTEngine(10, 1 << 10);
     engine_->InitNetwork();
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
index 59c14103ca67d..cad31f5bba91b 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
+++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
@@ -18,6 +18,7 @@
 #include <vector>
 
 #include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
 #include "paddle/fluid/memory/allocation/locked_allocator.h"
@@ -44,6 +45,10 @@ TEST(BestFitAllocator, concurrent_cuda) {
 
   platform::CUDAPlace gpu(0);
   platform::CUDADeviceContext dev_ctx(gpu);
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu, dev_ctx.stream())
+                           .get());
+  dev_ctx.PartialInitWithAllocator();
 
   auto th_main = [&](std::random_device::result_type seed) {
     std::default_random_engine engine(seed);
diff --git a/paddle/fluid/memory/malloc_test.cu b/paddle/fluid/memory/malloc_test.cu
index d015ed7ce693f..07577531d6459 100644
--- a/paddle/fluid/memory/malloc_test.cu
+++ b/paddle/fluid/memory/malloc_test.cu
@@ -25,6 +25,7 @@
 #include <vector>
 
 #include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device_context.h"
 
@@ -105,8 +106,21 @@ TEST(Malloc, CUDADeviceContextMultiStream) {
   main_stream_alloc_ptr.reset();
 
   for (int i = 0; i < NUM_STREAMS; ++i) {
-    dev_ctx.push_back(std::unique_ptr<platform::CUDADeviceContext>(
-        new platform::CUDADeviceContext(place)));
+    auto ctx = std::unique_ptr<platform::CUDADeviceContext>(
+        new platform::CUDADeviceContext(place));
+    ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                          .GetAllocator(place, ctx->stream())
+                          .get());
+    ctx->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    ctx->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(place)
+            .get());
+    ctx->PartialInitWithAllocator();
+    dev_ctx.emplace_back(std::move(ctx));
     MultiStreamCompute(&data[i], &second_data[i], *dev_ctx[i]);
   }
 
@@ -144,8 +158,21 @@ TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) {
   main_stream_alloc_ptr.reset();
 
   for (int i = 0; i < NUM_STREAMS; ++i) {
-    dev_ctx.push_back(std::unique_ptr<platform::CUDADeviceContext>(
-        new platform::CUDADeviceContext(place)));
+    auto ctx = std::unique_ptr<platform::CUDADeviceContext>(
+        new platform::CUDADeviceContext(place));
+    ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                          .GetAllocator(place, ctx->stream())
+                          .get());
+    ctx->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    ctx->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(place)
+            .get());
+    ctx->PartialInitWithAllocator();
+    dev_ctx.emplace_back(std::move(ctx));
     threads.push_back(std::thread(MultiStreamCompute, &data[i], &second_data[i],
                                   std::cref(*dev_ctx[i])));
   }
diff --git a/paddle/fluid/operators/arg_min_max_op_base.cu.h b/paddle/fluid/operators/arg_min_max_op_base.cu.h
index 2c34d6f8300a7..89ba2dfb9250b 100644
--- a/paddle/fluid/operators/arg_min_max_op_base.cu.h
+++ b/paddle/fluid/operators/arg_min_max_op_base.cu.h
@@ -110,7 +110,7 @@ void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input,
     return block_size;
   };
 
-  int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize().x;
+  int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
   int64_t height = pre * post;
   int64_t width = n;
   int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
diff --git a/paddle/fluid/operators/argsort_op.cu b/paddle/fluid/operators/argsort_op.cu
index 6236a07de4bc6..b6a1f1f6d2358 100644
--- a/paddle/fluid/operators/argsort_op.cu
+++ b/paddle/fluid/operators/argsort_op.cu
@@ -131,7 +131,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
 
   int block_size = ComputeBlockSize(num_cols);
 
-  int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
+  int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
   // actually, int num_rows < max_grid_size
   int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
   // Init a index array
@@ -212,7 +212,7 @@ void ArgFullAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO,
 
   int block_size = ComputeBlockSize(num_cols);
 
-  int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
+  int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
   // actually, int num_rows < max_grid_size
   int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
   FillGrad<<<grid_size, block_size, 0, cu_stream>>>(
diff --git a/paddle/fluid/operators/broadcast_tensors_op.cu b/paddle/fluid/operators/broadcast_tensors_op.cu
index 718e7ce396621..5a835c7678fa2 100644
--- a/paddle/fluid/operators/broadcast_tensors_op.cu
+++ b/paddle/fluid/operators/broadcast_tensors_op.cu
@@ -90,8 +90,8 @@ class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel<T> {
         // reduce_sum implementation on CUDA
         auto stream = context.cuda_device_context().stream();
         TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-            *input_tensor, output_tensor, kps::IdentityFunctor<T>(),
-            reduce_dims_vec, stream);
+            context.cuda_device_context(), *input_tensor, output_tensor,
+            kps::IdentityFunctor<T>(), reduce_dims_vec, stream);
       }
     }
   }
diff --git a/paddle/fluid/operators/cholesky_solve_op.cu b/paddle/fluid/operators/cholesky_solve_op.cu
index f42364c9619ef..a5d5baf19dad0 100644
--- a/paddle/fluid/operators/cholesky_solve_op.cu
+++ b/paddle/fluid/operators/cholesky_solve_op.cu
@@ -115,7 +115,8 @@ class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
     }
     gpuStream_t stream = ctx.cuda_device_context().stream();
     TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        in, out, kps::IdentityFunctor<T>(), out_reduce_dims, stream);
+        ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(),
+        out_reduce_dims, stream);
   }
 };
 
diff --git a/paddle/fluid/operators/clip_by_norm_op.cu b/paddle/fluid/operators/clip_by_norm_op.cu
index 4d04fdc8ce2d2..f2714d1378510 100644
--- a/paddle/fluid/operators/clip_by_norm_op.cu
+++ b/paddle/fluid/operators/clip_by_norm_op.cu
@@ -77,7 +77,7 @@ class ClipByNormKernel<platform::CUDADeviceContext, platform::float16>
         {1}, dev_ctx);
     TensorReduceFunctorImpl<platform::float16, float, kps::AddFunctor,
                             kps::SquareFunctor<platform::float16, float>>(
-        *input, &tmp, kps::SquareFunctor<platform::float16, float>(),
+        dev_ctx, *input, &tmp, kps::SquareFunctor<platform::float16, float>(),
         reduce_dims, dev_ctx.stream());
     auto tmp_eigen = EigenVector<float>::Flatten(tmp);
     auto x_norm = tmp_eigen.sqrt();
diff --git a/paddle/fluid/operators/controlflow/compare_all_op.cu b/paddle/fluid/operators/controlflow/compare_all_op.cu
index 54f59c40a205d..6526d774cafe8 100644
--- a/paddle/fluid/operators/controlflow/compare_all_op.cu
+++ b/paddle/fluid/operators/controlflow/compare_all_op.cu
@@ -65,7 +65,8 @@ class CompareReduceOpKernel
       auto stream = context.cuda_device_context().stream();
       TensorReduceFunctorImpl<bool, bool, BitwiseAdd,
                               kps::IdentityFunctor<bool>>(
-          tmp, z, kps::IdentityFunctor<bool>(), reduce_dims, stream);
+          context.cuda_device_context(), tmp, z, kps::IdentityFunctor<bool>(),
+          reduce_dims, stream);
     }
   }
 };
diff --git a/paddle/fluid/operators/copy_cross_scope_test.cc b/paddle/fluid/operators/copy_cross_scope_test.cc
index 5f951ad337e8e..95e30efda0f78 100644
--- a/paddle/fluid/operators/copy_cross_scope_test.cc
+++ b/paddle/fluid/operators/copy_cross_scope_test.cc
@@ -131,12 +131,20 @@ void Compare2(f::Scope* scope, const p::DeviceContext& ctx,
 TEST(copy_cross_scope, CUDA_fp32) {
   f::Scope scope;
   p::CUDADeviceContext ctx(p::CUDAPlace(0));
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(p::CUDAPlace(0), ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
   Compare1<float>(&scope, ctx, "copy_cross_scope");
 }
 
 TEST(copy_cross_scope_to_main_scope, CUDA_fp32) {
   f::Scope scope;
   p::CUDADeviceContext ctx(p::CUDAPlace(0));
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(p::CUDAPlace(0), ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
   Compare2<float>(&scope, ctx, "copy_cross_scope");
 }
 #elif PADDLE_WITH_ASCEND_CL
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
index 8923f1fd4b866..56580e6d59525 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
@@ -51,8 +52,8 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T>
       auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod);
       auto pt_y = paddle::experimental::MakePtenDenseTensor(*y_lod);
       auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod);
-      pten::MultiplyRawKernel<T>(cuda_ctx, *pt_x.get(), *pt_y.get(), axis,
-                                 pt_z.get());
+      pten::MultiplyRawKernel<T>(static_cast<const pten::GPUContext&>(cuda_ctx),
+                                 *pt_x.get(), *pt_y.get(), axis, pt_z.get());
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "X's type[%s] is not supported by elementwise_op. X's type should be "
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index fdf04181de76c..74e74870b8e11 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -1189,7 +1189,8 @@ void ReduceWrapper(const platform::CUDADeviceContext &dev_ctx, int axis,
                    framework::Tensor *src, framework::Tensor *dst) {
   std::vector<int> reduce_dims = GetReduceDim(dst->dims(), src->dims(), axis);
   TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-      *src, dst, kps::IdentityFunctor<T>(), reduce_dims, dev_ctx.stream());
+      dev_ctx, *src, dst, kps::IdentityFunctor<T>(), reduce_dims,
+      dev_ctx.stream());
 }
 
 template <ElementwiseType ET, typename T, typename Functor>
diff --git a/paddle/fluid/operators/feed_forward_test.cu b/paddle/fluid/operators/feed_forward_test.cu
index 9f44c39a92c5e..dea427393b175 100644
--- a/paddle/fluid/operators/feed_forward_test.cu
+++ b/paddle/fluid/operators/feed_forward_test.cu
@@ -275,6 +275,18 @@ class TestFeedForward {
     output_size_ = 3 * num_head_ * dim_head_;
     input_size_ = dim_embed_;
     ctx_ = new platform::CUDADeviceContext(place_);
+    ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(place_, ctx_->stream())
+                           .get());
+    ctx_->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    ctx_->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(place_)
+            .get());
+    ctx_->PartialInitWithAllocator();
 
     size_src_ = bsz_seq_ * dim_embed_;         // src: [bs, seq_len, em_dim]
     size_weight_ = dim_embed_ * output_size_;  // weight: [output_size, em_dim]
diff --git a/paddle/fluid/operators/fused/attn_gemm.h b/paddle/fluid/operators/fused/attn_gemm.h
index b9b881cf83e0b..1128997fd259a 100644
--- a/paddle/fluid/operators/fused/attn_gemm.h
+++ b/paddle/fluid/operators/fused/attn_gemm.h
@@ -166,7 +166,8 @@ class AttnMatMul {
       if (support_case_1 || support_case_2) {
         gpuStream_t stream = dev_ctx_.stream();
         TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-            *d_output, d_bias, kps::IdentityFunctor<T>(), {0, 1}, stream);
+            dev_ctx_, *d_output, d_bias, kps::IdentityFunctor<T>(), {0, 1},
+            stream);
       } else {
         PADDLE_THROW(platform::errors::InvalidArgument(
             "Only support reduce when the input dims are [0,1,2,3,4] and "
diff --git a/paddle/fluid/operators/gelu_op.cu b/paddle/fluid/operators/gelu_op.cu
index 73694471799af..6b778eee43451 100644
--- a/paddle/fluid/operators/gelu_op.cu
+++ b/paddle/fluid/operators/gelu_op.cu
@@ -108,7 +108,7 @@ static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(
         is_aligned(y, kAlignment)) {                                          \
       size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
       size_t block = (n / __vec_size + thread - 1) / thread;                  \
-      block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize().x);     \
+      block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()[0]);    \
       VLOG(10) << "Use FP16 fast gelu fwd kernel, block = " << block          \
                << " , thread = " << thread;                                   \
       FP16FastGeluFwdCUDAKernel<                                              \
@@ -144,7 +144,7 @@ static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(
         is_aligned(x_g, kAlignment)) {                                        \
       size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
       size_t block = (n / __vec_size + thread - 1) / thread;                  \
-      block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize().x);     \
+      block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()[0]);    \
       VLOG(10) << "Use FP16 fast gelu bwd kernel, block = " << block          \
                << " , thread = " << thread;                                   \
       FP16FastGeluBwdCUDAKernel<                                              \
diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cu b/paddle/fluid/operators/graph_khop_sampler_op.cu
index 777ec64f6e008..df977b43512a0 100644
--- a/paddle/fluid/operators/graph_khop_sampler_op.cu
+++ b/paddle/fluid/operators/graph_khop_sampler_op.cu
@@ -260,7 +260,7 @@ void FillHashTable(const framework::ExecutionContext& ctx, const T* input,
   int block = 1024;
 #endif
   const auto& dev_ctx = ctx.cuda_device_context();
-  int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x;
+  int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
   int grid_tmp = (num_input + block - 1) / block;
   int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
   // 1. Insert data into keys and values.
@@ -334,7 +334,7 @@ void ReindexFunc(const framework::ExecutionContext& ctx,
   int block = 1024;
 #endif
   const auto& dev_ctx = ctx.cuda_device_context();
-  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x;
+  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
   int64_t grid_tmp = (outputs->size() + block - 1) / block;
   int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
   ReindexSrcOutput<
diff --git a/paddle/fluid/operators/graph_send_recv_op.cu b/paddle/fluid/operators/graph_send_recv_op.cu
index 6e5e203e2d943..446ad2d97a7fb 100644
--- a/paddle/fluid/operators/graph_send_recv_op.cu
+++ b/paddle/fluid/operators/graph_send_recv_op.cu
@@ -197,7 +197,7 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(
 #endif
   int64_t n = slice_size * index_size;
   const auto& dev_ctx = ctx.cuda_device_context();
-  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x;
+  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
   int64_t grid_tmp = (n + block - 1) / block;
   int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
   int64_t input_size = src_dims[0];
@@ -320,7 +320,7 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper(
 #endif
   int64_t n = slice_size * index_size;
   const auto& dev_ctx = ctx.cuda_device_context();
-  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x;
+  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
   int64_t grid_tmp = (n + block - 1) / block;
   int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
   int64_t input_size = src_dims[0];
diff --git a/paddle/fluid/operators/gumbel_softmax_op.cu b/paddle/fluid/operators/gumbel_softmax_op.cu
index 4e91e689fa58c..63577ed1e0f1f 100644
--- a/paddle/fluid/operators/gumbel_softmax_op.cu
+++ b/paddle/fluid/operators/gumbel_softmax_op.cu
@@ -92,7 +92,7 @@ struct OneHotGenerator<platform::CUDADeviceContext, T> {
     const int size_from_axis = SizeFromAxis(axis, X.dims());
     const int size_out_axis = SizeOutAxis(axis, X.dims());
     constexpr int thread_size = 512;
-    int64_t max_grid_dimx = context.GetCUDAMaxGridDimSize().x;
+    int64_t max_grid_dimx = context.GetCUDAMaxGridDimSize()[0];
     int64_t height = size_to_axis * size_out_axis;
     int block_size = height < max_grid_dimx ? height : max_grid_dimx;
 
diff --git a/paddle/fluid/operators/index_sample_op.cu b/paddle/fluid/operators/index_sample_op.cu
index 45f63c2b2fbd8..4c9dec1400076 100644
--- a/paddle/fluid/operators/index_sample_op.cu
+++ b/paddle/fluid/operators/index_sample_op.cu
@@ -27,10 +27,10 @@ namespace operators {
 
 namespace {
 void LimitGridDim(const framework::ExecutionContext& ctx, dim3* grid_dim) {
-  dim3 max_grid_dim = ctx.template device_context<platform::CUDADeviceContext>()
+  auto max_grid_dim = ctx.template device_context<platform::CUDADeviceContext>()
                           .GetCUDAMaxGridDimSize();
-  grid_dim->x = grid_dim->x < max_grid_dim.x ? grid_dim->x : max_grid_dim.x;
-  grid_dim->y = grid_dim->y < max_grid_dim.y ? grid_dim->y : max_grid_dim.y;
+  grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0];
+  grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1];
 }
 }
 
diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu
index a4d5e75e7289b..3901a2515450d 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cu
+++ b/paddle/fluid/operators/interpolate_v2_op.cu
@@ -45,11 +45,11 @@ inline platform::GpuLaunchConfig GetGpuLaunchConfig3D(
   int block_y = std::min(GetLastPow2(height), max_threads / block_x);
   int block_z = std::min(num_img, max_threads / block_x / block_y);
 
-  dim3 max_grid_dim = context.GetCUDAMaxGridDimSize();
-  int grid_x = std::min<int>(max_grid_dim.x, platform::DivUp(width, block_x));
-  int grid_y = std::min<int>(max_grid_dim.y, platform::DivUp(height, block_y));
+  auto max_grid_dim = context.GetCUDAMaxGridDimSize();
+  int grid_x = std::min<int>(max_grid_dim[0], platform::DivUp(width, block_x));
+  int grid_y = std::min<int>(max_grid_dim[1], platform::DivUp(height, block_y));
   int grid_z =
-      std::min<int>(max_grid_dim.z, platform::DivUp(num_img, block_z * 4));
+      std::min<int>(max_grid_dim[2], platform::DivUp(num_img, block_z * 4));
 
   const int capability = context.GetComputeCapability();
   platform::GpuLaunchConfig config;
diff --git a/paddle/fluid/operators/kron_op.h b/paddle/fluid/operators/kron_op.h
index 6039d8c624052..ff3baf4d70f10 100644
--- a/paddle/fluid/operators/kron_op.h
+++ b/paddle/fluid/operators/kron_op.h
@@ -306,11 +306,11 @@ struct KronGradOpFunctor {
     auto stream = dev_ctx.stream();  // it is a cuda device_context
     if (dx) {
       TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          dout_x, dx, kps::IdentityFunctor<T>(), {1}, stream);
+          dev_ctx, dout_x, dx, kps::IdentityFunctor<T>(), {1}, stream);
     }
     if (dy) {
       TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          dout_y, dy, kps::IdentityFunctor<T>(), {1}, stream);
+          dev_ctx, dout_y, dy, kps::IdentityFunctor<T>(), {1}, stream);
     }
 #else
     auto* place = dev_ctx.eigen_device();
diff --git a/paddle/fluid/operators/kthvalue_op.cu b/paddle/fluid/operators/kthvalue_op.cu
index c6c62a763aa06..0941f9d4c3b3b 100644
--- a/paddle/fluid/operators/kthvalue_op.cu
+++ b/paddle/fluid/operators/kthvalue_op.cu
@@ -54,7 +54,7 @@ bool SortKthvalue(const platform::CUDADeviceContext& ctx,
   input_indices.mutable_data<int64_t>(ctx.GetPlace());
   size_t temp_storage_bytes = -1;
   int block_size = getBlockSize(num_cols);
-  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
+  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
   unsigned int grid_size = num_rows < maxGridDimX
                                ? static_cast<unsigned int>(num_rows)
                                : maxGridDimX;
diff --git a/paddle/fluid/operators/lite/lite_engine_op_test.cc b/paddle/fluid/operators/lite/lite_engine_op_test.cc
index 053ba322d8f4d..01583cea31222 100644
--- a/paddle/fluid/operators/lite/lite_engine_op_test.cc
+++ b/paddle/fluid/operators/lite/lite_engine_op_test.cc
@@ -72,6 +72,10 @@ TEST(LiteEngineOp, engine_op) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   platform::CUDAPlace place;
   platform::CUDADeviceContext ctx(place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(place, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
 #else
   platform::CPUPlace place;
   platform::CPUDeviceContext ctx(place);
diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu
index e4fb4150f841b..a8a3390c00207 100644
--- a/paddle/fluid/operators/margin_cross_entropy_op.cu
+++ b/paddle/fluid/operators/margin_cross_entropy_op.cu
@@ -299,7 +299,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
         ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
     T* logits_max_buff = logits_max.mutable_data<T>(place);
     TensorReduceFunctorImpl<T, T, kps::MaxFunctor, kps::IdentityFunctor<T>>(
-        softmax_2d, &logits_max, kps::IdentityFunctor<T>(), {1},
+        dev_ctx, softmax_2d, &logits_max, kps::IdentityFunctor<T>(), {1},
         dev_ctx.stream());
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
@@ -321,7 +321,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
         ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
     T* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
     TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::ExpFunctor<T>>(
-        softmax_2d, &sum_exp_logits, kps::ExpFunctor<T>(), {1},
+        dev_ctx, softmax_2d, &sum_exp_logits, kps::ExpFunctor<T>(), {1},
         dev_ctx.stream());
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
diff --git a/paddle/fluid/operators/math/beam_search_test.cc b/paddle/fluid/operators/math/beam_search_test.cc
index ec2e9516fcd4b..e6f6a09a43b20 100644
--- a/paddle/fluid/operators/math/beam_search_test.cc
+++ b/paddle/fluid/operators/math/beam_search_test.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/beam_search.h"
 
 #include <gtest/gtest.h>
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
 
 void PrepareCPUTensors(paddle::framework::LoDTensor* ids,
                        paddle::framework::LoDTensor* scores,
@@ -129,6 +131,83 @@ void TestBeamSearch() {
   delete context;
 }
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <>
+void TestBeamSearch<paddle::platform::CUDADeviceContext,
+                    paddle::platform::CUDAPlace>() {
+  paddle::framework::LoDTensor ids;
+  paddle::framework::LoDTensor scores;
+  paddle::framework::LoDTensor pre_ids;
+  paddle::framework::LoDTensor pre_scores;
+
+  auto* place = new paddle::platform::CUDAPlace();
+  auto* context = new paddle::platform::CUDADeviceContext(*place);
+  context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                            .GetAllocator(*place, context->stream())
+                            .get());
+  context->PartialInitWithAllocator();
+  if (paddle::platform::is_cpu_place(*place)) {
+    PrepareCPUTensors(&ids, &scores, &pre_ids, &pre_scores);
+  } else {
+    paddle::framework::LoDTensor cpu_ids;
+    paddle::framework::LoDTensor cpu_scores;
+    paddle::framework::LoDTensor cpu_pre_ids;
+    paddle::framework::LoDTensor cpu_pre_scores;
+
+    PrepareCPUTensors(&cpu_ids, &cpu_scores, &cpu_pre_ids, &cpu_pre_scores);
+
+    paddle::framework::TensorCopySync(cpu_ids, *place, &ids);
+    paddle::framework::TensorCopySync(cpu_scores, *place, &scores);
+    paddle::framework::TensorCopySync(cpu_pre_ids, *place, &pre_ids);
+    paddle::framework::TensorCopySync(cpu_pre_scores, *place, &pre_scores);
+
+    ids.set_lod(cpu_ids.lod());
+    scores.set_lod(cpu_scores.lod());
+    pre_ids.set_lod(cpu_pre_ids.lod());
+    pre_scores.set_lod(cpu_pre_scores.lod());
+  }
+
+  paddle::framework::LoDTensor selected_ids;
+  paddle::framework::LoDTensor selected_scores;
+  paddle::framework::LoDTensor parent_idx;
+
+  size_t level = 0;
+  size_t beam_size = 2;
+  int end_id = 0;
+  paddle::operators::math::BeamSearchFunctor<
+      paddle::platform::CUDADeviceContext, float>
+      beamsearch;
+  beamsearch(*context, &pre_ids, &pre_scores, &ids, &scores, &selected_ids,
+             &selected_scores, &parent_idx, level, beam_size, end_id, true);
+
+  ASSERT_EQ(selected_ids.lod(), selected_scores.lod());
+
+  paddle::framework::LoDTensor cpu_selected_ids;
+  paddle::framework::LoDTensor cpu_selected_scores;
+  if (paddle::platform::is_cpu_place(*place)) {
+    cpu_selected_ids = selected_ids;
+    cpu_selected_scores = selected_scores;
+  } else {
+    paddle::framework::TensorCopySync(
+        selected_ids, paddle::platform::CPUPlace(), &cpu_selected_ids);
+    paddle::framework::TensorCopySync(
+        selected_scores, paddle::platform::CPUPlace(), &cpu_selected_scores);
+    cpu_selected_ids.set_lod(selected_ids.lod());
+    cpu_selected_scores.set_lod(selected_scores.lod());
+  }
+
+  std::vector<int64_t> expected_ids({4, 5, 3, 8});
+  std::vector<float> expected_scores({0.6f, 0.5f, 0.9f, 0.7f});
+  for (int i = 0; i < 4; i++) {
+    ASSERT_EQ(expected_ids[i], cpu_selected_ids.data<int64_t>()[i]);
+    ASSERT_EQ(expected_scores[i], cpu_selected_scores.data<float>()[i]);
+  }
+
+  delete place;
+  delete context;
+}
+#endif
+
 TEST(BeamSearch, CPU) {
   TestBeamSearch<paddle::platform::CPUDeviceContext,
                  paddle::platform::CPUPlace>();
diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index 92162e639ff86..7ffd2a7ab2d84 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -18,6 +18,7 @@
 #include "paddle/fluid/platform/dynload/cublas.h"
 
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 
 DECLARE_bool(enable_cublas_tensor_op_math);
 
@@ -92,6 +93,32 @@ struct CUBlas<float> {
 #endif
   }
 
+  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
+  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
+  template <typename... ARGS>
+  static void GEMM_EX(pten::GPUContext *dev_ctx, cublasOperation_t transa,
+                      cublasOperation_t transb, int m, int n, int k,
+                      const float *alpha, const void *A, cudaDataType_t Atype,
+                      int lda, const void *B, cudaDataType_t Btype, int ldb,
+                      const float *beta, void *C, cudaDataType_t Ctype,
+                      int ldc) {
+// Because the gcc 4.8 doesn't expand template parameter pack that
+// appears in a lambda-expression, I can not use template parameter pack
+// here.
+#if CUDA_VERSION >= 8000
+    VLOG(5) << "use_tensor_op_math: "
+            << (dev_ctx->tensor_core_available() ? "True" : "False");
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasSgemmEx(
+          handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
+          beta, C, Ctype, ldc));
+    });
+#else
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "cublasSgemmEx is not supported on cuda <= 7.5"));
+#endif
+  }
+
   template <typename... ARGS>
   static void TRSM(ARGS... args) {
     PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasStrsm(args...));
@@ -273,6 +300,37 @@ struct CUBlas<platform::float16> {
         "cublasGemmEx is not supported on cuda <= 7.5"));
 #endif
   }
+
+  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
+  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
+  template <typename... ARGS>
+  static void GEMM_EX(pten::GPUContext *dev_ctx, cublasOperation_t transa,
+                      cublasOperation_t transb, int m, int n, int k,
+                      const void *alpha, const void *A, cudaDataType_t Atype,
+                      int lda, const void *B, cudaDataType_t Btype, int ldb,
+                      const void *beta, void *C, cudaDataType_t Ctype, int ldc,
+                      cudaDataType_t computeType) {
+#if CUDA_VERSION >= 8000
+    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+#if CUDA_VERSION >= 9000
+    bool use_tensor_op_math = dev_ctx->tensor_core_available();
+    if (use_tensor_op_math) {
+      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+    }
+    VLOG(5) << "use_tensor_op_math: "
+            << (use_tensor_op_math ? "True" : "False");
+#endif  // CUDA_VERSION >= 9000
+
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx(
+          handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
+          beta, C, Ctype, ldc, computeType, algo));
+    });
+#else
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "cublasGemmEx is not supported on cuda <= 7.5"));
+#endif
+  }
 };
 
 template <>
@@ -388,6 +446,37 @@ struct CUBlas<platform::complex<float>> {
 #endif
   }
 
+  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
+  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
+  template <typename... ARGS>
+  static void GEMM_EX(pten::GPUContext *dev_ctx, cublasOperation_t transa,
+                      cublasOperation_t transb, int m, int n, int k,
+                      const void *alpha, const void *A, cudaDataType_t Atype,
+                      int lda, const void *B, cudaDataType_t Btype, int ldb,
+                      const void *beta, void *C, cudaDataType_t Ctype, int ldc,
+                      cudaDataType_t computeType) {
+#if CUDA_VERSION >= 8000
+    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+#if CUDA_VERSION >= 9000
+    bool use_tensor_op_math = dev_ctx->tensor_core_available();
+    if (use_tensor_op_math) {
+      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+    }
+    VLOG(5) << "use_tensor_op_math: "
+            << (use_tensor_op_math ? "True" : "False");
+#endif  // CUDA_VERSION >= 9000
+
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx(
+          handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
+          beta, C, Ctype, ldc, computeType, algo));
+    });
+#else
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "cublasGemmEx is not supported on cuda <= 7.5"));
+#endif
+  }
+
   static void TRSM_BATCH(cublasHandle_t handle, cublasSideMode_t side,
                          cublasFillMode_t uplo, cublasOperation_t transa,
                          cublasDiagType_t diag, int m, int n,
@@ -529,6 +618,37 @@ struct CUBlas<platform::complex<double>> {
         "cublasGemmEx is not supported on cuda <= 7.5"));
 #endif
   }
+
+  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
+  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
+  template <typename... ARGS>
+  static void GEMM_EX(pten::GPUContext *dev_ctx, cublasOperation_t transa,
+                      cublasOperation_t transb, int m, int n, int k,
+                      const void *alpha, const void *A, cudaDataType_t Atype,
+                      int lda, const void *B, cudaDataType_t Btype, int ldb,
+                      const void *beta, void *C, cudaDataType_t Ctype, int ldc,
+                      cudaDataType_t computeType) {
+#if CUDA_VERSION >= 8000
+    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+#if CUDA_VERSION >= 9000
+    bool use_tensor_op_math = dev_ctx->tensor_core_available();
+    if (use_tensor_op_math) {
+      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+    }
+    VLOG(5) << "use_tensor_op_math: "
+            << (use_tensor_op_math ? "True" : "False");
+#endif  // CUDA_VERSION >= 9000
+
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx(
+          handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
+          beta, C, Ctype, ldc, computeType, algo));
+    });
+#else
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "cublasGemmEx is not supported on cuda <= 7.5"));
+#endif
+  }
 };
 
 template <>
@@ -564,6 +684,39 @@ void Blas<platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
 #endif  // CUDA_VERSION >= 8000
 }
 
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                  CBLAS_TRANSPOSE transB, int M, int N, int K,
+                                  T alpha, const T *A, const T *B, T beta,
+                                  T *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+#if CUDA_VERSION >= 8000
+  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
+    auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
+    CUBlas<T>::GEMM_EX(&cuda_ctx, cuTransB, cuTransA, N, M, K, &alpha, B,
+                       CUDA_R_32F, ldb, A, CUDA_R_32F, lda, &beta, C,
+                       CUDA_R_32F, N);
+  } else {
+#endif  // CUDA_VERSION >= 8000
+    context_.CublasCall([&](cublasHandle_t handle) {
+      CUBlas<T>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
+                      lda, &beta, C, N);
+    });
+
+#if CUDA_VERSION >= 8000
+  }
+#endif  // CUDA_VERSION >= 8000
+}
+
 template <>
 template <>
 inline void Blas<platform::CUDADeviceContext>::GEMM(
@@ -611,6 +764,55 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
 #endif  // CUDA_VERSION >= 8000
 }
 
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, platform::float16 alpha,
+                                         const platform::float16 *A,
+                                         const platform::float16 *B,
+                                         platform::float16 beta,
+                                         platform::float16 *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+  // TODO(kexinzhao): add processing code for compute capability < 53 case
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(), 53,
+      platform::errors::InvalidArgument(
+          "cublas fp16 gemm requires GPU compute capability >= 53,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
+
+#if CUDA_VERSION >= 8000
+  // cublasHgemm does true FP16 computation which is slow for non-Volta
+  // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
+  // input/output in fp16, computation in fp32, which can also be accelerated
+  // using tensor cores in volta GPUs.
+  auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
+  CUBlas<platform::float16>::GEMM_EX(
+      &cuda_ctx, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_16F, ldb, A,
+      CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N, CUDA_R_32F);
+#else
+  // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
+
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<platform::float16>::GEMM(handle, cuTransB, cuTransA, N, M, K,
+                                    &h_alpha, h_B, ldb, h_A, lda, &h_beta, h_C,
+                                    N);
+  });
+#endif  // CUDA_VERSION >= 8000
+}
+
 template <>
 template <>
 inline void Blas<platform::CUDADeviceContext>::GEMM(
@@ -659,6 +861,56 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
 #endif  // CUDA_VERSION >= 8000
 }
 
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, platform::complex<float> alpha,
+                                         const platform::complex<float> *A,
+                                         const platform::complex<float> *B,
+                                         platform::complex<float> beta,
+                                         platform::complex<float> *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+  // TODO(kexinzhao): add processing code for compute capability < 53 case
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(), 53,
+      platform::errors::InvalidArgument(
+          "cublas complex64 gemm requires GPU compute capability >= 53,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  thrust::complex<float> c_alpha =
+      thrust::complex<float>(alpha.real, alpha.imag);
+  thrust::complex<float> c_beta = thrust::complex<float>(beta.real, beta.imag);
+
+#if CUDA_VERSION >= 8000
+  // cublasHgemm does true FP16 computation which is slow for non-Volta
+  // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
+  // input/output in fp16, computation in fp32, which can also be accelerated
+  // using tensor cores in volta GPUs.
+  auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
+  CUBlas<platform::complex<float>>::GEMM_EX(
+      &cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B, CUDA_C_32F, ldb, A,
+      CUDA_C_32F, lda, &c_beta, C, CUDA_C_32F, N, CUDA_C_32F);
+#else
+  // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
+
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<platform::complex<float>>::GEMM(handle, cuTransB, cuTransA, N, M, K,
+                                           &c_alpha, h_B, ldb, h_A, lda,
+                                           &c_beta, h_C, N);
+  });
+#endif  // CUDA_VERSION >= 8000
+}
+
 template <>
 template <>
 inline void Blas<platform::CUDADeviceContext>::GEMM(
@@ -708,6 +960,57 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
 #endif  // CUDA_VERSION >= 8000
 }
 
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, platform::complex<double> alpha,
+                                         const platform::complex<double> *A,
+                                         const platform::complex<double> *B,
+                                         platform::complex<double> beta,
+                                         platform::complex<double> *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+  // TODO(kexinzhao): add processing code for compute capability < 53 case
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(), 53,
+      platform::errors::InvalidArgument(
+          "cublas complex128 gemm requires GPU compute capability >= 53,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  thrust::complex<double> c_alpha =
+      thrust::complex<double>(alpha.real, alpha.imag);
+  thrust::complex<double> c_beta =
+      thrust::complex<double>(beta.real, beta.imag);
+
+#if CUDA_VERSION >= 8000
+  // cublasHgemm does true FP16 computation which is slow for non-Volta
+  // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
+  // input/output in fp16, computation in fp32, which can also be accelerated
+  // using tensor cores in volta GPUs.
+  auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
+  CUBlas<platform::complex<double>>::GEMM_EX(
+      &cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B, CUDA_C_64F, ldb, A,
+      CUDA_C_64F, lda, &c_beta, C, CUDA_C_64F, N, CUDA_C_64F);
+#else
+  // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
+
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<platform::complex<double>>::GEMM(handle, cuTransB, cuTransA, N, M, K,
+                                            &c_alpha, h_B, ldb, h_A, lda,
+                                            &c_beta, h_C, N);
+  });
+#endif  // CUDA_VERSION >= 8000
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M,
@@ -738,6 +1041,35 @@ void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M,
 #endif  // CUDA_VERSION >= 8000
 }
 
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::GEMM(bool transA, bool transB, int M, int N, int K,
+                                  T alpha, const T *A, int lda, const T *B,
+                                  int ldb, T beta, T *C, int ldc) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+#if CUDA_VERSION >= 8000
+  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
+    auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
+    CUBlas<T>::GEMM_EX(&cuda_ctx, cuTransB, cuTransA, N, M, K, &alpha, B,
+                       CUDA_R_32F, ldb, A, CUDA_R_32F, lda, &beta, C,
+                       CUDA_R_32F, ldc);
+  } else {
+#endif  // CUDA_VERSION >= 8000
+
+    context_.CublasCall([&](cublasHandle_t handle) {
+      CUBlas<T>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
+                      lda, &beta, C, ldc);
+    });
+
+#if CUDA_VERSION >= 8000
+  }
+#endif  // CUDA_VERSION >= 8000
+}
+
 template <>
 template <>
 inline void Blas<platform::CUDADeviceContext>::GEMM(
@@ -755,6 +1087,25 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
   });
 }
 
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMM(bool transA, bool transB, int M, int N,
+                                         int K, platform::float16 alpha,
+                                         const platform::float16 *A, int lda,
+                                         const platform::float16 *B, int ldb,
+                                         platform::float16 beta,
+                                         platform::float16 *C, int ldc) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<platform::float16>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha,
+                                    B, ldb, A, lda, &beta, C, ldc);
+  });
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::AXPY(int n, T alpha, const T *x,
@@ -764,6 +1115,14 @@ void Blas<platform::CUDADeviceContext>::AXPY(int n, T alpha, const T *x,
   });
 }
 
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
+  });
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::SCAL(int n, const T alpha, T *x) const {
@@ -771,6 +1130,13 @@ void Blas<platform::CUDADeviceContext>::SCAL(int n, const T alpha, T *x) const {
       [&](cublasHandle_t handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
 }
 
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::SCAL(int n, const T alpha, T *x) const {
+  context_.CublasCall(
+      [&](cublasHandle_t handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::VCOPY(int n, const T *x, T *y) const {
@@ -778,6 +1144,13 @@ void Blas<platform::CUDADeviceContext>::VCOPY(int n, const T *x, T *y) const {
       [&](cublasHandle_t handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
 }
 
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::VCOPY(int n, const T *x, T *y) const {
+  context_.CublasCall(
+      [&](cublasHandle_t handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
@@ -790,6 +1163,17 @@ void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
   });
 }
 
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::GEMV(bool trans_a, int M, int N, T alpha,
+                                  const T *A, const T *B, T beta, T *C) const {
+  cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
+  });
+}
+
 template <>
 template <>
 inline void Blas<platform::CUDADeviceContext>::GEMV(
@@ -806,6 +1190,24 @@ inline void Blas<platform::CUDADeviceContext>::GEMV(
   }
 }
 
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMV(bool trans_a, int M, int N,
+                                         platform::float16 alpha,
+                                         const platform::float16 *A,
+                                         const platform::float16 *B,
+                                         platform::float16 beta,
+                                         platform::float16 *C) const {
+  // Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
+  if (trans_a) {
+    this->template GEMM<platform::float16>(CblasNoTrans, CblasNoTrans, 1, N, M,
+                                           alpha, B, A, beta, C);
+  } else {
+    this->template GEMM<platform::float16>(CblasNoTrans, CblasNoTrans, M, 1, N,
+                                           alpha, A, B, beta, C);
+  }
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedGEMM(
@@ -854,6 +1256,56 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
 #endif  // CUDA_VERSION >= 9010
 }
 
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, T alpha, const T *A, const T *B,
+                                         T beta, T *C, int batchCount,
+                                         int64_t strideA,
+                                         int64_t strideB) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  const int64_t strideC = M * N;
+
+#if CUDA_VERSION >= 9010
+  if ((FLAGS_enable_cublas_tensor_op_math && (std::is_same<T, float>::value)) ||
+      std::is_same<T, paddle::platform::float16>::value) {
+    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+    bool use_tensor_op_math = context_.tensor_core_available();
+    if (use_tensor_op_math) {
+      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+    }
+    VLOG(5) << "use_tensor_op_math: "
+            << (use_tensor_op_math ? "True" : "False");
+
+    auto fp = std::is_same<T, float>::value ? CUDA_R_32F : CUDA_R_16F;
+    context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmStridedBatchedEx(
+          handle, cuTransB, cuTransA, N, M, K, &alpha, B, fp, ldb, strideB, A,
+          fp, lda, strideA, &beta, C, fp, ldc, strideC, batchCount, fp, algo));
+    });
+  } else {
+#endif  // CUDA_VERSION >= 9010
+
+    context_.CublasCall([&](cublasHandle_t handle) {
+      CUBlas<T>::GEMM_STRIDED_BATCH(handle, cuTransB, cuTransA, N, M, K, &alpha,
+                                    B, ldb, strideB, A, lda, strideA, &beta, C,
+                                    ldc, strideC, batchCount);
+    });
+
+#if CUDA_VERSION >= 9010
+  }
+#endif  // CUDA_VERSION >= 9010
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedGEMM(
@@ -865,6 +1317,19 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
   }
 }
 
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, T alpha, const T **A,
+                                         const T **B, T beta, T **C,
+                                         int batchCount) const {
+  for (int k = 0; k < batchCount; ++k) {
+    this->template GEMM<T>(transA, transB, M, N, K, alpha, A[k], B[k], beta,
+                           C[k]);
+  }
+}
+
 template <>
 template <>
 inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
@@ -878,6 +1343,19 @@ inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
   }
 }
 
+template <>
+template <>
+inline void Blas<pten::GPUContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    platform::float16 alpha, const platform::float16 **A,
+    const platform::float16 **B, platform::float16 beta, platform::float16 **C,
+    int batchCount) const {
+  for (int k = 0; k < batchCount; ++k) {
+    this->template GEMM<platform::float16>(transA, transB, M, N, K, alpha, A[k],
+                                           B[k], beta, C[k]);
+  }
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
@@ -903,6 +1381,30 @@ void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
   });
 }
 
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
+                                  CBLAS_TRANSPOSE transA, CBLAS_DIAG diag,
+                                  int M, int N, T alpha, const T *A, int lda,
+                                  T *B, int ldb) const {
+  // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' )  =  α B'`
+  // where ' stands for transpose
+  cublasSideMode_t cuSide =
+      (side == CblasLeft) ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT;
+  cublasFillMode_t cuUplo =
+      (uplo == CblasLower) ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
+  // use CUBLAS_OP_C (conjugate transpose) for complex
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasDiagType_t cuDiag =
+      (diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
+
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::TRSM(handle, cuSide, cuUplo, cuTransA, cuDiag, N, M, &alpha, A,
+                    lda, B, ldb);
+  });
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedGETRF(int n, T **a, int *ipiv,
@@ -913,6 +1415,15 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRF(int n, T **a, int *ipiv,
   });
 }
 
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedGETRF(int n, T **a, int *ipiv, int *info,
+                                          int batch_size) const {
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size);
+  });
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedGETRI(int n, const T **a,
@@ -931,6 +1442,23 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRI(int n, const T **a,
   });
 }
 
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedGETRI(int n, const T **a, const int *ipiv,
+                                          T **a_inv, int *info,
+                                          int batch_size) const {
+  PADDLE_ENFORCE_NE(
+      a_inv, a,
+      platform::errors::InvalidArgument(
+          "cuBLAS fuction 'cublas<S/D>getrfBatched' cannot be executed "
+          "in-place. The memory space of output matrix (address: %p) cannot "
+          "overlap memory space of input matrix (address: %p).",
+          a_inv, a));
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size);
+  });
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedMatInv(int n, const T **a,
@@ -941,6 +1469,15 @@ void Blas<platform::CUDADeviceContext>::BatchedMatInv(int n, const T **a,
   });
 }
 
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedMatInv(int n, const T **a, T **a_inv,
+                                           int *info, int batch_size) const {
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size);
+  });
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedGETRS(
@@ -955,6 +1492,21 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRS(
   });
 }
 
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans, int n,
+                                          int nrhs, const T **a, int lda,
+                                          int *ipiv, T **b, int ldb, int *info,
+                                          int batch_size) const {
+  // use CUBLAS_OP_C (conjugate transpose) for complex
+  cublasOperation_t cuTrans =
+      (trans == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::GETRS_BATCH(handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info,
+                           batch_size);
+  });
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedTRSM(
@@ -979,6 +1531,31 @@ void Blas<platform::CUDADeviceContext>::BatchedTRSM(
   });
 }
 
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedTRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
+                                         CBLAS_TRANSPOSE transA,
+                                         CBLAS_DIAG diag, int M, int N, T alpha,
+                                         const T **A, int lda, T **B, int ldb,
+                                         int batch_size) const {
+  // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' )  =  α B'`
+  // where ' stands for transpose
+  cublasSideMode_t cuSide =
+      (side == CblasLeft) ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT;
+  cublasFillMode_t cuUplo =
+      (uplo == CblasLower) ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
+  // use CUBLAS_OP_C (conjugate transpose) for complex
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasDiagType_t cuDiag =
+      (diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
+
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::TRSM_BATCH(handle, cuSide, cuUplo, cuTransA, cuDiag, N, M,
+                          &alpha, A, lda, B, ldb, batch_size);
+  });
+}
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/blas_impl.hip.h b/paddle/fluid/operators/math/blas_impl.hip.h
index 32479189eea58..bf7d66f485327 100644
--- a/paddle/fluid/operators/math/blas_impl.hip.h
+++ b/paddle/fluid/operators/math/blas_impl.hip.h
@@ -17,6 +17,7 @@
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/dynload/rocblas.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 
 DECLARE_bool(enable_cublas_tensor_op_math);
 
@@ -221,6 +222,20 @@ struct CUBlas<platform::float16> {
           beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
     });
   }
+  template <typename... ARGS>
+  static void GEMM_EX(pten::GPUContext *dev_ctx, rocblas_operation transa,
+                      rocblas_operation transb, int m, int n, int k,
+                      const void *alpha, const void *A, rocblas_datatype Atype,
+                      int lda, const void *B, rocblas_datatype Btype, int ldb,
+                      const void *beta, void *C, rocblas_datatype Ctype,
+                      int ldc, rocblas_datatype computeType) {
+    rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex(
+          handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
+          beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
+    });
+  }
 };
 
 template <>
@@ -305,6 +320,20 @@ struct CUBlas<platform::complex<float>> {
           beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
     });
   }
+  template <typename... ARGS>
+  static void GEMM_EX(pten::GPUContext *dev_ctx, rocblas_operation transa,
+                      rocblas_operation transb, int m, int n, int k,
+                      const void *alpha, const void *A, rocblas_datatype Atype,
+                      int lda, const void *B, rocblas_datatype Btype, int ldb,
+                      const void *beta, void *C, rocblas_datatype Ctype,
+                      int ldc, rocblas_datatype computeType) {
+    rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex(
+          handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
+          beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
+    });
+  }
 };
 
 template <>
@@ -389,6 +418,20 @@ struct CUBlas<platform::complex<double>> {
           beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
     });
   }
+  template <typename... ARGS>
+  static void GEMM_EX(pten::GPUContext *dev_ctx, rocblas_operation transa,
+                      rocblas_operation transb, int m, int n, int k,
+                      const void *alpha, const void *A, rocblas_datatype Atype,
+                      int lda, const void *B, rocblas_datatype Btype, int ldb,
+                      const void *beta, void *C, rocblas_datatype Ctype,
+                      int ldc, rocblas_datatype computeType) {
+    rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex(
+          handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
+          beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
+    });
+  }
 };
 
 template <>
@@ -412,6 +455,27 @@ void Blas<platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
                     &beta, C, N);
   });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                  CBLAS_TRANSPOSE transB, int M, int N, int K,
+                                  T alpha, const T *A, const T *B, T beta,
+                                  T *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (transB == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda,
+                    &beta, C, N);
+  });
+}
 
 template <>
 template <>
@@ -448,6 +512,43 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
       rocblas_datatype_f16_r, ldb, A, rocblas_datatype_f16_r, lda, &h_beta, C,
       rocblas_datatype_f16_r, N, rocblas_datatype_f32_r);
 }
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, platform::float16 alpha,
+                                         const platform::float16 *A,
+                                         const platform::float16 *B,
+                                         platform::float16 beta,
+                                         platform::float16 *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (transB == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+
+  // TODO(kexinzhao): add processing code for compute capability < 53 case
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(), 53,
+      platform::errors::InvalidArgument(
+          "cublas fp16 gemm requires GPU compute capability >= 53,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
+
+  auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
+  CUBlas<platform::float16>::GEMM_EX(
+      &cuda_ctx, cuTransB, cuTransA, N, M, K, &h_alpha, B,
+      rocblas_datatype_f16_r, ldb, A, rocblas_datatype_f16_r, lda, &h_beta, C,
+      rocblas_datatype_f16_r, N, rocblas_datatype_f32_r);
+}
 
 template <>
 template <>
@@ -485,6 +586,44 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
       rocblas_datatype_f32_c, ldb, A, rocblas_datatype_f32_c, lda, &c_beta, C,
       rocblas_datatype_f32_c, N, rocblas_datatype_f32_c);
 }
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, platform::complex<float> alpha,
+                                         const platform::complex<float> *A,
+                                         const platform::complex<float> *B,
+                                         platform::complex<float> beta,
+                                         platform::complex<float> *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (transB == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+
+  // TODO(kexinzhao): add processing code for compute capability < 53 case
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(), 53,
+      platform::errors::InvalidArgument(
+          "cublas complex64 gemm requires GPU compute capability >= 53,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  thrust::complex<float> c_alpha =
+      thrust::complex<float>(alpha.real, alpha.imag);
+  thrust::complex<float> c_beta = thrust::complex<float>(beta.real, beta.imag);
+
+  auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
+  CUBlas<platform::complex<float>>::GEMM_EX(
+      &cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B,
+      rocblas_datatype_f32_c, ldb, A, rocblas_datatype_f32_c, lda, &c_beta, C,
+      rocblas_datatype_f32_c, N, rocblas_datatype_f32_c);
+}
 
 template <>
 template <>
@@ -523,6 +662,45 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
       rocblas_datatype_f64_c, ldb, A, rocblas_datatype_f64_c, lda, &c_beta, C,
       rocblas_datatype_f64_c, N, rocblas_datatype_f64_c);
 }
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, platform::complex<double> alpha,
+                                         const platform::complex<double> *A,
+                                         const platform::complex<double> *B,
+                                         platform::complex<double> beta,
+                                         platform::complex<double> *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (transB == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+
+  // TODO(kexinzhao): add processing code for compute capability < 53 case
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(), 53,
+      platform::errors::InvalidArgument(
+          "cublas complex128 gemm requires GPU compute capability >= 53,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  thrust::complex<double> c_alpha =
+      thrust::complex<double>(alpha.real, alpha.imag);
+  thrust::complex<double> c_beta =
+      thrust::complex<double>(beta.real, beta.imag);
+
+  auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
+  CUBlas<platform::complex<double>>::GEMM_EX(
+      &cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B,
+      rocblas_datatype_f64_c, ldb, A, rocblas_datatype_f64_c, lda, &c_beta, C,
+      rocblas_datatype_f64_c, N, rocblas_datatype_f64_c);
+}
 
 template <>
 template <typename T>
@@ -541,6 +719,22 @@ void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M,
                     &beta, C, ldc);
   });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::GEMM(bool transA, bool transB, int M, int N, int K,
+                                  T alpha, const T *A, int lda, const T *B,
+                                  int ldb, T beta, T *C, int ldc) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  rocblas_operation cuTransA =
+      transA ? rocblas_operation_transpose : rocblas_operation_none;
+  rocblas_operation cuTransB =
+      transB ? rocblas_operation_transpose : rocblas_operation_none;
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda,
+                    &beta, C, ldc);
+  });
+}
 
 template <>
 template <>
@@ -560,6 +754,26 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
                                     B, ldb, A, lda, &beta, C, ldc);
   });
 }
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMM(bool transA, bool transB, int M, int N,
+                                         int K, platform::float16 alpha,
+                                         const platform::float16 *A, int lda,
+                                         const platform::float16 *B, int ldb,
+                                         platform::float16 beta,
+                                         platform::float16 *C, int ldc) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  rocblas_operation cuTransA =
+      transA ? rocblas_operation_transpose : rocblas_operation_none;
+  rocblas_operation cuTransB =
+      transB ? rocblas_operation_transpose : rocblas_operation_none;
+
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<platform::float16>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha,
+                                    B, ldb, A, lda, &beta, C, ldc);
+  });
+}
 
 template <>
 template <typename T>
@@ -569,6 +783,13 @@ void Blas<platform::CUDADeviceContext>::AXPY(int n, T alpha, const T *x,
     CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
   });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
+  });
+}
 
 template <>
 template <typename T>
@@ -576,6 +797,12 @@ void Blas<platform::CUDADeviceContext>::SCAL(int n, const T alpha, T *x) const {
   context_.CublasCall(
       [&](rocblas_handle handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::SCAL(int n, const T alpha, T *x) const {
+  context_.CublasCall(
+      [&](rocblas_handle handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
+}
 
 template <>
 template <typename T>
@@ -583,6 +810,12 @@ void Blas<platform::CUDADeviceContext>::VCOPY(int n, const T *x, T *y) const {
   context_.CublasCall(
       [&](rocblas_handle handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::VCOPY(int n, const T *x, T *y) const {
+  context_.CublasCall(
+      [&](rocblas_handle handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
+}
 
 template <>
 template <typename T>
@@ -596,6 +829,17 @@ void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
     CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
   });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::GEMV(bool trans_a, int M, int N, T alpha,
+                                  const T *A, const T *B, T beta, T *C) const {
+  rocblas_operation cuTransA =
+      !trans_a ? rocblas_operation_transpose : rocblas_operation_none;
+
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
+  });
+}
 
 template <>
 template <>
@@ -612,6 +856,23 @@ inline void Blas<platform::CUDADeviceContext>::GEMV(
                                            alpha, A, B, beta, C);
   }
 }
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMV(bool trans_a, int M, int N,
+                                         platform::float16 alpha,
+                                         const platform::float16 *A,
+                                         const platform::float16 *B,
+                                         platform::float16 beta,
+                                         platform::float16 *C) const {
+  // Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
+  if (trans_a) {
+    this->template GEMM<platform::float16>(CblasNoTrans, CblasNoTrans, 1, N, M,
+                                           alpha, B, A, beta, C);
+  } else {
+    this->template GEMM<platform::float16>(CblasNoTrans, CblasNoTrans, M, 1, N,
+                                           alpha, A, B, beta, C);
+  }
+}
 
 template <>
 template <typename T>
@@ -637,6 +898,32 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
                                   ldc, strideC, batchCount);
   });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, T alpha, const T *A, const T *B,
+                                         T beta, T *C, int batchCount,
+                                         int64_t strideA,
+                                         int64_t strideB) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (transB == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  const int64_t strideC = M * N;
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::GEMM_STRIDED_BATCH(handle, cuTransB, cuTransA, N, M, K, &alpha,
+                                  B, ldb, strideB, A, lda, strideA, &beta, C,
+                                  ldc, strideC, batchCount);
+  });
+}
 
 template <>
 template <typename T>
@@ -648,6 +935,18 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
                            C[k]);
   }
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, T alpha, const T **A,
+                                         const T **B, T beta, T **C,
+                                         int batchCount) const {
+  for (int k = 0; k < batchCount; ++k) {
+    this->template GEMM<T>(transA, transB, M, N, K, alpha, A[k], B[k], beta,
+                           C[k]);
+  }
+}
 
 template <>
 template <>
@@ -661,6 +960,18 @@ inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
                                            B[k], beta, C[k]);
   }
 }
+template <>
+template <>
+inline void Blas<pten::GPUContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    platform::float16 alpha, const platform::float16 **A,
+    const platform::float16 **B, platform::float16 beta, platform::float16 **C,
+    int batchCount) const {
+  for (int k = 0; k < batchCount; ++k) {
+    this->template GEMM<platform::float16>(transA, transB, M, N, K, alpha, A[k],
+                                           B[k], beta, C[k]);
+  }
+}
 
 template <>
 template <typename T>
@@ -687,6 +998,30 @@ void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
                     lda, B, ldb);
   });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
+                                  CBLAS_TRANSPOSE transA, CBLAS_DIAG diag,
+                                  int M, int N, T alpha, const T *A, int lda,
+                                  T *B, int ldb) const {
+  // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' )  =  α B'`
+  // where ' stands for transpose
+  rocblas_side cuSide =
+      (side == CblasLeft) ? rocblas_side_right : rocblas_side_left;
+  rocblas_fill cuUplo =
+      (uplo == CblasLower) ? rocblas_fill_upper : rocblas_fill_lower;
+  // use CUBLAS_OP_C (conjugate transpose) for complex
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_diagonal cuDiag =
+      (diag == CblasUnit) ? rocblas_diagonal_unit : rocblas_diagonal_non_unit;
+
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::TRSM(handle, cuSide, cuUplo, cuTransA, cuDiag, N, M, &alpha, A,
+                    lda, B, ldb);
+  });
+}
 
 template <>
 template <typename T>
@@ -697,6 +1032,14 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRF(int n, T **a, int *ipiv,
     CUBlas<T>::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size);
   });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedGETRF(int n, T **a, int *ipiv, int *info,
+                                          int batch_size) const {
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size);
+  });
+}
 
 template <>
 template <typename T>
@@ -715,6 +1058,22 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRI(int n, const T **a,
     CUBlas<T>::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size);
   });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedGETRI(int n, const T **a, const int *ipiv,
+                                          T **a_inv, int *info,
+                                          int batch_size) const {
+  PADDLE_ENFORCE_NE(
+      a_inv, a,
+      platform::errors::InvalidArgument(
+          "cuBLAS fuction 'cublas<S/D>getrfBatched' cannot be executed "
+          "in-place. The memory space of output matrix (address: %p) cannot "
+          "overlap memory space of input matrix (address: %p).",
+          a_inv, a));
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size);
+  });
+}
 
 template <>
 template <typename T>
@@ -725,6 +1084,14 @@ void Blas<platform::CUDADeviceContext>::BatchedMatInv(int n, const T **a,
     CUBlas<T>::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size);
   });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedMatInv(int n, const T **a, T **a_inv,
+                                           int *info, int batch_size) const {
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size);
+  });
+}
 
 template <>
 template <typename T>
@@ -739,6 +1106,20 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRS(
                            batch_size);
   });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans, int n,
+                                          int nrhs, const T **a, int lda,
+                                          int *ipiv, T **b, int ldb, int *info,
+                                          int batch_size) const {
+  rocblas_operation cuTrans = (trans == CblasNoTrans)
+                                  ? rocblas_operation_none
+                                  : rocblas_operation_transpose;
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::GETRS_BATCH(handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info,
+                           batch_size);
+  });
+}
 
 template <>
 template <typename T>
@@ -764,6 +1145,31 @@ void Blas<platform::CUDADeviceContext>::BatchedTRSM(
                           &alpha, A, lda, B, ldb, batch_size);
   });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedTRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
+                                         CBLAS_TRANSPOSE transA,
+                                         CBLAS_DIAG diag, int M, int N, T alpha,
+                                         const T **A, int lda, T **B, int ldb,
+                                         int batch_size) const {
+  // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' )  =  α B'`
+  // where ' stands for transpose
+  rocblas_side cuSide =
+      (side == CblasLeft) ? rocblas_side_right : rocblas_side_left;
+  rocblas_fill cuUplo =
+      (uplo == CblasLower) ? rocblas_fill_upper : rocblas_fill_lower;
+  // use CUBLAS_OP_C (conjugate transpose) for complex
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_diagonal cuDiag =
+      (diag == CblasUnit) ? rocblas_diagonal_unit : rocblas_diagonal_non_unit;
+
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::TRSM_BATCH(handle, cuSide, cuUplo, cuTransA, cuDiag, N, M,
+                          &alpha, A, lda, B, ldb, batch_size);
+  });
+}
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc
index c8e2acea451a4..65e48e5817574 100644
--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
@@ -16,6 +16,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
 
 /**
  * case 1:
@@ -441,6 +443,31 @@ void TestConcatMain() {
   delete context;
 }
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <>
+void TestConcatMain<paddle::platform::CUDADeviceContext,
+                    paddle::platform::CUDAPlace>() {
+  auto* context =
+      new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace());
+  context->SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CUDAPlace(), context->stream())
+          .get());
+  context->PartialInitWithAllocator();
+
+  ConcatCase1<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
+      context);
+  ConcatCase2<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
+      context);
+  ConcatCase3<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
+      context);
+  ConcatCase4<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
+      context);
+
+  delete context;
+}
+#endif
+
 TEST(math, concat) {
   TestConcatMain<paddle::platform::CPUDeviceContext,
                  paddle::platform::CPUPlace>();
diff --git a/paddle/fluid/operators/math/cusparse_conversion_api_test.cc b/paddle/fluid/operators/math/cusparse_conversion_api_test.cc
index d45b57420eef1..a84b734fbeb2d 100644
--- a/paddle/fluid/operators/math/cusparse_conversion_api_test.cc
+++ b/paddle/fluid/operators/math/cusparse_conversion_api_test.cc
@@ -24,6 +24,11 @@ void TestNNZ(const std::vector<T>& dense_data, const int correct_nnz,
              const int rows, const int cols) {
   paddle::platform::CUDADeviceContext* context =
       new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace());
+  context->SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CUDAPlace(), context->stream())
+          .get());
+  context->PartialInitWithAllocator();
   auto sparse =
       paddle::operators::math::GetSparse<paddle::platform::CUDADeviceContext,
                                          T>(*context);
@@ -61,6 +66,11 @@ void TestDenseToSparse(const std::vector<T>& correct_dense_data,
                        const std::string& mode) {
   paddle::platform::CUDADeviceContext* context =
       new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace());
+  context->SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CUDAPlace(), context->stream())
+          .get());
+  context->PartialInitWithAllocator();
   // get sparse
   auto sparse =
       paddle::operators::math::GetSparse<paddle::platform::CUDADeviceContext,
diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc
index 1400b9d105ce1..0e4032986cf0c 100644
--- a/paddle/fluid/operators/math/im2col_test.cc
+++ b/paddle/fluid/operators/math/im2col_test.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/im2col.h"
 #include <gtest/gtest.h>
 #include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
 
 template <typename DeviceContext, typename Place>
 void testIm2col() {
@@ -60,6 +62,7 @@ void testIm2col() {
 
   auto* place = new Place();
   DeviceContext* context = new DeviceContext(*place);
+
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
@@ -164,6 +167,165 @@ void testIm2col() {
   delete context;
 }
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <>
+void testIm2col<paddle::platform::CUDADeviceContext,
+                paddle::platform::CUDAPlace>() {
+  paddle::framework::Tensor input_tmp;
+  paddle::framework::Tensor input;
+  paddle::framework::Tensor output_cfo;
+  paddle::framework::Tensor output_ocf;
+  paddle::framework::Tensor output_tmp;
+
+  /**
+   * input = [0, 1, 2,
+   *          3, 4, 5]
+   *
+   * output_cfo = [0, 1
+   *               1, 2
+   *               3, 4
+   *               4, 5]
+   *
+   * output_ocf = [0, 1, 3, 4
+   *               1, 2, 4, 5]
+   *
+   * col2im_cfo = [0, 2, 2
+   *               3, 4, 5]
+   *
+   * col2im_ocf = [0, 2, 2
+   *               3, 4, 5]
+   */
+  int input_height = 2;
+  int input_width = 3;
+  int filter_size = 2;
+  std::vector<int> stride({1, 1});  // stride_y, stride_x
+  std::vector<int> padding(
+      {0, 0, 0, 0});                  // up_pad, left_pad, down_pad, right_pad
+  std::vector<int> dilation({1, 1});  // dilation_y, dilation_x
+  int output_height =
+      (input_height - filter_size + padding[0] + padding[1]) / stride[0] + 1;
+  int output_width =
+      (input_width - filter_size + padding[2] + padding[3]) / stride[1] + 1;
+  float* input_ptr = input_tmp.mutable_data<float>(
+      {1, input_height, input_width}, paddle::platform::CPUPlace());
+  float arr[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input_ptr, arr, 6 * sizeof(float));
+
+  auto* place = new paddle::platform::CUDAPlace();
+  auto* context = new paddle::platform::CUDADeviceContext(*place);
+  context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                            .GetAllocator(*place, context->stream())
+                            .get());
+  context->PartialInitWithAllocator();
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    paddle::framework::TensorCopySync(input_tmp, *place, &input);
+  }
+  output_cfo.mutable_data<float>(
+      {1, filter_size, filter_size, output_height, output_width}, *place);
+  output_ocf.mutable_data<float>(
+      {output_height, output_width, 1, filter_size, filter_size}, *place);
+
+  // Im2Col
+  paddle::operators::math::Im2ColFunctor<
+      paddle::operators::math::ColFormat::kCFO,
+      paddle::platform::CUDADeviceContext, float>
+      im2col;
+  paddle::operators::math::Im2ColFunctor<
+      paddle::operators::math::ColFormat::kOCF,
+      paddle::platform::CUDADeviceContext, float>
+      im2col_ocf;
+
+  im2col(*context, input, dilation, stride, padding, &output_cfo);
+  im2col_ocf(*context, input, dilation, stride, padding, &output_ocf);
+
+  float out_cfo_data[] = {0, 1, 1, 2, 3, 4, 4, 5};
+  float out_ocf_data[] = {0, 1, 3, 4, 1, 2, 4, 5};
+
+  float* out_cfo_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    out_cfo_ptr = output_cfo.data<float>();
+  } else {
+    paddle::framework::TensorCopySync(output_cfo, paddle::platform::CPUPlace(),
+                                      &output_tmp);
+    out_cfo_ptr = output_tmp.data<float>();
+  }
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(out_cfo_ptr[i], out_cfo_data[i]);
+  }
+
+  float* out_ocf_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    out_ocf_ptr = output_ocf.data<float>();
+  } else {
+    paddle::framework::TensorCopySync(output_ocf, paddle::platform::CPUPlace(),
+                                      &output_tmp);
+    out_ocf_ptr = output_tmp.data<float>();
+  }
+
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(out_ocf_ptr[i], out_ocf_data[i]);
+  }
+
+  // Col2Im: kCFO
+  paddle::operators::math::Col2ImFunctor<
+      paddle::operators::math::ColFormat::kCFO,
+      paddle::platform::CUDADeviceContext, float>
+      col2im;
+  paddle::operators::math::Col2ImFunctor<
+      paddle::operators::math::ColFormat::kOCF,
+      paddle::platform::CUDADeviceContext, float>
+      col2im_ocf;
+  float col2im_data[] = {0, 2, 2, 3, 8, 5};
+
+  memset(input_ptr, 0, 6 * sizeof(float));
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    paddle::framework::TensorCopySync(input_tmp, *place, &input);
+  }
+
+  col2im(*context, output_cfo, dilation, stride, padding, &input);
+
+  float* in_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    in_ptr = input.data<float>();
+  } else {
+    paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(),
+                                      &input_tmp);
+    in_ptr = input_tmp.data<float>();
+  }
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(in_ptr[i], col2im_data[i]);
+  }
+
+  // Col2Im: kOCF
+  memset(input_ptr, 0, 6 * sizeof(float));
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    paddle::framework::TensorCopySync(input_tmp, *place, &input);
+  }
+
+  col2im_ocf(*context, output_ocf, dilation, stride, padding, &input);
+
+  if (paddle::platform::is_cpu_place(*place)) {
+    in_ptr = input.data<float>();
+  } else {
+    paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(),
+                                      &input_tmp);
+    in_ptr = input_tmp.data<float>();
+  }
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(in_ptr[i], col2im_data[i]);
+  }
+
+  delete place;
+  delete context;
+}
+#endif
+
 TEST(math, im2col) {
   testIm2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h
index 54a37db1df71a..5fdc2889a8885 100644
--- a/paddle/fluid/operators/math/inclusive_scan.h
+++ b/paddle/fluid/operators/math/inclusive_scan.h
@@ -194,7 +194,7 @@ static void InclusiveScanInnerDim(const T *x, T *y, size_t outer_dim,
   constexpr size_t kThreadNumY = 32;
 
   size_t grid_dim = (outer_dim + kThreadNumY - 1) / kThreadNumY;
-  grid_dim = std::min<size_t>(grid_dim, dev_ctx.GetCUDAMaxGridDimSize().x);
+  grid_dim = std::min<size_t>(grid_dim, dev_ctx.GetCUDAMaxGridDimSize()[0]);
   dim3 thread_dims(kThreadNumX, kThreadNumY);
   if (reverse) {
     InclusiveScanInnerDimCUDAKernel<
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index 9ade45ee743a3..f0ef692b99f57 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function_impl.h"
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/kernels/funcs/eigen/common.h"
 
 namespace paddle {
@@ -44,6 +45,18 @@ template struct SetConstant<platform::CUDADeviceContext,
 template struct SetConstant<platform::CUDADeviceContext,
                             platform::complex<double>>;
 
+template struct SetConstant<pten::GPUContext, platform::float16>;
+template struct SetConstant<pten::GPUContext, platform::bfloat16>;
+template struct SetConstant<pten::GPUContext, float>;
+template struct SetConstant<pten::GPUContext, double>;
+template struct SetConstant<pten::GPUContext, uint8_t>;
+template struct SetConstant<pten::GPUContext, int>;
+template struct SetConstant<pten::GPUContext, int16_t>;
+template struct SetConstant<pten::GPUContext, int64_t>;
+template struct SetConstant<pten::GPUContext, bool>;
+template struct SetConstant<pten::GPUContext, platform::complex<float>>;
+template struct SetConstant<pten::GPUContext, platform::complex<double>>;
+
 template struct SetConstant<platform::CUDAPinnedDeviceContext,
                             platform::float16>;
 template struct SetConstant<platform::CUDAPinnedDeviceContext,
diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/fluid/operators/math/math_function_test.cc
index 32f9938dcacfb..91a4f2746ea57 100644
--- a/paddle/fluid/operators/math/math_function_test.cc
+++ b/paddle/fluid/operators/math/math_function_test.cc
@@ -223,6 +223,7 @@ TEST(math_funciton, set_constant) {
   t.Resize({10, 10});
   t.mutable_data<int>(paddle::platform::CPUPlace());
   auto* ctx = new paddle::platform::CPUDeviceContext();
+  ctx->Init();
   paddle::operators::math::set_constant(*ctx, &t, 10);
   for (int64_t i = 0; i < t.numel(); ++i) {
     PADDLE_ENFORCE_EQ(10, t.data<int>()[i],
diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/fluid/operators/math/math_function_test.cu
index 44b1ee45a4fe9..39c91e96a70fa 100644
--- a/paddle/fluid/operators/math/math_function_test.cu
+++ b/paddle/fluid/operators/math/math_function_test.cu
@@ -46,6 +46,10 @@ TEST(math_function, notrans_mul_trans_fp32) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
   paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
 
   float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
   float arr[6] = {0, 1, 2, 3, 4, 5};
@@ -78,6 +82,10 @@ TEST(math_function, notrans_mul_trans_fp16) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
   paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
 
   // fp16 GEMM in cublas requires GPU compute capability >= 53
   if (context.GetComputeCapability() < 53) {
@@ -117,6 +125,10 @@ TEST(math_function, trans_mul_notrans_fp32) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
   paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
 
   float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
   float arr[6] = {0, 1, 2, 3, 4, 5};
@@ -155,6 +167,10 @@ TEST(math_function, trans_mul_notrans_fp16) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
   paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
 
   // fp16 GEMM in cublas requires GPU compute capability >= 53
   if (context.GetComputeCapability() < 53) {
@@ -200,6 +216,10 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
   paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
 
   int m = 2;
   int n = 3;
@@ -254,6 +274,10 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
   paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
 
   // fp16 GEMM in cublas requires GPU compute capability >= 53
   if (context.GetComputeCapability() < 53) {
@@ -316,6 +340,10 @@ TEST(math_function, gemm_trans_cublas_fp32) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
   paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
 
   int m = 2;
   int n = 3;
@@ -364,6 +392,10 @@ TEST(math_function, gemm_trans_cublas_fp16) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
   paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
 
   // fp16 GEMM in cublas requires GPU compute capability >= 53
   if (context.GetComputeCapability() < 53) {
@@ -418,6 +450,10 @@ void GemvTest(int m, int n, bool trans) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
   paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
 
   T* data_a = mat_a.mutable_data<T>({m, n}, cpu_place);
   T* data_b = vec_b.mutable_data<T>({trans ? m : n}, cpu_place);
diff --git a/paddle/fluid/operators/math/vol2col_test.cc b/paddle/fluid/operators/math/vol2col_test.cc
index 8cd2824465879..210cf10d8879d 100644
--- a/paddle/fluid/operators/math/vol2col_test.cc
+++ b/paddle/fluid/operators/math/vol2col_test.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/vol2col.h"
 
 #include <gtest/gtest.h>
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
 
 template <typename DeviceContext, typename Place>
 void testVol2col() {
@@ -25,7 +27,6 @@ void testVol2col() {
 
   auto* place = new Place();
   DeviceContext* context = new DeviceContext(*place);
-
   /**
    * input = [[0, 1, 2,
    *          3, 4, 5]
@@ -123,6 +124,124 @@ void testVol2col() {
   delete context;
 }
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <>
+void testVol2col<paddle::platform::CUDADeviceContext,
+                 paddle::platform::CUDAPlace>() {
+  paddle::framework::Tensor input;
+  paddle::framework::Tensor input_tmp;
+  paddle::framework::Tensor output;
+  paddle::framework::Tensor output_tmp;
+
+  auto* place = new paddle::platform::CUDAPlace();
+  auto* context = new paddle::platform::CUDADeviceContext(*place);
+  context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                            .GetAllocator(*place, context->stream())
+                            .get());
+  context->PartialInitWithAllocator();
+
+  /**
+   * input = [[0, 1, 2,
+   *          3, 4, 5]
+   *          [6, 7, 8,
+   *          9, 10, 11]]
+   *
+   * output = [0, 1
+   *           1, 2
+   *           3, 4
+   *           4, 5
+   *           6, 7
+   *           7, 8
+   *           9, 10
+   *           10, 11]
+   *
+   * col2vol = [[0, 2, 2,
+   *             3, 8, 5]
+   *            [6, 14, 8,
+   *             9, 20, 11]]
+   *
+   */
+  int input_depth = 2;
+  int input_height = 2;
+  int input_width = 3;
+  int filter_size = 2;
+  std::vector<int> strides({1, 1, 1});
+  std::vector<int> paddings({0, 0, 0});
+  std::vector<int> dilations({1, 1, 1});
+  int output_depth =
+      (input_depth - filter_size + 2 * paddings[0]) / strides[0] + 1;
+  int output_height =
+      (input_height - filter_size + 2 * paddings[1]) / strides[1] + 1;
+  int output_width =
+      (input_width - filter_size + 2 * paddings[2]) / strides[2] + 1;
+
+  // Vol2Col test
+  float* input_ptr =
+      input_tmp.mutable_data<float>({1, input_depth, input_height, input_width},
+                                    paddle::platform::CPUPlace());
+  float arr[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  memcpy(input_ptr, arr, 12 * sizeof(float));
+
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    paddle::framework::TensorCopySync(input_tmp, *place, &input);
+  }
+  output.mutable_data<float>({1, filter_size, filter_size, filter_size,
+                              output_depth, output_height, output_width},
+                             *place);
+
+  paddle::operators::math::Vol2ColFunctor<paddle::platform::CUDADeviceContext,
+                                          float>
+      vol2col;
+  vol2col(*context, input, dilations, strides, paddings, &output);
+
+  float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11};
+  float* out_cfo_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    out_cfo_ptr = output.data<float>();
+  } else {
+    paddle::framework::TensorCopySync(output, paddle::platform::CPUPlace(),
+                                      &output_tmp);
+    out_cfo_ptr = output_tmp.data<float>();
+  }
+
+  for (int i = 0; i < 16; ++i) {
+    EXPECT_EQ(out_cfo_ptr[i], vol_2_col[i]);
+  }
+
+  // Col2Vol test
+  float col_2_vol[] = {0, 2, 2, 3, 8, 5, 6, 14, 8, 9, 20, 11};
+  memset(input_ptr, 0, 12 * sizeof(float));
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    paddle::framework::TensorCopySync(input_tmp, *place, &input);
+  }
+
+  paddle::operators::math::Col2VolFunctor<paddle::platform::CUDADeviceContext,
+                                          float>
+      col2vol;
+  col2vol(*context, output, dilations, strides, paddings, &input);
+
+  float* in_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    in_ptr = input.data<float>();
+  } else {
+    paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(),
+                                      &input_tmp);
+    in_ptr = input_tmp.data<float>();
+  }
+
+  for (int i = 0; i < 12; ++i) {
+    EXPECT_EQ(in_ptr[i], col_2_vol[i]);
+  }
+
+  delete place;
+  delete context;
+}
+#endif
+
 TEST(math, vol2col) {
   testVol2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu
index 63b5b871aabb5..db8f586297c54 100644
--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
@@ -66,7 +66,8 @@ class MeanCUDAKernel : public framework::OpKernel<T> {
       reduce_dims.push_back(i);
     }
     TensorReduceFunctorImpl<T, T, kernel_primitives::AddFunctor, Div>(
-        *input, output, Div(numel), reduce_dims, stream);
+        context.cuda_device_context(), *input, output, Div(numel), reduce_dims,
+        stream);
   }
 };
 
diff --git a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
index 41c1b4d7a8f81..9fcbfa90230c5 100644
--- a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
@@ -57,7 +57,12 @@ class NCCLTester : public ::testing::Test {
     paddle::platform::CPUPlace cpu_place;
     for (size_t i = 0; i < gpu_list_.size(); ++i) {
       p::CUDAPlace place(i);
-      dev_ctxs_.emplace_back(new p::CUDADeviceContext(place));
+      auto *ctx = new p::CUDADeviceContext(place);
+      ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                            .GetAllocator(place, ctx->stream())
+                            .get());
+      ctx->PartialInitWithAllocator();
+      dev_ctxs_.emplace_back(ctx);
     }
 
     NCCLInitOp();
diff --git a/paddle/fluid/operators/p_norm_op.cu b/paddle/fluid/operators/p_norm_op.cu
index c0bd906685d4d..e11fe478106f9 100644
--- a/paddle/fluid/operators/p_norm_op.cu
+++ b/paddle/fluid/operators/p_norm_op.cu
@@ -106,16 +106,20 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
     using MT = typename details::MPTypeTrait<T>::Type;
     if (porder == 0) {
       TensorReduceFunctorImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
-          *in_x, out_norm, NonzeroFunctor<T>(), reduce_axis, stream);
+          ctx.cuda_device_context(), *in_x, out_norm, NonzeroFunctor<T>(),
+          reduce_axis, stream);
     } else if (porder == INFINITY) {
       TensorReduceFunctorImpl<T, T, kps::MaxFunctor, AbsFunctor<T>>(
-          *in_x, out_norm, AbsFunctor<T>(), reduce_axis, stream);
+          ctx.cuda_device_context(), *in_x, out_norm, AbsFunctor<T>(),
+          reduce_axis, stream);
     } else if (porder == -INFINITY) {
       TensorReduceFunctorImpl<T, T, kps::MinFunctor, AbsFunctor<T>>(
-          *in_x, out_norm, AbsFunctor<T>(), reduce_axis, stream);
+          ctx.cuda_device_context(), *in_x, out_norm, AbsFunctor<T>(),
+          reduce_axis, stream);
     } else {
       TensorReduceFunctorImpl<T, T, kps::AddFunctor, UnsignedPowFunctor<T>>(
-          *in_x, out_norm, UnsignedPowFunctor<T>(porder), reduce_axis, stream);
+          ctx.cuda_device_context(), *in_x, out_norm,
+          UnsignedPowFunctor<T>(porder), reduce_axis, stream);
 
       const framework::Tensor* tmp_norm = out_norm;
       std::vector<const framework::Tensor*> ins = {tmp_norm};
diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h
index 84c1988e29b07..f7f60e82216ae 100644
--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -208,8 +208,8 @@ class PoolKernel : public framework::OpKernel<T> {
             auto stream = dev_ctx.stream();
             TensorReduceFunctorImpl<T, T, kps::AddFunctor,
                                     kps::DivideFunctor<T>>(
-                *in_x, out, kps::DivideFunctor<T>(reduce_num), reduce_dim,
-                stream);
+                dev_ctx, *in_x, out, kps::DivideFunctor<T>(reduce_num),
+                reduce_dim, stream);
 #else  // for cpu
             paddle::operators::math::Pool2dFunctor<
                 DeviceContext, paddle::operators::math::AvgPool<T>, T>
diff --git a/paddle/fluid/operators/prelu_op.cu b/paddle/fluid/operators/prelu_op.cu
index c6997603bb13b..9493b6d4391d5 100644
--- a/paddle/fluid/operators/prelu_op.cu
+++ b/paddle/fluid/operators/prelu_op.cu
@@ -186,7 +186,8 @@ class CUDAPReluGradKernel : public framework::OpKernel<T> {
     }
 
     TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        dalpha_tmp, dalpha, kps::IdentityFunctor<T>(), reduce_dims, stream);
+        context.cuda_device_context(), dalpha_tmp, dalpha,
+        kps::IdentityFunctor<T>(), reduce_dims, stream);
   }
 };
 
diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
index 21f21cdc95606..2c701bdae7601 100644
--- a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
@@ -222,6 +222,10 @@ TEST(SENDANDRECV, GPU) {
   framework::Scope* scope = (*micro_scope)[0];
   platform::CUDAPlace place;
   platform::CUDADeviceContext ctx(place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(place, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
 
   framework::Executor exe(place);
   // create var on local scope
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
index 730a84da88b59..f741c5941eb26 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -39,14 +39,16 @@ namespace operators {
 
 template <typename Tx, typename Ty, template <typename> class ReduceOp,
           typename TransformOp>
-void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
+void TensorReduceFunctorImpl(const platform::CUDADeviceContext& dev_ctx,
+                             const framework::Tensor& x, framework::Tensor* y,
                              const TransformOp& transform,
                              const std::vector<int>& origin_reduce_dims,
                              gpuStream_t stream) {
   y->mutable_data<Ty>(x.place());
 
   pten::kernels::TensorReduceFunctorImpl<Tx, Ty, ReduceOp, TransformOp>(
-      x, y, transform, origin_reduce_dims, stream);
+      static_cast<const pten::GPUContext&>(dev_ctx), x, y, transform,
+      origin_reduce_dims, stream);
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/renorm_op.cu b/paddle/fluid/operators/renorm_op.cu
index 1d76eaf27e818..ca6169d0410f1 100644
--- a/paddle/fluid/operators/renorm_op.cu
+++ b/paddle/fluid/operators/renorm_op.cu
@@ -156,7 +156,8 @@ class CUDARenormKernel : public framework::OpKernel<T> {
         cuda_ctx, ins, &outs, func);
     std::vector<int> reduce_axis = {0, 2};
     TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis, stream);
+        cuda_ctx, pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis,
+        stream);
     RenormKernelFunc3<T><<<grid2, block2, 0, stream>>>(
         numel, dim_value.mutable_data<T>(context.GetPlace()), p, max_norm);
     RenormKernelFunc4<T><<<grid, block, 0, stream>>>(
@@ -213,10 +214,11 @@ class CUDAGradRenormKernel : public framework::OpKernel<T> {
         dim_divisor);
     std::vector<int> reduce_axis = {0, 2};
     TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis, stream);
+        ctx.cuda_device_context(), pow_value, &dim_value,
+        kps::IdentityFunctor<T>(), reduce_axis, stream);
     TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        mul_value, &weight_derivative, kps::IdentityFunctor<T>(), reduce_axis,
-        stream);
+        ctx.cuda_device_context(), mul_value, &weight_derivative,
+        kps::IdentityFunctor<T>(), reduce_axis, stream);
     RenormGradKernelFunc2<T><<<grid, block, 0, stream>>>(
         x_data, dout_data, dx_data, numel,
         dim_value.mutable_data<T>(ctx.GetPlace()),
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index d54d5234010ba..99e40de3080e3 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -389,7 +389,8 @@ class ReshapeKernel {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_gpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
-      pten::ReshapeKernel(dev_ctx, *in, pt_scalar_shape, out);
+      pten::ReshapeKernel(static_cast<const pten::GPUContext &>(dev_ctx), *in,
+                          pt_scalar_shape, out);
     }
 #endif
 #ifdef PADDLE_WITH_XPU
@@ -417,7 +418,8 @@ class ReshapeGradKernel {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_gpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
-      pten::ReshapeGradKernel(dev_ctx, *d_out, d_x);
+      pten::ReshapeGradKernel(static_cast<const pten::GPUContext &>(dev_ctx),
+                              *d_out, d_x);
     }
 #endif
 #ifdef PADDLE_WITH_XPU
@@ -445,7 +447,8 @@ class ReshapeDoubleGradKernel {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_gpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
-      pten::ReshapeDoubleGradKernel(dev_ctx, *dd_x, dd_out);
+      pten::ReshapeDoubleGradKernel(
+          static_cast<const pten::GPUContext &>(dev_ctx), *dd_x, dd_out);
     }
 #endif
 #ifdef PADDLE_WITH_XPU
diff --git a/paddle/fluid/operators/scatter.cu.h b/paddle/fluid/operators/scatter.cu.h
index 98311ff404b47..13c08aea68849 100644
--- a/paddle/fluid/operators/scatter.cu.h
+++ b/paddle/fluid/operators/scatter.cu.h
@@ -183,8 +183,7 @@ void GPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index,
 
   int64_t max_grid_dimx =
       reinterpret_cast<const platform::CUDADeviceContext&>(ctx)
-          .GetCUDAMaxGridDimSize()
-          .x;
+          .GetCUDAMaxGridDimSize()[0];
   int64_t grid = height < max_grid_dimx ? height : max_grid_dimx;
 
   ScatterInitCUDAKernel<T, IndexT><<<
diff --git a/paddle/fluid/operators/solve_op.h b/paddle/fluid/operators/solve_op.h
index 0acef78484cd3..ea3a5aa5af9b2 100644
--- a/paddle/fluid/operators/solve_op.h
+++ b/paddle/fluid/operators/solve_op.h
@@ -46,7 +46,8 @@ void ReduceSumForSolve(const Tensor* input, Tensor* output,
 #if defined(__NVCC__) || defined(__HIPCC__)
   auto stream = ctx.cuda_device_context().stream();
   TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-      *input, output, kps::IdentityFunctor<T>(), reduce_dims, stream);
+      ctx.cuda_device_context(), *input, output, kps::IdentityFunctor<T>(),
+      reduce_dims, stream);
 #else
   ReduceKernelFunctor<DeviceContext, T, ops::SumFunctor>(
       input, output, reduce_dims, keep_dim, false, ctx)
diff --git a/paddle/fluid/operators/strided_memcpy_test.cc b/paddle/fluid/operators/strided_memcpy_test.cc
index 1ab036e869294..514bdac9c55f1 100644
--- a/paddle/fluid/operators/strided_memcpy_test.cc
+++ b/paddle/fluid/operators/strided_memcpy_test.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/strided_memcpy.h"
 
 #include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 
 namespace paddle {
 namespace operators {
@@ -86,6 +87,10 @@ TEST(StridedMemcpy, GPUCrop) {
   platform::CPUPlace cpu;
 
   platform::CUDADeviceContext ctx(gpu0);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(gpu0, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
 
   auto src_allocation = memory::Alloc(gpu0, sizeof(src));
 
@@ -124,6 +129,10 @@ TEST(StridedMemcpy, GPUConcat) {
   platform::CUDAPlace gpu0(0);
   platform::CPUPlace cpu;
   platform::CUDADeviceContext ctx(gpu0);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(gpu0, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
   auto gpu_src_allocation = memory::Alloc(gpu0, sizeof(src));
   int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr());
   memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
index d2d04a4fa50b4..90d489c8df733 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -37,6 +37,10 @@ void CreateCUDATensor(framework::Scope* scope, const std::string& name,
   tensor->Resize(dims);
   platform::CUDAPlace place;
   platform::CUDADeviceContext ctx(place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(place, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
   inference::tensorrt::RandomizeTensor(tensor, place, ctx);
 }
 
@@ -133,6 +137,10 @@ void DynamicShapeTest(bool allow_build_at_runtime) {
   framework::Scope scope;
   platform::CUDAPlace place;
   platform::CUDADeviceContext ctx(place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(place, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
   // Prepare variables.
   if (allow_build_at_runtime)
     CreateCUDATensor(&scope, "x", std::vector<int64_t>({3, 4, 1, 1}));
@@ -159,6 +167,10 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
   framework::Scope scope;
   platform::CUDAPlace place;
   platform::CUDADeviceContext ctx(place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(place, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
 
   auto* block_ = program.Proto()->add_blocks();
   block_->set_idx(0);
diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h
index 05ae5c9188ceb..c077411e49681 100644
--- a/paddle/fluid/operators/top_k_function_cuda.h
+++ b/paddle/fluid/operators/top_k_function_cuda.h
@@ -411,7 +411,7 @@ bool SortTopk(const platform::CUDADeviceContext& ctx,
   };
   int block_size = ComputeBlockSize(num_cols);
 
-  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
+  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
   // actually, int num_rows < max_grid_size
   unsigned int grid_size = num_rows < maxGridDimX
                                ? static_cast<unsigned int>(num_rows)
diff --git a/paddle/fluid/operators/trace_op.cu b/paddle/fluid/operators/trace_op.cu
index 98a77637f9204..0b9e615eece8f 100644
--- a/paddle/fluid/operators/trace_op.cu
+++ b/paddle/fluid/operators/trace_op.cu
@@ -40,7 +40,8 @@ class TraceCUDAKernel : public framework::OpKernel<T> {
       std::vector<int> reduce_dims;
       reduce_dims.push_back(out->dims().size());
       TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          diag, out, kps::IdentityFunctor<T>(), reduce_dims, stream);
+          context.cuda_device_context(), diag, out, kps::IdentityFunctor<T>(),
+          reduce_dims, stream);
     } else {
       math::SetConstant<DeviceContext, T> functor;
       functor(context.device_context<DeviceContext>(), out, static_cast<T>(0));
diff --git a/paddle/fluid/operators/triangular_solve_op.cu b/paddle/fluid/operators/triangular_solve_op.cu
index b7ea5cd953186..28cdc56e2aec1 100644
--- a/paddle/fluid/operators/triangular_solve_op.cu
+++ b/paddle/fluid/operators/triangular_solve_op.cu
@@ -45,7 +45,8 @@ class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
     }
     gpuStream_t stream = ctx.cuda_device_context().stream();
     TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        in, out, kps::IdentityFunctor<T>(), out_reduce_dims, stream);
+        ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(),
+        out_reduce_dims, stream);
   }
 };
 
diff --git a/paddle/fluid/operators/viterbi_decode_op.cu b/paddle/fluid/operators/viterbi_decode_op.cu
index 7ad3335009b06..b52f46e4a8ac6 100644
--- a/paddle/fluid/operators/viterbi_decode_op.cu
+++ b/paddle/fluid/operators/viterbi_decode_op.cu
@@ -148,7 +148,7 @@ struct Argmax<platform::CUDADeviceContext, T, IndType> {
     }
     const auto& dev_ctx = ctx.cuda_device_context();
     auto cu_stream = dev_ctx.stream();
-    int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x;
+    int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
     int64_t height = pre * post;
     int64_t width = n;
     int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index e35b586dc90b0..07d3e41958293 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -138,6 +138,7 @@ if(WITH_CNCL)
 endif()
 
 if(WITH_GPU OR WITH_ROCM)
+    target_link_libraries(device_context gpu_info gpu_context pten_gpu_info)
     target_link_libraries(device_context gpu_resource_pool)
 endif()
 
diff --git a/paddle/fluid/platform/bfloat16_test.cu b/paddle/fluid/platform/bfloat16_test.cu
index dbbb72920a53b..a82043cd7c435 100644
--- a/paddle/fluid/platform/bfloat16_test.cu
+++ b/paddle/fluid/platform/bfloat16_test.cu
@@ -66,6 +66,10 @@ TEST(bfloat16, lod_tensor_on_gpu) {
   // CPU LoDTensor to GPU LoDTensor
   CUDAPlace gpu_place(0);
   CUDADeviceContext gpu_ctx(gpu_place);
+  gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, gpu_ctx.stream())
+                           .get());
+  gpu_ctx.PartialInitWithAllocator();
   framework::TensorCopy(src_tensor, gpu_place, gpu_ctx, &gpu_tensor);
 
   // GPU LoDTensor to CPU LoDTensor
diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc
index dd2dc9a40799e..ae1df10c45f77 100644
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/platform/collective_helper.h"
 #include <utility>
 
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
 
@@ -187,6 +188,18 @@ NCCLComm* NCCLCommContext::AssignNCCLComm(ncclComm_t comm, int nranks, int rank,
                                           int dev_id, int ring_id) {
   std::unique_ptr<CUDADeviceContext> dev_ctx(
       new CUDADeviceContext(CUDAPlace(dev_id)));
+  dev_ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                            .GetAllocator(CUDAPlace(dev_id), dev_ctx->stream())
+                            .get());
+  dev_ctx->SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+  dev_ctx->SetZeroAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetZeroAllocator(CUDAPlace(dev_id))
+          .get());
+  dev_ctx->PartialInitWithAllocator();
 
   std::shared_ptr<platform::CudaEventObject> compute_event(
       platform::CudaEventResourcePool::Instance().New(dev_id));
@@ -329,7 +342,7 @@ BKCLComm* BKCLCommContext::AssignBKCLComm(BKCLContext_t comm, int nranks,
     auto* dev_ctx = static_cast<platform::XPUDeviceContext*>(
         platform::DeviceContextPool::Instance().Get(
             platform::XPUPlace(dev_id)));
-    dev_ctx->set_bkcl_context(comm);
+    dev_ctx->SetBkclContext(comm);
   }
 
   return comm_map_[ring_id][dev_id].get();
diff --git a/paddle/fluid/platform/device/gpu/CMakeLists.txt b/paddle/fluid/platform/device/gpu/CMakeLists.txt
index 5cf2258204fda..00f0cc2ac92bf 100644
--- a/paddle/fluid/platform/device/gpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/CMakeLists.txt
@@ -1,12 +1,12 @@
 IF(WITH_GPU)
     add_subdirectory(cuda)
-    nv_library(gpu_info SRCS gpu_info.cc DEPS cuda_info gflags glog enforce monitor dynload_cuda)
+    nv_library(gpu_info SRCS gpu_info.cc DEPS pten_gpu_info gflags glog enforce monitor dynload_cuda)
 
     nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
     nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
 ELSEIF(WITH_ROCM)
     add_subdirectory(rocm)
-    hip_library(gpu_info SRCS gpu_info.cc DEPS rocm_info gflags glog enforce monitor dynload_cuda)
+    hip_library(gpu_info SRCS gpu_info.cc DEPS pten_gpu_info gflags glog enforce monitor dynload_cuda)
 
     hip_test(cuda_helper_test SRCS cuda_helper_test.cu)
     hip_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
diff --git a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
index 5df1de1b00fac..8f7fd3dcbc03a 100644
--- a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
@@ -1,4 +1,3 @@
-nv_library(cuda_info SRCS cuda_info.cc DEPS gflags glog enforce monitor dynload_cuda)
 nv_library(cuda_graph SRCS cuda_graph.cc DEPS enforce allocator_facade)
 nv_library(cuda_profiler SRCS cuda_profiler.cc DEPS enforce)
 
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
index 3199af9c97520..ab7d474c1ac38 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
@@ -14,8 +14,10 @@
 
 #pragma once
 
+#include <functional>
 #include <mutex>  // NOLINT
 
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
@@ -96,8 +98,7 @@ class CublasHandleHolder {
     PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasDestroy(handle_));
   }
 
-  template <typename Callback>
-  inline void Call(Callback&& callback) const {
+  inline void Call(const std::function<void(blasHandle_t)>& callback) const {
     std::lock_guard<std::mutex> guard(mtx_);
     callback(handle_);
   }
diff --git a/paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h b/paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h
index 43da9bb1fb42d..cc2b7349d5017 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h
@@ -14,11 +14,13 @@ limitations under the License. */
 
 #pragma once
 
+#include <functional>
 #include <mutex>  // NOLINT
 
 #include "paddle/fluid/platform/dynload/cusparse.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/pten/backends/gpu/gpu_decls.h"
 
 namespace paddle {
 namespace platform {
@@ -45,8 +47,8 @@ class CusparseHandleHolder {
 #endif
   }
 
-  template <typename Callback>
-  inline void Call(Callback&& callback) const {
+  inline void Call(
+      const std::function<void(pten::sparseHandle_t)>& callback) const {
     std::lock_guard<std::mutex> guard(mtx_);
     callback(handle_);
   }
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index 6e6824ed7896f..59fb26e696e12 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include <array>
 #include <cstdlib>
 #include <mutex>
 #include <set>
@@ -39,11 +40,12 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/split.h"
 
+#include "paddle/pten/backends/gpu/gpu_info.h"
+
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_uint64(initial_gpu_memory_in_mb);
 DECLARE_uint64(reallocate_gpu_memory_in_mb);
 DECLARE_bool(enable_cublas_tensor_op_math);
-DECLARE_string(selected_gpus);
 DECLARE_uint64(gpu_memory_limit_mb);
 
 constexpr static float fraction_reserve_gpu_memory = 0.05f;
@@ -51,23 +53,6 @@ constexpr static float fraction_reserve_gpu_memory = 0.05f;
 USE_GPU_MEM_STAT;
 namespace paddle {
 namespace platform {
-//! Get a list of device ids from environment variable or use all.
-std::vector<int> GetSelectedDevices() {
-  // use user specified GPUs in single-node multi-process mode.
-  std::vector<int> devices;
-  if (!FLAGS_selected_gpus.empty()) {
-    auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ',');
-    for (auto id : devices_str) {
-      devices.push_back(atoi(id.c_str()));
-    }
-  } else {
-    int count = GetGPUDeviceCount();
-    for (int i = 0; i < count; ++i) {
-      devices.push_back(i);
-    }
-  }
-  return devices;
-}
 
 void GpuMemoryUsage(size_t *available, size_t *total) {
   size_t actual_available, actual_total;
@@ -382,5 +367,91 @@ void *GetGpuBasePtr(void *ptr, int dev_id) {
   return RecordedGpuMallocHelper::Instance(dev_id)->GetBasePtr(ptr);
 }
 
+int DnnVersion() { return pten::backends::gpu::DnnVersion(); }
+
+int GetGPUDeviceCount() { return pten::backends::gpu::GetGPUDeviceCount(); }
+
+int GetGPUComputeCapability(int id) {
+  return pten::backends::gpu::GetGPUComputeCapability(id);
+}
+
+int GetGPURuntimeVersion(int id) {
+  return pten::backends::gpu::GetGPURuntimeVersion(id);
+}
+
+int GetGPUDriverVersion(int id) {
+  return pten::backends::gpu::GetGPUDriverVersion(id);
+}
+
+bool TensorCoreAvailable() {
+  return pten::backends::gpu::TensorCoreAvailable();
+}
+
+int GetGPUMultiProcessors(int id) {
+  return pten::backends::gpu::GetGPUMultiProcessors(id);
+}
+
+int GetGPUMaxThreadsPerMultiProcessor(int id) {
+  return pten::backends::gpu::GetGPUMaxThreadsPerMultiProcessor(id);
+}
+
+int GetGPUMaxThreadsPerBlock(int id) {
+  return pten::backends::gpu::GetGPUMaxThreadsPerBlock(id);
+}
+
+int GetCurrentDeviceId() { return pten::backends::gpu::GetCurrentDeviceId(); }
+
+std::array<int, 3> GetGpuMaxGridDimSize(int id) {
+  return pten::backends::gpu::GetGpuMaxGridDimSize(id);
+}
+
+std::vector<int> GetSelectedDevices() {
+  return pten::backends::gpu::GetSelectedDevices();
+}
+
+const gpuDeviceProp &GetDeviceProperties(int id) {
+  return pten::backends::gpu::GetDeviceProperties(id);
+}
+
+void SetDeviceId(int device_id) { pten::backends::gpu::SetDeviceId(device_id); }
+
+gpuError_t GpuGetLastError() { return pten::backends::gpu::GpuGetLastError(); }
+
+void GpuStreamSync(gpuStream_t stream) {
+  pten::backends::gpu::GpuStreamSync(stream);
+}
+
+void GpuDestroyStream(gpuStream_t stream) {
+  pten::backends::gpu::GpuDestroyStream(stream);
+}
+
+void GpuDeviceSync() { pten::backends::gpu::GpuDeviceSync(); }
+
+void GpuMemcpyAsync(void *dst, const void *src, size_t count,
+                    gpuMemcpyKind kind, gpuStream_t stream) {
+  pten::backends::gpu::GpuMemcpyAsync(dst, src, count, kind, stream);
+}
+
+void GpuMemcpySync(void *dst, const void *src, size_t count,
+                   gpuMemcpyKind kind) {
+  pten::backends::gpu::GpuMemcpySync(dst, src, count, kind);
+}
+
+void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
+                        int src_device, size_t count, gpuStream_t stream) {
+  pten::backends::gpu::GpuMemcpyPeerAsync(dst, dst_device, src, src_device,
+                                          count, stream);
+}
+
+void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
+                       int src_device, size_t count) {
+  pten::backends::gpu::GpuMemcpyPeerSync(dst, dst_device, src, src_device,
+                                         count);
+}
+
+void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
+  pten::backends::gpu::GpuMemsetAsync(dst, value, count, stream);
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.h b/paddle/fluid/platform/device/gpu/gpu_info.h
index 1e072625ada1a..f6fb2ad8ce72e 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.h
+++ b/paddle/fluid/platform/device/gpu/gpu_info.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 #include <stddef.h>
+#include <array>
 #include <string>
 #include <vector>
 
@@ -52,7 +53,7 @@ int GetGPUMaxThreadsPerBlock(int id);
 int GetCurrentDeviceId();
 
 //! Get the maximum GridDim size for GPU buddy allocator.
-dim3 GetGpuMaxGridDimSize(int);
+std::array<int, 3> GetGpuMaxGridDimSize(int);
 
 //! Get a list of device ids from environment variable or use all.
 std::vector<int> GetSelectedDevices();
@@ -110,7 +111,7 @@ void GpuStreamSync(gpuStream_t stream);
 void GpuDestroyStream(gpuStream_t stream);
 
 // ! Blocks until device has completed all operations.
-void GpuDeviceync();
+void GpuDeviceSync();
 
 //! CudaMalloc with recorded info
 gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id);
diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h
index 261916b2555be..1d6ccdc1280a9 100644
--- a/paddle/fluid/platform/device/gpu/nccl_helper.h
+++ b/paddle/fluid/platform/device/gpu/nccl_helper.h
@@ -83,8 +83,21 @@ struct NCCLContext {
   std::unique_ptr<CUDADeviceContext> ctx_;
   ncclComm_t comm_;
 
-  explicit NCCLContext(int dev_id)
-      : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {}
+  explicit NCCLContext(int dev_id) : comm_{nullptr} {
+    ctx_.reset(new CUDADeviceContext(CUDAPlace(dev_id)));
+    ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(CUDAPlace(dev_id), ctx_->stream())
+                           .get());
+    ctx_->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    ctx_->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(CUDAPlace(dev_id))
+            .get());
+    ctx_->PartialInitWithAllocator();
+  }
 
   gpuStream_t stream() const { return ctx_->stream(); }
   ncclComm_t comm() const { return comm_; }
diff --git a/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt b/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt
index 86b9ecd5f5445..988807258c123 100644
--- a/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt
@@ -1,3 +1 @@
-hip_library(rocm_info SRCS rocm_info.cc DEPS gflags glog enforce monitor dynload_cuda)
-
 hip_test(miopen_helper_test SRCS miopen_helper_test.cc DEPS dynload_cuda)
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index fdd9883c2c924..1e674258334b0 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -10,8 +10,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
+#include <functional>
 #include <memory>
 #include <set>
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/stream/cuda_stream.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
@@ -149,16 +153,17 @@ inline void EmplaceDeviceContext(
               cuda_ctx,
               platform::errors::InvalidArgument(
                   "Failed to dynamic_cast dev_ctx into CUDADeviceContext."));
-          dev_ctx->SetDeviceAllocator(
-              memory::allocation::AllocatorFacade::Instance()
-                  .GetAllocator(p, cuda_ctx->context()->RawStream())
-                  .get());
+          // Note: A trick method to init context, why GetAllocator interface
+          // needs a stream parameter?
+          dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance()
+                                    .GetAllocator(p, cuda_ctx->stream())
+                                    .get());
+          cuda_ctx->PartialInitWithAllocator();
 #endif
         } else {
-          dev_ctx->SetDeviceAllocator(
-              memory::allocation::AllocatorFacade::Instance()
-                  .GetAllocator(p)
-                  .get());
+          dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance()
+                                    .GetAllocator(p)
+                                    .get());
         }
         dev_ctx->SetHostAllocator(
             memory::allocation::AllocatorFacade::Instance()
@@ -251,14 +256,18 @@ DeviceContextPool::DeviceContextPool(
   }
 }
 
-CPUDeviceContext::CPUDeviceContext() : pten::CPUContext() {}
+CPUDeviceContext::CPUDeviceContext() : pten::CPUContext() {
+  pten::CPUContext::Init();
+}
 
-CPUDeviceContext::CPUDeviceContext(CPUPlace place) : pten::CPUContext() {}
+CPUDeviceContext::CPUDeviceContext(CPUPlace place) : pten::CPUContext(place) {
+  pten::CPUContext::Init();
+}
 
 #ifdef PADDLE_WITH_IPU
 IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {}
 
-Place IPUDeviceContext::GetPlace() const { return place_; }
+const Place& IPUDeviceContext::GetPlace() const { return place_; }
 
 void IPUDeviceContext::Wait() const {
   /*! \brief  Wait for all operations completion in the stream. */
@@ -268,11 +277,14 @@ IPUDeviceContext::~IPUDeviceContext() {}
 
 #endif
 #ifdef PADDLE_WITH_XPU
-XPUDeviceContext::XPUDeviceContext() : pten::XPUContext() {}
+XPUDeviceContext::XPUDeviceContext() : pten::XPUContext() {
+  pten::XPUContext::Init();
+}
 
 XPUDeviceContext::~XPUDeviceContext() {}
 
 XPUDeviceContext::XPUDeviceContext(XPUPlace place) : pten::XPUContext(place) {
+  pten::XPUContext::Init();
   LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
                           << static_cast<int>(place.device);
 }
@@ -302,7 +314,7 @@ void NPUDeviceContext::Wait() const {
 
 aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); }
 
-Place NPUDeviceContext::GetPlace() const { return place_; }
+const Place& NPUDeviceContext::GetPlace() const { return place_; }
 
 aclrtContext NPUDeviceContext::context() const { return context_; }
 
@@ -319,7 +331,7 @@ Eigen::DefaultDevice* NPUPinnedDeviceContext::eigen_device() const {
   return eigen_device_.get();
 }
 
-Place NPUPinnedDeviceContext::GetPlace() const { return place_; }
+const Place& NPUPinnedDeviceContext::GetPlace() const { return place_; }
 
 #endif
 
@@ -470,102 +482,28 @@ CUDAContext::~CUDAContext() {
 #endif
 }
 
-CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
-  CUDADeviceGuard guard(place_.device);
-  compute_capability_ = GetGPUComputeCapability(place_.device);
-  multi_process_ = GetGPUMultiProcessors(place_.device);
-  max_threads_per_mp_ = GetGPUMaxThreadsPerMultiProcessor(place_.device);
-  max_grid_dim_size_ = GetGpuMaxGridDimSize(place_.device);
-  max_threads_per_block_ = GetGPUMaxThreadsPerBlock(place_.device);
-
-  driver_version_ = GetGPUDriverVersion(place_.device);
-  runtime_version_ = GetGPURuntimeVersion(place_.device);
-
-  LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: "
-                          << static_cast<int>(place_.device)
-                          << ", GPU Compute Capability: "
-                          << compute_capability_ / 10 << "."
-                          << compute_capability_ % 10
-                          << ", Driver API Version: " << driver_version_ / 1000
-                          << "." << (driver_version_ % 100) / 10
-                          << ", Runtime API Version: "
-                          << runtime_version_ / 1000 << "."
-                          << (runtime_version_ % 100) / 10;
-#ifdef PADDLE_WITH_HIP
-  size_t version_major, version_minor, version_patch;
-  PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion(
-      &version_major, &version_minor, &version_patch));
-  LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place_.device)
-                          << ", MIOpen Version: " << version_major << "."
-                          << version_minor << "." << version_patch;
-#else
-  size_t cudnn_dso_ver = dynload::cudnnGetVersion();
-  LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place_.device)
-                          << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
-                          << (cudnn_dso_ver % 1000) / 100 << ".";
-#endif
-  {
-    // Check CUDA/CUDNN version compatiblity
-    auto local_cuda_version =
-        (driver_version_ / 1000) * 10 + (driver_version_ % 100) / 10;
-#ifdef PADDLE_WITH_HIP
-    auto compile_cuda_version = (HIP_VERSION / 100) * 10 + (HIP_VERSION % 10);
-#else
-    auto compile_cuda_version =
-        (CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10;
-#endif
-    if (local_cuda_version < compile_cuda_version) {
-      LOG_FIRST_N(WARNING, 1)
-          << "WARNING: device: " << static_cast<int>(place_.device)
-          << ". The installed Paddle is compiled with CUDA "
-          << compile_cuda_version / 10 << "." << compile_cuda_version % 10
-          << ", but CUDA runtime version in your machine is "
-          << local_cuda_version / 10 << "." << local_cuda_version % 10
-          << ", which may cause serious incompatible bug. "
-          << "Please recompile or reinstall Paddle with compatible CUDA "
-             "version.";
-    }
-  }
-  default_ctx_.reset(new CUDAContext(place_));
-}
-
-CUDADeviceContext::~CUDADeviceContext() {
-  SetDeviceId(place_.device);
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  if (nccl_comm_) {
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
-  }
-#endif
-}
-
-Place CUDADeviceContext::GetPlace() const { return place_; }
-
-void CUDADeviceContext::Wait() const { context()->Stream()->Wait(); }
-
-int CUDADeviceContext::GetComputeCapability() const {
-  return compute_capability_;
-}
-
-int CUDADeviceContext::GetMaxPhysicalThreadCount() const {
-  return multi_process_ * max_threads_per_mp_;
+CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
+    : pten::GPUContext(place) {
+  pten::GPUContext::PartialInitWithoutAllocator();
+  cuda_stream_.reset(
+      new stream::CUDAStream(pten::GPUContext::stream(), this->GetPlace()));
 }
 
-int CUDADeviceContext::GetSMCount() const { return multi_process_; }
-
-int CUDADeviceContext::GetMaxThreadsPerBlock() const {
-  return max_threads_per_block_;
-}
+CUDADeviceContext::~CUDADeviceContext() = default;
 
 Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
-  return context()->EigenDevice().get();
-}
-
-bool CUDADeviceContext::tensor_core_available() const {
-  return context()->CublasTensorCoreHandle() != nullptr;
+  if (thread_ctx_.count(this)) {
+    return context()->EigenDevice().get();
+  }
+  return pten::GPUContext::eigen_device();
 }
 
-dim3 CUDADeviceContext::GetCUDAMaxGridDimSize() const {
-  return max_grid_dim_size_;
+void CUDADeviceContext::Wait() const {
+  if (thread_ctx_.count(this)) {
+    context()->Stream()->Wait();
+    return;
+  }
+  pten::GPUContext::Wait();
 }
 
 #ifdef PADDLE_WITH_HIP
@@ -573,33 +511,96 @@ miopenHandle_t CUDADeviceContext::cudnn_handle() const {
 #else
 cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
 #endif
-  return context()->CudnnHandle();
+  if (thread_ctx_.count(this)) {
+    return context()->CudnnHandle();
+  }
+  return pten::GPUContext::cudnn_handle();
 }
 
 #ifdef PADDLE_WITH_HIP
 rocblas_handle CUDADeviceContext::cublas_handle() const {
-  return context()->CublasHandle()->GetCublasHandle();
+  if (thread_ctx_.count(this)) {
+    return context()->CublasHandle()->GetCublasHandle();
+  }
+  return pten::GPUContext::cublas_handle();
 }
 #else
 cublasHandle_t CUDADeviceContext::cublas_handle() const {
-  return context()->CublasHandle()->GetCublasHandle();
+  if (thread_ctx_.count(this)) {
+    return context()->CublasHandle()->GetCublasHandle();
+  }
+  return pten::GPUContext::cublas_handle();
 }
 cusparseHandle_t CUDADeviceContext::cusparse_handle() const {
-  return context()->CusparseHandle()->GetCusparseHandle();
+  if (thread_ctx_.count(this)) {
+    return context()->CusparseHandle()->GetCusparseHandle();
+  }
+  return pten::GPUContext::cusparse_handle();
+}
+cusolverDnHandle_t CUDADeviceContext::cusolver_dn_handle() const {
+  if (thread_ctx_.count(this)) {
+    return context()->CusolverDnHandle();
+  }
+  return pten::GPUContext::cusolver_dn_handle();
 }
 #endif
 
+void CUDADeviceContext::RecordEvent(
+    gpuEvent_t ev, const std::function<void()>& callback) const {
+  if (thread_ctx_.count(this)) {
+    context()->Stream()->RecordEvent(ev, callback);
+    return;
+  }
+  pten::GPUContext::RecordEvent(ev, callback);
+}
+
+void CUDADeviceContext::AddStreamCallback(
+    const std::function<void()>& callback) const {
+  if (thread_ctx_.count(this)) {
+    context()->Stream()->AddCallback(callback);
+    return;
+  }
+  pten::GPUContext::AddStreamCallback(callback);
+}
+
+void CUDADeviceContext::WaitStreamCallback() const {
+  if (thread_ctx_.count(this)) {
+    context()->Stream()->WaitCallback();
+    return;
+  }
+  pten::GPUContext::WaitStreamCallback();
+}
+
 CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
   return CudnnWorkspaceHandle(*this, &cudnn_handle_mtx_);
 }
 
-#ifndef PADDLE_WITH_HIP
-cusolverDnHandle_t CUDADeviceContext::cusolver_dn_handle() const {
-  return context()->CusolverDnHandle();
+gpuStream_t CUDADeviceContext::stream() const {
+  if (thread_ctx_.count(this)) {
+    return context()->RawStream();
+  }
+  return pten::GPUContext::stream();
 }
-#endif
 
-gpuStream_t CUDADeviceContext::stream() const { return context()->RawStream(); }
+std::shared_ptr<CUDAContext> CUDADeviceContext::context() const {
+  if (!thread_ctx_.count(this)) {
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "CUDADeviceContext call context() failed, make sure in the "
+        "thread_local semantic."));
+  }
+  return thread_ctx_.at(this);
+}
+
+stream::CUDAStream* CUDADeviceContext::GetCudaStream() const {
+  return cuda_stream_.get();
+}
+
+stream::CUDAStream* CUDADeviceContext::SetCudaStream(
+    stream::CUDAStream* new_stream_ptr) {
+  auto* old_stream_ptr = cuda_stream_.release();
+  cuda_stream_.reset(new_stream_ptr);
+  return old_stream_ptr;
+}
 
 CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() {
   eigen_device_.reset(new Eigen::DefaultDevice());
@@ -614,7 +615,7 @@ Eigen::DefaultDevice* CUDAPinnedDeviceContext::eigen_device() const {
   return eigen_device_.get();
 }
 
-Place CUDAPinnedDeviceContext::GetPlace() const { return place_; }
+const Place& CUDAPinnedDeviceContext::GetPlace() const { return place_; }
 #endif
 
 #ifdef PADDLE_WITH_MKLDNN
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 17b22907b1532..4d469e92c04cd 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
+#include <functional>
 #include <future>  // NOLINT
 #include <memory>
 #include <mutex>  // NOLINT
@@ -18,7 +19,9 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/backends/gpu/gpu_decls.h"
 #include "paddle/pten/core/device_context.h"
 
 #include "paddle/fluid/memory/malloc.h"
@@ -28,6 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/dynload/cusolver.h"
 #include "paddle/fluid/platform/dynload/cusparse.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif
@@ -38,6 +42,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_helper.h"  // NOLINT
 #include "paddle/fluid/platform/dynload/miopen.h"
 #include "paddle/fluid/platform/dynload/rocblas.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"  // NOLINT
 #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/dynload/rccl.h"
 #endif
@@ -145,7 +150,7 @@ class IPUDeviceContext : public DeviceContext {
   explicit IPUDeviceContext(IPUPlace place);
   virtual ~IPUDeviceContext();
   Eigen::DefaultDevice* eigen_device() const { return nullptr; }
-  Place GetPlace() const override;
+  const Place& GetPlace() const override;
   /*! \brief  Wait for all operations completion in the stream. */
   void Wait() const override;
 
@@ -187,7 +192,7 @@ class NPUDeviceContext : public DeviceContext {
   explicit NPUDeviceContext(NPUPlace place);
   virtual ~NPUDeviceContext();
   Eigen::DefaultDevice* eigen_device() const { return nullptr; }
-  Place GetPlace() const override;
+  const Place& GetPlace() const override;
   aclrtContext context() const;
 
   /*! \brief  Wait for all operations completion in the stream. */
@@ -247,7 +252,7 @@ class NPUPinnedDeviceContext : public DeviceContext {
   NPUPinnedDeviceContext();
   explicit NPUPinnedDeviceContext(NPUPinnedPlace place);
 
-  Place GetPlace() const override;
+  const Place& GetPlace() const override;
 
   Eigen::DefaultDevice* eigen_device() const;
 
@@ -326,20 +331,20 @@ class CUDAContext {
 #endif
 
   /*! \brief  Call cublas function safely. */
-  template <typename Callback>
-  inline void CublasCall(Callback&& callback) const {
+  inline void CublasCall(
+      const std::function<void(blasHandle_t)>& callback) const {
     if (cublas_tf32_tensor_core_handle_) {
-      cublas_tf32_tensor_core_handle_->Call(std::forward<Callback>(callback));
+      cublas_tf32_tensor_core_handle_->Call(callback);
     } else {
-      cublas_handle_->Call(std::forward<Callback>(callback));
+      cublas_handle_->Call(callback);
     }
   }
 
 #ifndef PADDLE_WITH_HIP
   /*! \brief  Call cusparse function safely. */
-  template <typename Callback>
-  inline void CusparseCall(Callback&& callback) const {
-    cusparse_handle_->Call(std::forward<Callback>(callback));
+  inline void CusparseCall(
+      const std::function<void(pten::sparseHandle_t)>& callback) const {
+    cusparse_handle_->Call(callback);
   }
 #endif
 
@@ -348,12 +353,12 @@ class CUDAContext {
 
   /*! \brief  Call cublas function with Tensor Core safely. If
       Tensor Core is not available, use DEFAULT_MATH instead. */
-  template <typename Callback>
-  inline void TensorCoreCublasCallIfAvailable(Callback&& callback) const {
+  inline void TensorCoreCublasCallIfAvailable(
+      const std::function<void(blasHandle_t)>& callback) const {
     if (cublas_tensor_core_handle_) {
-      cublas_tensor_core_handle_->Call(std::forward<Callback>(callback));
+      cublas_tensor_core_handle_->Call(callback);
     } else {
-      cublas_handle_->Call(std::forward<Callback>(callback));
+      cublas_handle_->Call(callback);
     }
   }
 
@@ -491,7 +496,7 @@ class CUDAContext {
   DISABLE_COPY_AND_ASSIGN(CUDAContext);
 };
 
-class CUDADeviceContext : public DeviceContext {
+class CUDADeviceContext : public pten::GPUContext {
  public:
   explicit CUDADeviceContext(CUDAPlace place);
   virtual ~CUDADeviceContext();
@@ -499,49 +504,40 @@ class CUDADeviceContext : public DeviceContext {
   /*! \brief  Wait for all operations completion in the stream. */
   void Wait() const override;
 
-  /*! \brief  Return place in the device context. */
-  Place GetPlace() const override;
-
-  /*! \brief  Return compute capability in the device context. */
-  int GetComputeCapability() const;
-
-  /*! \brief  Return the max physical thread count in the device context */
-  int GetMaxPhysicalThreadCount() const;
-
-  /*! \brief  Return the SM count in the device context */
-  int GetSMCount() const;
-
-  /*! \brief  Return the Max thread num of block in the device context */
-  int GetMaxThreadsPerBlock() const;
-
-  /*! \brief  Return the max grid dim size in the device context */
-  dim3 GetCUDAMaxGridDimSize() const;
-
   /*! \brief  Return eigen device in the device context. */
   Eigen::GpuDevice* eigen_device() const;
 
   /*! \brief  Call cublas function safely. */
-  template <typename Callback>
-  inline void CublasCall(Callback&& callback) const {
+  inline void CublasCall(
+      const std::function<void(blasHandle_t)>& callback) const {
+    if (!thread_ctx_.count(this)) {
+      pten::GPUContext::CublasCall(callback);
+      return;
+    }
     return context()->CublasCall(callback);
   }
 
 #ifndef PADDLE_WITH_HIP
   /*! \brief  Call cusparse function safely. */
-  template <typename Callback>
-  inline void CusparseCall(Callback&& callback) const {
-    return context()->CusparseCall(callback);
+  inline void CusparseCall(
+      const std::function<void(pten::sparseHandle_t)>& callback) const {
+    if (!thread_ctx_.count(this)) {
+      pten::GPUContext::CusparseCall(callback);
+      return;
+    }
+    context()->CusparseCall(callback);
   }
 #endif
 
-  /*! \brief  Check whether tensor core is supported */
-  bool tensor_core_available() const;
-
   /*! \brief  Call cublas function with Tensor Core safely. If
       Tensor Core is not available, use DEFAULT_MATH instead. */
-  template <typename Callback>
-  inline void TensorCoreCublasCallIfAvailable(Callback&& callback) const {
-    return context()->TensorCoreCublasCallIfAvailable(callback);
+  inline void TensorCoreCublasCallIfAvailable(
+      const std::function<void(blasHandle_t)>& callback) const {
+    if (!thread_ctx_.count(this)) {
+      pten::GPUContext::TensorCoreCublasCallIfAvailable(callback);
+      return;
+    }
+    context()->TensorCoreCublasCallIfAvailable(callback);
   }
 
 /*! \brief  Return cudnn  handle in the device context. */
@@ -559,6 +555,10 @@ class CUDADeviceContext : public DeviceContext {
   cusparseHandle_t cusparse_handle() const;
 #endif
 
+#ifndef PADDLE_WITH_HIP
+  cusolverDnHandle_t cusolver_dn_handle() const;
+#endif
+
   /*! \brief  Return a cudnn workspace handle to call multiple cudnn
    *  functions without interrupting by other threads.
    *  Once the first cudnn function is called by the handle, a lock
@@ -568,60 +568,33 @@ class CUDADeviceContext : public DeviceContext {
    *  sequential cudnn function calls. */
   CudnnWorkspaceHandle cudnn_workspace_handle() const;
 
-#ifndef PADDLE_WITH_HIP
-  cusolverDnHandle_t cusolver_dn_handle() const;
-#endif
-
   /*! \brief  Return cuda stream in the device context. */
   gpuStream_t stream() const;
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  /*! \brief  Return nccl communicators. */
-  ncclComm_t nccl_comm() const { return nccl_comm_; }
-
-  /*! \brief  Set nccl communicators. */
-  void set_nccl_comm(ncclComm_t comm) { nccl_comm_ = comm; }
-#endif
-
-  template <typename Callback>
-  void RecordEvent(gpuEvent_t ev, Callback callback) const {
-    return context()->Stream()->RecordEvent(ev, callback);
-  }
-
-  template <typename Callback>
-  void AddStreamCallback(Callback&& callback) const {
-    return context()->Stream()->AddCallback(callback);
-  }
+  void RecordEvent(gpuEvent_t ev, const std::function<void()>& callback) const;
 
-  void WaitStreamCallback() const {
-    return context()->Stream()->WaitCallback();
-  }
+  void AddStreamCallback(const std::function<void()>& callback) const;
 
-  void ResetDefaultContext(const stream::Priority& priority) {
-    default_ctx_.reset(new CUDAContext(place_, priority));
-  }
+  void WaitStreamCallback() const;
 
   void ResetThreadContext(const stream::Priority& priority) {
     std::lock_guard<std::mutex> guard(ctx_mtx_);
-    thread_ctx_[this].reset(new CUDAContext(place_, priority));
+    thread_ctx_[this].reset(new CUDAContext(this->GetPlace(), priority));
   }
 
-  std::shared_ptr<CUDAContext> context() const {
-    if (!thread_ctx_.count(this)) {
-      return default_ctx_;
-    }
-    return thread_ctx_.at(this);
-  }
+  std::shared_ptr<CUDAContext> context() const;
 
   // Note: Can only be used under thread_local semantics.
   void SetThreadLocalStream(const gpuStream_t stream) {
     thread_ctx_.at(this)->SetStream(stream);
   }
 
- private:
-  CUDAPlace place_;
-  std::shared_ptr<CUDAContext> default_ctx_;
+  // NOTE: Just for compatibility with the past, please delete if there is an
+  // elegant way.
+  stream::CUDAStream* GetCudaStream() const;
+  stream::CUDAStream* SetCudaStream(stream::CUDAStream*);
 
+ private:
   // The thread_local static variable will be released before the
   // global static variable, so avoid using it in dtor.
   static thread_local std::unordered_map<const CUDADeviceContext*,
@@ -631,22 +604,9 @@ class CUDADeviceContext : public DeviceContext {
 
   mutable std::mutex cudnn_handle_mtx_;
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  // NCCL communicator (single process version) for NCCL collective operations.
-  // NCCL collective operations provides fast collectives over multiple GPUs
-  // both within and across nodes.
-  // But, this collectives is used for collectives over multiple GPUs within
-  // nodes.
-  ncclComm_t nccl_comm_{nullptr};
-#endif
-
-  int compute_capability_;
-  int runtime_version_;
-  int driver_version_;
-  int multi_process_;
-  int max_threads_per_mp_;
-  int max_threads_per_block_;
-  dim3 max_grid_dim_size_;
+  // NOTE: Just for compatibility with the past, please delete if there is an
+  // elegant way.
+  std::unique_ptr<stream::CUDAStream> cuda_stream_;
 
   DISABLE_COPY_AND_ASSIGN(CUDADeviceContext);
 };
@@ -711,7 +671,7 @@ class CUDAPinnedDeviceContext : public DeviceContext {
   CUDAPinnedDeviceContext();
   explicit CUDAPinnedDeviceContext(CUDAPinnedPlace place);
 
-  Place GetPlace() const override;
+  const Place& GetPlace() const override;
 
   Eigen::DefaultDevice* eigen_device() const;
 
diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu
index cf617a478eb71..851c756b665b8 100644
--- a/paddle/fluid/platform/device_context_test.cu
+++ b/paddle/fluid/platform/device_context_test.cu
@@ -17,6 +17,7 @@ limitations under the License. */
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 
 TEST(Device, Init) {
   using paddle::platform::DeviceContext;
@@ -26,6 +27,20 @@ TEST(Device, Init) {
   int count = paddle::platform::GetGPUDeviceCount();
   for (int i = 0; i < count; i++) {
     CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
+    device_context->SetAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(CUDAPlace(i), device_context->stream())
+            .get());
+    device_context->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    device_context->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(CUDAPlace(i))
+            .get());
+    device_context->PartialInitWithAllocator();
+
     Eigen::GpuDevice* gpu_device = device_context->eigen_device();
     ASSERT_NE(nullptr, gpu_device);
     delete device_context;
@@ -39,6 +54,19 @@ TEST(Device, CUDADeviceContext) {
   int count = paddle::platform::GetGPUDeviceCount();
   for (int i = 0; i < count; i++) {
     CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
+    device_context->SetAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(CUDAPlace(i), device_context->stream())
+            .get());
+    device_context->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    device_context->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(CUDAPlace(i))
+            .get());
+    device_context->PartialInitWithAllocator();
     Eigen::GpuDevice* gpu_device = device_context->eigen_device();
     ASSERT_NE(nullptr, gpu_device);
 #ifdef PADDLE_WITH_HIP
diff --git a/paddle/fluid/platform/device_event_gpu.cc b/paddle/fluid/platform/device_event_gpu.cc
index 0a6b3917fbc21..a811a5b9c130d 100644
--- a/paddle/fluid/platform/device_event_gpu.cc
+++ b/paddle/fluid/platform/device_event_gpu.cc
@@ -53,7 +53,7 @@ void DeviceEventRecordCUDA(DeviceEvent* event, const DeviceContext* context) {
       platform::errors::PreconditionNotMet(
           "Failed to dynamic_cast context into CUDADeviceContext."));
 
-  wrapper->inner_event_.Record(*cuda_dev_ctx->context()->Stream());
+  wrapper->inner_event_.Record(cuda_dev_ctx->stream());
 }
 
 bool DeviceEventQueryCUDA(const DeviceEvent* event) {
@@ -82,8 +82,7 @@ void DeviceEventCUDAWaitCUDA(const DeviceEvent* event,
       platform::errors::PreconditionNotMet(
           "Failed to dynamic_cast context into CUDADeviceContext."));
   // calling cudaStreamWaitEvent(stream, event, 0)
-  cuda_dev_ctx->context()->Stream()->WaitEvent(
-      wrapper->inner_event_.GetRawCudaEvent());
+  cuda_dev_ctx->WaitEvent(wrapper->inner_event_.GetRawCudaEvent());
 }
 
 void DeviceEventCPUWaitCUDA(const DeviceEvent* event,
diff --git a/paddle/fluid/platform/device_event_test.cc b/paddle/fluid/platform/device_event_test.cc
index a56d94b892e98..96e89f9257dd2 100644
--- a/paddle/fluid/platform/device_event_test.cc
+++ b/paddle/fluid/platform/device_event_test.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/platform/device_event.h"
 #include "glog/logging.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/platform/place.h"
 
 using ::paddle::platform::kCUDA;
 using ::paddle::platform::kCPU;
@@ -38,9 +39,11 @@ TEST(DeviceEvent, CUDA) {
   // case 1. test for event_creator
   DeviceEvent event(place);
   ASSERT_NE(event.GetEvent().get(), nullptr);
+  bool status = event.Query();
+  ASSERT_EQ(status, true);
   // case 2. test for event_recorder
   event.Record(context);
-  bool status = event.Query();
+  status = event.Query();
   ASSERT_EQ(status, false);
   // case 3. test for event_finisher
   event.Finish();
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index c751ee1e69b2b..7de5055431972 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -539,7 +539,7 @@ inline void retry_sleep(unsigned milliseconds) {
         ::paddle::platform::details::ExternalApiType<                   \
             __CUDA_STATUS_TYPE__>::kSuccess;                            \
     while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
-      retry_sleep(FLAGS_gpu_allocator_retry_time);                      \
+      paddle::platform::retry_sleep(FLAGS_gpu_allocator_retry_time);    \
       __cond__ = (COND);                                                \
       ++retry_count;                                                    \
     }                                                                   \
@@ -727,7 +727,7 @@ inline void retry_sleep(unsigned millisecond) {
         ::paddle::platform::details::ExternalApiType<                   \
             __CUDA_STATUS_TYPE__>::kSuccess;                            \
     while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
-      retry_sleep(FLAGS_gpu_allocator_retry_time);                      \
+      ::paddle::platform::retry_sleep(FLAGS_gpu_allocator_retry_time);  \
       __cond__ = (COND);                                                \
       ++retry_count;                                                    \
     }                                                                   \
diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h
index da5080cc86f0c..86df34acd7612 100644
--- a/paddle/fluid/platform/event.h
+++ b/paddle/fluid/platform/event.h
@@ -152,11 +152,11 @@ class CudaEvent {
 #endif
   }
 
-  void Record(const paddle::platform::stream::CUDAStream &stream) {
+  void Record(gpuStream_t stream) {
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream.raw_stream()));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream));
 #else
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, stream.raw_stream()));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, stream));
 #endif
   }
 
diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu
index 7a047d790abdc..cc5c11778aa30 100644
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -328,6 +328,10 @@ TEST(float16, lod_tensor_on_gpu) {
   // CPU LoDTensor to GPU LoDTensor
   CUDAPlace gpu_place(0);
   CUDADeviceContext gpu_ctx(gpu_place);
+  gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, gpu_ctx.stream())
+                           .get());
+  gpu_ctx.PartialInitWithAllocator();
   framework::TensorCopy(src_tensor, gpu_place, gpu_ctx, &gpu_tensor);
 
   // GPU LoDTensor to CPU LoDTensor
diff --git a/paddle/fluid/platform/for_range.h b/paddle/fluid/platform/for_range.h
index 1d4be3801dda9..aef36a893156d 100644
--- a/paddle/fluid/platform/for_range.h
+++ b/paddle/fluid/platform/for_range.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 
 namespace paddle {
 namespace platform {
@@ -72,6 +73,7 @@ __global__ static void ForRangeElemwiseOp(Function func, size_t limit) {
   }
 }
 
+// NOTE: After the pten kernel is migrated, it needs to be deleted.
 template <>
 struct ForRange<CUDADeviceContext> {
   ForRange(const CUDADeviceContext& dev_ctx, size_t limit)
@@ -106,6 +108,40 @@ struct ForRange<CUDADeviceContext> {
   size_t limit_;
 };
 
+template <>
+struct ForRange<pten::GPUContext> {
+  ForRange(const pten::GPUContext& dev_ctx, size_t limit)
+      : dev_ctx_(dev_ctx), limit_(static_cast<size_t>(limit)) {}
+
+  template <typename Function>
+  inline void operator()(Function func) const {
+#ifdef __HIPCC__
+    // HIP will throw core dump when threads > 256
+    constexpr int num_threads = 256;
+#elif WITH_NV_JETSON
+    // JETSON_NANO will throw core dump when threads > 128
+    int num_thread = 256;
+    platform::ChangeThreadNum(dev_ctx_, &num_thread, 128);
+    const int num_threads = num_thread;
+#else
+    constexpr int num_threads = 1024;
+#endif
+    size_t block_size = limit_ <= num_threads ? limit_ : num_threads;
+    size_t grid_size = (limit_ + num_threads - 1) / num_threads;
+
+    if (grid_size == 1) {
+      ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
+          func);
+    } else {
+      ForRangeElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>(
+          func, limit_);
+    }
+  }
+
+  const pten::GPUContext& dev_ctx_;
+  size_t limit_;
+};
+
 #endif
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/stream/cuda_stream.cc b/paddle/fluid/platform/stream/cuda_stream.cc
index 5697bbee0bb92..e3e735d03aba1 100644
--- a/paddle/fluid/platform/stream/cuda_stream.cc
+++ b/paddle/fluid/platform/stream/cuda_stream.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/stream/cuda_stream.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -116,11 +117,8 @@ CUDAStream* get_current_stream(int deviceId) {
 
   platform::Place device = CUDAPlace(deviceId);
 
-  auto stream = static_cast<platform::CUDADeviceContext*>(pool.Get(device))
-                    ->context()
-                    ->Stream()
-                    .get();
-  return stream;
+  return static_cast<platform::CUDADeviceContext*>(pool.Get(device))
+      ->GetCudaStream();
 #else
   PADDLE_THROW(platform::errors::Unavailable(
       "Paddle is not compiled with CUDA. Cannot visit cuda current stream."));
@@ -133,12 +131,12 @@ CUDAStream* set_current_stream(CUDAStream* stream) {
   auto& device = stream->GetPlace();
   auto& pool = platform::DeviceContextPool::Instance();
   return static_cast<platform::CUDADeviceContext*>(pool.Get(device))
-      ->context()
-      ->SetStream(stream);
+      ->SetCudaStream(stream);
 #else
   PADDLE_THROW(platform::errors::Unavailable(
-      "Paddle is not compiled with CUDA. Cannot visit cuda current stream."));
-  return nullptr;
+      "Paddle is not compiled with CUDA. Cannot visit cuda current"
+      "stream."));
+  return CUDAStream(nullptr);
 #endif
 }
 }  // namespace stream
diff --git a/paddle/fluid/platform/stream/cuda_stream.h b/paddle/fluid/platform/stream/cuda_stream.h
index 0683cf4b0424e..2b54f0861f426 100644
--- a/paddle/fluid/platform/stream/cuda_stream.h
+++ b/paddle/fluid/platform/stream/cuda_stream.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <cstdint>
+#include <functional>
 #include <memory>
 
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
@@ -51,24 +52,28 @@ class CUDAStream final {
                       const StreamFlag& flag = StreamFlag::kDefaultFlag) {
     Init(place, priority, flag);
   }
+  explicit CUDAStream(gpuStream_t stream, const Place& place)
+      : place_(place), stream_(stream) {
+    owned_stream_ = false;
+    callback_manager_.reset(new StreamCallbackManager<gpuStream_t>(stream_));
+  }
   virtual ~CUDAStream() { Destroy(); }
 
   bool Init(const Place& place, const Priority& priority = Priority::kNormal,
             const StreamFlag& flag = StreamFlag::kDefaultFlag);
 
-  template <typename Callback>
-  void AddCallback(Callback&& callback) const {
+  void AddCallback(std::function<void()> callback) const {
     callback_manager_->AddCallback(callback);
   }
 
-  template <typename Callback>
 #ifdef PADDLE_WITH_HIP
-  void RecordEvent(hipEvent_t ev, Callback callback) const {
+  void RecordEvent(hipEvent_t ev, const std::function<void()>& callback) const {
     callback();
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(ev, stream_));
   }
 #else
-  void RecordEvent(cudaEvent_t ev, Callback callback) const {
+  void RecordEvent(cudaEvent_t ev,
+                   const std::function<void()>& callback) const {
     callback();
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(ev, stream_));
   }
@@ -149,6 +154,7 @@ class CUDAStream final {
 };
 
 CUDAStream* get_current_stream(int deviceId);
+// NOTE: There is a problem with the interface and needs to be fixed
 CUDAStream* set_current_stream(CUDAStream* stream);
 
 }  // namespace stream
diff --git a/paddle/fluid/platform/transform_test.cu b/paddle/fluid/platform/transform_test.cu
index 32ec113d1f5e5..e26fd511aa9af 100644
--- a/paddle/fluid/platform/transform_test.cu
+++ b/paddle/fluid/platform/transform_test.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/transform.h"
@@ -57,6 +58,10 @@ TEST(Transform, CPUUnary) {
 TEST(Transform, GPUUnary) {
   CUDAPlace gpu0(0);
   CUDADeviceContext ctx(gpu0);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(gpu0, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
   float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
   auto gpu_allocation = Alloc(gpu0, sizeof(float) * 4);
   float* gpu_buf = static_cast<float*>(gpu_allocation->ptr());
@@ -84,6 +89,10 @@ TEST(Transform, GPUBinary) {
   int buf[4] = {1, 2, 3, 4};
   CUDAPlace gpu0(0);
   CUDADeviceContext ctx(gpu0);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(gpu0, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
   auto gpu_allocation = Alloc(gpu0, sizeof(buf));
   int* gpu_buf = static_cast<int*>(gpu_allocation->ptr());
   Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream());
diff --git a/paddle/fluid/pybind/cuda_streams_py.cc b/paddle/fluid/pybind/cuda_streams_py.cc
index 21571e17a2b48..64c145c94f99d 100644
--- a/paddle/fluid/pybind/cuda_streams_py.cc
+++ b/paddle/fluid/pybind/cuda_streams_py.cc
@@ -34,7 +34,7 @@ void BindCudaStream(py::module *m_ptr) {
           return paddle::platform::stream::get_current_stream(deviceId);
 #else
           PADDLE_THROW(platform::errors::Unavailable(
-              "Paddle is not compiled with CUDA. Cannot visit cuda current "
+              "Paddle is not compiled with CUDA. Cannot visit cuda current"
               "stream."));
 #endif
         },
@@ -119,7 +119,7 @@ void BindCudaStream(py::module *m_ptr) {
            [](paddle::platform::stream::CUDAStream &self,
               paddle::platform::stream::CUDAStream &stream) {
              paddle::platform::CudaEvent event;
-             event.Record(stream);
+             event.Record(stream.raw_stream());
 
              self.WaitEvent(event.GetRawCudaEvent());
            },
@@ -179,7 +179,7 @@ void BindCudaStream(py::module *m_ptr) {
              if (event == nullptr) {
                event = new paddle::platform::CudaEvent();
              }
-             event->Record(self);
+             event->Record(self.raw_stream());
              return event;
            },
            R"DOC(
@@ -321,7 +321,7 @@ void BindCudaStream(py::module *m_ptr) {
              if (stream == nullptr) {
                stream = paddle::platform::stream::get_current_stream(-1);
              }
-             self.Record(*stream);
+             self.Record(stream->raw_stream());
            },
            R"DOC(
           Records the event in the given stream.
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index e8dbe41ebba3a..9a535f5fb0492 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1596,7 +1596,20 @@ All parameter, weight, gradient are variables in Paddle.
       .def_static("create",
                   [](paddle::platform::CPUPlace& place)
                       -> paddle::platform::DeviceContext* {
-                    return new paddle::platform::CPUDeviceContext();
+    auto* context = new paddle::platform::CPUDeviceContext();
+    context->SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+        .GetAllocator(place)
+        .get());
+    context->SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+        .GetAllocator(paddle::platform::CPUPlace())
+        .get());
+    context->SetZeroAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+        .GetZeroAllocator(place)
+        .get());
+    return context;
                   })
       .def_static("create",
                   [](paddle::platform::XPUPlace& place)
@@ -1607,7 +1620,20 @@ All parameter, weight, gradient are variables in Paddle.
                  "Cannot use XPUPlace in CPU/GPU version, "
                  "Please recompile or reinstall Paddle with XPU support."));
 #else
-                    return new paddle::platform::XPUDeviceContext(place);
+      auto* context = new paddle::platform::XPUDeviceContext(place);
+      context->SetAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(place)
+          .get());
+      context->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+      context->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetZeroAllocator(place)
+          .get());
+      return context;
 #endif
                   })
         .def_static("create",
@@ -1643,7 +1669,21 @@ All parameter, weight, gradient are variables in Paddle.
                  "Cannot use CUDAPlace in CPU only version, "
                  "Please recompile or reinstall Paddle with CUDA support."));
 #else
-                    return new paddle::platform::CUDADeviceContext(place);
+      auto* context = new paddle::platform::CUDADeviceContext(place);
+      context->SetAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(place, context->stream())
+          .get());
+      context->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+      context->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+        .GetZeroAllocator(place)
+        .get());
+      context->PartialInitWithAllocator();
+      return context;
 #endif
                   })
           .def_static("create",
diff --git a/paddle/pten/backends/CMakeLists.txt b/paddle/pten/backends/CMakeLists.txt
index e9f222d642ea0..cc9352892030a 100644
--- a/paddle/pten/backends/CMakeLists.txt
+++ b/paddle/pten/backends/CMakeLists.txt
@@ -2,6 +2,10 @@ add_subdirectory(dynload)
 
 add_subdirectory(cpu)
 
+if(WITH_GPU OR WITH_ROCM)
+  add_subdirectory(gpu)
+endif()
+
 if(WITH_XPU)
   add_subdirectory(xpu)
 endif()
@@ -11,3 +15,7 @@ cc_library(pten_context SRCS all_context.cc DEPS device_context cpu_context)
 if(WITH_XPU)
   add_dependencies(pten_context xpu_context)
 endif()
+
+if(WITH_GPU)
+  add_dependencies(pten_context gpu_context)
+endif()
diff --git a/paddle/pten/backends/cpu/cpu_context.cc b/paddle/pten/backends/cpu/cpu_context.cc
index efce128596b81..4029c286a5b28 100644
--- a/paddle/pten/backends/cpu/cpu_context.cc
+++ b/paddle/pten/backends/cpu/cpu_context.cc
@@ -15,75 +15,59 @@
 #include "paddle/pten/backends/cpu/cpu_context.h"
 
 #include "paddle/pten/api/ext/exception.h"
+#include "paddle/pten/common/place.h"
 
 // NOTE: The paddle framework should add WITH_EIGEN option to support compile
 // without eigen.
+#include "paddle/pten/core/device_context.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace pten {
 
-struct CPUContext::CPUImpl {
-  CPUImpl() { device_ = new Eigen::DefaultDevice(); }
+struct CPUContext::Impl {
+  Impl() : place_(CPUPlace()) {}
 
-  // Users need to manage external resources.
-  explicit CPUImpl(const CPUContextResource& ctx_res) : res_(ctx_res) {
-    device_ = res_.device;
-  }
+  explicit Impl(const Place& place) : place_(place) {}
 
-  ~CPUImpl() {
-    if (res_.device == nullptr && device_ != nullptr) {
-      delete device_;
-      device_ = nullptr;
+  ~Impl() {
+    if (owned_) {
+      delete eigen_device_;
     }
   }
 
-  Eigen::DefaultDevice* GetEigenDevice() const {
-    PD_CHECK(device_ != nullptr, "the eigen_device is nullptr.");
-    return device_;
+  void Init() {
+    owned_ = true;
+    eigen_device_ = new Eigen::DefaultDevice();
   }
 
-  void SetEigenDevice(Eigen::DefaultDevice* device) {
-    if (device == nullptr) {
-      return;
-    }
-    res_.device = device;
-    device_ = device;
+  Eigen::DefaultDevice* GetEigenDevice() const {
+    PD_CHECK(eigen_device_ != nullptr, "the cpu eigen_device is nullptr.");
+    return eigen_device_;
   }
 
-  Place GetPlace() const { return place_; }
-
-  Eigen::DefaultDevice* device_{nullptr};
-  CPUContextResource res_;
-  CPUPlace place_;
+  bool owned_{false};
+  Eigen::DefaultDevice* eigen_device_{nullptr};
+  Place place_;
 };
 
-CPUContext::CPUContext() : DeviceContext() {
-  cpu_impl_ = std::make_unique<CPUImpl>();
-}
-
-CPUContext::CPUContext(const CPUContext& other) : DeviceContext() {
-  cpu_impl_ = std::make_unique<CPUImpl>();
-  cpu_impl_->SetEigenDevice(other.eigen_device());
-}
+CPUContext::CPUContext()
+    : DeviceContext(), impl_(std::make_unique<CPUContext::Impl>()) {}
 
-CPUContext::CPUContext(CPUContext&& other) : DeviceContext() {
-  cpu_impl_ = std::move(other.cpu_impl_);
-}
+CPUContext::CPUContext(const Place& place)
+    : DeviceContext(), impl_(std::make_unique<CPUContext::Impl>(place)) {}
 
 CPUContext::~CPUContext() = default;
 
-CPUContext::CPUContext(const CPUContextResource& ctx_res) : DeviceContext() {
-  cpu_impl_ = std::make_unique<CPUImpl>(ctx_res);
-}
+void CPUContext::Init() { impl_->Init(); }
 
 Eigen::DefaultDevice* CPUContext::eigen_device() const {
-  return cpu_impl_->GetEigenDevice();
+  return impl_->GetEigenDevice();
 }
 
+const Place& CPUContext::GetPlace() const { return impl_->place_; }
+
 void CPUContext::SetEigenDevice(Eigen::DefaultDevice* device) {
-  cpu_impl_->SetEigenDevice(device);
+  impl_->eigen_device_ = device;
 }
 
-Place CPUContext::GetPlace() const { return cpu_impl_->GetPlace(); }
-
 }  // namespace pten
diff --git a/paddle/pten/backends/cpu/cpu_context.h b/paddle/pten/backends/cpu/cpu_context.h
index 059588dc712ea..dca87a786b961 100644
--- a/paddle/pten/backends/cpu/cpu_context.h
+++ b/paddle/pten/backends/cpu/cpu_context.h
@@ -24,37 +24,29 @@ limitations under the License. */
 
 namespace pten {
 
-struct CPUContextResource {
-  Eigen::DefaultDevice* device{nullptr};
-};
-
 class CPUContext : public DeviceContext {
  public:
-  // NOTE: DeviceContext hold resources. Used in training scenarios.
   CPUContext();
-
-  // NOTE: Share the same underlying resources, please ensure that resources are
-  // not released.
-  CPUContext(const CPUContext&);
-
-  CPUContext(CPUContext&&);
-
-  ~CPUContext();
-
+  explicit CPUContext(const Place&);
+  virtual ~CPUContext();
   Eigen::DefaultDevice* eigen_device() const;
-
-  // TODO(wilber): Whether the interface should be preserved.
-  Place GetPlace() const override;
+  const Place& GetPlace() const override;
 
  public:
-  // NOTE: External users manage resources. Used in inference scenarios.
-  explicit CPUContext(const CPUContextResource& ctx_res);
+  // NOTE: DeviceContext hold resources. Used in training scenarios.
+  // The interface used by the training scene, DeviceContext will initialize
+  // all resources and delete them when destructing.
+  void Init();
 
+ protected:
+  // NOTE: External users manage resources. Used in inference scenarios.
+  // The Set interface is for inference only, DeviceContext will mark the
+  // resource as external, and will not delete any resource when destructing.
   void SetEigenDevice(Eigen::DefaultDevice* device);
 
  private:
-  struct CPUImpl;
-  std::unique_ptr<CPUImpl> cpu_impl_;
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
 };
 
 }  // namespace pten
diff --git a/paddle/pten/backends/gpu/CMakeLists.txt b/paddle/pten/backends/gpu/CMakeLists.txt
new file mode 100644
index 0000000000000..09591f79ae8fc
--- /dev/null
+++ b/paddle/pten/backends/gpu/CMakeLists.txt
@@ -0,0 +1,9 @@
+if(WITH_GPU)
+  add_subdirectory(cuda)
+  nv_library(pten_gpu_info SRCS gpu_info.cc DEPS pten_cuda_info gflags glog enforce pten_dynload_cuda)
+elseif(WITH_ROCM)
+  add_subdirectory(rocm)
+  hip_library(pten_gpu_info SRCS gpu_info.cc DEPS pten_rocm_info gflags glog enforce pten_dynload_cuda)
+endif()
+
+cc_library(gpu_context SRCS gpu_context.cc DEPS pten_device_context pten_gpu_info eigen3)
diff --git a/paddle/pten/backends/gpu/cuda/CMakeLists.txt b/paddle/pten/backends/gpu/cuda/CMakeLists.txt
new file mode 100644
index 0000000000000..7eb1983a793bc
--- /dev/null
+++ b/paddle/pten/backends/gpu/cuda/CMakeLists.txt
@@ -0,0 +1 @@
+nv_library(pten_cuda_info SRCS cuda_info.cc DEPS gflags glog enforce pten_dynload_cuda)
diff --git a/paddle/pten/backends/gpu/cuda/cuda_helper.h b/paddle/pten/backends/gpu/cuda/cuda_helper.h
new file mode 100644
index 0000000000000..baa1e43ce3c89
--- /dev/null
+++ b/paddle/pten/backends/gpu/cuda/cuda_helper.h
@@ -0,0 +1,72 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace pten {
+namespace backends {
+namespace gpu {
+
+/*
+ * Summary: Grid stride looping macro in CUDA kernel
+ *
+ *  [ Why need this macro? ]
+ *
+ *    The original looping in CUDA kernel is:
+ *
+ *    `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+ *        i += blockDim.x * gridDim.x)`
+ *
+ *    This for condition is risky. The value of `blockIdx.x * blockDim.x`
+ *    may be large, such as over 1GB, the first iteration is no problem here,
+ *    but when `i += blockDim.x * gridDim.x` is executed, the value of i
+ *    will greater than INT_MAX and overflow becomes negative value, at
+ *    this time, the cycle condition `i < (n)` is still satisfied, so it
+ *    will cause illegal access to cuda memory.
+ *
+ *    Here is a real example in ERINE, it will trigger above error.
+ *    The related data are:
+ *      - blockIdx.x = 2172938
+ *      - blockDim.x = 512
+ *      - blockIdx.x * blockDim.x = 1112543864
+ *      - INT_MAX = 2147483647
+ *
+ *    So we polish the for condition as follow, the int64_t __index__ will
+ *    prevent overflow in the loop increment.
+ *
+ * Parameters:
+ *    - i: loop index
+ *    - num: total element numbers
+ *
+ * Examples:
+ *    template <typename T>
+ *    __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
+ *                      const int d, const int remain) {
+ *    CUDA_KERNEL_LOOP(index, num) {
+ *      int idx_n = index / d;
+ *      int idx_remain = index % remain;
+ *      logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
+ *      }
+ *    }
+ *
+*/
+
+#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)            \
+  int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \
+  for (index_type i = __index__; __index__ < (num);          \
+       __index__ += blockDim.x * gridDim.x, i = __index__)
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace pten
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_info.cc b/paddle/pten/backends/gpu/cuda/cuda_info.cc
similarity index 71%
rename from paddle/fluid/platform/device/gpu/cuda/cuda_info.cc
rename to paddle/pten/backends/gpu/cuda/cuda_info.cc
index 6109ed6554318..55766facac815 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_info.cc
+++ b/paddle/pten/backends/gpu/cuda/cuda_info.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,20 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/pten/backends/gpu/gpu_info.h"
+
+// TODO(pten): remove fluid headers.
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/lock_guard_ptr.h"
-#include "paddle/fluid/platform/macros.h"
-#include "paddle/fluid/platform/monitor.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/split.h"
 
 static std::once_flag g_device_props_size_init_flag;
 static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags;
-static std::vector<paddle::gpuDeviceProp> g_device_props;
+static std::vector<pten::gpuDeviceProp> g_device_props;
+
+namespace pten {
+namespace backends {
+namespace gpu {
 
-namespace paddle {
-namespace platform {
 int DnnVersion() {
   if (!dynload::HasCUDNN()) return -1;
   return dynload::cudnnGetVersion();
@@ -75,11 +74,13 @@ int GetGPUDeviceCount() {
 }
 
 int GetGPUComputeCapability(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
   int major, minor;
   auto major_error_code =
       cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id);
@@ -92,22 +93,26 @@ int GetGPUComputeCapability(int id) {
 }
 
 int GetGPURuntimeVersion(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
   int runtime_version = 0;
   PADDLE_ENFORCE_GPU_SUCCESS(cudaRuntimeGetVersion(&runtime_version));
   return runtime_version;
 }
 
 int GetGPUDriverVersion(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
   int driver_version = 0;
   PADDLE_ENFORCE_GPU_SUCCESS(cudaDriverGetVersion(&driver_version));
   return driver_version;
@@ -120,11 +125,13 @@ bool TensorCoreAvailable() {
 }
 
 int GetGPUMultiProcessors(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
   int count;
   PADDLE_ENFORCE_GPU_SUCCESS(
       cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id));
@@ -132,11 +139,13 @@ int GetGPUMultiProcessors(int id) {
 }
 
 int GetGPUMaxThreadsPerMultiProcessor(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
   int count;
   PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceGetAttribute(
       &count, cudaDevAttrMaxThreadsPerMultiProcessor, id));
@@ -145,11 +154,13 @@ int GetGPUMaxThreadsPerMultiProcessor(int id) {
 }
 
 int GetGPUMaxThreadsPerBlock(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
   int count;
   PADDLE_ENFORCE_GPU_SUCCESS(
       cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id));
@@ -162,32 +173,34 @@ int GetCurrentDeviceId() {
   return device_id;
 }
 
-dim3 GetGpuMaxGridDimSize(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+std::array<int, 3> GetGpuMaxGridDimSize(int id) {
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
-  dim3 ret;
+                        id,
+                        GetGPUDeviceCount()));
+  std::array<int, 3> ret;
   int size;
   auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id);
   PADDLE_ENFORCE_GPU_SUCCESS(error_code_x);
-  ret.x = size;
+  ret[0] = size;
 
   auto error_code_y = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id);
   PADDLE_ENFORCE_GPU_SUCCESS(error_code_y);
-  ret.y = size;
+  ret[1] = size;
 
   auto error_code_z = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id);
   PADDLE_ENFORCE_GPU_SUCCESS(error_code_z);
-  ret.z = size;
+  ret[2] = size;
   return ret;
 }
 
 const gpuDeviceProp &GetDeviceProperties(int id) {
   std::call_once(g_device_props_size_init_flag, [&] {
     int gpu_num = 0;
-    gpu_num = platform::GetGPUDeviceCount();
+    gpu_num = GetGPUDeviceCount();
     g_device_props_init_flags.resize(gpu_num);
     g_device_props.resize(gpu_num);
     for (int i = 0; i < gpu_num; ++i) {
@@ -196,16 +209,17 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
   });
 
   if (id == -1) {
-    id = platform::GetCurrentDeviceId();
+    id = GetCurrentDeviceId();
   }
 
   if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
-    PADDLE_THROW(platform::errors::OutOfRange(
+    PADDLE_THROW(paddle::platform::errors::OutOfRange(
         "The device id %d is out of range [0, %d), where %d is the number of "
         "devices on this machine. Because the device id should be greater than "
         "or equal to zero and smaller than the number of gpus. Please input "
         "appropriate device again!",
-        id, static_cast<int>(g_device_props.size()),
+        id,
+        static_cast<int>(g_device_props.size()),
         static_cast<int>(g_device_props.size())));
   }
 
@@ -219,32 +233,43 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
 
 void SetDeviceId(int id) {
   // TODO(qijun): find a better way to cache the cuda device count
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
   PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(id));
 }
 
-void GpuMemcpyAsync(void *dst, const void *src, size_t count,
-                    gpuMemcpyKind kind, gpuStream_t stream) {
+void GpuMemcpyAsync(void *dst,
+                    const void *src,
+                    size_t count,
+                    gpuMemcpyKind kind,
+                    gpuStream_t stream) {
   PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(dst, src, count, kind, stream));
 }
 
-void GpuMemcpySync(void *dst, const void *src, size_t count,
+void GpuMemcpySync(void *dst,
+                   const void *src,
+                   size_t count,
                    gpuMemcpyKind kind) {
   PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(dst, src, count, kind));
 }
 
-void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
-                        int src_device, size_t count, gpuStream_t stream) {
+void GpuMemcpyPeerAsync(void *dst,
+                        int dst_device,
+                        const void *src,
+                        int src_device,
+                        size_t count,
+                        gpuStream_t stream) {
   PADDLE_ENFORCE_GPU_SUCCESS(
       cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
 }
 
-void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
-                       int src_device, size_t count) {
+void GpuMemcpyPeerSync(
+    void *dst, int dst_device, const void *src, int src_device, size_t count) {
   PADDLE_ENFORCE_GPU_SUCCESS(
       cudaMemcpyPeer(dst, dst_device, src, src_device, count));
 }
@@ -264,5 +289,7 @@ void GpuDestroyStream(gpuStream_t stream) {
 void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); }
 
 gpuError_t GpuGetLastError() { return cudaGetLastError(); }
-}  // namespace platform
-}  // namespace paddle
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace pten
diff --git a/paddle/pten/backends/gpu/forwards.h b/paddle/pten/backends/gpu/forwards.h
new file mode 100644
index 0000000000000..d0787159e1e30
--- /dev/null
+++ b/paddle/pten/backends/gpu/forwards.h
@@ -0,0 +1,111 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+// Forward-declares CUDA API types used in platform-agnostic wrapper headers.
+#pragma once
+
+/// Forward declaration of Eigen types.
+namespace Eigen {
+struct GpuDevice;
+}  // namespace Eigen
+
+/// Forward declaration of CUDA types.
+
+// Forward declaration of CUDA runtime types.
+using cudaStream_t = struct CUstream_st *;
+using cudaEvent_t = struct CUevent_st *;
+
+// Forward declaration of cuDNN types.
+using cudnnHandle_t = struct cudnnContext *;
+using cudnnTensorDescriptor_t = struct cudnnTensorStruct *;
+using cudnnConvolutionDescriptor_t = struct cudnnConvolutionStruct *;
+using cudnnPoolingDescriptor_t = struct cudnnPoolingStruct *;
+using cudnnFilterDescriptor_t = struct cudnnFilterStruct *;
+using cudnnLRNDescriptor_t = struct cudnnLRNStruct *;
+using cudnnActivationDescriptor_t = struct cudnnActivationStruct *;
+using cudnnSpatialTransformerDescriptor_t =
+    struct cudnnSpatialTransformerStruct *;
+using cudnnOpTensorDescriptor_t = struct cudnnOpTensorStruct *;
+using cudnnReduceTensorDescriptor_t = struct cudnnReduceTensorStruct *;
+using cudnnCTCLossDescriptor_t = struct cudnnCTCLossStruct *;
+using cudnnTensorTransformDescriptor_t = struct cudnnTensorTransformStruct *;
+using cudnnDropoutDescriptor_t = struct cudnnDropoutStruct *;
+using cudnnRNNDescriptor_t = struct cudnnRNNStruct *;
+using cudnnPersistentRNNPlan_t = struct cudnnPersistentRNNPlan *;
+using cudnnRNNDataDescriptor_t = struct cudnnRNNDataStruct *;
+using cudnnAlgorithmDescriptor_t = struct cudnnAlgorithmStruct *;
+using cudnnAlgorithmPerformance_t = struct cudnnAlgorithmPerformanceStruct *;
+using cudnnSeqDataDescriptor_t = struct cudnnSeqDataStruct *;
+using cudnnAttnDescriptor_t = struct cudnnAttnStruct *;
+using cudnnFusedOpsConstParamPack_t = struct cudnnFusedOpsConstParamStruct *;
+using cudnnFusedOpsVariantParamPack_t =
+    struct cudnnFusedOpsVariantParamStruct *;
+using cudnnFusedOpsPlan_t = struct cudnnFusedOpsPlanStruct *;
+
+// Forward declaration of cuBLAS types.
+using cublasHandle_t = struct cublasContext *;
+
+// Forward declaration of cuSOLVER types.
+using cusolverDnHandle_t = struct cusolverDnContext *;
+
+// Forward declaration of cuSparse types.
+using cusparseHandle_t = struct cusparseContext *;
+
+// Forward declaration of cuFFT types.
+using cufftHandle = int;
+
+// Forward declaration of NCCL types.
+using ncclComm_t = struct ncclComm *;
+
+/// Forward declaration of ROCM types.
+#include <cstddef>
+
+using hipDevice_t = int;
+using hipCtx_t = struct ihipCtx_t *;
+using hipModule_t = struct ihipModule_t *;
+using hipStream_t = struct ihipStream_t *;
+using hipEvent_t = struct ihipEvent_t *;
+using hipFunction_t = struct ihipModuleSymbol_t *;
+
+// Forward declaration of MIOpen types.
+using miopenHandle_t = struct miopenHandle *;
+using miopenAcceleratorQueue_t = hipStream_t;
+using miopenFusionOpDescriptor_t = struct miopenFusionOpDescriptor *;
+using miopenTensorDescriptor_t = struct miopenTensorDescriptor *;
+using miopenConvolutionDescriptor_t = struct miopenConvolutionDescriptor *;
+using miopenPoolingDescriptor_t = struct miopenPoolingDescriptor *;
+using miopenLRNDescriptor_t = struct miopenLRNDescriptor *;
+using miopenActivationDescriptor_t = struct miopenActivationDescriptor *;
+using miopenRNNDescriptor_t = struct miopenRNNDescriptor *;
+using miopenCTCLossDescriptor_t = struct miopenCTCLossDescriptor *;
+using miopenDropoutDescriptor_t = struct miopenDropoutDescriptor *;
+using miopenFusionPlanDescriptor_t = struct miopenFusionPlanDescriptor *;
+using miopenOperatorDescriptor_t = struct miopenOperatorDescriptor *;
+using miopenOperatorArgs_t = struct miopenOperatorArgs *;
+using miopenAllocatorFunction = void *(*)(void *context, size_t sizeBytes);
+// using miopenDeallocatorFunction = void *(*)(void *context, void *memory);
+// struct miopenConvAlgoPerf_t;
+// struct miopenConvSolution_t;
+
+// Forward declaration of rocBLAS types.
+using rocblas_handle = struct _rocblas_handle *;
+
+// Forward declaration of hipfft types.
+using hipfftHandle = struct hipfftHandle_t *;
+
+// Forward declaration of rocSOLVER types.
+using rocsolver_handle = rocblas_handle;
+
+// Forward declaration of rocSparse types.
+using rocsparse_handle = struct _rocsparse_handle *;
diff --git a/paddle/pten/backends/gpu/gpu_context.cc b/paddle/pten/backends/gpu/gpu_context.cc
new file mode 100644
index 0000000000000..1e707c46cc93d
--- /dev/null
+++ b/paddle/pten/backends/gpu/gpu_context.cc
@@ -0,0 +1,899 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include <array>
+#include <functional>
+#include <future>
+#include <memory>
+#include <mutex>
+
+#include "paddle/pten/api/ext/exception.h"
+
+#include "paddle/pten/backends/gpu/gpu_decls.h"
+#include "paddle/pten/backends/gpu/gpu_info.h"
+#include "paddle/pten/common/float16.h"
+#include "paddle/pten/common/place.h"
+#include "paddle/pten/core/allocator.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/pten/backends/dynload/cublas.h"
+#include "paddle/pten/backends/dynload/cudnn.h"
+#include "paddle/pten/backends/dynload/cusolver.h"
+#include "paddle/pten/backends/dynload/cusparse.h"
+#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
+#include "paddle/pten/backends/dynload/nccl.h"
+#endif  // !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
+#endif  // PADDLE_WITH_CUDA
+
+#ifdef PADDLE_WITH_HIP
+#include "paddle/pten/backends/dynload/miopen.h"
+#include "paddle/pten/backends/dynload/rocblas.h"
+#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
+#include "paddle/pten/backends/dynload/rccl.h"
+#endif  // !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
+#endif  // PADDLE_WITH_HIP
+
+// NOTE: The paddle framework should add WITH_EIGEN option to support compile
+// without eigen.
+#include "unsupported/Eigen/CXX11/Tensor"
+
+// TODO(pten): remove fluid header.
+#include "paddle/fluid/platform/enforce.h"
+
+namespace pten {
+
+namespace internal {
+
+class EigenGpuStreamDevice : public Eigen::StreamInterface {
+ public:
+  EigenGpuStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
+    Eigen::initializeDeviceProp();
+  }
+  ~EigenGpuStreamDevice() override {}
+
+  void Reinitialize(gpuStream_t cuda_stream,
+                    Allocator* allocator,
+                    GPUPlace place) {
+    stream_ = cuda_stream;
+    place_ = place;
+    allocator_ = allocator;
+    device_prop_ = &Eigen::m_deviceProperties[place.device];
+  }
+
+  const gpuStream_t& stream() const override { return stream_; }
+
+  const gpuDeviceProp& deviceProperties() const override {
+    return *device_prop_;
+  }
+
+  void* allocate(size_t num_bytes) const override {
+    if (UNLIKELY(num_bytes == 0)) {
+      return nullptr;
+    }
+    auto buf = allocator_->Allocate(num_bytes);
+    VLOG(4) << "Eigen allocated at " << buf->ptr() << " requested "
+            << num_bytes;
+    void* retv = buf->ptr();
+    {
+      std::lock_guard<std::mutex> lock(mtx_);
+      allocations_.emplace(retv, std::move(buf));
+    }
+    return retv;
+  }
+
+  void deallocate(void* buffer) const override {
+    if (LIKELY(buffer)) {
+      std::lock_guard<std::mutex> lock(mtx_);
+      allocations_.erase(buffer);
+    }
+  }
+
+  void* scratchpad() const override {
+    if (scratch_ == NULL) {
+      scratch_ = allocate(Eigen::kGpuScratchSize + sizeof(unsigned int));
+    }
+    return scratch_;
+  }
+
+  unsigned int* semaphore() const override {
+    if (semaphore_ == NULL) {
+      char* scratch = static_cast<char*>(scratchpad()) + Eigen::kGpuScratchSize;
+      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_));
+#endif
+    }
+    return semaphore_;
+  }
+
+ private:
+  GPUPlace place_;
+  gpuStream_t stream_;                // not owned;
+  Allocator* allocator_;              // not owned;
+  const gpuDeviceProp* device_prop_;  // not owned;
+  mutable void* scratch_;
+  mutable unsigned int* semaphore_;
+  mutable std::mutex mtx_;  // to protect allocations_
+  mutable std::unordered_map<void*, Allocator::AllocationPtr> allocations_;
+};
+
+#ifdef PADDLE_WITH_HIP
+static void StreamCallbackFunc(gpuStream_t stream,
+                               gpuError_t status,
+                               void* user_data)
+#endif
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 10000
+    static void CUDART_CB StreamCallbackFunc(void* user_data)
+#else
+    static void CUDART_CB
+    StreamCallbackFunc(cudaStream_t stream, cudaError_t status, void* user_data)
+#endif
+#endif
+{
+  std::unique_ptr<std::function<void()>> func(
+      reinterpret_cast<std::function<void()>*>(user_data));
+  (*func)();
+}
+
+}  // namespace internal
+
+class DnnWorkspaceHandle {
+ public:
+  explicit inline DnnWorkspaceHandle(Allocator* allocator)
+      : allocator_(allocator) {}
+
+  inline void RunFunc(const std::function<void(void*)>& cudnn_func,
+                      size_t required_workspace_bytes) {
+    if (required_workspace_bytes > WorkspaceSize()) {
+      ReallocWorkspace(required_workspace_bytes);
+    }
+    VLOG(2) << "Cudnn workspace size at RunFunc: "
+            << static_cast<double>(WorkspaceSize()) / (1 << 20) << " MB";
+    {
+      std::lock_guard<std::mutex> guard(mtx_);
+      cudnn_func(allocation_ ? allocation_->ptr() : nullptr);
+    }
+  }
+
+  /*! \brief Thread which call RunFuncSync() would release gpu memory after
+   *  running the function. Currently this function is only used when cudnn
+   *  exhaustive searching and callers have to guarantee that the input function
+   *  is host blocking */
+  inline void RunFuncSync(const std::function<void(void*)>& cudnn_func,
+                          size_t required_workspace_bytes) {
+    RunFunc(cudnn_func, required_workspace_bytes);
+    ResetWorkspace();
+  }
+
+  inline size_t WorkspaceSize() {
+    if (allocation_ == nullptr) {
+      return 0;
+    }
+    return allocation_->size();
+  }
+
+  void ResetWorkspace() { allocation_ = nullptr; }
+
+  void ReallocWorkspace(size_t required_workspace_bytes) {
+    if (required_workspace_bytes <= WorkspaceSize()) return;
+    // reset allocation first before re-allocate to save memory
+    allocation_.reset();
+    allocation_ = allocator_->Allocate(required_workspace_bytes);
+  }
+
+ private:
+  Allocator::AllocationPtr allocation_{nullptr};
+  Allocator* allocator_{nullptr};
+  std::mutex mtx_;
+};
+
+struct GPUContext::Impl {
+  void Init() {
+    owned_ = true;
+    backends::gpu::GPUDeviceGuard guard(place_.device);
+    InitGpuProperties();
+    InitStream();
+    InitEigenDevice();
+    InitBlasHandle();
+    InitDNNHandle();
+    InitSolverHandle();
+    InitSparseHandle();
+    InitDnnWorkspace();
+  }
+
+  void PartialInitWithoutAllocator() {
+    owned_ = true;
+    backends::gpu::GPUDeviceGuard guard(place_.device);
+    InitGpuProperties();
+    InitStream();
+    InitBlasHandle();
+    InitDNNHandle();
+    InitSolverHandle();
+    InitSparseHandle();
+  }
+
+  void PartialInitWithAllocator() {
+    owned_ = true;
+    backends::gpu::GPUDeviceGuard guard(place_.device);
+    InitEigenDevice();
+    InitDnnWorkspace();
+  }
+
+  Impl() : place_(GPUPlace()) {}
+
+  explicit Impl(const GPUPlace& place) : place_(place) {}
+
+  ~Impl() {
+    backends::gpu::GPUDeviceGuard guard(place_.device);
+    DestoryInternalWorkspace();
+    DestoryInternalEigenDevice();
+    DestroyInternalSparseHandle();
+    DestroyInternalSolverHandle();
+    DestroyInternalDnnHandle();
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    if (nccl_comm_) {
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
+    }
+#endif
+    DestroyInternalBlasHandle();
+    DestoryInternalStream();
+  }
+
+  const Place& GetPlace() const { return place_; }
+
+  bool IsTensorCoreAvailable() const {
+    return blas_tensor_core_handle_ != nullptr;
+  }
+
+  void InitGpuProperties() {
+    backends::gpu::GPUDeviceGuard guard(place_.GetDeviceId());
+    compute_capability_ =
+        backends::gpu::GetGPUComputeCapability(place_.GetDeviceId());
+    multi_process_ = backends::gpu::GetGPUMultiProcessors(place_.GetDeviceId());
+    max_threads_per_mp_ =
+        backends::gpu::GetGPUMaxThreadsPerMultiProcessor(place_.GetDeviceId());
+    max_grid_dim_size_ =
+        backends::gpu::GetGpuMaxGridDimSize(place_.GetDeviceId());
+    max_threads_per_block_ =
+        backends::gpu::GetGPUMaxThreadsPerBlock(place_.GetDeviceId());
+    driver_version_ = backends::gpu::GetGPUDriverVersion(place_.GetDeviceId());
+    runtime_version_ =
+        backends::gpu::GetGPURuntimeVersion(place_.GetDeviceId());
+
+    // TODO(wilber): glog may be replaced in the future?
+    LOG_FIRST_N(WARNING, 1)
+        << "Please NOTE: device: " << static_cast<int>(place_.device)
+        << ", GPU Compute Capability: " << compute_capability_ / 10 << "."
+        << compute_capability_ % 10
+        << ", Driver API Version: " << driver_version_ / 1000 << "."
+        << (driver_version_ % 100) / 10
+        << ", Runtime API Version: " << runtime_version_ / 1000 << "."
+        << (runtime_version_ % 100) / 10;
+#ifdef PADDLE_WITH_HIP
+    size_t miopen_major, miopen_minor, miopen_patch;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        dynload::miopenGetVersion(&miopen_major, &miopen_minor, &miopen_patch));
+    auto cudnn_dso_ver =
+        (miopen_major * 1000 + miopen_minor * 10 + miopen_patch) / 10;
+    auto compile_miopen_version = MIOPEN_VERSION / 10;
+    if (cudnn_dso_ver < static_cast<size_t>(compile_miopen_version)) {
+      LOG_FIRST_N(WARNING, 1)
+          << "WARNING: device: " << static_cast<int>(place_.device)
+          << ". The installed Paddle is compiled with MIOPEN "
+          << compile_miopen_version / 100 << "." << compile_miopen_version % 100
+          << ", but MIOPEN version in your machine is " << cudnn_dso_ver / 100
+          << "." << cudnn_dso_ver % 100
+          << ", which may cause serious incompatible bug. "
+          << "Please recompile or reinstall Paddle with compatible MIOPEN "
+             "version.";
+    }
+#else
+    size_t cudnn_dso_ver = dynload::cudnnGetVersion();
+    LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place_.device)
+                            << ", cuDNN Version: " << cudnn_dso_ver / 1000
+                            << "." << (cudnn_dso_ver % 1000) / 100 << ".";
+
+    // Check CUDA/CUDNN version compatiblity
+    auto local_cuda_version =
+        (driver_version_ / 1000) * 10 + (driver_version_ % 100) / 10;
+    auto compile_cuda_version =
+        (CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10;
+    if (local_cuda_version < compile_cuda_version) {
+      LOG_FIRST_N(WARNING, 1)
+          << "WARNING: device: " << static_cast<int>(place_.device)
+          << ". The installed Paddle is compiled with CUDA "
+          << compile_cuda_version / 10 << "." << compile_cuda_version % 10
+          << ", but CUDA runtime version in your machine is "
+          << local_cuda_version / 10 << "." << local_cuda_version % 10
+          << ", which may cause serious incompatible bug. "
+          << "Please recompile or reinstall Paddle with compatible CUDA "
+             "version.";
+    }
+#endif
+  }
+
+  void InitDnnWorkspace() {
+    PD_CHECK(allocator_ != nullptr,
+             "the device allocator for gpu context is nullptr.");
+    workspace_ = new DnnWorkspaceHandle(allocator_);
+  }
+
+  void DestoryInternalWorkspace() {
+    if (owned_ && workspace_ != nullptr) {
+      delete workspace_;
+      stream_ = nullptr;
+    }
+  }
+
+  DnnWorkspaceHandle* GetDnnWorkspace() {
+    PD_CHECK(workspace_ != nullptr, "the gpu cudnn workspace is nullptr.");
+    return workspace_;
+  }
+
+  void InitStream() {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        hipStreamCreateWithPriority(&stream_, hipStreamDefault, 0));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        cudaStreamCreateWithPriority(&stream_, cudaStreamDefault, 0));
+#endif
+  }
+
+  void DestoryInternalStream() {
+    if (owned_ && stream_ != nullptr) {
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream_));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream_));
+#endif
+    }
+    stream_ = nullptr;
+  }
+
+  void SetStream(gpuStream_t stream) { stream_ = stream; }
+
+  gpuStream_t GetStream() const {
+    PD_CHECK(stream_ != nullptr, "the gpu stream is nullptr.");
+    return stream_;
+  }
+
+  void InitEigenDevice() {
+    PD_CHECK(allocator_ != nullptr,
+             "the allocator for eigen device is nullptr.");
+    eigen_stream_.reset(new internal::EigenGpuStreamDevice());
+    eigen_stream_->Reinitialize(stream_, allocator_, place_);
+    eigen_device_ = new Eigen::GpuDevice(eigen_stream_.get());
+  }
+
+  void DestoryInternalEigenDevice() {
+    if (owned_ && eigen_device_ != nullptr) {
+      delete eigen_device_;
+      eigen_device_ = nullptr;
+    }
+  }
+
+  void SetEigenDevice(Eigen::GpuDevice* device) { eigen_device_ = device; }
+
+  Eigen::GpuDevice* eigen_device() const {
+    PD_CHECK(eigen_device_ != nullptr, "the gpu eigen_device is nullptr.");
+    return eigen_device_;
+  }
+
+  void InitBlasHandle() {
+#ifdef PADDLE_WITH_HIP
+    pten::dynload::rocblas_create_handle(&blas_handle_);
+    pten::dynload::rocblas_set_stream(blas_handle_, stream_);
+#else  // PADDLE_WITH_CUDA
+    PADDLE_RETRY_CUDA_SUCCESS(pten::dynload::cublasCreate(&blas_handle_));
+    PADDLE_RETRY_CUDA_SUCCESS(
+        pten::dynload::cublasSetStream(blas_handle_, stream_));
+#if CUDA_VERSION >= 9000
+    PADDLE_RETRY_CUDA_SUCCESS(
+        pten::dynload::cublasCreate(&blas_tensor_core_handle_));
+    PADDLE_RETRY_CUDA_SUCCESS(
+        pten::dynload::cublasSetStream(blas_tensor_core_handle_, stream_));
+    PADDLE_RETRY_CUDA_SUCCESS(pten::dynload::cublasSetMathMode(
+        blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
+#if CUDA_VERSION >= 11000
+    PADDLE_RETRY_CUDA_SUCCESS(
+        pten::dynload::cublasCreate(&blas_tf32_tensor_core_handle_));
+    PADDLE_RETRY_CUDA_SUCCESS(
+        pten::dynload::cublasSetStream(blas_tf32_tensor_core_handle_, stream_));
+    PADDLE_RETRY_CUDA_SUCCESS(pten::dynload::cublasSetMathMode(
+        blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
+#endif  // CUDA_VERSION >= 11000
+#endif  // CUDA_VERSION >= 9000
+#endif  // PADDLE_WITH_HIP
+  }
+
+  void DestroyInternalBlasHandle() {
+#ifdef PADDLE_WITH_HIP
+    if (owned_ && blas_handle_ != nullptr) {
+      pten::dynload::rocblas_destroy_handle(blas_handle_);
+      blas_handle_ = nullptr;
+    }
+#else
+    if (owned_ && blas_handle_ != nullptr) {
+      pten::dynload::cublasDestroy(blas_handle_);
+      blas_handle_ = nullptr;
+    }
+    if (owned_ && blas_tensor_core_handle_ != nullptr) {
+      pten::dynload::cublasDestroy(blas_tensor_core_handle_);
+      blas_tensor_core_handle_ = nullptr;
+    }
+    if (owned_ && blas_tf32_tensor_core_handle_ != nullptr) {
+      pten::dynload::cublasDestroy(blas_tf32_tensor_core_handle_);
+      blas_tf32_tensor_core_handle_ = nullptr;
+    }
+#endif  // PADDLE_WITH_HIP
+  }
+
+  blasHandle_t GetBlasHandle() const {
+    PD_CHECK(blas_handle_ != nullptr, "the gpu blas handle is nullptr.");
+    return blas_handle_;
+  }
+
+  void SetBlasHandle(blasHandle_t blas) { blas_handle_ = blas; }
+
+  void InitDNNHandle() {
+    if (pten::dynload::HasCUDNN()) {
+#ifdef PADDLE_WITH_HIP
+      size_t miopen_major, miopen_minor, miopen_patch;
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion(
+          &miopen_major, &miopen_minor, &miopen_patch));
+      auto local_miopen_version =
+          (miopen_major * 1000 + miopen_minor * 10 + miopen_patch) / 10;
+      auto compile_miopen_version = MIOPEN_VERSION / 10;
+      if (local_miopen_version < static_cast<size_t>(compile_miopen_version)) {
+        LOG_FIRST_N(WARNING, 1)
+            << "WARNING: device: " << place_.device
+            << ". The installed Paddle is compiled with MIOPEN "
+            << compile_miopen_version / 100 << "."
+            << compile_miopen_version % 100
+            << ", but MIOPEN version in your machine is "
+            << local_miopen_version / 100 << "." << local_miopen_version % 100
+            << ", which may cause serious incompatible bug. "
+            << "Please recompile or reinstall Paddle with compatible MIOPEN "
+               "version.";
+      }
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreate(&dnn_handle_));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          dynload::miopenSetStream(dnn_handle_, stream_));
+#else
+      auto local_cudnn_version = pten::dynload::cudnnGetVersion() / 100;
+      auto compile_cudnn_version = CUDNN_VERSION / 100;
+      if (local_cudnn_version < static_cast<size_t>(compile_cudnn_version)) {
+        LOG_FIRST_N(WARNING, 1)
+            << "WARNING: device: " << place_.device
+            << ". The installed Paddle is compiled with CUDNN "
+            << compile_cudnn_version / 10 << "." << compile_cudnn_version % 10
+            << ", but CUDNN version in your machine is "
+            << local_cudnn_version / 10 << "." << local_cudnn_version % 10
+            << ", which may cause serious incompatible bug. "
+            << "Please recompile or reinstall Paddle with compatible CUDNN "
+               "version.";
+      }
+      PADDLE_RETRY_CUDA_SUCCESS(pten::dynload::cudnnCreate(&dnn_handle_));
+      PADDLE_RETRY_CUDA_SUCCESS(
+          pten::dynload::cudnnSetStream(dnn_handle_, stream_));
+#endif
+    } else {
+      dnn_handle_ = nullptr;
+    }
+  }
+
+  dnnHandle_t GetDnnHandle() {
+    PD_CHECK(dnn_handle_ != nullptr, "the gpu dnn handle is nullptr.");
+    return dnn_handle_;
+  }
+
+  void DestroyInternalDnnHandle() {
+#ifdef PADDLE_WITH_HIP
+    if (owned_ && dnn_handle_ != nullptr) {
+      PADDLE_ENFORCE_GPU_SUCCESS(pten::dynload::miopenDestroy(dnn_handle_));
+      dnn_handle_ = nullptr;
+    }
+#else
+    if (owned_ && dnn_handle_ != nullptr) {
+      PADDLE_ENFORCE_GPU_SUCCESS(pten::dynload::cudnnDestroy(dnn_handle_));
+      dnn_handle_ = nullptr;
+    }
+#endif  // PADDLE_WITH_HIP
+  }
+
+  void SetDnnHandle(dnnHandle_t handle) { dnn_handle_ = handle; }
+
+  void InitSolverHandle() {
+#ifndef PADDLE_WITH_HIP
+    PADDLE_RETRY_CUDA_SUCCESS(pten::dynload::cusolverDnCreate(&solver_handle_));
+    PADDLE_RETRY_CUDA_SUCCESS(
+        pten::dynload::cusolverDnSetStream(solver_handle_, stream_));
+#endif
+  }
+
+  void DestroyInternalSolverHandle() {
+#ifndef PADDLE_WITH_HIP
+    if (owned_ && solver_handle_ != nullptr) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          pten::dynload::cusolverDnDestroy(solver_handle_));
+      solver_handle_ = nullptr;
+    }
+#endif
+  }
+
+  solverHandle_t GetSolverHandle() const {
+    PD_CHECK(solver_handle_ != nullptr, "the gpu solver handle is nullptr.");
+    return solver_handle_;
+  }
+
+  void SetSolverHandle(solverHandle_t handle) { solver_handle_ = handle; }
+
+  void InitSparseHandle() {
+// ROCM is not yet supported
+#if defined(PADDLE_WITH_CUDA)
+// The generic APIs is supported from CUDA10.1
+#if CUDA_VERSION >= 10010
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseCreate(&sparse_handle_));
+    PADDLE_RETRY_CUDA_SUCCESS(
+        dynload::cusparseSetStream(sparse_handle_, stream_));
+#endif
+#endif
+  }
+
+  void DestroyInternalSparseHandle() {
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 10010
+    if (owned_ && sparse_handle_ != nullptr) {
+      PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseDestroy(sparse_handle_));
+      sparse_handle_ = nullptr;
+    }
+#endif
+#endif
+  }
+
+  sparseHandle_t GetSparseHandle() const {
+    PD_CHECK(sparse_handle_ != nullptr, "the gpu sparse handle is nullptr.");
+    return sparse_handle_;
+  }
+
+  void SetSparseHandle(sparseHandle_t handle) { sparse_handle_ = handle; }
+
+  void Wait() const {
+#ifdef PADDLE_WITH_HIP
+    hipError_t e_sync = hipSuccess;
+#if !defined(_WIN32)
+    e_sync = hipStreamSynchronize(stream_);
+#else
+    while (e_sync = hipStreamQuery(stream_)) {
+      if (e_sync == hipErrorNotReady) continue;
+      break;
+    }
+#endif  // !defined(_WIN32)
+#else   // PADDLE_WITH_HIP
+    cudaError_t e_sync = cudaSuccess;
+#if !defined(_WIN32)
+    e_sync = cudaStreamSynchronize(stream_);
+#else
+    while (e_sync = cudaStreamQuery(stream_)) {
+      if (e_sync == cudaErrorNotReady) continue;
+      break;
+    }
+#endif  // !defined(_WIN32)
+#endif  // PADDLE_WITH_HIP
+
+    PADDLE_ENFORCE_GPU_SUCCESS(e_sync);
+  }
+
+  void WaitEvent(gpuEvent_t ev) const {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream_, ev, 0));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(stream_, ev, 0));
+#endif
+  }
+
+  ncclComm_t GetNcclComm() const {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    // PD_CHECK(nccl_comm_ != nullptr, "the gpu nccl_comm is nullptr.");
+    return nccl_comm_;
+#endif
+    return nullptr;
+  }
+
+  void SetNcclComm(ncclComm_t comm) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    nccl_comm_ = comm;
+#endif
+  }
+
+  inline void CublasCall(
+      const std::function<void(blasHandle_t)>& callback) const {
+    if (blas_tf32_tensor_core_handle_ != nullptr) {
+      std::lock_guard<std::mutex> guard(blas_tf32_mtx_);
+      callback(blas_tf32_tensor_core_handle_);
+    } else {
+      std::lock_guard<std::mutex> guard(blas_mtx_);
+      callback(blas_handle_);
+    }
+  }
+
+  inline void TensorCoreCublasCallIfAvailable(
+      const std::function<void(blasHandle_t)>& callback) const {
+    if (blas_tensor_core_handle_ != nullptr) {
+      std::lock_guard<std::mutex> guard(blas_tensor_core_mtx_);
+      callback(blas_tensor_core_handle_);
+    } else {
+      std::lock_guard<std::mutex> guard(blas_mtx_);
+      callback(blas_handle_);
+    }
+  }
+
+  inline void CusparseCall(
+      const std::function<void(sparseHandle_t)>& callback) const {
+    std::lock_guard<std::mutex> guard(sparse_mtx_);
+    callback(sparse_handle_);
+  }
+
+  void RecordEvent(gpuEvent_t ev, const std::function<void()>& callback) const {
+    callback();
+    RecordEvent(ev);
+  }
+
+  void RecordEvent(gpuEvent_t ev) const {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(ev, stream_));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(ev, stream_));
+#endif
+  }
+
+  void AddStreamCallback(const std::function<void()>& callback) const {
+    // TODO(wilber): Do we need ThreadPool?
+    auto* func = new std::function<void()>([this, callback] {
+      std::lock_guard<std::mutex> lock(stream_call_back_mtx_);
+      last_future_ = std::async(std::launch::deferred, [&]() { callback(); });
+    });
+
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        hipStreamAddCallback(stream_, internal::StreamCallbackFunc, func, 0));
+#endif
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 10000
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        cudaLaunchHostFunc(stream_, internal::StreamCallbackFunc, func));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        cudaStreamAddCallback(stream_, internal::StreamCallbackFunc, func, 0));
+#endif
+#endif
+  }
+
+  void WaitStreamCallback() const {
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA)
+    pten::backends::gpu::GpuStreamSync(stream_);
+#endif
+    {
+      std::lock_guard<std::mutex> lock(stream_call_back_mtx_);
+      if (last_future_.valid()) {
+        last_future_.wait();
+      }
+    }
+  }
+
+  bool owned_{false};
+  Place place_;
+  int compute_capability_;
+  int runtime_version_;
+  int driver_version_;
+  int multi_process_;
+  int max_threads_per_mp_;
+  int max_threads_per_block_;
+  std::array<int, 3> max_grid_dim_size_;
+
+  gpuStream_t stream_{nullptr};
+  Eigen::GpuDevice* eigen_device_{nullptr};
+  blasHandle_t blas_handle_{nullptr};
+  blasHandle_t blas_tensor_core_handle_{nullptr};
+  blasHandle_t blas_tf32_tensor_core_handle_{nullptr};
+  dnnHandle_t dnn_handle_{nullptr};
+  solverHandle_t solver_handle_{nullptr};
+  sparseHandle_t sparse_handle_{nullptr};
+  DnnWorkspaceHandle* workspace_{nullptr};
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  // NCCL communicator (single process version) for NCCL collective operations.
+  // NCCL collective operations provides fast collectives over multiple GPUs
+  // both within and across nodes.
+  // But, this collectives is used for collectives over multiple GPUs within
+  // nodes.
+
+  // NOTE: Distributed communicator, distributed framework manages its
+  // resources.
+  ncclComm_t nccl_comm_{nullptr};
+#endif
+
+  mutable std::mutex blas_mtx_;
+  mutable std::mutex blas_tensor_core_mtx_;
+  mutable std::mutex blas_tf32_mtx_;
+  mutable std::mutex sparse_mtx_;
+  mutable std::mutex stream_call_back_mtx_;
+  mutable std::future<void> last_future_;
+
+  Allocator* allocator_{nullptr};  // external resource.
+  // A internal resouce to initinalize eigen_device.
+  std::unique_ptr<internal::EigenGpuStreamDevice> eigen_stream_{nullptr};
+};
+
+GPUContext::GPUContext() : DeviceContext(), impl_(std::make_unique<Impl>()) {}
+
+GPUContext::GPUContext(const GPUPlace& place)
+    : DeviceContext(), impl_(std::make_unique<Impl>(place)) {}
+
+GPUContext::~GPUContext() = default;
+
+const Place& GPUContext::GetPlace() const { return impl_->GetPlace(); }
+
+gpuStream_t GPUContext::stream() const { return impl_->GetStream(); }
+
+dnnHandle_t GPUContext::cudnn_handle() const { return impl_->GetDnnHandle(); }
+
+blasHandle_t GPUContext::cublas_handle() const {
+  return impl_->GetBlasHandle();
+}
+
+solverHandle_t GPUContext::cusolver_dn_handle() const {
+  return impl_->GetSolverHandle();
+}
+
+sparseHandle_t GPUContext::cusparse_handle() const {
+  return impl_->GetSparseHandle();
+}
+
+void GPUContext::Wait() const { impl_->Wait(); }
+
+void GPUContext::WaitEvent(gpuEvent_t ev) const { impl_->WaitEvent(ev); }
+
+bool GPUContext::tensor_core_available() const {
+  return impl_->IsTensorCoreAvailable();
+}
+
+int GPUContext::GetComputeCapability() const {
+  return impl_->compute_capability_;
+}
+
+int GPUContext::GetMaxPhysicalThreadCount() const {
+  return impl_->multi_process_ * impl_->max_threads_per_mp_;
+}
+
+int GPUContext::GetSMCount() const { return impl_->multi_process_; }
+
+int GPUContext::GetMaxThreadsPerBlock() const {
+  return impl_->max_threads_per_block_;
+}
+
+std::array<int, 3> GPUContext::GetCUDAMaxGridDimSize() const {
+  return impl_->max_grid_dim_size_;
+}
+
+Eigen::GpuDevice* GPUContext::eigen_device() const {
+  return impl_->eigen_device();
+}
+
+DnnWorkspaceHandle* GPUContext::cudnn_workspace_handle() {
+  return impl_->GetDnnWorkspace();
+}
+
+void GPUContext::CublasCall(
+    const std::function<void(blasHandle_t)>& callback) const {
+  impl_->CublasCall(callback);
+}
+
+void GPUContext::TensorCoreCublasCallIfAvailable(
+    const std::function<void(blasHandle_t)>& callback) const {
+  impl_->TensorCoreCublasCallIfAvailable(callback);
+}
+
+void GPUContext::CusparseCall(
+    const std::function<void(sparseHandle_t)>& callback) const {
+  impl_->CusparseCall(callback);
+}
+
+void GPUContext::RecordEvent(gpuEvent_t ev,
+                             const std::function<void()>& callback) const {
+  impl_->RecordEvent(ev, callback);
+}
+
+void GPUContext::RecordEvent(gpuEvent_t ev) const { impl_->RecordEvent(ev); }
+
+void GPUContext::AddStreamCallback(
+    const std::function<void()>& callback) const {
+  impl_->AddStreamCallback(callback);
+}
+
+void GPUContext::WaitStreamCallback() const { impl_->WaitStreamCallback(); }
+
+ncclComm_t GPUContext::nccl_comm() const { return impl_->GetNcclComm(); }
+
+void GPUContext::set_nccl_comm(ncclComm_t comm) { impl_->SetNcclComm(comm); }
+
+void GPUContext::Init() {
+  impl_->allocator_ = const_cast<Allocator*>(&this->GetAllocator());
+  impl_->Init();
+}
+
+void GPUContext::SetStream(gpuStream_t stream) { impl_->SetStream(stream); }
+
+void GPUContext::SetEigenDevice(Eigen::GpuDevice* device) {
+  impl_->SetEigenDevice(device);
+}
+
+void GPUContext::SetBlasHandle(blasHandle_t blas) {
+  impl_->SetBlasHandle(blas);
+}
+
+void GPUContext::SetDnnHandle(dnnHandle_t handle) {
+  impl_->SetDnnHandle(handle);
+}
+
+void GPUContext::SetSolverHandle(solverHandle_t handle) {
+  impl_->SetSolverHandle(handle);
+}
+
+void GPUContext::SetSparseHandle(sparseHandle_t handle) {
+  impl_->SetSparseHandle(handle);
+}
+
+void GPUContext::SetDnnWorkspaceHandle(DnnWorkspaceHandle* handle) {
+  impl_->workspace_ = handle;
+}
+
+void GPUContext::PartialInitWithoutAllocator() {
+  impl_->PartialInitWithoutAllocator();
+}
+
+void GPUContext::PartialInitWithAllocator() {
+  impl_->allocator_ = const_cast<Allocator*>(&this->GetAllocator());
+  impl_->PartialInitWithAllocator();
+}
+
+void GPUContext::SetComputeCapability(int val) {
+  impl_->compute_capability_ = val;
+}
+
+void GPUContext::SetMaxThreadsPerMultiProcessor(int val) {
+  impl_->max_threads_per_mp_ = val;
+}
+
+void GPUContext::SetMultiProcessors(int val) { impl_->multi_process_ = val; }
+
+void GPUContext::SetMaxThreadsPerBlock(int val) {
+  impl_->max_threads_per_block_ = val;
+}
+
+void GPUContext::SetMaxGridDimSize(const std::array<int, 3>& val) {
+  impl_->max_grid_dim_size_ = val;
+}
+
+void GPUContext::SetDriverVersion(int val) { impl_->driver_version_ = val; }
+
+void GPUContext::SetRuntimeVersion(int val) { impl_->runtime_version_ = val; }
+
+}  // namespace pten
diff --git a/paddle/pten/backends/gpu/gpu_context.h b/paddle/pten/backends/gpu/gpu_context.h
index 1adfd155ce334..2a2be0e44b4f0 100644
--- a/paddle/pten/backends/gpu/gpu_context.h
+++ b/paddle/pten/backends/gpu/gpu_context.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,13 +14,162 @@ limitations under the License. */
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/device_context.h"
+#include <array>
+#include <functional>
+#include "paddle/pten/backends/gpu/forwards.h"
+#include "paddle/pten/backends/gpu/gpu_decls.h"
+#include "paddle/pten/backends/gpu/gpu_helper.h"
+#include "paddle/pten/common/place.h"
+#include "paddle/pten/core/device_context.h"
 
 namespace pten {
-using GPUContext = paddle::platform::CUDADeviceContext;
-}  // namespace pten
 
-#endif
+class DnnWorkspaceHandle;
+
+class GPUContext : public DeviceContext {
+ public:
+  GPUContext();
+
+  explicit GPUContext(const GPUPlace& place);
+
+  virtual ~GPUContext();
+
+  /*! \brief  Return place in the device context. */
+  const Place& GetPlace() const override;
+
+  /*! \brief  Return gpu stream in the device context. */
+  gpuStream_t stream() const;
+
+  /*! \brief  Return cudnn  handle in the device context. */
+  dnnHandle_t cudnn_handle() const;
+
+  /*! \brief  Return cublas handle in the device context. */
+  blasHandle_t cublas_handle() const;
+
+  /*! \brief  Return cusolver handle in the device context. */
+  solverHandle_t cusolver_dn_handle() const;
+
+  /*! \brief  Return cusparse handle in the device context. */
+  sparseHandle_t cusparse_handle() const;
+
+  /*! \brief  Wait for all operations completion in the stream. */
+  void Wait() const override;
+
+  /*! \brief  Wait for event in the stream. */
+  void WaitEvent(gpuEvent_t ev) const;
+
+  /*! \brief  Check whether tensor core is supported */
+  bool tensor_core_available() const;
+
+  /*! \brief  Return compute capability in the device context. */
+  int GetComputeCapability() const;
+
+  /*! \brief  Return the max physical thread count in the device context */
+  int GetMaxPhysicalThreadCount() const;
+
+  /*! \brief  Return the SM count in the device context */
+  int GetSMCount() const;
+
+  /*! \brief  Return the Max thread num of block in the device context */
+  int GetMaxThreadsPerBlock() const;
+
+  /*! \brief  Return the max grid dim size in the device context */
+  std::array<int, 3> GetCUDAMaxGridDimSize() const;
+
+  /*! \brief  Return eigen device in the device context. */
+  Eigen::GpuDevice* eigen_device() const;
+
+  /*! \brief  Return a cudnn workspace handle to call multiple cudnn
+   *  functions without interrupting by other threads.
+   *  Once the first cudnn function is called by the handle, a lock
+   *  would be acquired to prevent other threads from accessing the
+   *  workspace. Once the handle is destructed, the lock would be released.
+   */
+  DnnWorkspaceHandle* cudnn_workspace_handle();
+
+ public:
+  /*! \brief  Call cublas function safely. */
+  void CublasCall(const std::function<void(blasHandle_t)>&) const;
+
+  /*! \brief  Call cublas function with Tensor Core safely. If
+      Tensor Core is not available, use DEFAULT_MATH instead. */
+  void TensorCoreCublasCallIfAvailable(
+      const std::function<void(blasHandle_t)>&) const;
+
+  /*! \brief  Call cusparse function safely. */
+  void CusparseCall(const std::function<void(sparseHandle_t)>&) const;
+
+  void RecordEvent(gpuEvent_t ev, const std::function<void()>& callback) const;
+
+  void RecordEvent(gpuEvent_t ev) const;
+
+  void AddStreamCallback(const std::function<void()>& callback) const;
+
+  void WaitStreamCallback() const;
+
+ public:
+  /*! \brief  Return nccl communicators. */
+  ncclComm_t nccl_comm() const;
+
+  /*! \brief  Set nccl communicators. */
+  void set_nccl_comm(ncclComm_t comm);
+
+ public:
+  // NOTE: DeviceContext hold resources. Used in training scenarios.
+  // The interface used by the training scene, DeviceContext will initialize
+  // all resources and delete them when destructing.
+  // Note that you must set the Allocator before calling Init function.
+  void Init();
+
+  // TODO(wilber): Why does the GetAllocator interface require a stream
+  // parameter?
+  // The temporary trick method bypasses this problem, and the following
+  // interfaces
+  // need to be deleted later.
+
+  // Note that this is a trick implementation, which can be used to partially
+  // initialize when the SetAllocator interface is not called.
+  void PartialInitWithoutAllocator();
+  // Note that this is a trick implementation that can be used to initialize
+  // resources that require an Allocator when the SetAllocator interface is
+  // called.
+  void PartialInitWithAllocator();
+
+ protected:
+  // NOTE: External users manage resources. Used in inference scenarios.
+  // The Set interface is for inference only, DeviceContext will mark the
+  // resource as external, and will not delete any resource when destructing.
+  void SetStream(gpuStream_t);
+
+  void SetEigenDevice(Eigen::GpuDevice*);
+
+  void SetBlasHandle(blasHandle_t);
+
+  void SetDnnHandle(dnnHandle_t);
+
+  void SetSolverHandle(solverHandle_t);
+
+  void SetSparseHandle(sparseHandle_t);
+
+  void SetDnnWorkspaceHandle(DnnWorkspaceHandle*);
+
+  void SetComputeCapability(int val);
+
+  void SetMaxThreadsPerMultiProcessor(int val);
+
+  void SetMultiProcessors(int val);
+
+  void SetMaxThreadsPerBlock(int val);
+
+  void SetMaxGridDimSize(const std::array<int, 3>& val);
+
+  void SetDriverVersion(int val);
+
+  void SetRuntimeVersion(int val);
+
+ private:
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace pten
diff --git a/paddle/pten/backends/gpu/gpu_decls.h b/paddle/pten/backends/gpu/gpu_decls.h
new file mode 100644
index 0000000000000..747244c4407e0
--- /dev/null
+++ b/paddle/pten/backends/gpu/gpu_decls.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pten/backends/gpu/forwards.h"
+
+namespace pten {
+
+#ifdef PADDLE_WITH_HIP
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using GPU_TYPE = ROCM_TYPE;
+
+#else  // PADDLE_WITH_CDUA
+
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using GPU_TYPE = CUDA_TYPE;
+#endif
+
+DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t);
+DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t);
+
+DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor,
+                     cudnnActivationStruct,
+                     miopenActivationDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnTensorDescriptor,
+                     cudnnTensorStruct,
+                     miopenTensorDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor,
+                     cudnnFilterStruct,
+                     miopenTensorDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor_t,
+                     cudnnFilterDescriptor_t,
+                     miopenTensorDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor,
+                     cudnnConvolutionStruct,
+                     miopenConvolutionDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor_t,
+                     cudnnConvolutionDescriptor_t,
+                     miopenConvolutionDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t,
+                     cudnnPoolingDescriptor_t,
+                     miopenPoolingDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t,
+                     cudnnDropoutDescriptor_t,
+                     miopenDropoutDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t);
+
+DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle);
+
+DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle);
+
+DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle);
+
+#undef DECLARE_TYPE_FOR_GPU
+
+using CUDAGraphID = unsigned long long;  // NOLINT
+
+}  // namespace pten
diff --git a/paddle/pten/backends/gpu/gpu_helper.h b/paddle/pten/backends/gpu/gpu_helper.h
new file mode 100644
index 0000000000000..e9254115ed5bc
--- /dev/null
+++ b/paddle/pten/backends/gpu/gpu_helper.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+#ifdef PADDLE_WITH_HIP
+#include "paddle/pten/backends/gpu/rocm/rocm_helper.h"
+#else
+#include "paddle/pten/backends/gpu/cuda/cuda_helper.h"
+#endif
+
+#define CUDA_KERNEL_LOOP(i, num) CUDA_KERNEL_LOOP_TYPE(i, num, int)
+
+#endif
diff --git a/paddle/pten/backends/gpu/gpu_info.cc b/paddle/pten/backends/gpu/gpu_info.cc
new file mode 100644
index 0000000000000..d1b8d4a362637
--- /dev/null
+++ b/paddle/pten/backends/gpu/gpu_info.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/gpu/gpu_info.h"
+
+#include <vector>
+
+#include "gflags/gflags.h"
+
+DECLARE_string(selected_gpus);
+
+namespace pten {
+namespace backends {
+namespace gpu {
+
+static inline std::vector<std::string> Split(std::string const& original,
+                                             char separator) {
+  std::vector<std::string> results;
+  std::string token;
+  std::istringstream is(original);
+  while (std::getline(is, token, separator)) {
+    if (!token.empty()) {
+      results.push_back(token);
+    }
+  }
+  return results;
+}
+
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetSelectedDevices() {
+  // use user specified GPUs in single-node multi-process mode.
+  std::vector<int> devices;
+  if (!FLAGS_selected_gpus.empty()) {
+    auto devices_str = Split(FLAGS_selected_gpus, ',');
+    for (auto id : devices_str) {
+      devices.push_back(atoi(id.c_str()));
+    }
+  } else {
+    int count = GetGPUDeviceCount();
+    for (int i = 0; i < count; ++i) {
+      devices.push_back(i);
+    }
+  }
+  return devices;
+}
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace pten
diff --git a/paddle/pten/backends/gpu/gpu_info.h b/paddle/pten/backends/gpu/gpu_info.h
new file mode 100644
index 0000000000000..59add6166d1d2
--- /dev/null
+++ b/paddle/pten/backends/gpu/gpu_info.h
@@ -0,0 +1,132 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+#include <stddef.h>
+#include <array>
+#include <string>
+#include <vector>
+
+#include "paddle/pten/backends/gpu/gpu_types.h"
+
+namespace pten {
+namespace backends {
+namespace gpu {
+
+//! Get the version of dnn
+int DnnVersion();
+
+//! Get the total number of GPU devices in system.
+int GetGPUDeviceCount();
+
+//! Get the compute capability of the ith GPU (format: major * 10 + minor)
+int GetGPUComputeCapability(int id);
+
+//! Get the runtime version of the ith GPU
+int GetGPURuntimeVersion(int id);
+
+//! Get the driver version of the ith GPU
+int GetGPUDriverVersion(int id);
+
+//! Wheter the current device support TensorCore
+bool TensorCoreAvailable();
+
+//! Get the MultiProcessors of the ith GPU.
+int GetGPUMultiProcessors(int id);
+
+//! Get the MaxThreads of each MultiProcessor of the ith GPU.
+int GetGPUMaxThreadsPerMultiProcessor(int id);
+
+//! Get the MaxThreads of each block of the ith GPU.
+int GetGPUMaxThreadsPerBlock(int id);
+
+//! Get the current GPU device id in system.
+int GetCurrentDeviceId();
+
+//! Get the maximum GridDim size for GPU buddy allocator.
+std::array<int, 3> GetGpuMaxGridDimSize(int);
+
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetSelectedDevices();
+
+//! Get the properties of the ith GPU device.
+const gpuDeviceProp &GetDeviceProperties(int id);
+
+//! Set the GPU device id for next execution.
+void SetDeviceId(int device_id);
+
+//! Copy memory from address src to dst asynchronously.
+void GpuMemcpyAsync(void *dst,
+                    const void *src,
+                    size_t count,
+                    gpuMemcpyKind kind,
+                    gpuStream_t stream);
+
+//! Copy memory from address src to dst synchronously.
+void GpuMemcpySync(void *dst,
+                   const void *src,
+                   size_t count,
+                   gpuMemcpyKind kind);
+
+//! Copy memory from one device to another device asynchronously.
+void GpuMemcpyPeerAsync(void *dst,
+                        int dst_device,
+                        const void *src,
+                        int src_device,
+                        size_t count,
+                        gpuStream_t stream);
+
+//! Copy memory from one device to another device synchronously.
+void GpuMemcpyPeerSync(
+    void *dst, int dst_device, const void *src, int src_device, size_t count);
+
+//! Set memory dst with value count size asynchronously
+void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream);
+
+//! Blocks until stream has completed all operations.
+void GpuStreamSync(gpuStream_t stream);
+
+void GpuDestroyStream(gpuStream_t stream);
+
+// ! Blocks until device has completed all operations.
+void GpuDeviceSync();
+
+gpuError_t GpuGetLastError();
+
+class GPUDeviceGuard {
+ public:
+  explicit inline GPUDeviceGuard(int dev_id) {
+    int prev_id = GetCurrentDeviceId();
+    if (prev_id != dev_id) {
+      prev_id_ = prev_id;
+      SetDeviceId(dev_id);
+    }
+  }
+  inline ~GPUDeviceGuard() {
+    if (prev_id_ != -1) {
+      SetDeviceId(prev_id_);
+    }
+  }
+  GPUDeviceGuard(const GPUDeviceGuard &o) = delete;
+  GPUDeviceGuard &operator=(const GPUDeviceGuard &o) = delete;
+
+ private:
+  int prev_id_{-1};
+};
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace pten
+
+#endif
diff --git a/paddle/pten/backends/gpu/gpu_launch_config.h b/paddle/pten/backends/gpu/gpu_launch_config.h
new file mode 100644
index 0000000000000..c166b3d2f80ed
--- /dev/null
+++ b/paddle/pten/backends/gpu/gpu_launch_config.h
@@ -0,0 +1,181 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Used for compute gpu launch parameter config
+
+#pragma once
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+#ifdef PADDLE_WITH_CUDA
+#include <cuda_runtime.h>
+#else
+#include <hip/hip_runtime.h>
+#endif
+
+#include <stddef.h>
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/pten/backends/gpu/gpu_context.h"
+
+#ifdef __HIPCC__
+// HIP results in error or nan if > 256
+#define PREDEFINED_BLOCK_SIZE 256
+#else
+/* CUDA performs better as thread_per_block
+   num is between [64, 512] */
+#define PREDEFINED_BLOCK_SIZE 512
+#endif
+
+namespace pten {
+namespace backends {
+namespace gpu {
+
+inline int DivUp(int a, int b) { return (a + b - 1) / b; }
+
+/* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+   for round integer value into next highest power of 2. */
+static inline int RoundToPowerOfTwo(int n) {
+  n--;
+  n |= (n >> 1);
+  n |= (n >> 2);
+  n |= (n >> 4);
+  n |= (n >> 8);
+  n |= (n >> 16);
+#ifdef __HIPCC__
+  return std::min(256, std::max(32, (n + 1)));
+#else
+  return std::min(1024, std::max(32, (n + 1)));
+#endif
+}
+
+#ifdef WITH_NV_JETSON
+// The number of threads cannot be assigned 1024 in some cases when the device
+// is nano or tx2 .
+inline void ChangeThreadNum(const pten::CUDAContext& context,
+                            int* num_thread,
+                            int alternative_num_thread = 512) {
+  if (context.GetComputeCapability() == 53 ||
+      context.GetComputeCapability() == 62) {
+    *num_thread = alternative_num_thread;
+  }
+}
+#endif
+
+struct GpuLaunchConfig {
+ public:
+  GpuLaunchConfig() {}
+
+  size_t GetThreadNum() const { return GetBlockSize() * GetGridSize(); }
+
+  size_t GetGridSize() const {
+    return block_per_grid.x * block_per_grid.y * block_per_grid.z;
+  }
+
+  size_t GetBlockSize() const {
+    return thread_per_block.x * thread_per_block.y * thread_per_block.z;
+  }
+
+  int compute_capability = 0;
+  dim3 thread_per_block = dim3(1, 1, 1);
+  dim3 block_per_grid = dim3(1, 1, 1);
+};
+
+/* According to NVIDIA, if number of threads per block is 64/128/256/512,
+  * cuda performs better. And number of blocks should be greater (at least
+  * 2x~4x) than number of SMs. Hence, SM count is took into account within
+  * this function to determine the right number of threads per block. */
+inline GpuLaunchConfig GetGpuLaunchConfig1D(const pten::GPUContext& context,
+                                            int64_t numel,
+                                            int vec_size = 1) {
+  PADDLE_ENFORCE_GT(numel,
+                    0,
+                    paddle::platform::errors::InvalidArgument(
+                        "element quantity should be greater than 0,"
+                        " but received value is: %d.",
+                        numel));
+  // Get compute_capability
+  const int capability = context.GetComputeCapability();
+  /* If thread number per block is 64/128/256/512, cuda performs better.*/
+  int limit_threads =
+      std::min(PREDEFINED_BLOCK_SIZE, context.GetMaxThreadsPerBlock());
+#ifdef WITH_NV_JETSON
+  if (capability == 53 || capability == 62) {
+    limit_threads = 512;
+  }
+#endif
+  int threads = limit_threads;
+  int sm_count = context.GetSMCount();
+  int active_threads_num = numel / vec_size;
+  if (active_threads_num / (sm_count << 1) < limit_threads) {
+    // Round up threads number into an exponential multiple of 2, while number
+    // of acitve blocks is about twice of SM, to acquire better performance.
+    threads = RoundToPowerOfTwo(active_threads_num / (sm_count << 1));
+  } else if (active_threads_num / (sm_count << 2) < limit_threads) {
+    // Round up threads number into an exponential multiple of 2, while number
+    // of acitve blocks is about 4 times of SM, to acquire better performance.
+    threads = RoundToPowerOfTwo(active_threads_num / (sm_count << 2));
+  }
+  // Number of threads per block shall be larger than 64.
+  threads = std::max(64, threads);
+  int blocks = DivUp(DivUp(numel, vec_size), threads);
+
+  GpuLaunchConfig config;
+  config.thread_per_block.x = threads;
+  config.block_per_grid.x = blocks;
+  config.compute_capability = capability;
+  return config;
+}
+
+inline GpuLaunchConfig GetGpuLaunchConfig2D(const pten::GPUContext& context,
+                                            int x_dim,
+                                            int y_dim) {
+  PADDLE_ENFORCE_GT(x_dim,
+                    0,
+                    paddle::platform::errors::InvalidArgument(
+                        "x dim number should greater than 0,"
+                        " but received value is: %d",
+                        x_dim));
+  PADDLE_ENFORCE_GT(y_dim,
+                    0,
+                    paddle::platform::errors::InvalidArgument(
+                        "y dim number should greater than 0,"
+                        " but received value is: %d",
+                        y_dim));
+
+  const int kThreadsPerBlock = 256;
+  int block_cols = (std::min)(x_dim, kThreadsPerBlock);
+  int block_rows = (std::max)(kThreadsPerBlock / block_cols, 1);
+
+  int max_physical_threads = context.GetMaxPhysicalThreadCount();
+  const int max_blocks = (std::max)(max_physical_threads / kThreadsPerBlock, 1);
+
+  GpuLaunchConfig config;
+  // Noticed, block size is not align to 32, if needed do it yourself.
+  config.thread_per_block = dim3(block_cols, block_rows, 1);
+
+  int grid_x = (std::min)(DivUp(x_dim, block_cols), max_blocks);
+  int grid_y =
+      (std::min)(max_blocks / grid_x, (std::max)(y_dim / block_rows, 1));
+
+  config.block_per_grid = dim3(grid_x, grid_y, 1);
+  return config;
+}
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace pten
+
+#endif
diff --git a/paddle/pten/backends/gpu/gpu_types.h b/paddle/pten/backends/gpu/gpu_types.h
new file mode 100644
index 0000000000000..72353294b29e0
--- /dev/null
+++ b/paddle/pten/backends/gpu/gpu_types.h
@@ -0,0 +1,73 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pten/backends/gpu/forwards.h"
+#include "paddle/pten/backends/gpu/gpu_decls.h"
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+#ifdef PADDLE_WITH_HIP
+#include "paddle/pten/backends/dynload/miopen.h"
+#include "paddle/pten/backends/dynload/rocblas.h"
+#else  // PADDLE_WITH_CUDA
+#include "paddle/pten/backends/dynload/cublas.h"
+#include "paddle/pten/backends/dynload/cudnn.h"
+#endif
+
+namespace pten {
+
+#ifdef PADDLE_WITH_HIP
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using GPU_TYPE = ROCM_TYPE;
+
+#else  // PADDLE_WITH_CDUA
+
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using GPU_TYPE = CUDA_TYPE;
+#endif
+
+DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t);
+DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind);
+DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t);
+DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t);
+DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t);
+DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t,
+                     cudnnTensorFormat_t,
+                     miopenTensorFormat_t);
+DECLARE_TYPE_FOR_GPU(dnnActivationMode_t,
+                     cudnnActivationMode_t,
+                     miopenActivationMode_t);
+
+#undef DECLARE_TYPE_FOR_GPU
+
+#ifdef PADDLE_WITH_HIP
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
+  constexpr auto GPU_CV = ROCM_CV;
+#else  // PADDLE_WITH_CUDA
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
+  constexpr auto GPU_CV = CUDA_CV;
+#endif
+
+DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory,
+                         cudaErrorMemoryAllocation,
+                         hipErrorOutOfMemory);
+DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady);
+DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess);
+
+#undef DECLARE_CONSTANT_FOR_GPU
+}  // namespace pten
+
+#endif  // defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/pten/backends/gpu/rocm/CMakeLists.txt b/paddle/pten/backends/gpu/rocm/CMakeLists.txt
new file mode 100644
index 0000000000000..181f92cbfc31c
--- /dev/null
+++ b/paddle/pten/backends/gpu/rocm/CMakeLists.txt
@@ -0,0 +1 @@
+hip_library(pten_rocm_info SRCS rocm_info.cc DEPS gflags glog enforce pten_dynload_cuda)
diff --git a/paddle/pten/backends/gpu/rocm/rocm_helper.h b/paddle/pten/backends/gpu/rocm/rocm_helper.h
new file mode 100644
index 0000000000000..3a16bfc5286ae
--- /dev/null
+++ b/paddle/pten/backends/gpu/rocm/rocm_helper.h
@@ -0,0 +1,72 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace pten {
+namespace backends {
+namespace gpu {
+
+/*
+ * Summary: Grid stride looping macro in CUDA kernel
+ *
+ *  [ Why need this macro? ]
+ *
+ *    The original looping in CUDA kernel is:
+ *
+ *    `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+ *        i += blockDim.x * gridDim.x)`
+ *
+ *    This for condition is risky. The value of `blockIdx.x * blockDim.x`
+ *    may be large, such as over 1GB, the first iteration is no problem here,
+ *    but when `i += blockDim.x * gridDim.x` is executed, the value of i
+ *    will greater than INT_MAX and overflow becomes negative value, at
+ *    this time, the cycle condition `i < (n)` is still satisfied, so it
+ *    will cause illegal access to cuda memory.
+ *
+ *    Here is a real example in ERINE, it will trigger above error.
+ *    The related data are:
+ *      - blockIdx.x = 2172938
+ *      - blockDim.x = 512
+ *      - blockIdx.x * blockDim.x = 1112543864
+ *      - INT_MAX = 2147483647
+ *
+ *    So we polish the for condition as follow, the int64_t __index__ will
+ *    prevent overflow in the loop increment.
+ *
+ * Parameters:
+ *    - i: loop index
+ *    - num: total element numbers
+ *
+ * Examples:
+ *    template <typename T>
+ *    __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
+ *                      const int d, const int remain) {
+ *    CUDA_KERNEL_LOOP(index, num) {
+ *      int idx_n = index / d;
+ *      int idx_remain = index % remain;
+ *      logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
+ *      }
+ *    }
+ *
+*/
+
+#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                     \
+  int64_t __index__ = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; \
+  for (index_type i = __index__; __index__ < (num);                   \
+       __index__ += hipBlockDim_x * hipGridDim_x, i = __index__)
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace pten
diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_info.cc b/paddle/pten/backends/gpu/rocm/rocm_info.cc
similarity index 72%
rename from paddle/fluid/platform/device/gpu/rocm/rocm_info.cc
rename to paddle/pten/backends/gpu/rocm/rocm_info.cc
index 06dba8ce423ef..095acdf0769d4 100644
--- a/paddle/fluid/platform/device/gpu/rocm/rocm_info.cc
+++ b/paddle/pten/backends/gpu/rocm/rocm_info.cc
@@ -12,20 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include <array>
+#include "paddle/pten/backends/gpu/gpu_info.h"
+
+// TODO(pten): remove fluid headers.
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/lock_guard_ptr.h"
-#include "paddle/fluid/platform/macros.h"
-#include "paddle/fluid/platform/monitor.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/split.h"
 
 static std::once_flag g_device_props_size_init_flag;
 static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags;
 static std::vector<paddle::gpuDeviceProp> g_device_props;
 
-namespace paddle {
-namespace platform {
+namespace pten {
+namespace backends {
+namespace gpu {
+
 int DnnVersion() {
   if (!dynload::HasCUDNN()) return -1;
   size_t version_major, version_minor, version_patch;
@@ -78,11 +78,13 @@ int GetGPUDeviceCount() {
 }
 
 int GetGPUComputeCapability(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
   int major, minor;
   auto major_error_code = hipDeviceGetAttribute(
       &major, hipDeviceAttributeComputeCapabilityMajor, id);
@@ -95,22 +97,26 @@ int GetGPUComputeCapability(int id) {
 }
 
 int GetGPURuntimeVersion(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
   int runtime_version = 0;
   PADDLE_ENFORCE_GPU_SUCCESS(hipRuntimeGetVersion(&runtime_version));
   return runtime_version;
 }
 
 int GetGPUDriverVersion(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
   int driver_version = 0;
   PADDLE_ENFORCE_GPU_SUCCESS(hipDriverGetVersion(&driver_version));
   return driver_version;
@@ -119,11 +125,13 @@ int GetGPUDriverVersion(int id) {
 bool TensorCoreAvailable() { return false; }
 
 int GetGPUMultiProcessors(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
   int count;
   PADDLE_ENFORCE_GPU_SUCCESS(
       hipDeviceGetAttribute(&count, hipDeviceAttributeMultiprocessorCount, id));
@@ -131,11 +139,13 @@ int GetGPUMultiProcessors(int id) {
 }
 
 int GetGPUMaxThreadsPerMultiProcessor(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
   int count;
   PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceGetAttribute(
       &count, hipDeviceAttributeMaxThreadsPerMultiProcessor, id));
@@ -144,11 +154,13 @@ int GetGPUMaxThreadsPerMultiProcessor(int id) {
 }
 
 int GetGPUMaxThreadsPerBlock(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
   int count;
   PADDLE_ENFORCE_GPU_SUCCESS(
       hipDeviceGetAttribute(&count, hipDeviceAttributeMaxThreadsPerBlock, id));
@@ -161,35 +173,37 @@ int GetCurrentDeviceId() {
   return device_id;
 }
 
-dim3 GetGpuMaxGridDimSize(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+std::array<int, 3> GetGpuMaxGridDimSize(int id) {
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
-  dim3 ret;
+                        id,
+                        GetGPUDeviceCount()));
+  std::array<int, 3> ret;
   int size;
   auto error_code_x =
       hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimX, id);
   PADDLE_ENFORCE_GPU_SUCCESS(error_code_x);
-  ret.x = size;
+  ret[0] = size;
 
   auto error_code_y =
       hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimY, id);
   PADDLE_ENFORCE_GPU_SUCCESS(error_code_y);
-  ret.y = size;
+  ret[1] = size;
 
   auto error_code_z =
       hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimZ, id);
   PADDLE_ENFORCE_GPU_SUCCESS(error_code_z);
-  ret.z = size;
+  ret[2] = size;
   return ret;
 }
 
 const gpuDeviceProp &GetDeviceProperties(int id) {
   std::call_once(g_device_props_size_init_flag, [&] {
     int gpu_num = 0;
-    gpu_num = platform::GetGPUDeviceCount();
+    gpu_num = GetGPUDeviceCount();
     g_device_props_init_flags.resize(gpu_num);
     g_device_props.resize(gpu_num);
     for (int i = 0; i < gpu_num; ++i) {
@@ -198,16 +212,17 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
   });
 
   if (id == -1) {
-    id = platform::GetCurrentDeviceId();
+    id = GetCurrentDeviceId();
   }
 
   if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
-    PADDLE_THROW(platform::errors::OutOfRange(
+    PADDLE_THROW(paddle::platform::errors::OutOfRange(
         "The device id %d is out of range [0, %d), where %d is the number of "
         "devices on this machine. Because the device id should be greater than "
         "or equal to zero and smaller than the number of gpus. Please input "
         "appropriate device again!",
-        id, static_cast<int>(g_device_props.size()),
+        id,
+        static_cast<int>(g_device_props.size()),
         static_cast<int>(g_device_props.size())));
   }
 
@@ -220,32 +235,43 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
 
 void SetDeviceId(int id) {
   // TODO(qijun): find a better way to cache the cuda device count
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
   PADDLE_RETRY_CUDA_SUCCESS(hipSetDevice(id));
 }
 
-void GpuMemcpyAsync(void *dst, const void *src, size_t count,
-                    gpuMemcpyKind kind, gpuStream_t stream) {
+void GpuMemcpyAsync(void *dst,
+                    const void *src,
+                    size_t count,
+                    gpuMemcpyKind kind,
+                    gpuStream_t stream) {
   PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(dst, src, count, kind, stream));
 }
 
-void GpuMemcpySync(void *dst, const void *src, size_t count,
+void GpuMemcpySync(void *dst,
+                   const void *src,
+                   size_t count,
                    gpuMemcpyKind kind) {
   PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(dst, src, count, kind));
 }
 
-void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
-                        int src_device, size_t count, gpuStream_t stream) {
+void GpuMemcpyPeerAsync(void *dst,
+                        int dst_device,
+                        const void *src,
+                        int src_device,
+                        size_t count,
+                        gpuStream_t stream) {
   PADDLE_ENFORCE_GPU_SUCCESS(
       hipMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
 }
 
-void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
-                       int src_device, size_t count) {
+void GpuMemcpyPeerSync(
+    void *dst, int dst_device, const void *src, int src_device, size_t count) {
   PADDLE_ENFORCE_GPU_SUCCESS(
       hipMemcpyPeer(dst, dst_device, src, src_device, count));
 }
@@ -265,5 +291,7 @@ void GpuDestroyStream(gpuStream_t stream) {
 void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); }
 
 gpuError_t GpuGetLastError() { return hipGetLastError(); }
-}  // namespace platform
-}  // namespace paddle
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace pten
diff --git a/paddle/pten/backends/xpu/xpu_context.cc b/paddle/pten/backends/xpu/xpu_context.cc
index af4478662a53b..22e82b220d2a8 100644
--- a/paddle/pten/backends/xpu/xpu_context.cc
+++ b/paddle/pten/backends/xpu/xpu_context.cc
@@ -13,8 +13,11 @@
 // limitations under the License.
 
 #include "paddle/pten/backends/xpu/xpu_context.h"
+
 #include <memory>
+
 #include "paddle/pten/api/ext/exception.h"
+#include "paddle/pten/common/place.h"
 
 #include "xpu/runtime.h"
 #include "xpu/runtime_ex.h"
@@ -24,12 +27,11 @@ namespace xpu = baidu::xpu::api;
 
 namespace pten {
 
-struct XPUContext::XPUImpl {
-  void SetL3Cache() {
+struct XPUContext::Impl {
+  void SetL3Cache(int l3_size = 14155776) {
     const int MAX_XPU_NUM = 16;
     static void* l3ptrs[MAX_XPU_NUM] = {nullptr};
 
-    int l3_size = 13.5 * 1024 * 1024;
     if (std::getenv("XPU_PADDLE_L3_SIZE") != nullptr) {
       l3_size = atoi(std::getenv("XPU_PADDLE_L3_SIZE"));
     }
@@ -52,48 +54,28 @@ struct XPUContext::XPUImpl {
     }
   }
 
-  XPUImpl() {
-    context_ = xpu::create_context();
-    xpu_version_ = backends::xpu::get_xpu_version(place_.device);
-  }
-
-  explicit XPUImpl(XPUPlace place) : place_(place) {
-    backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId());
-
-    LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
-                            << static_cast<int>(place_.device);
-
-    context_ = xpu::create_context();
-    xpu_version_ = backends::xpu::get_xpu_version(place_.device);
-    SetL3Cache();
-  }
+  Impl() : place_(XPUPlace()) {}
 
-  // Users need to manage external resources.
-  explicit XPUImpl(const XPUContextResource& ctx_res,
-                   const XPUPlace& place = XPUPlace(0))
-      : res_(ctx_res), place_(place) {
-    context_ = res_.context;
-    xpu_version_ = backends::xpu::get_xpu_version(place_.device);
-    SetL3Cache();
-  }
+  explicit Impl(const Place& place) : place_(place) {}
 
-  ~XPUImpl() {
-    if (res_.context == nullptr && context_ != nullptr) {
+  ~Impl() {
+    if (owned_ && context_ != nullptr) {
       xpu::destroy_context(context_);
       context_ = nullptr;
     }
   }
 
-  Place GetPlace() const { return place_; }
-
-  backends::xpu::XPUVersion GetXpuVersion() const { return xpu_version_; }
+  const Place& GetPlace() const { return place_; }
 
   xpu::Context* GetXContext() const {
     PD_CHECK(context_ != nullptr, "the xpu context is nullptr.");
     return context_;
   }
 
-  xpu::BKCLContext_t GetBkclContext() const { return bkcl_context_; }
+  xpu::BKCLContext_t GetBkclContext() const {
+    PD_CHECK(bkcl_context_ != nullptr, "the xpu bkcl_context is nullptr.");
+    return bkcl_context_;
+  }
 
   void Wait() const {
     backends::xpu::SetXPUDeviceId(place_.GetDeviceId());
@@ -101,53 +83,41 @@ struct XPUContext::XPUImpl {
     xpu_wait(context_->xpu_stream);
   }
 
-  void SetXContext(xpu::Context* context) {
-    if (context == nullptr) {
-      return;
-    }
-    res_.context = context;
-    context_ = context;
+  void Init() {
+    owned_ = true;
+    backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId());
+    LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
+                            << static_cast<int>(place_.device);
+    context_ = xpu::create_context();
+    xpu_version_ = backends::xpu::get_xpu_version(place_.device);
+    SetL3Cache();
   }
 
+  void SetXContext(xpu::Context* context) { context_ = context; }
+
   void SetBkclContext(xpu::BKCLContext_t context) { bkcl_context_ = context; }
 
-  XPUContextResource res_;
-  XPUPlace place_;
+  bool owned_{false};
+  Place place_;
   backends::xpu::XPUVersion xpu_version_;
   xpu::Context* context_{nullptr};
+
   // NOTE: Distributed communicator, distributed framework manages its
   // resources, XPUContext only holds references.
   xpu::BKCLContext_t bkcl_context_{nullptr};
 };
 
-XPUContext::XPUContext() : DeviceContext() {
-  impl_ = std::make_unique<XPUImpl>();
-}
+XPUContext::XPUContext() : DeviceContext(), impl_(std::make_unique<Impl>()) {}
 
-XPUContext::XPUContext(const XPUPlace& place) {
-  impl_ = std::make_unique<XPUImpl>(place);
-}
-
-XPUContext::XPUContext(const XPUContext& other) : DeviceContext() {
-  impl_ = std::make_unique<XPUImpl>();
-  impl_->SetXContext(other.x_context());
-  impl_->SetBkclContext(other.bkcl_context());
-}
-
-XPUContext::XPUContext(XPUContext&& other) : DeviceContext() {
-  impl_ = std::move(other.impl_);
-}
+XPUContext::XPUContext(const XPUPlace& place)
+    : DeviceContext(), impl_(std::make_unique<Impl>(place)) {}
 
 XPUContext::~XPUContext() = default;
 
-XPUContext::XPUContext(const XPUContextResource& ctx_res) : DeviceContext() {
-  impl_ = std::make_unique<XPUImpl>(ctx_res);
-}
-
-Place XPUContext::GetPlace() const { return impl_->GetPlace(); }
+const Place& XPUContext::GetPlace() const { return impl_->GetPlace(); }
 
 backends::xpu::XPUVersion XPUContext::xpu_version() const {
-  return impl_->GetXpuVersion();
+  return impl_->xpu_version_;
 }
 
 xpu::Context* XPUContext::x_context() const { return impl_->GetXContext(); }
@@ -158,12 +128,16 @@ xpu::BKCLContext_t XPUContext::bkcl_context() const {
 
 void XPUContext::Wait() const { impl_->Wait(); }
 
-void XPUContext::set_x_context(xpu::Context* context) {
+void XPUContext::SetXContext(xpu::Context* context) {
   impl_->SetXContext(context);
 }
 
-void XPUContext::set_bkcl_context(xpu::BKCLContext_t context) {
+void XPUContext::SetL3Cache(int l3_size) { impl_->SetL3Cache(l3_size); }
+
+void XPUContext::SetBkclContext(xpu::BKCLContext_t context) {
   impl_->SetBkclContext(context);
 }
 
+void XPUContext::Init() { impl_->Init(); }
+
 }  // namespace pten
diff --git a/paddle/pten/backends/xpu/xpu_context.h b/paddle/pten/backends/xpu/xpu_context.h
index 4ae5786211dd2..440d06a60fc5c 100644
--- a/paddle/pten/backends/xpu/xpu_context.h
+++ b/paddle/pten/backends/xpu/xpu_context.h
@@ -26,26 +26,15 @@ namespace xpu = baidu::xpu::api;
 
 namespace pten {
 
-struct XPUContextResource {
-  xpu::Context* context{nullptr};
-};
-
 class XPUContext : public DeviceContext {
  public:
-  // NOTE: DeviceContext hold resources. Used in training scenarios.
   XPUContext();
 
   explicit XPUContext(const XPUPlace&);
 
-  // NOTE: Share the same underlying resources, please ensure that resources are
-  // not released.
-  XPUContext(const XPUContext&);
-
-  XPUContext(XPUContext&&);
-
   virtual ~XPUContext();
 
-  Place GetPlace() const override;
+  const Place& GetPlace() const override;
 
   backends::xpu::XPUVersion xpu_version() const;
 
@@ -53,21 +42,28 @@ class XPUContext : public DeviceContext {
 
   // Return bkcl context.
   xpu::BKCLContext_t bkcl_context() const;
+  void SetBkclContext(xpu::BKCLContext_t context);
 
   // Wait for all operations completion in the stream.
   void Wait() const override;
 
  public:
-  // NOTE: External users manage resources. Used in inference scenarios.
-  explicit XPUContext(const XPUContextResource&);
+  // NOTE: DeviceContext hold resources. Used in training scenarios.
+  // The interface used by the training scene, DeviceContext will initialize
+  // all resources and delete them when destructing.
+  void Init();
 
-  void set_x_context(xpu::Context*);
+ public:
+  // NOTE: External users manage resources. Used in inference scenarios.
+  // The Set interface is for inference only, DeviceContext will mark the
+  // resource as external, and will not delete any resource when destructing.
+  void SetXContext(xpu::Context*);
 
-  void set_bkcl_context(xpu::BKCLContext_t context);
+  void SetL3Cache(int l3_size = 14155776);
 
  private:
-  struct XPUImpl;
-  std::unique_ptr<XPUImpl> impl_;
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
 };
 
 }  // namespace pten
diff --git a/paddle/pten/core/device_context.cc b/paddle/pten/core/device_context.cc
index d6e01c5c6e664..2a11b1bef9dbc 100644
--- a/paddle/pten/core/device_context.cc
+++ b/paddle/pten/core/device_context.cc
@@ -23,7 +23,7 @@ struct DeviceContext::Impl {
   Impl() = default;
   ~Impl() = default;
 
-  void SetDeviceAllocator(const Allocator* allocator) {
+  void SetAllocator(const Allocator* allocator) {
     PADDLE_ENFORCE_NOT_NULL(
         allocator,
         pten::errors::InvalidArgument(
@@ -47,7 +47,7 @@ struct DeviceContext::Impl {
     zero_allocator_ = allocator;
   }
 
-  const Allocator& GetDeviceAllocator() const {
+  const Allocator& GetAllocator() const {
     PADDLE_ENFORCE_NOT_NULL(
         device_allocator_,
         pten::errors::InvalidArgument("Required device_allocator_ shall not be "
@@ -124,7 +124,7 @@ DeviceContext::DeviceContext() { impl_ = std::make_unique<Impl>(); }
 
 DeviceContext::DeviceContext(const DeviceContext& other) {
   impl_->SetHostAllocator(&other.GetHostAllocator());
-  impl_->SetDeviceAllocator(&other.GetDeviceAllocator());
+  impl_->SetAllocator(&other.GetAllocator());
   impl_->SetZeroAllocator(&other.GetZeroAllocator());
 }
 
@@ -134,12 +134,12 @@ DeviceContext::DeviceContext(DeviceContext&& other) {
 
 DeviceContext::~DeviceContext() = default;
 
-void DeviceContext::SetDeviceAllocator(const Allocator* allocator) {
-  impl_->SetDeviceAllocator(allocator);
+void DeviceContext::SetAllocator(const Allocator* allocator) {
+  impl_->SetAllocator(allocator);
 }
 
-const Allocator& DeviceContext::GetDeviceAllocator() const {
-  return impl_->GetDeviceAllocator();
+const Allocator& DeviceContext::GetAllocator() const {
+  return impl_->GetAllocator();
 }
 
 void DeviceContext::SetHostAllocator(const Allocator* allocator) {
diff --git a/paddle/pten/core/device_context.h b/paddle/pten/core/device_context.h
index 30be5cd22dd4e..68c16dc3a1964 100644
--- a/paddle/pten/core/device_context.h
+++ b/paddle/pten/core/device_context.h
@@ -60,7 +60,7 @@ class DeviceContext {
    *
    * @param allocator
    */
-  void SetDeviceAllocator(const Allocator*);
+  void SetAllocator(const Allocator*);
 
   /**
    * @brief Set the host Allocator object.
@@ -81,7 +81,7 @@ class DeviceContext {
    *
    * @return Allocator
    */
-  const Allocator& GetDeviceAllocator() const;
+  const Allocator& GetAllocator() const;
 
   /**
    * @brief Get the const device-related Allocator object.
@@ -114,7 +114,7 @@ class DeviceContext {
 
   // TODO(wilber): Just for the convenience of migrating the code, it will be
   // modified or removed later.
-  virtual Place GetPlace() const = 0;
+  virtual const Place& GetPlace() const = 0;
   // TODO(wilber): The fluid framework uses wait() in many places, how to delete
   // this API interface.
   virtual void Wait() const {}
diff --git a/paddle/pten/kernels/funcs/cuda_kernel_config.h b/paddle/pten/kernels/funcs/cuda_kernel_config.h
index 6c7c6d11713c3..483e58eedb4d1 100644
--- a/paddle/pten/kernels/funcs/cuda_kernel_config.h
+++ b/paddle/pten/kernels/funcs/cuda_kernel_config.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 
 #ifdef __HIPCC__
 #define ELEMENTWISE_BLOCK_SIZE 256
@@ -31,7 +32,7 @@ namespace funcs {
 * 2x~4x) than number of SMs. Hence, SM count is took into account within
 * this function to determine the right number of threads per block.
 */
-inline int GetThreadsConfig(const paddle::platform::CUDADeviceContext &ctx,
+inline int GetThreadsConfig(const pten::GPUContext &ctx,
                             int64_t numel,
                             int vec_size) {
   int threads = ELEMENTWISE_BLOCK_SIZE;
diff --git a/paddle/pten/kernels/funcs/elementwise_base.h b/paddle/pten/kernels/funcs/elementwise_base.h
index 3f1651eeb276f..d102fd637181f 100644
--- a/paddle/pten/kernels/funcs/elementwise_base.h
+++ b/paddle/pten/kernels/funcs/elementwise_base.h
@@ -23,8 +23,8 @@ limitations under the License. */
 
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/fluid/platform/aligned_vector.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/function_traits.h"
+#include "paddle/pten/backends/gpu/gpu_launch_config.h"
 #include "paddle/pten/kernels/primitive/kernel_primitives.h"
 
 namespace kps = pten::kps;
@@ -646,7 +646,8 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
                               VecSize><<<grid_size, block_size, 0, stream>>>(
       ins_data, outs_data, numel, main_offset, func);
 #else
-  auto gpu_config = GetGpuLaunchConfig1D(ctx, numel, VecSize);
+  auto gpu_config =
+      pten::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, VecSize);
   int main_offset = (numel / (VecSize * gpu_config.GetBlockSize())) * VecSize *
                     gpu_config.GetBlockSize();
   auto stream = ctx.stream();
diff --git a/paddle/pten/kernels/funcs/transpose.cu b/paddle/pten/kernels/funcs/transpose.cu
index a7b7184487c96..09baa2c6e023a 100644
--- a/paddle/pten/kernels/funcs/transpose.cu
+++ b/paddle/pten/kernels/funcs/transpose.cu
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/ddim.h"
diff --git a/paddle/pten/kernels/gpu/concat_and_split.h b/paddle/pten/kernels/gpu/concat_and_split.h
index 66b21b5f51351..47022666564df 100644
--- a/paddle/pten/kernels/gpu/concat_and_split.h
+++ b/paddle/pten/kernels/gpu/concat_and_split.h
@@ -237,12 +237,11 @@ __global__ void SplitKernel(const T* input_data,
   SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
 }
 
-static inline void GetBlockDims(
-    const paddle::platform::CUDADeviceContext& context,
-    int64_t num_rows,
-    int64_t num_cols,
-    dim3* block_dims,
-    dim3* grid_dims) {
+static inline void GetBlockDims(const pten::GPUContext& context,
+                                int64_t num_rows,
+                                int64_t num_cols,
+                                dim3* block_dims,
+                                dim3* grid_dims) {
   // Set the thread block and grid according to CurrentDeviceId
   const int kThreadsPerBlock = 1024;
   int block_cols = kThreadsPerBlock;
diff --git a/paddle/pten/kernels/gpu/copy_kernel.cu b/paddle/pten/kernels/gpu/copy_kernel.cu
index 6c0e7b1fe65d3..d48a9fb1d774f 100644
--- a/paddle/pten/kernels/gpu/copy_kernel.cu
+++ b/paddle/pten/kernels/gpu/copy_kernel.cu
@@ -87,9 +87,7 @@ void Copy(const Context& dev_ctx,
                           ctx_gpu_place));
     auto stream =
         blocking ? nullptr
-                 : reinterpret_cast<const paddle::platform::CUDADeviceContext&>(
-                       dev_ctx)
-                       .stream();
+                 : reinterpret_cast<const pten::GPUContext&>(dev_ctx).stream();
     paddle::memory::Copy(
         dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
   } else if (paddle::platform::is_cpu_place(src_place) &&  // NOLINT
@@ -113,9 +111,7 @@ void Copy(const Context& dev_ctx,
                           ctx_gpu_place));
     auto stream =
         blocking ? nullptr
-                 : reinterpret_cast<const paddle::platform::CUDADeviceContext&>(
-                       dev_ctx)
-                       .stream();
+                 : reinterpret_cast<const pten::GPUContext&>(dev_ctx).stream();
     paddle::memory::Copy(
         dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
   } else if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
@@ -140,9 +136,7 @@ void Copy(const Context& dev_ctx,
                           ctx_gpu_place.device));
     auto stream =
         blocking ? nullptr
-                 : reinterpret_cast<const paddle::platform::CUDADeviceContext&>(
-                       dev_ctx)
-                       .stream();
+                 : reinterpret_cast<const pten::GPUContext&>(dev_ctx).stream();
     paddle::memory::Copy(
         dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
   } else if (paddle::platform::is_cuda_pinned_place(src_place) &&  // NOLINT
@@ -167,9 +161,7 @@ void Copy(const Context& dev_ctx,
                           ctx_gpu_place.device));
     auto stream =
         blocking ? nullptr
-                 : reinterpret_cast<const paddle::platform::CUDADeviceContext&>(
-                       dev_ctx)
-                       .stream();
+                 : reinterpret_cast<const pten::GPUContext&>(dev_ctx).stream();
     paddle::memory::Copy(
         dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size, stream);
   } else if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
@@ -185,9 +177,7 @@ void Copy(const Context& dev_ctx,
             ctx_place));
     auto stream =
         blocking ? nullptr
-                 : reinterpret_cast<const paddle::platform::CUDADeviceContext&>(
-                       dev_ctx)
-                       .stream();
+                 : reinterpret_cast<const pten::GPUContext&>(dev_ctx).stream();
     if (paddle::platform::is_same_place(src_place, dst_place)) {
       paddle::memory::Copy(
           dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
diff --git a/paddle/pten/kernels/gpu/elementwise.h b/paddle/pten/kernels/gpu/elementwise.h
index f988f5abdb1ca..947f969b077d0 100644
--- a/paddle/pten/kernels/gpu/elementwise.h
+++ b/paddle/pten/kernels/gpu/elementwise.h
@@ -2020,7 +2020,7 @@ void default_elementwise_add_grad(const GPUContext &ctx,
                                        T,
                                        kps::AddFunctor,
                                        kps::IdentityFunctor<T>>(
-          dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
+          ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
     }
   }
   // dy
@@ -2038,7 +2038,7 @@ void default_elementwise_add_grad(const GPUContext &ctx,
                                        T,
                                        kps::AddFunctor,
                                        kps::IdentityFunctor<T>>(
-          dout, dy, kps::IdentityFunctor<T>(), reduce_dims, stream);
+          ctx, dout, dy, kps::IdentityFunctor<T>(), reduce_dims, stream);
     }
   }
 }
@@ -2137,7 +2137,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
                                        T,
                                        kps::AddFunctor,
                                        kps::IdentityFunctor<T>>(
-          dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
+          ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
     }
   }
   // dy
@@ -2161,7 +2161,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
                                        T,
                                        kps::AddFunctor,
                                        kps::InverseFunctor<T>>(
-          dout, dy, kps::InverseFunctor<T>(), reduce_dims, stream);
+          ctx, dout, dy, kps::InverseFunctor<T>(), reduce_dims, stream);
     }
   }
 }
diff --git a/paddle/pten/kernels/gpu/reduce.h b/paddle/pten/kernels/gpu/reduce.h
index ca850d2c8c5ee..7a76a988dee25 100644
--- a/paddle/pten/kernels/gpu/reduce.h
+++ b/paddle/pten/kernels/gpu/reduce.h
@@ -1064,7 +1064,8 @@ template <typename Tx,
           typename Ty,
           template <typename> class ReduceOp,
           typename TransformOp>
-void TensorReduceFunctorImpl(const pten::DenseTensor& x,
+void TensorReduceFunctorImpl(const pten::GPUContext& dev_ctx,
+                             const pten::DenseTensor& x,
                              pten::DenseTensor* y,
                              const TransformOp& transform,
                              const std::vector<int>& origin_reduce_dims,
@@ -1088,13 +1089,11 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x,
   auto x_data = x.data<Tx>();
   auto y_data = y->data<Ty>();
 
-  auto* dev_ctx = static_cast<paddle::platform::CUDADeviceContext*>(
-      paddle::platform::DeviceContextPool::Instance().Get(x.place()));
   if (config.reduce_num == 1) {
     std::vector<const DenseTensor*> inputs = {&x};
     std::vector<DenseTensor*> outputs = {y};
     funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, Tx, Ty>(
-        *dev_ctx, inputs, &outputs, transform);
+        dev_ctx, inputs, &outputs, transform);
     return;
   }
 
@@ -1244,13 +1243,23 @@ void Reduce(const GPUContext& dev_ctx,
                                                  data_t,
                                                  ReduceOp,
                                                  TransformOp<T, MPType>>(
-              x, out, TransformOp<T, MPType>(reduce_num), reduce_dims, stream);
+              dev_ctx,
+              x,
+              out,
+              TransformOp<T, MPType>(reduce_num),
+              reduce_dims,
+              stream);
         }));
   } else {
     using MPType = typename kps::details::MPTypeTrait<T>::Type;
     pten::kernels::
         TensorReduceFunctorImpl<T, T, ReduceOp, TransformOp<T, MPType>>(
-            x, out, TransformOp<T, MPType>(reduce_num), reduce_dims, stream);
+            dev_ctx,
+            x,
+            out,
+            TransformOp<T, MPType>(reduce_num),
+            reduce_dims,
+            stream);
   }
 }
 }  // namespace pten
diff --git a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
index 87785a2b4778a..226cfd89b1327 100644
--- a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
@@ -60,9 +60,11 @@ struct ReduceSumForMatmulGrad<GPUContext, T> {
                   DenseTensor* output,
                   const std::vector<int>& reduce_dims) {
     auto stream = dev_ctx.stream();
-    kernels::
-        TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-            input, output, kps::IdentityFunctor<T>(), reduce_dims, stream);
+    kernels::TensorReduceFunctorImpl<T,
+                                     T,
+                                     kps::AddFunctor,
+                                     kps::IdentityFunctor<T>>(
+        dev_ctx, input, output, kps::IdentityFunctor<T>(), reduce_dims, stream);
   }
 };
 #endif
diff --git a/paddle/pten/tests/api/test_matmul_api.cc b/paddle/pten/tests/api/test_matmul_api.cc
index 48d2374b68b76..4d3adf86d166d 100644
--- a/paddle/pten/tests/api/test_matmul_api.cc
+++ b/paddle/pten/tests/api/test_matmul_api.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/pten/api/include/api.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
 #include "paddle/pten/kernels/copy_kernel.h"
@@ -122,7 +123,7 @@ TEST(API, matmul_cuda) {
 
   auto& pool = paddle::platform::DeviceContextPool::Instance();
   auto place = paddle::platform::CUDAPlace();
-  auto* dev_ctx = pool.GetByPlace(place);
+  auto* dev_ctx = static_cast<const pten::GPUContext*>(pool.GetByPlace(place));
 
   pten::Copy(*dev_ctx, *ref_x.get(), false, dense_x.get());
   pten::Copy(*dev_ctx, *ref_y.get(), false, dense_y.get());
diff --git a/paddle/pten/tests/api/test_sparse_utils_api.cc b/paddle/pten/tests/api/test_sparse_utils_api.cc
index 3ab7a60dff19e..1ec025faedc2c 100644
--- a/paddle/pten/tests/api/test_sparse_utils_api.cc
+++ b/paddle/pten/tests/api/test_sparse_utils_api.cc
@@ -47,6 +47,7 @@ TEST(API, to_sparse_coo) {
   std::copy(&dense_data[0][0], &dense_data[0][0] + 9, dense_x_data);
 
   pten::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.Init();
 
   // 1. test dense_to_sparse_coo
   paddle::experimental::Tensor x(dense_x);
diff --git a/paddle/pten/tests/core/test_device_context.cc b/paddle/pten/tests/core/test_device_context.cc
index c84c4a41a02bf..4fa8faabcfcee 100644
--- a/paddle/pten/tests/core/test_device_context.cc
+++ b/paddle/pten/tests/core/test_device_context.cc
@@ -25,43 +25,29 @@ limitations under the License. */
 namespace pten {
 namespace tests {
 
+class InferenceCPUContext : public CPUContext {
+ public:
+  void SetEigenDevice(Eigen::DefaultDevice* eigen_device) {
+    CPUContext::SetEigenDevice(eigen_device);
+  }
+};
+
 TEST(DeviceContext, cpu_context) {
   std::cout << "test training scenarios" << std::endl;
   {
     pten::CPUContext ctx;
+    ctx.Init();
     EXPECT_TRUE(ctx.eigen_device() != nullptr);
   }
 
   std::cout << "test inference scenarios" << std::endl;
   Eigen::DefaultDevice* device = new Eigen::DefaultDevice();
   {
-    pten::CPUContextResource ctx_res{device};
-    pten::CPUContext ctx(ctx_res);
-    EXPECT_TRUE(ctx.eigen_device() != nullptr);
-  }
-  {
-    pten::CPUContextResource ctx_res{nullptr};
-    pten::CPUContext ctx(ctx_res);
+    InferenceCPUContext ctx;
     ctx.SetEigenDevice(device);
     EXPECT_TRUE(ctx.eigen_device() != nullptr);
   }
   delete device;
-
-  std::cout << "test copy constructor" << std::endl;
-  {
-    pten::CPUContext ctx1;
-    pten::CPUContext ctx2(ctx1);
-    EXPECT_EQ(ctx1.eigen_device(), ctx2.eigen_device());
-  }
-
-  std::cout << "test move constructor" << std::endl;
-  {
-    pten::CPUContext ctx1 = pten::CPUContext();
-    auto* eigen_device1 = ctx1.eigen_device();
-    pten::CPUContext ctx2(std::move(ctx1));
-    auto* eigen_device2 = ctx2.eigen_device();
-    EXPECT_EQ(eigen_device1, eigen_device2);
-  }
 }
 
 }  // namespace tests
diff --git a/paddle/pten/tests/kernels/test_cast_dev_api.cc b/paddle/pten/tests/kernels/test_cast_dev_api.cc
index 2f9dab0fbef04..d6aac6d7db862 100644
--- a/paddle/pten/tests/kernels/test_cast_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_cast_dev_api.cc
@@ -50,10 +50,10 @@ TEST(DEV_API, cast) {
   }
 
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
 
   pten::DataType out_dtype = pten::DataType::FLOAT64;
   // 2. test API
diff --git a/paddle/pten/tests/kernels/test_concat_dev_api.cc b/paddle/pten/tests/kernels/test_concat_dev_api.cc
index ad2cf8c220e49..5e78545f6d6d9 100644
--- a/paddle/pten/tests/kernels/test_concat_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_concat_dev_api.cc
@@ -59,10 +59,10 @@ TEST(DEV_API, concat) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
   auto out = pten::Concat<float>(dev_ctx, inputs, 0);
 
   // 3. check result
diff --git a/paddle/pten/tests/kernels/test_conj_dev_api.cc b/paddle/pten/tests/kernels/test_conj_dev_api.cc
index 27332d99be6a1..cac8370f67fbf 100644
--- a/paddle/pten/tests/kernels/test_conj_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_conj_dev_api.cc
@@ -46,10 +46,10 @@ TEST(DEV_API, conj) {
   }
 
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
 
   // 2. test API
   auto out = pten::Conj<paddle::complex64>(dev_ctx, dense_x);
diff --git a/paddle/pten/tests/kernels/test_copy_dev_api.cc b/paddle/pten/tests/kernels/test_copy_dev_api.cc
index 76891028c4399..cd589142aed9b 100644
--- a/paddle/pten/tests/kernels/test_copy_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_copy_dev_api.cc
@@ -58,10 +58,10 @@ TEST(DEV_API, copy) {
   std::cout << typeid(a).name() << std::endl;
   // 2. test API
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
   pten::Copy(dev_ctx, *(dense_src.get()), false, dense_dst.get());
 
   // 3. check result
diff --git a/paddle/pten/tests/kernels/test_creation_dev_api.cc b/paddle/pten/tests/kernels/test_creation_dev_api.cc
index 14f323399cc1f..0e87fe554a2f4 100644
--- a/paddle/pten/tests/kernels/test_creation_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_creation_dev_api.cc
@@ -33,10 +33,10 @@ using DDim = pten::framework::DDim;
 TEST(DEV_API, empty) {
   // 1. create input
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
 
   // 2. test API
   auto out = pten::Empty<float>(dev_ctx, {3, 2}, pten::DataType::INT32);
@@ -64,10 +64,10 @@ TEST(DEV_API, empty_like) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
   auto out = pten::EmptyLike<float>(dev_ctx, dense_x);
 
   // 3. check result
@@ -84,10 +84,10 @@ TEST(DEV_API, full) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
   auto out = pten::Full<float>(dev_ctx, {3, 2}, val, pten::DataType::FLOAT32);
 
   // 3. check result
@@ -118,10 +118,10 @@ TEST(DEV_API, full_like) {
   float val = 1.0;
 
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
 
   // 2. test API
   auto out = pten::FullLike<float>(dev_ctx, dense_x, val);
diff --git a/paddle/pten/tests/kernels/test_dot_dev_api.cc b/paddle/pten/tests/kernels/test_dot_dev_api.cc
index 3343ab1d4283a..051f05c7805ca 100644
--- a/paddle/pten/tests/kernels/test_dot_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_dot_dev_api.cc
@@ -60,10 +60,10 @@ TEST(DEV_API, dot) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
   auto out = pten::Dot<float>(dev_ctx, dense_x, dense_y);
 
   // 3. check result
diff --git a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
index 5a44fb6dae870..20dd5ddb4a38c 100644
--- a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
@@ -62,10 +62,10 @@ TEST(DEV_API, add) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
   auto dense_out = pten::Add<float>(dev_ctx, dense_x, dense_y);
 
   // 3. check result
@@ -116,10 +116,10 @@ TEST(DEV_API, subtract) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
   auto dense_out = pten::Subtract<float>(dev_ctx, dense_x, dense_y);
 
   // 3. check result
@@ -170,10 +170,10 @@ TEST(DEV_API, divide) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
   auto dense_out = pten::Divide<float>(dev_ctx, dense_x, dense_y);
 
   // 3. check result
@@ -224,10 +224,10 @@ TEST(DEV_API, multiply) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
   auto dense_out = pten::Multiply<float>(dev_ctx, dense_x, dense_y);
 
   // 3. check result
diff --git a/paddle/pten/tests/kernels/test_flatten_dev_api.cc b/paddle/pten/tests/kernels/test_flatten_dev_api.cc
index 749004f3d71e9..f9dc1df818e45 100644
--- a/paddle/pten/tests/kernels/test_flatten_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_flatten_dev_api.cc
@@ -56,10 +56,10 @@ TEST(DEV_API, flatten) {
   }
   int start_axis = 1, stop_axis = 2;
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
 
   // 2. test API
   auto out = pten::Flatten<float>(dev_ctx, dense_x, start_axis, stop_axis);
diff --git a/paddle/pten/tests/kernels/test_matmul_dev_api.cc b/paddle/pten/tests/kernels/test_matmul_dev_api.cc
index 0dfbb54a496ae..b1c0b40eae775 100644
--- a/paddle/pten/tests/kernels/test_matmul_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_matmul_dev_api.cc
@@ -55,10 +55,10 @@ TEST(DEV_API, dot) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
   auto out = Matmul<float, CPUContext>(dev_ctx, dense_x, dense_y, false, false);
 
   // 3. check result
diff --git a/paddle/pten/tests/kernels/test_mean_dev_api.cc b/paddle/pten/tests/kernels/test_mean_dev_api.cc
index 5360c711d0a7c..4b84a13115047 100644
--- a/paddle/pten/tests/kernels/test_mean_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_mean_dev_api.cc
@@ -49,10 +49,10 @@ TEST(DEV_API, mean) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
   auto out = pten::Mean<float>(dev_ctx, dense_x, dims, false);
 
   // 3. check result
diff --git a/paddle/pten/tests/kernels/test_reshape_dev_api.cc b/paddle/pten/tests/kernels/test_reshape_dev_api.cc
index 95eda3ef2d49f..58004e718ea2a 100644
--- a/paddle/pten/tests/kernels/test_reshape_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_reshape_dev_api.cc
@@ -48,10 +48,10 @@ TEST(DEV_API, reshape) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
   auto out = pten::Reshape<float>(dev_ctx, dense_x, shape);
   // 3. check result
   std::vector<int64_t> expect_shape = {12, 3};
diff --git a/paddle/pten/tests/kernels/test_scale_dev_api.cc b/paddle/pten/tests/kernels/test_scale_dev_api.cc
index f27e1037d0471..ccad71711d610 100644
--- a/paddle/pten/tests/kernels/test_scale_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_scale_dev_api.cc
@@ -49,10 +49,11 @@ TEST(DEV_API, scale) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
+
   auto out =
       pten::Scale<float>(dev_ctx, dense_x, scale, bias, bias_after_scale);
 
@@ -92,10 +93,11 @@ TEST(DEV_API, scale_host) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
+
   auto out =
       pten::Scale<float>(dev_ctx, dense_x, scale, bias, bias_after_scale);
 
diff --git a/paddle/pten/tests/kernels/test_sparse_utils_dev_api.cc b/paddle/pten/tests/kernels/test_sparse_utils_dev_api.cc
index e4841097520ba..967609e9a8c3c 100644
--- a/paddle/pten/tests/kernels/test_sparse_utils_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_sparse_utils_dev_api.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/common/place.h"
 #include "paddle/pten/kernels/copy_kernel.h"
 #include "paddle/pten/kernels/sparse/sparse_utils_kernel.h"
 
@@ -22,6 +24,8 @@ limitations under the License. */
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
 
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+
 namespace pten {
 namespace tests {
 
@@ -38,9 +42,8 @@ inline void CheckResult(
   ASSERT_EQ(coo.nnz(), non_zero_num);
 
 #if defined(PADDLE_WITH_CUDA)
-  if (coo.place() == paddle::platform::CUDAPlace()) {
-    const auto* dev_ctx_cuda =
-        static_cast<const paddle::platform::CUDADeviceContext*>(dev_ctx);
+  if (coo.place() == pten::GPUPlace()) {
+    const auto* dev_ctx_cuda = static_cast<const pten::GPUContext*>(dev_ctx);
     DenseTensor indices(
         alloc.get(),
         DenseTensorMeta(
@@ -86,6 +89,8 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x,
       paddle::platform::CPUPlace());
 
   pten::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.Init();
+
   // 1. test cpu
   auto cpu_sparse_out =
       sparse::DenseToSparseCoo<T>(dev_ctx_cpu, dense_x, sparse_dim);
@@ -98,9 +103,21 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x,
 
 // 2. test cuda
 #if defined(PADDLE_WITH_CUDA)
-  paddle::platform::DeviceContextPool& pool =
-      paddle::platform::DeviceContextPool::Instance();
-  auto* dev_ctx_cuda = pool.GetByPlace(paddle::platform::CUDAPlace());
+  // paddle::platform::DeviceContextPool& pool =
+  //     paddle::platform::DeviceContextPool::Instance();
+  // auto* dev_ctx_cuda = pool.GetByPlace(paddle::platform::CUDAPlace());
+  pten::GPUContext dev_ctx_gpu;
+  dev_ctx_gpu.PartialInitWithoutAllocator();
+  dev_ctx_gpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(dev_ctx_gpu.GetPlace(), dev_ctx_gpu.stream())
+          .get());
+  dev_ctx_gpu.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(pten::CPUPlace())
+          .get());
+  dev_ctx_gpu.PartialInitWithAllocator();
+
   const auto cuda_alloc =
       std::make_shared<paddle::experimental::DefaultAllocator>(
           paddle::platform::CUDAPlace());
@@ -108,10 +125,10 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x,
       cuda_alloc.get(),
       DenseTensorMeta(dense_x.dtype(), dense_x.dims(), dense_x.layout()));
 
-  pten::Copy(*dev_ctx_cuda, dense_x, true, &d_dense_x);
+  pten::Copy(dev_ctx_gpu, dense_x, true, &d_dense_x);
   auto sparse_out =
-      sparse::DenseToSparseCoo<T>(*dev_ctx_cuda, d_dense_x, sparse_dim);
-  CheckResult<T, int64_t>(dev_ctx_cuda,
+      sparse::DenseToSparseCoo<T>(dev_ctx_gpu, d_dense_x, sparse_dim);
+  CheckResult<T, int64_t>(&dev_ctx_gpu,
                           sparse_out,
                           non_zero_data,
                           indices_data,
@@ -295,20 +312,32 @@ void TestSparseCsrToCoo(const DDim& dense_dims,
                           alloc);
 // 2. test cuda
 #if defined(PADDLE_WITH_CUDA)
+  pten::GPUContext dev_ctx_gpu;
+  dev_ctx_gpu.PartialInitWithoutAllocator();
+  dev_ctx_gpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(dev_ctx_gpu.GetPlace(), dev_ctx_gpu.stream())
+          .get());
+  dev_ctx_gpu.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(pten::CPUPlace())
+          .get());
+  dev_ctx_gpu.PartialInitWithAllocator();
+
   const auto cuda_alloc =
       std::make_shared<paddle::experimental::DefaultAllocator>(
           paddle::platform::CUDAPlace());
-  auto& pool = paddle::platform::DeviceContextPool::Instance();
-  auto* dev_ctx_cuda = pool.GetByPlace(paddle::platform::CUDAPlace());
+  // auto& pool = paddle::platform::DeviceContextPool::Instance();
+  // auto* dev_ctx_cuda = pool.GetByPlace(paddle::platform::CUDAPlace());
   pten::DenseTensor d_crows(cuda_alloc.get(), crows_meta);
   pten::DenseTensor d_cols(cuda_alloc.get(), cols_meta);
   pten::DenseTensor d_values(cuda_alloc.get(), values_meta);
-  pten::Copy(*dev_ctx_cuda, crows, true, &d_crows);
-  pten::Copy(*dev_ctx_cuda, cols, true, &d_cols);
-  pten::Copy(*dev_ctx_cuda, values, true, &d_values);
+  pten::Copy(dev_ctx_gpu, crows, true, &d_crows);
+  pten::Copy(dev_ctx_gpu, cols, true, &d_cols);
+  pten::Copy(dev_ctx_gpu, values, true, &d_values);
   pten::SparseCsrTensor d_csr(d_crows, d_cols, d_values, dense_dims);
-  auto cuda_sparse_out = sparse::SparseCsrToCoo<T>(*dev_ctx_cuda, d_csr);
-  CheckResult<T, int64_t>(dev_ctx_cuda,
+  auto cuda_sparse_out = sparse::SparseCsrToCoo<T>(dev_ctx_gpu, d_csr);
+  CheckResult<T, int64_t>(&dev_ctx_gpu,
                           cuda_sparse_out,
                           non_zero_data,
                           indices_data,
diff --git a/paddle/pten/tests/kernels/test_sum_dev_api.cc b/paddle/pten/tests/kernels/test_sum_dev_api.cc
index 023d62c7ed9ff..b05d40692afb4 100644
--- a/paddle/pten/tests/kernels/test_sum_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_sum_dev_api.cc
@@ -47,10 +47,11 @@ TEST(DEV_API, sum) {
 
   std::vector<int64_t> axis = {0, 1};
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
+
   // 2. test API
   auto out =
       pten::Sum<float>(dev_ctx, dense_x, axis, pten::DataType::FLOAT32, false);