[PTEN] Add Gpu context (PaddlePaddle#39305)

jim19930609 · Feb 6, 2022 · a821c4a · a821c4a
1 parent dcff7fa
commit a821c4a
Show file tree

Hide file tree

Showing 135 changed files with 4,438 additions and 770 deletions.
diff --git a/paddle/fluid/distributed/common/utils.h b/paddle/fluid/distributed/common/utils.h
@@ -33,7 +33,7 @@ namespace distributed {
 template <typename T>
 inline paddle::operators::math::BlasT<paddle::platform::CPUDeviceContext, T>
 GetBlas() {
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
+  paddle::platform::CPUDeviceContext cpu_ctx;
   return paddle::operators::math::GetBlas<paddle::platform::CPUDeviceContext,
                                           T>(cpu_ctx);
 }

diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator.cc b/paddle/fluid/distributed/ps/service/communicator/communicator.cc
@@ -1155,7 +1155,7 @@ void GeoCommunicator::SendDense(const CommContext &send_ctx) {
     auto &t_latest = var_latest->Get<framework::LoDTensor>();
     auto t_timestamp = var_timestamp->GetMutable<framework::LoDTensor>();
 
-    auto cpu_ctx = paddle::platform::CPUDeviceContext();
+    paddle::platform::CPUDeviceContext cpu_ctx;
     auto *var_delta = delta_scope_->Var(varname);
     auto *t_delta = var_delta->GetMutable<framework::LoDTensor>();
     t_delta->mutable_data<float>(t_latest.dims(), cpu_ctx.GetPlace());
@@ -1185,7 +1185,7 @@ void GeoCommunicator::RecvDense(const CommContext &send_ctx) {
   RpcRecvDense(varnames, table_id, pserver_scope_.get());
 
   // 2.1 pserver - old => delta; 2.2 latest + old => latest 2.3 old => pserver
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
+  paddle::platform::CPUDeviceContext cpu_ctx;
   for (auto &varname : varnames) {
     auto *var_latest = recv_scope_->FindVar(varname);
     auto t_latest = var_latest->GetMutable<framework::LoDTensor>();
@@ -1292,7 +1292,7 @@ void GeoCommunicator::SendSparse(const std::string &varname,
   auto *t_old = var_old->GetMutable<framework::LoDTensor>();
 
   auto dims1 = t_latest.dims()[1];
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
+  paddle::platform::CPUDeviceContext cpu_ctx;
 
   auto *var_delta = delta_scope_->Var(varname);
   auto *t_delta = var_delta->GetMutable<pten::SelectedRows>();
@@ -1370,7 +1370,7 @@ void GeoCommunicator::RecvSparse(const std::string &varname, int table_id,
   std::vector<float> v_delta;
   v_delta.resize(numel);
 
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
+  paddle::platform::CPUDeviceContext cpu_ctx;
   auto blas =
       paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>(
           cpu_ctx);

diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator.h b/paddle/fluid/distributed/ps/service/communicator/communicator.h
@@ -179,7 +179,7 @@ inline void MergeVars(const std::string &var_name,
     }
 
     // set output tensor to 0.
-    auto cpu_ctx = paddle::platform::CPUDeviceContext();
+    paddle::platform::CPUDeviceContext cpu_ctx;
     paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext, T>
         constant_functor;
     constant_functor(cpu_ctx, out_t, static_cast<T>(0));
@@ -204,7 +204,7 @@ inline void MergeVars(const std::string &var_name,
     for (auto &var : vars) {
       inputs.push_back(&var->Get<pten::SelectedRows>());
     }
-    auto dev_ctx = paddle::platform::CPUDeviceContext();
+    paddle::platform::CPUDeviceContext dev_ctx;
     if (merge_add) {
       paddle::operators::math::scatter::MergeAdd<
           paddle::platform::CPUDeviceContext, T>

diff --git a/paddle/fluid/framework/data_type_transform_test.cu b/paddle/fluid/framework/data_type_transform_test.cu
@@ -21,7 +21,10 @@ TEST(DataTypeTransform, GPUTransform) {
   auto cpu_place = paddle::platform::CPUPlace();
   auto gpu_place = paddle::platform::CUDAPlace(0);
   paddle::platform::CUDADeviceContext context(gpu_place);
-
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
   auto kernel_fp16 = paddle::framework::OpKernelType(
       paddle::framework::proto::VarType::FP16, gpu_place,
       paddle::framework::DataLayout::kAnyLayout,

diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
@@ -1361,7 +1361,7 @@ void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) {
       auto *dev_ctx = static_cast<platform::XPUDeviceContext *>(
           pool.Get(member_->places_[dev_id]));
       auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]);
-      dev_ctx->set_bkcl_context(bkcl_ctx.comm());
+      dev_ctx->SetBkclContext(bkcl_ctx.comm());
     }
 #else
     PADDLE_THROW(

diff --git a/paddle/fluid/framework/pten_utils.h b/paddle/fluid/framework/pten_utils.h
@@ -77,6 +77,13 @@ struct ConvertToPtenContext<platform::CPUDeviceContext> {
   using TYPE = pten::CPUContext;
 };
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <>
+struct ConvertToPtenContext<platform::CUDADeviceContext> {
+  using TYPE = pten::GPUContext;
+};
+#endif
+
 #ifdef PADDLE_WITH_XPU
 template <>
 struct ConvertToPtenContext<platform::XPUDeviceContext> {

diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
@@ -1085,7 +1085,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     is.seekg(seekg, is.cur);
 
     void* buf;
-    auto ctx = platform::CPUDeviceContext();
+    platform::CPUDeviceContext ctx;
     size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
         platform::is_xpu_place(dev_ctx.GetPlace()) ||
@@ -1155,7 +1155,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
     tensor->Resize(framework::make_ddim(dims));
     void* buf;
-    auto ctx = platform::CPUDeviceContext();
+    platform::CPUDeviceContext ctx;
     size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
         platform::is_xpu_place(dev_ctx.GetPlace()) ||
@@ -1432,4 +1432,4 @@ std::ostream& operator<<(std::ostream& os, const pten::DenseTensor& t) {
   VLOG(1) << "PrintVar: unrecognized data type:" << t.type();
   return os;
 }
-}
+}  // namespace pten
diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
@@ -73,6 +73,10 @@ TEST(TensorCopy, Tensor) {
     // CPU Tensor to GPU Tensor
     auto gpu_place = new platform::CUDAPlace(0);
     platform::CUDADeviceContext gpu_ctx(*gpu_place);
+    gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                             .GetAllocator(*gpu_place, gpu_ctx.stream())
+                             .get());
+    gpu_ctx.PartialInitWithAllocator();
     TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
 
     // GPU Tensor to CPU Tensor
@@ -166,6 +170,10 @@ TEST(TensorFromVector, Tensor) {
     gpu_tensor.Resize(paddle::framework::make_ddim({3, 3}));
     auto gpu_place = new paddle::platform::CUDAPlace();
     paddle::platform::CUDADeviceContext gpu_ctx(*gpu_place);
+    gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                             .GetAllocator(*gpu_place, gpu_ctx.stream())
+                             .get());
+    gpu_ctx.PartialInitWithAllocator();
     paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
     // Copy from GPU to CPU tensor for comparison
     paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
@@ -230,6 +238,10 @@ TEST(TensorToVector, Tensor) {
     paddle::framework::Tensor gpu_tensor;
     paddle::platform::CUDAPlace place;
     paddle::platform::CUDADeviceContext gpu_ctx(place);
+    gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                             .GetAllocator(place, gpu_ctx.stream())
+                             .get());
+    gpu_ctx.PartialInitWithAllocator();
     paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
 
     std::vector<int> dst;
@@ -267,6 +279,10 @@ TEST(TensorToVector, Tensor_bool) {
     paddle::framework::Tensor gpu_tensor;
     paddle::platform::CUDAPlace place;
     paddle::platform::CUDADeviceContext gpu_ctx(place);
+    gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                             .GetAllocator(place, gpu_ctx.stream())
+                             .get());
+    gpu_ctx.PartialInitWithAllocator();
     paddle::framework::TensorFromVector<bool>(src_vec, gpu_ctx, &gpu_tensor);
 
     std::vector<bool> dst;
@@ -493,6 +509,10 @@ TEST(Tensor, FromAndToStream) {
 
     auto gpu_place = new platform::CUDAPlace();
     platform::CUDADeviceContext gpu_ctx(*gpu_place);
+    gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                             .GetAllocator(*gpu_place, gpu_ctx.stream())
+                             .get());
+    gpu_ctx.PartialInitWithAllocator();
 
     TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
 

diff --git a/paddle/fluid/imperative/gloo_context.cc b/paddle/fluid/imperative/gloo_context.cc
@@ -46,6 +46,17 @@ void GLOOParallelContext::Init() {
   gloo_wrapper->Init();
   device_ = std::unique_ptr<platform::CPUDeviceContext>(
       new platform::CPUDeviceContext(platform::CPUPlace()));
+  device_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                            .GetAllocator(platform::CPUPlace())
+                            .get());
+  device_->SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+  device_->SetZeroAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetZeroAllocator(platform::CPUPlace())
+          .get());
 }
 
 void GLOOParallelContext::InitWithRingID(int ring_id) {

diff --git a/paddle/fluid/inference/lite/test_engine_lite.cc b/paddle/fluid/inference/lite/test_engine_lite.cc
@@ -77,6 +77,10 @@ void make_fake_model(std::string* model, std::string* param) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   platform::CUDAPlace place;
   platform::CUDADeviceContext ctx(place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(place, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
 #else
   platform::CPUPlace place;
   platform::CPUDeviceContext ctx(place);

diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -27,6 +27,18 @@ class TensorRTEngineTest : public ::testing::Test {
  protected:
   void SetUp() override {
     ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0));
+    ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(platform::CUDAPlace(0), ctx_->stream())
+                           .get());
+    ctx_->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    ctx_->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(platform::CUDAPlace(0))
+            .get());
+    ctx_->PartialInitWithAllocator();
 
     engine_ = new TensorRTEngine(10, 1 << 10);
     engine_->InitNetwork();

diff --git a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
@@ -18,6 +18,7 @@
 #include <vector>
 
 #include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
 #include "paddle/fluid/memory/allocation/locked_allocator.h"
@@ -44,6 +45,10 @@ TEST(BestFitAllocator, concurrent_cuda) {
 
   platform::CUDAPlace gpu(0);
   platform::CUDADeviceContext dev_ctx(gpu);
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu, dev_ctx.stream())
+                           .get());
+  dev_ctx.PartialInitWithAllocator();
 
   auto th_main = [&](std::random_device::result_type seed) {
     std::default_random_engine engine(seed);

diff --git a/paddle/fluid/memory/malloc_test.cu b/paddle/fluid/memory/malloc_test.cu
@@ -25,6 +25,7 @@
 #include <vector>
 
 #include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device_context.h"
 
@@ -105,8 +106,21 @@ TEST(Malloc, CUDADeviceContextMultiStream) {
   main_stream_alloc_ptr.reset();
 
   for (int i = 0; i < NUM_STREAMS; ++i) {
-    dev_ctx.push_back(std::unique_ptr<platform::CUDADeviceContext>(
-        new platform::CUDADeviceContext(place)));
+    auto ctx = std::unique_ptr<platform::CUDADeviceContext>(
+        new platform::CUDADeviceContext(place));
+    ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                          .GetAllocator(place, ctx->stream())
+                          .get());
+    ctx->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    ctx->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(place)
+            .get());
+    ctx->PartialInitWithAllocator();
+    dev_ctx.emplace_back(std::move(ctx));
     MultiStreamCompute(&data[i], &second_data[i], *dev_ctx[i]);
   }
 
@@ -144,8 +158,21 @@ TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) {
   main_stream_alloc_ptr.reset();
 
   for (int i = 0; i < NUM_STREAMS; ++i) {
-    dev_ctx.push_back(std::unique_ptr<platform::CUDADeviceContext>(
-        new platform::CUDADeviceContext(place)));
+    auto ctx = std::unique_ptr<platform::CUDADeviceContext>(
+        new platform::CUDADeviceContext(place));
+    ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                          .GetAllocator(place, ctx->stream())
+                          .get());
+    ctx->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    ctx->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(place)
+            .get());
+    ctx->PartialInitWithAllocator();
+    dev_ctx.emplace_back(std::move(ctx));
     threads.push_back(std::thread(MultiStreamCompute, &data[i], &second_data[i],
                                   std::cref(*dev_ctx[i])));
   }

diff --git a/paddle/fluid/operators/arg_min_max_op_base.cu.h b/paddle/fluid/operators/arg_min_max_op_base.cu.h
@@ -110,7 +110,7 @@ void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input,
     return block_size;
   };
 
-  int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize().x;
+  int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
   int64_t height = pre * post;
   int64_t width = n;
   int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;

diff --git a/paddle/fluid/operators/argsort_op.cu b/paddle/fluid/operators/argsort_op.cu
@@ -131,7 +131,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
 
   int block_size = ComputeBlockSize(num_cols);
 
-  int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
+  int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
   // actually, int num_rows < max_grid_size
   int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
   // Init a index array
@@ -212,7 +212,7 @@ void ArgFullAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO,
 
   int block_size = ComputeBlockSize(num_cols);
 
-  int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
+  int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
   // actually, int num_rows < max_grid_size
   int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
   FillGrad<<<grid_size, block_size, 0, cu_stream>>>(

diff --git a/paddle/fluid/operators/broadcast_tensors_op.cu b/paddle/fluid/operators/broadcast_tensors_op.cu
@@ -90,8 +90,8 @@ class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel<T> {
         // reduce_sum implementation on CUDA
         auto stream = context.cuda_device_context().stream();
         TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-            *input_tensor, output_tensor, kps::IdentityFunctor<T>(),
-            reduce_dims_vec, stream);
+            context.cuda_device_context(), *input_tensor, output_tensor,
+            kps::IdentityFunctor<T>(), reduce_dims_vec, stream);
       }
     }
   }

diff --git a/paddle/fluid/operators/cholesky_solve_op.cu b/paddle/fluid/operators/cholesky_solve_op.cu
@@ -115,7 +115,8 @@ class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
     }
     gpuStream_t stream = ctx.cuda_device_context().stream();
     TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        in, out, kps::IdentityFunctor<T>(), out_reduce_dims, stream);
+        ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(),
+        out_reduce_dims, stream);
   }
 };
 

diff --git a/paddle/fluid/operators/clip_by_norm_op.cu b/paddle/fluid/operators/clip_by_norm_op.cu
@@ -77,7 +77,7 @@ class ClipByNormKernel<platform::CUDADeviceContext, platform::float16>
         {1}, dev_ctx);
     TensorReduceFunctorImpl<platform::float16, float, kps::AddFunctor,
                             kps::SquareFunctor<platform::float16, float>>(
-        *input, &tmp, kps::SquareFunctor<platform::float16, float>(),
+        dev_ctx, *input, &tmp, kps::SquareFunctor<platform::float16, float>(),
         reduce_dims, dev_ctx.stream());
     auto tmp_eigen = EigenVector<float>::Flatten(tmp);
     auto x_norm = tmp_eigen.sqrt();

diff --git a/paddle/fluid/operators/controlflow/compare_all_op.cu b/paddle/fluid/operators/controlflow/compare_all_op.cu
@@ -65,7 +65,8 @@ class CompareReduceOpKernel
       auto stream = context.cuda_device_context().stream();
       TensorReduceFunctorImpl<bool, bool, BitwiseAdd,
                               kps::IdentityFunctor<bool>>(
-          tmp, z, kps::IdentityFunctor<bool>(), reduce_dims, stream);
+          context.cuda_device_context(), tmp, z, kps::IdentityFunctor<bool>(),
+          reduce_dims, stream);
     }
   }
 };

diff --git a/paddle/fluid/operators/copy_cross_scope_test.cc b/paddle/fluid/operators/copy_cross_scope_test.cc
@@ -131,12 +131,20 @@ void Compare2(f::Scope* scope, const p::DeviceContext& ctx,
 TEST(copy_cross_scope, CUDA_fp32) {
   f::Scope scope;
   p::CUDADeviceContext ctx(p::CUDAPlace(0));
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(p::CUDAPlace(0), ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
   Compare1<float>(&scope, ctx, "copy_cross_scope");
 }
 
 TEST(copy_cross_scope_to_main_scope, CUDA_fp32) {
   f::Scope scope;
   p::CUDADeviceContext ctx(p::CUDAPlace(0));
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(p::CUDAPlace(0), ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
   Compare2<float>(&scope, ctx, "copy_cross_scope");
 }
 #elif PADDLE_WITH_ASCEND_CL