PaddlePaddle · jiweibo · Feb 7, 2022 · Feb 7, 2022 · Feb 10, 2022 · Feb 10, 2022
diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -283,7 +283,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
       algo = static_cast<cudnnConvolutionFwdAlgo_t>(1);
     } else {
       auto& dev_ctx = ctx;
-      auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+      auto* workspace_handle = dev_ctx.cudnn_workspace_handle();
 
       AlgorithmsCache<algo_t>& algo_cache =
           *(framework::ConvSearchCache::Instance().GetForward());
@@ -311,7 +311,8 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
                       perf_stat.data(), cudnn_workspace_ptr,
                       workspace_size_limit));
             };
-            workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);
+            workspace_handle->RunFuncSync(cudnn_find_func,
+                                          workspace_size_limit);
 
             VLOG(3) << "FwdAlgo Perf result: (algo: stat, time, memory)";
             for (int i = 0; i < returned_algo_count; ++i) {
@@ -410,7 +411,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
       return CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
     } else {
       auto& dev_ctx = ctx;
-      auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+      auto* workspace_handle = dev_ctx.cudnn_workspace_handle();
 
       AlgorithmsCache<algo_t>& algo_cache =
           *(framework::ConvSearchCache::Instance().GetBackwardData());
@@ -440,7 +441,8 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
                           perf_stat.data(), cudnn_workspace_ptr,
                           workspace_size_limit));
             };
-            workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);
+            workspace_handle->RunFuncSync(cudnn_find_func,
+                                          workspace_size_limit);
 
             VLOG(3) << "BwdDataAlgo Perf result: (algo: stat, time, memory)";
             for (int i = 0; i < returned_algo_count; ++i) {
@@ -529,7 +531,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
       return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
     } else {
       auto& dev_ctx = ctx;
-      auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+      auto* workspace_handle = dev_ctx.cudnn_workspace_handle();
       AlgorithmsCache<algo_t>& algo_cache =
           *(framework::ConvSearchCache::Instance().GetBackwardFilter());
 
@@ -557,8 +559,8 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
                             perf_stat.data(), cudnn_workspace_ptr,
                             workspace_size_limit));
               };
-              workspace_handle.RunFuncSync(cudnn_find_func,
-                                           workspace_size_limit);
+              workspace_handle->RunFuncSync(cudnn_find_func,
+                                            workspace_size_limit);
 
               VLOG(3)
                   << "BwdFilterAlgo Perf result: (algo: stat, time, memory)";

diff --git a/paddle/fluid/operators/conv_miopen_helper.h b/paddle/fluid/operators/conv_miopen_helper.h
@@ -131,7 +131,7 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
                      const phi::GPUContext& ctx) {
     algo_t algo;
 
-    auto workspace_handle = ctx.cudnn_workspace_handle();
+    auto* workspace_handle = ctx.cudnn_workspace_handle();
 
     int find_count;
     miopenConvAlgoPerf_t find_result;
@@ -145,7 +145,7 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
               cudnn_workspace_ptr, workspace_size, false));
     };
 
-    workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
+    workspace_handle->RunFuncSync(cudnn_find_func, workspace_size);
     algo = find_result.fwd_algo;
     VLOG(3) << "choose algo " << algo;
     return algo;
@@ -172,7 +172,7 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
                      const phi::GPUContext& ctx) {
     algo_t algo;
 
-    auto workspace_handle = ctx.cudnn_workspace_handle();
+    auto* workspace_handle = ctx.cudnn_workspace_handle();
 
     int find_count;
     miopenConvAlgoPerf_t find_result;
@@ -186,7 +186,7 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
               cudnn_workspace_ptr, workspace_size, false));
     };
 
-    workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
+    workspace_handle->RunFuncSync(cudnn_find_func, workspace_size);
     algo = find_result.bwd_data_algo;
     VLOG(3) << "choose algo " << algo;
     return algo;
@@ -213,7 +213,7 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
                      const phi::GPUContext& ctx) {
     algo_t algo;
 
-    auto workspace_handle = ctx.cudnn_workspace_handle();
+    auto* workspace_handle = ctx.cudnn_workspace_handle();
 
     int find_count;
     miopenConvAlgoPerf_t find_result;
@@ -227,7 +227,7 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
               cudnn_workspace_ptr, workspace_size, false));
     };
 
-    workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
+    workspace_handle->RunFuncSync(cudnn_find_func, workspace_size);
     algo = find_result.bwd_weights_algo;
     VLOG(3) << "choose algo " << algo;
     return algo;

diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu
@@ -182,7 +182,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
 
     miopenConvFwdAlgorithm_t algo;
     auto handle = dev_ctx.cudnn_handle();
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+    auto* workspace_handle = dev_ctx.cudnn_workspace_handle();
 
     auto x_dims = phi::vectorize(transformed_input.dims());
     auto f_dims = phi::vectorize(filter->dims());
@@ -202,7 +202,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
               kNUM_CUDNN_FWD_ALGS, &find_count, &find_result,
               cudnn_workspace_ptr, workspace_size, false));
     };
-    workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
+    workspace_handle->RunFuncSync(cudnn_find_func, workspace_size);
     algo = find_result.fwd_algo;
     VLOG(3) << "cuDNN forward algo " << algo;
 
@@ -214,7 +214,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
             filter_data, cudnn_conv_desc, algo, &beta, cudnn_output_desc,
             output_data, cudnn_workspace, workspace_size));
       };
-      workspace_handle.RunFunc(cudnn_func, workspace_size);
+      workspace_handle->RunFunc(cudnn_func, workspace_size);
       PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::miopenConvolutionForwardBias(
               handle, &alpha, cudnn_bias_desc, bias_data, &beta,
@@ -264,7 +264,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
     // ------------------- cudnn conv algorithm ---------------------
     cudnnConvolutionFwdAlgo_t algo;
     auto handle = dev_ctx.cudnn_handle();
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+    auto* workspace_handle = dev_ctx.cudnn_workspace_handle();
 
     PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
         cudnn_conv_desc, CUDNN_DEFAULT_MATH));
@@ -318,7 +318,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
                   kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
                   fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit));
         };
-        workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);
+        workspace_handle->RunFuncSync(cudnn_find_func, workspace_size_limit);
         VLOG(3) << "Perf result: (algo: stat, time, memory)";
         for (int i = 0; i < returned_algo_count; ++i) {
           const auto& stat = fwd_perf_stat[i];
@@ -372,7 +372,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
             filter_data, cudnn_conv_desc, algo, cudnn_workspace,
             workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
       };
-      workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
+      workspace_handle->RunFunc(cudnn_func, workspace_size_in_bytes);
       PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnAddTensor(
           handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc,
           output_data));
@@ -392,7 +392,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
                 cudnn_output_desc, residual_data, cudnn_bias_desc, bias_data,
                 cudnn_act_desc, cudnn_output_desc, output_data));
       };
-      workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
+      workspace_handle->RunFunc(cudnn_func, workspace_size_in_bytes);
     }
 #endif
     std::vector<int> channels = ctx.Attr<std::vector<int>>("split_channels");

diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
@@ -190,7 +190,7 @@ class CudnnNormConvolution {
     fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_YSUM, sum_ptr);
     fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_YSQSUM, sum_of_squares_ptr);
 
-    ctx.cudnn_workspace_handle().RunFunc(
+    ctx.cudnn_workspace_handle()->RunFunc(
         [&](void *workspace_ptr) {
           // workspace ptr
           fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, workspace_ptr);
@@ -298,7 +298,7 @@ class CudnnNormConvolutionGrad {
     wgrad_op->SetOpVariantParamAttrPtr(
         CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &workspace_size);
 
-    ctx.cudnn_workspace_handle().RunFunc(
+    ctx.cudnn_workspace_handle()->RunFunc(
         [&](void *workspace_ptr) {
           // workspace ptr
           wgrad_op->SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE,
@@ -317,7 +317,7 @@ class CudnnNormConvolutionGrad {
     // Convolution dgrad followed optionally by batchnorm dgrad
     ScalingParamType<T> alpha = 1.0f;
     ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f;
-    ctx.cudnn_workspace_handle().RunFunc(
+    ctx.cudnn_workspace_handle()->RunFunc(
         [&](void *cudnn_workspace_ptr) {
           PADDLE_ENFORCE_GPU_SUCCESS(
               platform::dynload::cudnnConvolutionBackwardData(

diff --git a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
@@ -113,7 +113,7 @@ class CudnnScaleBiasAddRelu {
     ForwardInit(ctx);
     auto handle = ctx.cudnn_handle();
     auto place = ctx.GetPlace();
-    auto workspace_handle = ctx.cudnn_workspace_handle();
+    auto *workspace_handle = ctx.cudnn_workspace_handle();
     fwd_workspace_byte_ = fwd_op_.GetWorkspaceSizeInBytes(handle);
     // Set variant_param
     // input ptr
@@ -146,7 +146,7 @@ class CudnnScaleBiasAddRelu {
     fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_YDATA, out_ptr);
     fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_ACTIVATION_BITMASK, bitmask_ptr);
 
-    workspace_handle.RunFunc(
+    workspace_handle->RunFunc(
         [&](void *workspace_ptr) {
           // workspace ptr
           fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, workspace_ptr);
@@ -164,7 +164,7 @@ class CudnnScaleBiasAddRelu {
     BackwardInit(ctx);
     auto handle = ctx.cudnn_handle();
     auto place = ctx.GetPlace();
-    auto workspace_handle = ctx.cudnn_workspace_handle();
+    auto *workspace_handle = ctx.cudnn_workspace_handle();
     bwd_workspace_byte_ = bwd_op_.GetWorkspaceSizeInBytes(handle);
     // Set variant_param
     // input ptr
@@ -203,7 +203,7 @@ class CudnnScaleBiasAddRelu {
       bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_DZDATA, dz_ptr);
     }
 
-    workspace_handle.RunFunc(
+    workspace_handle->RunFunc(
         [&](void *workspace_ptr) {
           // workspace ptr
           bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, workspace_ptr);

diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
@@ -245,8 +245,8 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
                 static_cast<const void*>(bias[i]->data<T>()), cudnn_act_desc,
                 out_desc[i], out_datas[i]));
       };
-      auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-      workspace_handle.RunFunc(func, workspace_size_in_bytes);
+      auto* workspace_handle = dev_ctx.cudnn_workspace_handle();
+      workspace_handle->RunFunc(func, workspace_size_in_bytes);
     }
 
     cudnnTensorDescriptor_t x_desc;

diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
@@ -23,7 +23,7 @@ namespace platform {
 void BeginCUDAGraphCapture(platform::CUDAPlace place,
                            cudaStreamCaptureMode mode) {
   auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
-  dev_ctx->cudnn_workspace_handle().ResetWorkspace();
+  dev_ctx->cudnn_workspace_handle()->ResetWorkspace();
 
   auto stream = dev_ctx->stream();
   CUDAGraph::BeginCapture(place, stream, mode);
@@ -39,7 +39,7 @@ void BeginCUDAGraphCapture(platform::CUDAPlace place,
 std::unique_ptr<CUDAGraph> EndCUDAGraphCapture() {
   auto place = CUDAGraph::CapturingPlace();
   auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
-  dev_ctx->cudnn_workspace_handle().ResetWorkspace();
+  dev_ctx->cudnn_workspace_handle()->ResetWorkspace();
   return CUDAGraph::EndCapture();
 }
 #endif

diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
@@ -617,13 +617,9 @@ void CUDADeviceContext::WaitStreamCallback() const {
   phi::GPUContext::WaitStreamCallback();
 }
 
-phi::DnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
+phi::DnnWorkspaceHandle* CUDADeviceContext::cudnn_workspace_handle() const {
   if (thread_ctx_.count(this)) {
-    // return workspace_.get();
-    return phi::DnnWorkspaceHandle(
-        memory::allocation::AllocatorFacade::Instance()
-            .GetAllocator(GetPlace())
-            .get());
+    return workspace_.get();
   }
   return phi::GPUContext::cudnn_workspace_handle();
 }

diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
@@ -604,7 +604,7 @@ class CUDADeviceContext : public phi::GPUContext {
    *  workspace. Once the handle is destructed, the lock would be released.
    *  CudnnWorkspaceHandle is an RAII object to implement thread-safe
    *  sequential cudnn function calls. */
-  phi::DnnWorkspaceHandle cudnn_workspace_handle() const;
+  phi::DnnWorkspaceHandle* cudnn_workspace_handle() const;
 
   /*! \brief  Return cuda stream in the device context. */
   gpuStream_t stream() const;

diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
@@ -3154,7 +3154,7 @@ All parameter, weight, gradient are variables in Paddle.
     for (int dev_id : platform::GetSelectedDevices()) {
       auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(
           platform::CUDAPlace(dev_id));
-      dev_ctx->cudnn_workspace_handle().ResetWorkspace();
+      dev_ctx->cudnn_workspace_handle()->ResetWorkspace();
     }
     platform::EmptyCache();
   });

diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
@@ -155,15 +155,50 @@ static void StreamCallbackFunc(gpuStream_t stream,
 
 }  // namespace internal
 
-void DnnWorkspaceHandle::ResetWorkspace() { allocation_ = nullptr; }
+void DnnWorkspaceHandle::ResetWorkspace() {
+  std::lock_guard<std::mutex> guard(*mtx_);
+  allocation_ = nullptr;
+}
 
 void DnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) {
   if (required_workspace_bytes <= WorkspaceSize()) return;
   // reset allocation first before re-allocate to save memory
+  std::lock_guard<std::mutex> guard(*mtx_);
   allocation_.reset();
   allocation_ = allocator_->Allocate(required_workspace_bytes);
 }
 
+inline void DnnWorkspaceHandle::RunFunc(
+    const std::function<void(void*)>& cudnn_func,
+    size_t required_workspace_bytes) {
+  if (required_workspace_bytes > WorkspaceSize()) {
+    ReallocWorkspace(required_workspace_bytes);
+  }
+  {
+    std::lock_guard<std::mutex> guard(*mtx_);
+    cudnn_func(allocation_ ? allocation_->ptr() : nullptr);
+  }
+}
+
+/*! \brief Thread which call RunFuncSync() would release gpu memory after
+  *  running the function. Currently this function is only used when cudnn
+  *  exhaustive searching and callers have to guarantee that the input function
+  *  is host blocking */
+inline void DnnWorkspaceHandle::RunFuncSync(
+    const std::function<void(void*)>& cudnn_func,
+    size_t required_workspace_bytes) {
+  RunFunc(cudnn_func, required_workspace_bytes);
+  ResetWorkspace();
+}
+
+inline size_t DnnWorkspaceHandle::WorkspaceSize() {
+  std::lock_guard<std::mutex> guard(*mtx_);
+  if (allocation_ == nullptr) {
+    return 0;
+  }
+  return allocation_->size();
+}
+
 struct GPUContext::Impl {
   void Init() {
     owned_ = true;
@@ -305,15 +340,9 @@ struct GPUContext::Impl {
     }
   }
 
-  // TODO(wilber): The return type is a pointer, to be modified later.
-  // DnnWorkspaceHandle* GetDnnWorkspace() {
-  //   PD_CHECK(workspace_ != nullptr, "the gpu cudnn workspace is nullptr.");
-  //   return workspace_;
-  // }
-  DnnWorkspaceHandle GetDnnWorkspace() {
-    PD_CHECK(allocator_ != nullptr,
-             "the device allocator for gpu context is nullptr.");
-    return DnnWorkspaceHandle(allocator_);
+  DnnWorkspaceHandle* GetDnnWorkspace() {
+    PD_CHECK(workspace_ != nullptr, "the gpu cudnn workspace is nullptr.");
+    return workspace_;
   }
 
   void InitStream() {
@@ -802,7 +831,7 @@ Eigen::GpuDevice* GPUContext::eigen_device() const {
   return impl_->eigen_device();
 }
 
-DnnWorkspaceHandle GPUContext::cudnn_workspace_handle() const {
+DnnWorkspaceHandle* GPUContext::cudnn_workspace_handle() const {
   return impl_->GetDnnWorkspace();
 }