PaddlePaddle · jiweibo · Jul 26, 2022 · Jul 23, 2022 · Jul 23, 2022 · Jul 23, 2022
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -415,14 +415,16 @@ void AnalysisPredictor::InitDeviceContexts() {
           gpu_context->SetHostGenerator(framework::DefaultCPUGenerator().get());
 
           gpu_context->SetStream(gpu_resource->GetStream());
-          gpu_context->SetBlasHandle(gpu_resource->GetBlasHandle());
+          gpu_context->SetBlasHandle(gpu_resource->GetBlasHandleCreator());
           gpu_context->SetBlasTensorCoreHandle(
-              gpu_resource->GetBlasTensorCoreHandle());
-          gpu_context->SetBlasTF32Handle(gpu_resource->GetBlasTF32Handle());
-          gpu_context->SetDnnHandle(gpu_resource->GetDnnHandle());
-          gpu_context->SetSolverHandle(gpu_resource->GetSolverDnHandle());
-          gpu_context->SetSparseHandle(gpu_resource->GetSparseHandle());
-          gpu_context->SetEigenDevice(gpu_resource->GetGpuEigenDevice());
+              gpu_resource->GetBlasTensorCoreHandleCreator());
+          gpu_context->SetBlasTF32Handle(
+              gpu_resource->GetBlasTF32TensorCoreHandleCreator());
+          gpu_context->SetDnnHandle(gpu_resource->GetDnnHandleCreator());
+          gpu_context->SetSolverHandle(
+              gpu_resource->GetSolverDnHandleCreator());
+          gpu_context->SetSparseHandle(gpu_resource->GetSparseHandleCreator());
+          gpu_context->SetEigenDevice(gpu_resource->GetGpuEigenDeviceCreator());
           gpu_context->SetComputeCapability(
               gpu_resource->GetGpuComputeCapability());
           gpu_context->SetMaxThreadsPerBlock(

diff --git a/paddle/fluid/inference/api/resource_manager.cc b/paddle/fluid/inference/api/resource_manager.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/inference/api/resource_manager.h"
 
+#include <functional>
 #include <memory>
 #include <mutex>
 #include <unordered_map>
@@ -150,12 +151,6 @@ void GPUContextResource::InitGPUResource(void* stream) {
   }
 
   InitGpuProperties();
-  InitGpuEigenDevice();
-  InitDnnHanlde();
-  InitBlasHandle();
-  InitBlasLtHandle();
-  InitSolverHandle();
-  InitSparseHandle();
 }
 
 void GPUContextResource::DestroyGPUResource() {
@@ -203,22 +198,6 @@ void GPUContextResource::DestroyDnnHandle() {
   phi::DestroyDnnHandle(dnn_handle_);
 }
 
-void GPUContextResource::InitBlasHandle() {
-  phi::InitBlasHandle(&blas_handle_, stream_);
-#ifdef PADDLE_WITH_CUDA
-#if CUDA_VERSION >= 9000
-  phi::InitBlasHandle(&blas_tensor_core_handle_, stream_);
-  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
-      blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
-#endif
-#if CUDA_VERSION >= 11000
-  phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_);
-  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
-      blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
-#endif
-#endif
-}
-
 void GPUContextResource::DestroyBlasHandle() {
   phi::DestroyBlasHandle(blas_handle_);
   phi::DestroyBlasHandle(blas_tensor_core_handle_);
@@ -255,32 +234,106 @@ gpuStream_t GPUContextResource::GetStream() const { return stream_; }
 
 dnnHandle_t GPUContextResource::GetDnnHandle() const { return dnn_handle_; }
 
+std::function<phi::dnnHandle_t()> GPUContextResource::GetDnnHandleCreator() {
+  return [&]() -> phi::dnnHandle_t {
+    InitDnnHanlde();
+    return dnn_handle_;
+  };
+}
+
 blasHandle_t GPUContextResource::GetBlasHandle() const { return blas_handle_; }
 
+std::function<phi::blasHandle_t()> GPUContextResource::GetBlasHandleCreator() {
+  return [&]() -> phi::blasHandle_t {
+    phi::InitBlasHandle(&blas_handle_, stream_);
+    return blas_handle_;
+  };
+}
+
 blasHandle_t GPUContextResource::GetBlasTensorCoreHandle() const {
   return blas_tensor_core_handle_;
 }
 
+std::function<phi::blasHandle_t()>
+GPUContextResource::GetBlasTensorCoreHandleCreator() {
+  return [&]() -> phi::blasHandle_t {
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 9000
+    phi::InitBlasHandle(&blas_tensor_core_handle_, stream_);
+    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+        blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
+#endif
+#endif
+    return blas_tensor_core_handle_;
+  };
+}
+
 blasHandle_t GPUContextResource::GetBlasTF32Handle() const {
   return blas_tf32_tensor_core_handle_;
 }
 
+std::function<phi::blasHandle_t()>
+GPUContextResource::GetBlasTF32TensorCoreHandleCreator() {
+  return [&]() -> phi::blasHandle_t {
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 11000
+    phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_);
+    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+        blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
+#endif
+#endif
+    return blas_tf32_tensor_core_handle_;
+  };
+}
+
 blasLtHandle_t GPUContextResource::GetBlasLtHandle() const {
   return blaslt_handle_;
 }
 
+std::function<phi::blasLtHandle_t()>
+GPUContextResource::GetBlasLtHandleCreator() {
+  return [&]() {
+    InitBlasLtHandle();
+    return blaslt_handle_;
+  };
+}
+
 phi::solverHandle_t GPUContextResource::GetSolverDnHandle() const {
   return solver_handle_;
 }
 
+std::function<phi::solverHandle_t()>
+GPUContextResource::GetSolverDnHandleCreator() {
+  return [&]() {
+    InitSolverHandle();
+    return solver_handle_;
+  };
+}
+
 phi::sparseHandle_t GPUContextResource::GetSparseHandle() const {
   return sparse_handle_;
 }
 
+std::function<phi::sparseHandle_t()>
+GPUContextResource::GetSparseHandleCreator() {
+  return [&]() {
+    InitSparseHandle();
+    return sparse_handle_;
+  };
+}
+
 Eigen::GpuDevice* GPUContextResource::GetGpuEigenDevice() const {
   return gpu_eigen_device_.get();
 }
 
+std::function<Eigen::GpuDevice*()>
+GPUContextResource::GetGpuEigenDeviceCreator() {
+  return [&]() {
+    InitGpuEigenDevice();
+    return gpu_eigen_device_.get();
+  };
+}
+
 int GPUContextResource::GetGpuComputeCapability() const {
   return compute_capability_;
 }
@@ -311,67 +364,82 @@ void GPUContextResource::ReBindStream(gpuStream_t stream) {
 }
 
 void GPUContextResource::ReBindDnnHandle(gpuStream_t stream) const {
+  if (dnn_handle_) {
 #ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::miopenSetStream(dnn_handle_, stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::miopenSetStream(dnn_handle_, stream));
 #else
-  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cudnnSetStream(dnn_handle_, stream));
+    PADDLE_RETRY_CUDA_SUCCESS(
+        phi::dynload::cudnnSetStream(dnn_handle_, stream));
 #endif
+  }
 }
 
 void GPUContextResource::ReBindBlasHandle(gpuStream_t stream) const {
+  if (blas_handle_) {
 #ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::rocblas_set_stream(blas_handle_, stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::rocblas_set_stream(blas_handle_, stream));
 #else
-  PADDLE_RETRY_CUDA_SUCCESS(
-      phi::dynload::cublasSetStream(blas_handle_, stream));
+    PADDLE_RETRY_CUDA_SUCCESS(
+        phi::dynload::cublasSetStream(blas_handle_, stream));
 #endif
+  }
 }
 
 void GPUContextResource::ReBindBlasTensorCoreHandle(gpuStream_t stream) const {
+  if (blas_tensor_core_handle_) {
 #ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::rocblas_set_stream(blas_tensor_core_handle_, stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::rocblas_set_stream(blas_tensor_core_handle_, stream));
 #else
-  PADDLE_RETRY_CUDA_SUCCESS(
-      phi::dynload::cublasSetStream(blas_tensor_core_handle_, stream));
+    PADDLE_RETRY_CUDA_SUCCESS(
+        phi::dynload::cublasSetStream(blas_tensor_core_handle_, stream));
 #endif
+  }
 }
 
 void GPUContextResource::ReBindBlasTF32Handle(gpuStream_t stream) const {
+  if (blas_tf32_tensor_core_handle_) {
 #ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::rocblas_set_stream(blas_tf32_tensor_core_handle_, stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_set_stream(
+        blas_tf32_tensor_core_handle_, stream));
 #else
-  PADDLE_RETRY_CUDA_SUCCESS(
-      phi::dynload::cublasSetStream(blas_tf32_tensor_core_handle_, stream));
+    PADDLE_RETRY_CUDA_SUCCESS(
+        phi::dynload::cublasSetStream(blas_tf32_tensor_core_handle_, stream));
 #endif
+  }
 }
 
 void GPUContextResource::ReBindSolverDnHandle(gpuStream_t stream) const {
+  if (solver_handle_) {
 #ifndef PADDLE_WITH_HIP
-  PADDLE_RETRY_CUDA_SUCCESS(
-      phi::dynload::cusolverDnSetStream(solver_handle_, stream));
+    PADDLE_RETRY_CUDA_SUCCESS(
+        phi::dynload::cusolverDnSetStream(solver_handle_, stream));
 #endif
+  }
 }
 
 void GPUContextResource::ReBindSparseHandle(gpuStream_t stream) const {
+  if (sparse_handle_) {
 #if defined(PADDLE_WITH_CUDA)
 // The generic APIs is supported from CUDA10.1
 #if CUDA_VERSION >= 11000
-  PADDLE_RETRY_CUDA_SUCCESS(
-      phi::dynload::cusparseSetStream(sparse_handle_, stream));
+    PADDLE_RETRY_CUDA_SUCCESS(
+        phi::dynload::cusparseSetStream(sparse_handle_, stream));
 #endif
 #endif
+  }
 }
 
 void GPUContextResource::ReBindEigenDevice(gpuStream_t stream,
                                            GPUPlace place) const {
-  auto* allocator = paddle::memory::allocation::AllocatorFacade::Instance()
-                        .GetAllocator(place_)
-                        .get();
-  eigen_stream_->Reinitialize(stream, allocator, place);
+  if (eigen_stream_) {
+    auto* allocator = paddle::memory::allocation::AllocatorFacade::Instance()
+                          .GetAllocator(place_)
+                          .get();
+    eigen_stream_->Reinitialize(stream, allocator, place);
+  }
 }
 
 #endif

diff --git a/paddle/fluid/inference/api/resource_manager.h b/paddle/fluid/inference/api/resource_manager.h
@@ -55,6 +55,15 @@ class GPUContextResource {
   ~GPUContextResource();
   phi::Place Place() const;
 
+  std::function<phi::dnnHandle_t()> GetDnnHandleCreator();
+  std::function<phi::blasHandle_t()> GetBlasHandleCreator();
+  std::function<phi::blasHandle_t()> GetBlasTensorCoreHandleCreator();
+  std::function<phi::blasHandle_t()> GetBlasTF32TensorCoreHandleCreator();
+  std::function<phi::blasLtHandle_t()> GetBlasLtHandleCreator();
+  std::function<phi::solverHandle_t()> GetSolverDnHandleCreator();
+  std::function<phi::sparseHandle_t()> GetSparseHandleCreator();
+  std::function<Eigen::GpuDevice*()> GetGpuEigenDeviceCreator();
+
   gpuStream_t GetStream() const;
   dnnHandle_t GetDnnHandle() const;
   blasHandle_t GetBlasHandle() const;
@@ -89,7 +98,6 @@ class GPUContextResource {
   void InitGpuEigenDevice();
   void InitDnnHanlde();
   void DestroyDnnHandle();
-  void InitBlasHandle();
   void DestroyBlasHandle();
   void InitBlasLtHandle();
   void DestroyBlasLtHandle();

diff --git a/paddle/fluid/operators/fused/resnet_basic_block_op.cc b/paddle/fluid/operators/fused/resnet_basic_block_op.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/api/all.h"
+#include "paddle/phi/core/ddim.h"
 
 namespace paddle {
 namespace operators {