Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

inference multi stream support handle lazy init. #44563

Merged
merged 5 commits into from
Jul 26, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions paddle/fluid/inference/api/analysis_predictor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -415,14 +415,16 @@ void AnalysisPredictor::InitDeviceContexts() {
gpu_context->SetHostGenerator(framework::DefaultCPUGenerator().get());

gpu_context->SetStream(gpu_resource->GetStream());
gpu_context->SetBlasHandle(gpu_resource->GetBlasHandle());
gpu_context->SetBlasHandle(gpu_resource->GetBlasHandleCreator());
gpu_context->SetBlasTensorCoreHandle(
gpu_resource->GetBlasTensorCoreHandle());
gpu_context->SetBlasTF32Handle(gpu_resource->GetBlasTF32Handle());
gpu_context->SetDnnHandle(gpu_resource->GetDnnHandle());
gpu_context->SetSolverHandle(gpu_resource->GetSolverDnHandle());
gpu_context->SetSparseHandle(gpu_resource->GetSparseHandle());
gpu_context->SetEigenDevice(gpu_resource->GetGpuEigenDevice());
gpu_resource->GetBlasTensorCoreHandleCreator());
gpu_context->SetBlasTF32Handle(
gpu_resource->GetBlasTF32TensorCoreHandleCreator());
gpu_context->SetDnnHandle(gpu_resource->GetDnnHandleCreator());
gpu_context->SetSolverHandle(
gpu_resource->GetSolverDnHandleCreator());
gpu_context->SetSparseHandle(gpu_resource->GetSparseHandleCreator());
gpu_context->SetEigenDevice(gpu_resource->GetGpuEigenDeviceCreator());
gpu_context->SetComputeCapability(
gpu_resource->GetGpuComputeCapability());
gpu_context->SetMaxThreadsPerBlock(
Expand Down
158 changes: 113 additions & 45 deletions paddle/fluid/inference/api/resource_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

#include "paddle/fluid/inference/api/resource_manager.h"

#include <functional>
#include <memory>
#include <mutex>
#include <unordered_map>
Expand Down Expand Up @@ -150,12 +151,6 @@ void GPUContextResource::InitGPUResource(void* stream) {
}

InitGpuProperties();
InitGpuEigenDevice();
InitDnnHanlde();
InitBlasHandle();
InitBlasLtHandle();
InitSolverHandle();
InitSparseHandle();
}

void GPUContextResource::DestroyGPUResource() {
Expand Down Expand Up @@ -203,22 +198,6 @@ void GPUContextResource::DestroyDnnHandle() {
phi::DestroyDnnHandle(dnn_handle_);
}

void GPUContextResource::InitBlasHandle() {
phi::InitBlasHandle(&blas_handle_, stream_);
#ifdef PADDLE_WITH_CUDA
#if CUDA_VERSION >= 9000
phi::InitBlasHandle(&blas_tensor_core_handle_, stream_);
PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
#endif
#if CUDA_VERSION >= 11000
phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_);
PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
#endif
#endif
}

void GPUContextResource::DestroyBlasHandle() {
phi::DestroyBlasHandle(blas_handle_);
phi::DestroyBlasHandle(blas_tensor_core_handle_);
Expand Down Expand Up @@ -255,32 +234,106 @@ gpuStream_t GPUContextResource::GetStream() const { return stream_; }

dnnHandle_t GPUContextResource::GetDnnHandle() const { return dnn_handle_; }

std::function<phi::dnnHandle_t()> GPUContextResource::GetDnnHandleCreator() {
return [&]() -> phi::dnnHandle_t {
InitDnnHanlde();
return dnn_handle_;
};
}

blasHandle_t GPUContextResource::GetBlasHandle() const { return blas_handle_; }

std::function<phi::blasHandle_t()> GPUContextResource::GetBlasHandleCreator() {
return [&]() -> phi::blasHandle_t {
phi::InitBlasHandle(&blas_handle_, stream_);
return blas_handle_;
};
}

blasHandle_t GPUContextResource::GetBlasTensorCoreHandle() const {
return blas_tensor_core_handle_;
}

std::function<phi::blasHandle_t()>
GPUContextResource::GetBlasTensorCoreHandleCreator() {
return [&]() -> phi::blasHandle_t {
#ifdef PADDLE_WITH_CUDA
#if CUDA_VERSION >= 9000
phi::InitBlasHandle(&blas_tensor_core_handle_, stream_);
PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
#endif
#endif
return blas_tensor_core_handle_;
};
}

blasHandle_t GPUContextResource::GetBlasTF32Handle() const {
return blas_tf32_tensor_core_handle_;
}

std::function<phi::blasHandle_t()>
GPUContextResource::GetBlasTF32TensorCoreHandleCreator() {
return [&]() -> phi::blasHandle_t {
#ifdef PADDLE_WITH_CUDA
#if CUDA_VERSION >= 11000
phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_);
PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
#endif
#endif
return blas_tf32_tensor_core_handle_;
};
}

blasLtHandle_t GPUContextResource::GetBlasLtHandle() const {
return blaslt_handle_;
}

std::function<phi::blasLtHandle_t()>
GPUContextResource::GetBlasLtHandleCreator() {
return [&]() {
InitBlasLtHandle();
return blaslt_handle_;
};
}

phi::solverHandle_t GPUContextResource::GetSolverDnHandle() const {
return solver_handle_;
}

std::function<phi::solverHandle_t()>
GPUContextResource::GetSolverDnHandleCreator() {
return [&]() {
InitSolverHandle();
return solver_handle_;
};
}

phi::sparseHandle_t GPUContextResource::GetSparseHandle() const {
return sparse_handle_;
}

std::function<phi::sparseHandle_t()>
GPUContextResource::GetSparseHandleCreator() {
return [&]() {
InitSparseHandle();
return sparse_handle_;
};
}

Eigen::GpuDevice* GPUContextResource::GetGpuEigenDevice() const {
return gpu_eigen_device_.get();
}

std::function<Eigen::GpuDevice*()>
GPUContextResource::GetGpuEigenDeviceCreator() {
return [&]() {
InitGpuEigenDevice();
return gpu_eigen_device_.get();
};
}

int GPUContextResource::GetGpuComputeCapability() const {
return compute_capability_;
}
Expand Down Expand Up @@ -311,67 +364,82 @@ void GPUContextResource::ReBindStream(gpuStream_t stream) {
}

void GPUContextResource::ReBindDnnHandle(gpuStream_t stream) const {
if (dnn_handle_) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::miopenSetStream(dnn_handle_, stream));
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::miopenSetStream(dnn_handle_, stream));
#else
PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cudnnSetStream(dnn_handle_, stream));
PADDLE_RETRY_CUDA_SUCCESS(
phi::dynload::cudnnSetStream(dnn_handle_, stream));
#endif
}
}

void GPUContextResource::ReBindBlasHandle(gpuStream_t stream) const {
if (blas_handle_) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::rocblas_set_stream(blas_handle_, stream));
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::rocblas_set_stream(blas_handle_, stream));
#else
PADDLE_RETRY_CUDA_SUCCESS(
phi::dynload::cublasSetStream(blas_handle_, stream));
PADDLE_RETRY_CUDA_SUCCESS(
phi::dynload::cublasSetStream(blas_handle_, stream));
#endif
}
}

void GPUContextResource::ReBindBlasTensorCoreHandle(gpuStream_t stream) const {
if (blas_tensor_core_handle_) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::rocblas_set_stream(blas_tensor_core_handle_, stream));
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::rocblas_set_stream(blas_tensor_core_handle_, stream));
#else
PADDLE_RETRY_CUDA_SUCCESS(
phi::dynload::cublasSetStream(blas_tensor_core_handle_, stream));
PADDLE_RETRY_CUDA_SUCCESS(
phi::dynload::cublasSetStream(blas_tensor_core_handle_, stream));
#endif
}
}

void GPUContextResource::ReBindBlasTF32Handle(gpuStream_t stream) const {
if (blas_tf32_tensor_core_handle_) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::rocblas_set_stream(blas_tf32_tensor_core_handle_, stream));
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_set_stream(
blas_tf32_tensor_core_handle_, stream));
#else
PADDLE_RETRY_CUDA_SUCCESS(
phi::dynload::cublasSetStream(blas_tf32_tensor_core_handle_, stream));
PADDLE_RETRY_CUDA_SUCCESS(
phi::dynload::cublasSetStream(blas_tf32_tensor_core_handle_, stream));
#endif
}
}

void GPUContextResource::ReBindSolverDnHandle(gpuStream_t stream) const {
if (solver_handle_) {
#ifndef PADDLE_WITH_HIP
PADDLE_RETRY_CUDA_SUCCESS(
phi::dynload::cusolverDnSetStream(solver_handle_, stream));
PADDLE_RETRY_CUDA_SUCCESS(
phi::dynload::cusolverDnSetStream(solver_handle_, stream));
#endif
}
}

void GPUContextResource::ReBindSparseHandle(gpuStream_t stream) const {
if (sparse_handle_) {
#if defined(PADDLE_WITH_CUDA)
// The generic APIs is supported from CUDA10.1
#if CUDA_VERSION >= 11000
PADDLE_RETRY_CUDA_SUCCESS(
phi::dynload::cusparseSetStream(sparse_handle_, stream));
PADDLE_RETRY_CUDA_SUCCESS(
phi::dynload::cusparseSetStream(sparse_handle_, stream));
#endif
#endif
}
}

void GPUContextResource::ReBindEigenDevice(gpuStream_t stream,
GPUPlace place) const {
auto* allocator = paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place_)
.get();
eigen_stream_->Reinitialize(stream, allocator, place);
if (eigen_stream_) {
auto* allocator = paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place_)
.get();
eigen_stream_->Reinitialize(stream, allocator, place);
}
}

#endif
Expand Down
10 changes: 9 additions & 1 deletion paddle/fluid/inference/api/resource_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,15 @@ class GPUContextResource {
~GPUContextResource();
phi::Place Place() const;

std::function<phi::dnnHandle_t()> GetDnnHandleCreator();
std::function<phi::blasHandle_t()> GetBlasHandleCreator();
std::function<phi::blasHandle_t()> GetBlasTensorCoreHandleCreator();
std::function<phi::blasHandle_t()> GetBlasTF32TensorCoreHandleCreator();
std::function<phi::blasLtHandle_t()> GetBlasLtHandleCreator();
std::function<phi::solverHandle_t()> GetSolverDnHandleCreator();
std::function<phi::sparseHandle_t()> GetSparseHandleCreator();
std::function<Eigen::GpuDevice*()> GetGpuEigenDeviceCreator();

gpuStream_t GetStream() const;
dnnHandle_t GetDnnHandle() const;
blasHandle_t GetBlasHandle() const;
Expand Down Expand Up @@ -89,7 +98,6 @@ class GPUContextResource {
void InitGpuEigenDevice();
void InitDnnHanlde();
void DestroyDnnHandle();
void InitBlasHandle();
void DestroyBlasHandle();
void InitBlasLtHandle();
void DestroyBlasLtHandle();
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/operators/fused/resnet_basic_block_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/api/all.h"
#include "paddle/phi/core/ddim.h"

namespace paddle {
namespace operators {
Expand Down
Loading