Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update gpu context #39442

Closed
wants to merge 10 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions paddle/fluid/operators/conv_cudnn_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
algo = static_cast<cudnnConvolutionFwdAlgo_t>(1);
} else {
auto& dev_ctx = ctx;
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
auto* workspace_handle = dev_ctx.cudnn_workspace_handle();

AlgorithmsCache<algo_t>& algo_cache =
*(framework::ConvSearchCache::Instance().GetForward());
Expand Down Expand Up @@ -311,7 +311,8 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
perf_stat.data(), cudnn_workspace_ptr,
workspace_size_limit));
};
workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);
workspace_handle->RunFuncSync(cudnn_find_func,
workspace_size_limit);

VLOG(3) << "FwdAlgo Perf result: (algo: stat, time, memory)";
for (int i = 0; i < returned_algo_count; ++i) {
Expand Down Expand Up @@ -410,7 +411,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
return CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
} else {
auto& dev_ctx = ctx;
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
auto* workspace_handle = dev_ctx.cudnn_workspace_handle();

AlgorithmsCache<algo_t>& algo_cache =
*(framework::ConvSearchCache::Instance().GetBackwardData());
Expand Down Expand Up @@ -440,7 +441,8 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
perf_stat.data(), cudnn_workspace_ptr,
workspace_size_limit));
};
workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);
workspace_handle->RunFuncSync(cudnn_find_func,
workspace_size_limit);

VLOG(3) << "BwdDataAlgo Perf result: (algo: stat, time, memory)";
for (int i = 0; i < returned_algo_count; ++i) {
Expand Down Expand Up @@ -529,7 +531,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
} else {
auto& dev_ctx = ctx;
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
auto* workspace_handle = dev_ctx.cudnn_workspace_handle();
AlgorithmsCache<algo_t>& algo_cache =
*(framework::ConvSearchCache::Instance().GetBackwardFilter());

Expand Down Expand Up @@ -557,8 +559,8 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
perf_stat.data(), cudnn_workspace_ptr,
workspace_size_limit));
};
workspace_handle.RunFuncSync(cudnn_find_func,
workspace_size_limit);
workspace_handle->RunFuncSync(cudnn_find_func,
workspace_size_limit);

VLOG(3)
<< "BwdFilterAlgo Perf result: (algo: stat, time, memory)";
Expand Down
12 changes: 6 additions & 6 deletions paddle/fluid/operators/conv_miopen_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
const phi::GPUContext& ctx) {
algo_t algo;

auto workspace_handle = ctx.cudnn_workspace_handle();
auto* workspace_handle = ctx.cudnn_workspace_handle();

int find_count;
miopenConvAlgoPerf_t find_result;
Expand All @@ -145,7 +145,7 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
cudnn_workspace_ptr, workspace_size, false));
};

workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
workspace_handle->RunFuncSync(cudnn_find_func, workspace_size);
algo = find_result.fwd_algo;
VLOG(3) << "choose algo " << algo;
return algo;
Expand All @@ -172,7 +172,7 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
const phi::GPUContext& ctx) {
algo_t algo;

auto workspace_handle = ctx.cudnn_workspace_handle();
auto* workspace_handle = ctx.cudnn_workspace_handle();

int find_count;
miopenConvAlgoPerf_t find_result;
Expand All @@ -186,7 +186,7 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
cudnn_workspace_ptr, workspace_size, false));
};

workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
workspace_handle->RunFuncSync(cudnn_find_func, workspace_size);
algo = find_result.bwd_data_algo;
VLOG(3) << "choose algo " << algo;
return algo;
Expand All @@ -213,7 +213,7 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
const phi::GPUContext& ctx) {
algo_t algo;

auto workspace_handle = ctx.cudnn_workspace_handle();
auto* workspace_handle = ctx.cudnn_workspace_handle();

int find_count;
miopenConvAlgoPerf_t find_result;
Expand All @@ -227,7 +227,7 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
cudnn_workspace_ptr, workspace_size, false));
};

workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
workspace_handle->RunFuncSync(cudnn_find_func, workspace_size);
algo = find_result.bwd_weights_algo;
VLOG(3) << "choose algo " << algo;
return algo;
Expand Down
14 changes: 7 additions & 7 deletions paddle/fluid/operators/fused/conv_fusion_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {

miopenConvFwdAlgorithm_t algo;
auto handle = dev_ctx.cudnn_handle();
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
auto* workspace_handle = dev_ctx.cudnn_workspace_handle();

auto x_dims = phi::vectorize(transformed_input.dims());
auto f_dims = phi::vectorize(filter->dims());
Expand All @@ -202,7 +202,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
kNUM_CUDNN_FWD_ALGS, &find_count, &find_result,
cudnn_workspace_ptr, workspace_size, false));
};
workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
workspace_handle->RunFuncSync(cudnn_find_func, workspace_size);
algo = find_result.fwd_algo;
VLOG(3) << "cuDNN forward algo " << algo;

Expand All @@ -214,7 +214,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
filter_data, cudnn_conv_desc, algo, &beta, cudnn_output_desc,
output_data, cudnn_workspace, workspace_size));
};
workspace_handle.RunFunc(cudnn_func, workspace_size);
workspace_handle->RunFunc(cudnn_func, workspace_size);
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenConvolutionForwardBias(
handle, &alpha, cudnn_bias_desc, bias_data, &beta,
Expand Down Expand Up @@ -264,7 +264,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv algorithm ---------------------
cudnnConvolutionFwdAlgo_t algo;
auto handle = dev_ctx.cudnn_handle();
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
auto* workspace_handle = dev_ctx.cudnn_workspace_handle();

PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_DEFAULT_MATH));
Expand Down Expand Up @@ -318,7 +318,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit));
};
workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);
workspace_handle->RunFuncSync(cudnn_find_func, workspace_size_limit);
VLOG(3) << "Perf result: (algo: stat, time, memory)";
for (int i = 0; i < returned_algo_count; ++i) {
const auto& stat = fwd_perf_stat[i];
Expand Down Expand Up @@ -372,7 +372,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
filter_data, cudnn_conv_desc, algo, cudnn_workspace,
workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
};
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
workspace_handle->RunFunc(cudnn_func, workspace_size_in_bytes);
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnAddTensor(
handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc,
output_data));
Expand All @@ -392,7 +392,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
cudnn_output_desc, residual_data, cudnn_bias_desc, bias_data,
cudnn_act_desc, cudnn_output_desc, output_data));
};
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
workspace_handle->RunFunc(cudnn_func, workspace_size_in_bytes);
}
#endif
std::vector<int> channels = ctx.Attr<std::vector<int>>("split_channels");
Expand Down
6 changes: 3 additions & 3 deletions paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ class CudnnNormConvolution {
fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_YSUM, sum_ptr);
fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_YSQSUM, sum_of_squares_ptr);

ctx.cudnn_workspace_handle().RunFunc(
ctx.cudnn_workspace_handle()->RunFunc(
[&](void *workspace_ptr) {
// workspace ptr
fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, workspace_ptr);
Expand Down Expand Up @@ -298,7 +298,7 @@ class CudnnNormConvolutionGrad {
wgrad_op->SetOpVariantParamAttrPtr(
CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &workspace_size);

ctx.cudnn_workspace_handle().RunFunc(
ctx.cudnn_workspace_handle()->RunFunc(
[&](void *workspace_ptr) {
// workspace ptr
wgrad_op->SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE,
Expand All @@ -317,7 +317,7 @@ class CudnnNormConvolutionGrad {
// Convolution dgrad followed optionally by batchnorm dgrad
ScalingParamType<T> alpha = 1.0f;
ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f;
ctx.cudnn_workspace_handle().RunFunc(
ctx.cudnn_workspace_handle()->RunFunc(
[&](void *cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnConvolutionBackwardData(
Expand Down
8 changes: 4 additions & 4 deletions paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ class CudnnScaleBiasAddRelu {
ForwardInit(ctx);
auto handle = ctx.cudnn_handle();
auto place = ctx.GetPlace();
auto workspace_handle = ctx.cudnn_workspace_handle();
auto *workspace_handle = ctx.cudnn_workspace_handle();
fwd_workspace_byte_ = fwd_op_.GetWorkspaceSizeInBytes(handle);
// Set variant_param
// input ptr
Expand Down Expand Up @@ -146,7 +146,7 @@ class CudnnScaleBiasAddRelu {
fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_YDATA, out_ptr);
fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_ACTIVATION_BITMASK, bitmask_ptr);

workspace_handle.RunFunc(
workspace_handle->RunFunc(
[&](void *workspace_ptr) {
// workspace ptr
fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, workspace_ptr);
Expand All @@ -164,7 +164,7 @@ class CudnnScaleBiasAddRelu {
BackwardInit(ctx);
auto handle = ctx.cudnn_handle();
auto place = ctx.GetPlace();
auto workspace_handle = ctx.cudnn_workspace_handle();
auto *workspace_handle = ctx.cudnn_workspace_handle();
bwd_workspace_byte_ = bwd_op_.GetWorkspaceSizeInBytes(handle);
// Set variant_param
// input ptr
Expand Down Expand Up @@ -203,7 +203,7 @@ class CudnnScaleBiasAddRelu {
bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_DZDATA, dz_ptr);
}

workspace_handle.RunFunc(
workspace_handle->RunFunc(
[&](void *workspace_ptr) {
// workspace ptr
bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, workspace_ptr);
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/operators/fused/fusion_conv_inception_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -245,8 +245,8 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
static_cast<const void*>(bias[i]->data<T>()), cudnn_act_desc,
out_desc[i], out_datas[i]));
};
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
workspace_handle.RunFunc(func, workspace_size_in_bytes);
auto* workspace_handle = dev_ctx.cudnn_workspace_handle();
workspace_handle->RunFunc(func, workspace_size_in_bytes);
}

cudnnTensorDescriptor_t x_desc;
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/platform/cuda_graph_with_memory_pool.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ namespace platform {
void BeginCUDAGraphCapture(platform::CUDAPlace place,
cudaStreamCaptureMode mode) {
auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
dev_ctx->cudnn_workspace_handle().ResetWorkspace();
dev_ctx->cudnn_workspace_handle()->ResetWorkspace();

auto stream = dev_ctx->stream();
CUDAGraph::BeginCapture(place, stream, mode);
Expand All @@ -39,7 +39,7 @@ void BeginCUDAGraphCapture(platform::CUDAPlace place,
std::unique_ptr<CUDAGraph> EndCUDAGraphCapture() {
auto place = CUDAGraph::CapturingPlace();
auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
dev_ctx->cudnn_workspace_handle().ResetWorkspace();
dev_ctx->cudnn_workspace_handle()->ResetWorkspace();
return CUDAGraph::EndCapture();
}
#endif
Expand Down
8 changes: 2 additions & 6 deletions paddle/fluid/platform/device_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -617,13 +617,9 @@ void CUDADeviceContext::WaitStreamCallback() const {
phi::GPUContext::WaitStreamCallback();
}

phi::DnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
phi::DnnWorkspaceHandle* CUDADeviceContext::cudnn_workspace_handle() const {
if (thread_ctx_.count(this)) {
// return workspace_.get();
return phi::DnnWorkspaceHandle(
memory::allocation::AllocatorFacade::Instance()
.GetAllocator(GetPlace())
.get());
return workspace_.get();
}
return phi::GPUContext::cudnn_workspace_handle();
}
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/platform/device_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -604,7 +604,7 @@ class CUDADeviceContext : public phi::GPUContext {
* workspace. Once the handle is destructed, the lock would be released.
* CudnnWorkspaceHandle is an RAII object to implement thread-safe
* sequential cudnn function calls. */
phi::DnnWorkspaceHandle cudnn_workspace_handle() const;
phi::DnnWorkspaceHandle* cudnn_workspace_handle() const;

/*! \brief Return cuda stream in the device context. */
gpuStream_t stream() const;
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/pybind/pybind.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3154,7 +3154,7 @@ All parameter, weight, gradient are variables in Paddle.
for (int dev_id : platform::GetSelectedDevices()) {
auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(
platform::CUDAPlace(dev_id));
dev_ctx->cudnn_workspace_handle().ResetWorkspace();
dev_ctx->cudnn_workspace_handle()->ResetWorkspace();
}
platform::EmptyCache();
});
Expand Down
51 changes: 40 additions & 11 deletions paddle/phi/backends/gpu/gpu_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -155,15 +155,50 @@ static void StreamCallbackFunc(gpuStream_t stream,

} // namespace internal

void DnnWorkspaceHandle::ResetWorkspace() { allocation_ = nullptr; }
void DnnWorkspaceHandle::ResetWorkspace() {
std::lock_guard<std::mutex> guard(*mtx_);
allocation_ = nullptr;
}

void DnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) {
if (required_workspace_bytes <= WorkspaceSize()) return;
// reset allocation first before re-allocate to save memory
std::lock_guard<std::mutex> guard(*mtx_);
allocation_.reset();
allocation_ = allocator_->Allocate(required_workspace_bytes);
}

inline void DnnWorkspaceHandle::RunFunc(
const std::function<void(void*)>& cudnn_func,
size_t required_workspace_bytes) {
if (required_workspace_bytes > WorkspaceSize()) {
ReallocWorkspace(required_workspace_bytes);
}
{
std::lock_guard<std::mutex> guard(*mtx_);
cudnn_func(allocation_ ? allocation_->ptr() : nullptr);
}
}

/*! \brief Thread which call RunFuncSync() would release gpu memory after
* running the function. Currently this function is only used when cudnn
* exhaustive searching and callers have to guarantee that the input function
* is host blocking */
inline void DnnWorkspaceHandle::RunFuncSync(
const std::function<void(void*)>& cudnn_func,
size_t required_workspace_bytes) {
RunFunc(cudnn_func, required_workspace_bytes);
ResetWorkspace();
}

inline size_t DnnWorkspaceHandle::WorkspaceSize() {
std::lock_guard<std::mutex> guard(*mtx_);
if (allocation_ == nullptr) {
return 0;
}
return allocation_->size();
}

struct GPUContext::Impl {
void Init() {
owned_ = true;
Expand Down Expand Up @@ -305,15 +340,9 @@ struct GPUContext::Impl {
}
}

// TODO(wilber): The return type is a pointer, to be modified later.
// DnnWorkspaceHandle* GetDnnWorkspace() {
// PD_CHECK(workspace_ != nullptr, "the gpu cudnn workspace is nullptr.");
// return workspace_;
// }
DnnWorkspaceHandle GetDnnWorkspace() {
PD_CHECK(allocator_ != nullptr,
"the device allocator for gpu context is nullptr.");
return DnnWorkspaceHandle(allocator_);
DnnWorkspaceHandle* GetDnnWorkspace() {
PD_CHECK(workspace_ != nullptr, "the gpu cudnn workspace is nullptr.");
return workspace_;
}

void InitStream() {
Expand Down Expand Up @@ -802,7 +831,7 @@ Eigen::GpuDevice* GPUContext::eigen_device() const {
return impl_->eigen_device();
}

DnnWorkspaceHandle GPUContext::cudnn_workspace_handle() const {
DnnWorkspaceHandle* GPUContext::cudnn_workspace_handle() const {
return impl_->GetDnnWorkspace();
}

Expand Down
Loading