Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
jiweibo committed Feb 7, 2022
1 parent 298e3cb commit 3786705
Show file tree
Hide file tree
Showing 14 changed files with 90 additions and 81 deletions.
19 changes: 7 additions & 12 deletions paddle/fluid/operators/conv_cudnn_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,6 @@ limitations under the License. */
#include "paddle/fluid/operators/eigen/eigen_function.h"
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/pten/backends/gpu/gpu_context.h"

namespace paddle {
namespace operators {
Expand Down Expand Up @@ -288,9 +286,8 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
} else {
auto& dev_ctx =
ctx.template device_context<platform::CUDADeviceContext>();
auto* workspace_handle = dev_ctx.cudnn_workspace_handle();
auto workspace_handle = dev_ctx.cudnn_workspace_handle();

// auto& temp = ctx.cuda_device_context();
AlgorithmsCache<algo_t>& algo_cache =
*(framework::ConvSearchCache::Instance().GetForward());

Expand All @@ -317,8 +314,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
perf_stat.data(), cudnn_workspace_ptr,
workspace_size_limit));
};
workspace_handle->RunFuncSync(cudnn_find_func,
workspace_size_limit);
workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);

VLOG(3) << "FwdAlgo Perf result: (algo: stat, time, memory)";
for (int i = 0; i < returned_algo_count; ++i) {
Expand Down Expand Up @@ -419,7 +415,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
} else {
auto& dev_ctx =
ctx.template device_context<platform::CUDADeviceContext>();
auto* workspace_handle = dev_ctx.cudnn_workspace_handle();
auto workspace_handle = dev_ctx.cudnn_workspace_handle();

AlgorithmsCache<algo_t>& algo_cache =
*(framework::ConvSearchCache::Instance().GetBackwardData());
Expand Down Expand Up @@ -449,8 +445,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
perf_stat.data(), cudnn_workspace_ptr,
workspace_size_limit));
};
workspace_handle->RunFuncSync(cudnn_find_func,
workspace_size_limit);
workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);

VLOG(3) << "BwdDataAlgo Perf result: (algo: stat, time, memory)";
for (int i = 0; i < returned_algo_count; ++i) {
Expand Down Expand Up @@ -541,7 +536,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
} else {
auto& dev_ctx =
ctx.template device_context<platform::CUDADeviceContext>();
auto* workspace_handle = dev_ctx.cudnn_workspace_handle();
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
AlgorithmsCache<algo_t>& algo_cache =
*(framework::ConvSearchCache::Instance().GetBackwardFilter());

Expand Down Expand Up @@ -569,8 +564,8 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
perf_stat.data(), cudnn_workspace_ptr,
workspace_size_limit));
};
workspace_handle->RunFuncSync(cudnn_find_func,
workspace_size_limit);
workspace_handle.RunFuncSync(cudnn_find_func,
workspace_size_limit);

VLOG(3)
<< "BwdFilterAlgo Perf result: (algo: stat, time, memory)";
Expand Down
36 changes: 18 additions & 18 deletions paddle/fluid/operators/conv_cudnn_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
dtype};

auto handle = dev_ctx.cudnn_handle();
auto* workspace_handle = dev_ctx.cudnn_workspace_handle();
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
DataLayout layout = compute_format == DataLayout::kNHWC ? DataLayout::kNHWC
: DataLayout::kNCHW;
if (transformed_input.dims().size() == 5) {
Expand Down Expand Up @@ -326,7 +326,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
// VLOG(4) << "Conv: use_addto = " << ctx.Attr<bool>("use_addto");

#ifdef PADDLE_WITH_HIP
workspace_handle->RunFunc(
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenConvolutionForward(
Expand All @@ -338,7 +338,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
workspace_size);
#else
for (int i = 0; i < groups; i++) {
workspace_handle->RunFunc(
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnConvolutionForward(
Expand Down Expand Up @@ -607,7 +607,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
: DataLayout::kNCDHW;
}
auto layout_tensor = GetCudnnTensorFormat(layout);
auto* workspace_handle = dev_ctx.cudnn_workspace_handle();
auto workspace_handle = dev_ctx.cudnn_workspace_handle();

int i_n, i_c, i_d, i_h, i_w;
int o_n, o_c, o_d, o_h, o_w;
Expand Down Expand Up @@ -719,7 +719,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
Tensor temp_tensor(transformed_input_grad.type());
temp_tensor.Resize(transformed_input_grad.dims());
T* temp_tensor_data = temp_tensor.mutable_data<T>(ctx.GetPlace());
workspace_handle->RunFunc(
workspace_handle.RunFunc(
[&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenConvolutionBackwardData(
Expand All @@ -735,7 +735,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
temp_tensor_data, &beta, args1.idesc.desc(),
transformed_input_grad_data));
} else {
workspace_handle->RunFunc(
workspace_handle.RunFunc(
[&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenConvolutionBackwardData(
Expand All @@ -750,7 +750,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {

#else
for (int i = 0; i < groups; i++) {
workspace_handle->RunFunc(
workspace_handle.RunFunc(
[&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnConvolutionBackwardData(
Expand Down Expand Up @@ -797,7 +797,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
if (filter_grad) {
// Because beta is zero, it is unnecessary to reset filter_grad.
#ifdef PADDLE_WITH_HIP
workspace_handle->RunFunc(
workspace_handle.RunFunc(
[&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenConvolutionBackwardWeights(
Expand All @@ -809,7 +809,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
workspace_size);
#else
for (int i = 0; i < groups; i++) {
workspace_handle->RunFunc(
workspace_handle.RunFunc(
[&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnConvolutionBackwardFilter(
Expand Down Expand Up @@ -1224,13 +1224,13 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
// ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f :
// 0.0f;
// VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr<bool>("use_addto");
auto* wkspace_handle = dev_ctx.cudnn_workspace_handle();
auto wkspace_handle = dev_ctx.cudnn_workspace_handle();

if (ddO) {
if (ddX) {
ddx = transformed_ddX.data<T>();
#ifdef PADDLE_WITH_HIP
wkspace_handle->RunFunc(
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenConvolutionForward(
Expand All @@ -1242,7 +1242,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
workspace_size);
#else
for (int i = 0; i < groups; i++) {
wkspace_handle->RunFunc(
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnConvolutionForward(
Expand All @@ -1260,7 +1260,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
if (ddW) {
#ifdef PADDLE_WITH_HIP
// MIOPEN ONLY support beta to be 0.0f
wkspace_handle->RunFunc(
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenConvolutionForward(
Expand All @@ -1272,7 +1272,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
workspace_size);
#else
for (int i = 0; i < groups; i++) {
wkspace_handle->RunFunc(
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnConvolutionForward(
Expand All @@ -1296,7 +1296,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
if (dW && ddX) {
ddx = transformed_ddX.data<T>();
#ifdef PADDLE_WITH_HIP
wkspace_handle->RunFunc(
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenConvolutionBackwardWeights(
Expand All @@ -1308,7 +1308,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
workspace_size);
#else
for (int i = 0; i < groups; i++) {
wkspace_handle->RunFunc(
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnConvolutionBackwardFilter(
Expand All @@ -1327,7 +1327,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
if (dX && ddW) {
ddw = ddW->data<T>();
#ifdef PADDLE_WITH_HIP
wkspace_handle->RunFunc(
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenConvolutionBackwardData(
Expand All @@ -1339,7 +1339,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
workspace_size);
#else
for (int i = 0; i < groups; i++) {
wkspace_handle->RunFunc(
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnConvolutionBackwardData(
Expand Down
12 changes: 6 additions & 6 deletions paddle/fluid/operators/conv_miopen_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
algo_t algo;

auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto* workspace_handle = dev_ctx.cudnn_workspace_handle();
auto workspace_handle = dev_ctx.cudnn_workspace_handle();

int find_count;
miopenConvAlgoPerf_t find_result;
Expand All @@ -146,7 +146,7 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
cudnn_workspace_ptr, workspace_size, false));
};

workspace_handle->RunFuncSync(cudnn_find_func, workspace_size);
workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
algo = find_result.fwd_algo;
VLOG(3) << "choose algo " << algo;
return algo;
Expand Down Expand Up @@ -174,7 +174,7 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
algo_t algo;

auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto* workspace_handle = dev_ctx.cudnn_workspace_handle();
auto workspace_handle = dev_ctx.cudnn_workspace_handle();

int find_count;
miopenConvAlgoPerf_t find_result;
Expand All @@ -188,7 +188,7 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
cudnn_workspace_ptr, workspace_size, false));
};

workspace_handle->RunFuncSync(cudnn_find_func, workspace_size);
workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
algo = find_result.bwd_data_algo;
VLOG(3) << "choose algo " << algo;
return algo;
Expand Down Expand Up @@ -216,7 +216,7 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
algo_t algo;

auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto* workspace_handle = dev_ctx.cudnn_workspace_handle();
auto workspace_handle = dev_ctx.cudnn_workspace_handle();

int find_count;
miopenConvAlgoPerf_t find_result;
Expand All @@ -230,7 +230,7 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
cudnn_workspace_ptr, workspace_size, false));
};

workspace_handle->RunFuncSync(cudnn_find_func, workspace_size);
workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
algo = find_result.bwd_weights_algo;
VLOG(3) << "choose algo " << algo;
return algo;
Expand Down
Loading

0 comments on commit 3786705

Please sign in to comment.