From 259bb9404edb4cb8c5ae173e8627fc335f99fefa Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Tue, 19 Jun 2018 12:28:32 -0700 Subject: [PATCH] Fix #11241 Conv1D throws CUDNN_STATUS_EXECUTION_FAILED --- src/operator/nn/convolution.cu | 20 ++++++++++++----- src/operator/nn/cudnn/cudnn_algoreg-inl.h | 11 +++++++--- src/operator/nn/cudnn/cudnn_convolution-inl.h | 22 +++++++++++++------ .../nn/cudnn/cudnn_deconvolution-inl.h | 20 ++++++++++++----- src/operator/nn/deconvolution.cu | 20 ++++++++++++----- src/operator/operator_common.h | 2 +- 6 files changed, 66 insertions(+), 29 deletions(-) diff --git a/src/operator/nn/convolution.cu b/src/operator/nn/convolution.cu index 9f61212d5c78..e3a2ca5001cf 100644 --- a/src/operator/nn/convolution.cu +++ b/src/operator/nn/convolution.cu @@ -41,7 +41,8 @@ static CuDNNConvolutionOp& GetCuDNNConvOp(const ConvolutionParam& param, int backward_compute_type, const std::vector& in_shape, const std::vector& out_shape, - const RunContext& rctx) { + const RunContext& rctx, + bool add_to_weight) { #if DMLC_CXX11_THREAD_LOCAL static thread_local std::unordered_map >, @@ -57,14 +58,18 @@ static CuDNNConvolutionOp& GetCuDNNConvOp(const ConvolutionParam& param, ndim += s.ndim(); for (auto &s : out_shape) ndim += s.ndim(); - key.Reserve(1 /* for forward_compute_type */ + 1 /* for backward_compute_type */ - + ndim + 1 /* for dev_id */); + key.Reserve(1 /* for forward_compute_type */ + + 1 /* for backward_compute_type */ + + ndim /* for in and out shapes */ + + 1 /* for dev_id */ + + 1 /* for add_to_weight */); key.AddSign(forward_compute_type); key.AddSign(backward_compute_type); key.AddSign(in_shape); key.AddSign(out_shape); key.AddSign(rctx.ctx.dev_id); + key.AddSign(add_to_weight ? 1 : 0); auto it = ops.find(key); if (it == ops.end()) { @@ -74,7 +79,7 @@ static CuDNNConvolutionOp& GetCuDNNConvOp(const ConvolutionParam& param, CHECK(ins_ret.second); it = ins_ret.first; it->second->Init(param, forward_compute_type, backward_compute_type, in_shape, - out_shape, rctx); + out_shape, rctx, add_to_weight); } return *it->second; } @@ -141,8 +146,10 @@ void ConvolutionCompute(const nnvm::NodeAttrs& attrs, std::vector out_shape(1, outputs[0].shape_); for (size_t i = 0; i < in_shape.size(); i++) in_shape[i] = inputs[i].shape_; + // req[conv::kWeight] is only set for backward, so assume the typical 'write' for now. + auto add_to_weight = false; CuDNNConvolutionOp &op = GetCuDNNConvOp(param, - compute_type, compute_type, in_shape, out_shape, ctx.run_ctx); + compute_type, compute_type, in_shape, out_shape, ctx.run_ctx, add_to_weight); op.Forward(ctx, inputs, req, outputs); } }) @@ -220,8 +227,9 @@ void ConvolutionGradCompute(const nnvm::NodeAttrs& attrs, std::vector out_shape(1, out_grad.shape_); for (size_t i = 0; i < in_shape.size(); i++) in_shape[i] = in_data[i].shape_; + auto add_to_weight = req[conv::kWeight] == kAddTo; CuDNNConvolutionOp &op = GetCuDNNConvOp(param, - compute_type, compute_type, in_shape, out_shape, ctx.run_ctx); + compute_type, compute_type, in_shape, out_shape, ctx.run_ctx, add_to_weight); op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); } }) diff --git a/src/operator/nn/cudnn/cudnn_algoreg-inl.h b/src/operator/nn/cudnn/cudnn_algoreg-inl.h index e029c837bd4d..3b59fd1c3ced 100644 --- a/src/operator/nn/cudnn/cudnn_algoreg-inl.h +++ b/src/operator/nn/cudnn/cudnn_algoreg-inl.h @@ -72,12 +72,13 @@ class CuDNNAlgoReg { cudnnDataType_t cudnn_forward_compute_type, cudnnDataType_t cudnn_backward_compute_type, int sm_arch, + bool add_to_weight, CuDNNAlgo *fwd, CuDNNAlgo *bwd, CuDNNAlgo *flt) { CHECK(in_shape.size() == 2 || in_shape.size() == 3); ParamKey key{param, in_shape[0], in_shape[1], out_shape[0], cudnn_data_type, - cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch}; + cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch, add_to_weight}; std::lock_guard guard(lock_); auto i = reg_.find(key); if (i != reg_.end()) { @@ -96,12 +97,13 @@ class CuDNNAlgoReg { cudnnDataType_t cudnn_forward_compute_type, cudnnDataType_t cudnn_backward_compute_type, int sm_arch, + bool add_to_weight, const CuDNNAlgo &fwd, const CuDNNAlgo &bwd, const CuDNNAlgo &flt) { CHECK(in_shape.size() == 2 || in_shape.size() == 3); ParamKey key{param, in_shape[0], in_shape[1], out_shape[0], cudnn_data_type, - cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch}; + cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch, add_to_weight}; std::lock_guard guard(lock_); if (param.cudnn_tune.value() && reg_.size() % 50 == 0) { LOG(INFO) << "Running performance tests to find the best convolution " @@ -140,6 +142,7 @@ class CuDNNAlgoReg { cudnnDataType_t cudnn_forward_compute_type; cudnnDataType_t cudnn_backward_compute_type; int sm_arch; + bool add_to_weight; bool operator==(const ParamKey& other) const { return this->param == other.param && @@ -149,7 +152,8 @@ class CuDNNAlgoReg { this->cudnn_data_type == other.cudnn_data_type && this->cudnn_forward_compute_type == other.cudnn_forward_compute_type && this->cudnn_backward_compute_type == other.cudnn_backward_compute_type && - this->sm_arch == other.sm_arch; + this->sm_arch == other.sm_arch && + this->add_to_weight == other.add_to_weight; } }; @@ -164,6 +168,7 @@ class CuDNNAlgoReg { ret = dmlc::HashCombine(ret, static_cast(key.cudnn_forward_compute_type)); ret = dmlc::HashCombine(ret, static_cast(key.cudnn_backward_compute_type)); ret = dmlc::HashCombine(ret, key.sm_arch); + ret = dmlc::HashCombine(ret, key.add_to_weight); return ret; } }; diff --git a/src/operator/nn/cudnn/cudnn_convolution-inl.h b/src/operator/nn/cudnn/cudnn_convolution-inl.h index 4b1cbbe7057b..c5fdf3d42b81 100644 --- a/src/operator/nn/cudnn/cudnn_convolution-inl.h +++ b/src/operator/nn/cudnn/cudnn_convolution-inl.h @@ -59,9 +59,11 @@ class CuDNNConvolutionOp { int backward_compute_type, const std::vector& in_shape, const std::vector& out_shape, - const RunContext& rctx) { + const RunContext& rctx, + bool add_to_weight) { using namespace mshadow; this->param_ = param; + this->add_to_weight_ = add_to_weight; InitBufferForParam(); auto cudnn_forward_compute_type = convertToCuDNNDataType(forward_compute_type); auto cudnn_backward_compute_type = convertToCuDNNDataType(backward_compute_type); @@ -247,6 +249,7 @@ class CuDNNConvolutionOp { gbias.dptr_)); } if (req[conv::kWeight] != kNullOp) { + CHECK_EQ(add_to_weight_, req[conv::kWeight] == kAddTo); CUDNN_CALL(cudnnConvolutionBackwardFilter(s->dnn_handle_, &alpha, in_desc_, @@ -610,8 +613,8 @@ class CuDNNConvolutionOp { cudnnDataType_t cudnn_backward_compute_type) { if (!CuDNNConvAlgoReg::Get()->Find(param_, in_shape, out_shape, dtype_, cudnn_forward_compute_type, cudnn_backward_compute_type, - SMArch(rctx.ctx.dev_id), &forward_algo_, &back_algo_, - &back_algo_w_)) { + SMArch(rctx.ctx.dev_id), add_to_weight_, + &forward_algo_, &back_algo_, &back_algo_w_)) { mshadow::Stream *s = rctx.get_stream(); CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream::OwnHandle); size_t workspace_byte = static_cast(param_.workspace * sizeof(DType)); @@ -645,9 +648,11 @@ class CuDNNConvolutionOp { auto max_bwd_filt_algos = MaxBackwardFilterAlgos(s->dnn_handle_); std::vector bwd_filt_results(max_bwd_filt_algos); int actual_bwd_filter_algos = 0; - auto bwd_filter_algo_discoverer = - param_.cudnn_tune.value() == conv::kOff ? cudnnGetConvolutionBackwardFilterAlgorithm_v7 - : cudnnFindConvolutionBackwardFilterAlgorithm; + // In cudnn v7.1.4, find() returned wgrad algos that could fail for large c if we + // were summing into the output (i.e. beta != 0). Get() returned OK algos though. + auto bwd_filter_algo_discoverer = (add_to_weight_ || + param_.cudnn_tune.value() == conv::kOff) ? cudnnGetConvolutionBackwardFilterAlgorithm_v7 + : cudnnFindConvolutionBackwardFilterAlgorithm; CUDNN_CALL((*bwd_filter_algo_discoverer)(s->dnn_handle_, in_desc_, out_desc_, @@ -799,7 +804,8 @@ class CuDNNConvolutionOp { CuDNNConvAlgoReg::Get()->Register(param_, in_shape, out_shape, dtype_, cudnn_forward_compute_type, cudnn_backward_compute_type, - SMArch(rctx.ctx.dev_id), this->forward_algo_, + SMArch(rctx.ctx.dev_id), this->add_to_weight_, + this->forward_algo_, this->back_algo_, this->back_algo_w_); } // If we're allowing Tensor Core variants of the algos to be considered in @@ -953,6 +959,8 @@ class CuDNNConvolutionOp { cudnnTensorFormat_t format_; // Allow TensorCore algo policy bool cudnn_tensor_core_; + // Is req[kWeight] == conv::kAddTo ? + bool add_to_weight_; ConvolutionParam param_; }; #endif // __CUDACC__ && CUDNN diff --git a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h index cb0de4c961bf..85fe604b0817 100644 --- a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h +++ b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h @@ -56,9 +56,11 @@ class CuDNNDeconvolutionOp { int backward_compute_type, const std::vector& in_shape, const std::vector& out_shape, - const RunContext& rctx) { + const RunContext& rctx, + bool add_to_weight) { using namespace mshadow; this->param_ = param; + this->add_to_weight_ = add_to_weight; InitBufferForParam(); auto cudnn_forward_compute_type = convertToCuDNNDataType(forward_compute_type); auto cudnn_backward_compute_type = convertToCuDNNDataType(backward_compute_type); @@ -257,6 +259,7 @@ class CuDNNDeconvolutionOp { filter_desc_, gwmat.dptr_ + weight_offset_ * g)); #elif CUDNN_MAJOR >= 5 + CHECK_EQ(add_to_weight_, req[deconv::kWeight] == kAddTo); CUDNN_CALL(cudnnConvolutionBackwardFilter( s->dnn_handle_, &alpha, @@ -543,8 +546,8 @@ class CuDNNDeconvolutionOp { if (!CuDNNDeconvAlgoReg::Get()->Find(param_, in_shape, out_shape, dtype_, cudnn_forward_compute_type, cudnn_backward_compute_type, - SMArch(rctx.ctx.dev_id), &forward_algo_, - &back_algo_, &back_algo_w_)) { + SMArch(rctx.ctx.dev_id), add_to_weight_, + &forward_algo_, &back_algo_, &back_algo_w_)) { mshadow::Stream *s = rctx.get_stream(); CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream::OwnHandle); size_t workspace_byte = static_cast(param_.workspace * sizeof(DType)); @@ -578,8 +581,10 @@ class CuDNNDeconvolutionOp { auto max_bwd_filt_algos = MaxBackwardFilterAlgos(s->dnn_handle_); std::vector bwd_filt_results(max_bwd_filt_algos); int actual_bwd_filter_algos = 0; - auto bwd_filter_algo_discoverer = - param_.cudnn_tune.value() == conv::kOff ? cudnnGetConvolutionBackwardFilterAlgorithm_v7 + // In cudnn v7.1.4, find() returned wgrad algos that could fail for large c if we + // were summing into the output (i.e. beta != 0). Get() returned OK algos though. + auto bwd_filter_algo_discoverer = (add_to_weight_ || + param_.cudnn_tune.value() == conv::kOff) ? cudnnGetConvolutionBackwardFilterAlgorithm_v7 : cudnnFindConvolutionBackwardFilterAlgorithm; CUDNN_CALL((*bwd_filter_algo_discoverer)(s->dnn_handle_, out_desc_, @@ -735,7 +740,8 @@ class CuDNNDeconvolutionOp { CuDNNDeconvAlgoReg::Get()->Register(param_, in_shape, out_shape, dtype_, cudnn_forward_compute_type, cudnn_backward_compute_type, - SMArch(rctx.ctx.dev_id), this->forward_algo_, + SMArch(rctx.ctx.dev_id), this->add_to_weight_, + this->forward_algo_, this->back_algo_, this->back_algo_w_); } // If we're allowing Tensor Core variants of the algos to be considered in @@ -912,6 +918,8 @@ class CuDNNDeconvolutionOp { cudnnTensorFormat_t format_; // Allow TensorCore algo policy bool cudnn_tensor_core_; + // Is req[kWeight] == deconv::kAddTo ? + bool add_to_weight_; DeconvolutionParam param_; }; #endif // CUDNN diff --git a/src/operator/nn/deconvolution.cu b/src/operator/nn/deconvolution.cu index cdfb606900bf..1c3970b9e716 100644 --- a/src/operator/nn/deconvolution.cu +++ b/src/operator/nn/deconvolution.cu @@ -39,7 +39,8 @@ static CuDNNDeconvolutionOp &GetCuDNNDeconvOp(const DeconvolutionParam& p int backward_compute_type, const std::vector& in_shape, const std::vector& out_shape, - const RunContext& rctx) { + const RunContext& rctx, + bool add_to_weight) { #if DMLC_CXX11_THREAD_LOCAL static thread_local std::unordered_map >, @@ -55,14 +56,18 @@ static CuDNNDeconvolutionOp &GetCuDNNDeconvOp(const DeconvolutionParam& p ndim += s.ndim(); for (auto &s : out_shape) ndim += s.ndim(); - key.Reserve(1 /* for forward_compute_type */ + 1 /* for backward_compute_type */ - + ndim + 1 /* for dev_id */); + key.Reserve(1 /* for forward_compute_type */ + + 1 /* for backward_compute_type */ + + ndim /* for in and out shapes */ + + 1 /* for dev_id */ + + 1 /* for add_to_weight */); key.AddSign(forward_compute_type); key.AddSign(backward_compute_type); key.AddSign(in_shape); key.AddSign(out_shape); key.AddSign(rctx.ctx.dev_id); + key.AddSign(add_to_weight ? 1 : 0); auto it = ops.find(key); if (it == ops.end()) { @@ -72,7 +77,7 @@ static CuDNNDeconvolutionOp &GetCuDNNDeconvOp(const DeconvolutionParam& p CHECK(ins_ret.second); it = ins_ret.first; it->second->Init(param, forward_compute_type, backward_compute_type, in_shape, - out_shape, rctx); + out_shape, rctx, add_to_weight); } return *it->second; } @@ -109,8 +114,10 @@ void DeconvolutionCompute(const nnvm::NodeAttrs& attrs, for (size_t i = 0; i < in_shape.size(); i++) { in_shape[i] = inputs[i].shape_; } + // req[deconv::kWeight] is only set for backward, so assume the typical 'write' for now. + auto add_to_weight = false; GetCuDNNDeconvOp(param, compute_type, compute_type, - in_shape, out_shape, ctx.run_ctx).Forward(ctx, inputs, req, outputs); + in_shape, out_shape, ctx.run_ctx, add_to_weight).Forward(ctx, inputs, req, outputs); } }) #else @@ -156,8 +163,9 @@ void DeconvolutionGradCompute(const nnvm::NodeAttrs& attrs, for (size_t i = 0; i < in_shape.size(); i++) { in_shape[i] = in_data[i].shape_; } + auto add_to_weight = req[deconv::kWeight] == kAddTo; GetCuDNNDeconvOp(param, compute_type, compute_type, - in_shape, out_shape, ctx.run_ctx).Backward(ctx, + in_shape, out_shape, ctx.run_ctx, add_to_weight).Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); } }) diff --git a/src/operator/operator_common.h b/src/operator/operator_common.h index 0a9cd08db81b..0b28d558e134 100644 --- a/src/operator/operator_common.h +++ b/src/operator/operator_common.h @@ -494,7 +494,7 @@ inline void LogUnimplementedOp(const nnvm::NodeAttrs& attrs, } class OpSignature { - std::vector eles; + std::vector eles; uint64_t hash; public: