Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Commit

Permalink
[MXNET-11241] Avoid use of troublesome cudnnFind() results when grad_…
Browse files Browse the repository at this point in the history
…req='add' (#11338)

* Add tests that fail due to issue 11241

* Fix #11241 Conv1D throws CUDNN_STATUS_EXECUTION_FAILED

* Force algo 1 when grad_req==add with large c.  Expand tests.

* Shorten test runtimes.
  • Loading branch information
DickJC123 authored and eric-haibin-lin committed Jul 30, 2018
1 parent b2fd3b1 commit 024b5a9
Show file tree
Hide file tree
Showing 7 changed files with 162 additions and 24 deletions.
20 changes: 14 additions & 6 deletions src/operator/nn/convolution.cu
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ static CuDNNConvolutionOp<DType>& GetCuDNNConvOp(const ConvolutionParam& param,
int backward_compute_type,
const std::vector<TShape>& in_shape,
const std::vector<TShape>& out_shape,
const RunContext& rctx) {
const RunContext& rctx,
bool add_to_weight) {
#if DMLC_CXX11_THREAD_LOCAL
static thread_local std::unordered_map<ConvSignature,
std::shared_ptr<CuDNNConvolutionOp<DType> >,
Expand All @@ -57,14 +58,18 @@ static CuDNNConvolutionOp<DType>& GetCuDNNConvOp(const ConvolutionParam& param,
ndim += s.ndim();
for (auto &s : out_shape)
ndim += s.ndim();
key.Reserve(1 /* for forward_compute_type */ + 1 /* for backward_compute_type */
+ ndim + 1 /* for dev_id */);
key.Reserve(1 /* for forward_compute_type */ +
1 /* for backward_compute_type */ +
ndim /* for in and out shapes */ +
1 /* for dev_id */ +
1 /* for add_to_weight */);

key.AddSign(forward_compute_type);
key.AddSign(backward_compute_type);
key.AddSign(in_shape);
key.AddSign(out_shape);
key.AddSign(rctx.ctx.dev_id);
key.AddSign(add_to_weight ? 1 : 0);

auto it = ops.find(key);
if (it == ops.end()) {
Expand All @@ -74,7 +79,7 @@ static CuDNNConvolutionOp<DType>& GetCuDNNConvOp(const ConvolutionParam& param,
CHECK(ins_ret.second);
it = ins_ret.first;
it->second->Init(param, forward_compute_type, backward_compute_type, in_shape,
out_shape, rctx);
out_shape, rctx, add_to_weight);
}
return *it->second;
}
Expand Down Expand Up @@ -141,8 +146,10 @@ void ConvolutionCompute<gpu>(const nnvm::NodeAttrs& attrs,
std::vector<TShape> out_shape(1, outputs[0].shape_);
for (size_t i = 0; i < in_shape.size(); i++)
in_shape[i] = inputs[i].shape_;
// req[conv::kWeight] is only set for backward, so assume the typical 'write' for now.
auto add_to_weight = false;
CuDNNConvolutionOp<DType> &op = GetCuDNNConvOp<DType>(param,
compute_type, compute_type, in_shape, out_shape, ctx.run_ctx);
compute_type, compute_type, in_shape, out_shape, ctx.run_ctx, add_to_weight);
op.Forward(ctx, inputs, req, outputs);
}
})
Expand Down Expand Up @@ -220,8 +227,9 @@ void ConvolutionGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
std::vector<TShape> out_shape(1, out_grad.shape_);
for (size_t i = 0; i < in_shape.size(); i++)
in_shape[i] = in_data[i].shape_;
auto add_to_weight = req[conv::kWeight] == kAddTo;
CuDNNConvolutionOp<DType> &op = GetCuDNNConvOp<DType>(param,
compute_type, compute_type, in_shape, out_shape, ctx.run_ctx);
compute_type, compute_type, in_shape, out_shape, ctx.run_ctx, add_to_weight);
op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
}
})
Expand Down
11 changes: 8 additions & 3 deletions src/operator/nn/cudnn/cudnn_algoreg-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,13 @@ class CuDNNAlgoReg {
cudnnDataType_t cudnn_forward_compute_type,
cudnnDataType_t cudnn_backward_compute_type,
int sm_arch,
bool add_to_weight,
CuDNNAlgo<cudnnConvolutionFwdAlgo_t> *fwd,
CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t> *bwd,
CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> *flt) {
CHECK(in_shape.size() == 2 || in_shape.size() == 3);
ParamKey key{param, in_shape[0], in_shape[1], out_shape[0], cudnn_data_type,
cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch};
cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch, add_to_weight};
std::lock_guard<std::mutex> guard(lock_);
auto i = reg_.find(key);
if (i != reg_.end()) {
Expand All @@ -96,12 +97,13 @@ class CuDNNAlgoReg {
cudnnDataType_t cudnn_forward_compute_type,
cudnnDataType_t cudnn_backward_compute_type,
int sm_arch,
bool add_to_weight,
const CuDNNAlgo<cudnnConvolutionFwdAlgo_t> &fwd,
const CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t> &bwd,
const CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> &flt) {
CHECK(in_shape.size() == 2 || in_shape.size() == 3);
ParamKey key{param, in_shape[0], in_shape[1], out_shape[0], cudnn_data_type,
cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch};
cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch, add_to_weight};
std::lock_guard<std::mutex> guard(lock_);
if (param.cudnn_tune.value() && reg_.size() % 50 == 0) {
LOG(INFO) << "Running performance tests to find the best convolution "
Expand Down Expand Up @@ -140,6 +142,7 @@ class CuDNNAlgoReg {
cudnnDataType_t cudnn_forward_compute_type;
cudnnDataType_t cudnn_backward_compute_type;
int sm_arch;
bool add_to_weight;

bool operator==(const ParamKey& other) const {
return this->param == other.param &&
Expand All @@ -149,7 +152,8 @@ class CuDNNAlgoReg {
this->cudnn_data_type == other.cudnn_data_type &&
this->cudnn_forward_compute_type == other.cudnn_forward_compute_type &&
this->cudnn_backward_compute_type == other.cudnn_backward_compute_type &&
this->sm_arch == other.sm_arch;
this->sm_arch == other.sm_arch &&
this->add_to_weight == other.add_to_weight;
}
};

Expand All @@ -164,6 +168,7 @@ class CuDNNAlgoReg {
ret = dmlc::HashCombine(ret, static_cast<int>(key.cudnn_forward_compute_type));
ret = dmlc::HashCombine(ret, static_cast<int>(key.cudnn_backward_compute_type));
ret = dmlc::HashCombine(ret, key.sm_arch);
ret = dmlc::HashCombine(ret, key.add_to_weight);
return ret;
}
};
Expand Down
36 changes: 32 additions & 4 deletions src/operator/nn/cudnn/cudnn_convolution-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,11 @@ class CuDNNConvolutionOp {
int backward_compute_type,
const std::vector<TShape>& in_shape,
const std::vector<TShape>& out_shape,
const RunContext& rctx) {
const RunContext& rctx,
bool add_to_weight) {
using namespace mshadow;
this->param_ = param;
this->add_to_weight_ = add_to_weight;
InitBufferForParam();
auto cudnn_forward_compute_type = convertToCuDNNDataType(forward_compute_type);
auto cudnn_backward_compute_type = convertToCuDNNDataType(backward_compute_type);
Expand Down Expand Up @@ -247,6 +249,7 @@ class CuDNNConvolutionOp {
gbias.dptr_));
}
if (req[conv::kWeight] != kNullOp) {
CHECK_EQ(add_to_weight_, req[conv::kWeight] == kAddTo);
CUDNN_CALL(cudnnConvolutionBackwardFilter(s->dnn_handle_,
&alpha,
in_desc_,
Expand Down Expand Up @@ -610,8 +613,8 @@ class CuDNNConvolutionOp {
cudnnDataType_t cudnn_backward_compute_type) {
if (!CuDNNConvAlgoReg::Get()->Find(param_, in_shape, out_shape, dtype_,
cudnn_forward_compute_type, cudnn_backward_compute_type,
SMArch(rctx.ctx.dev_id), &forward_algo_, &back_algo_,
&back_algo_w_)) {
SMArch(rctx.ctx.dev_id), add_to_weight_,
&forward_algo_, &back_algo_, &back_algo_w_)) {
mshadow::Stream<gpu> *s = rctx.get_stream<gpu>();
CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
size_t workspace_byte = static_cast<size_t>(param_.workspace * sizeof(DType));
Expand Down Expand Up @@ -645,6 +648,8 @@ class CuDNNConvolutionOp {
auto max_bwd_filt_algos = MaxBackwardFilterAlgos(s->dnn_handle_);
std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> bwd_filt_results(max_bwd_filt_algos);
int actual_bwd_filter_algos = 0;
// In cudnn v7.1.4, find() returned wgrad algos that could fail for large c if we
// were summing into the output (i.e. beta != 0). Get() returned OK algos though.
auto bwd_filter_algo_discoverer =
param_.cudnn_tune.value() == conv::kOff ? cudnnGetConvolutionBackwardFilterAlgorithm_v7
: cudnnFindConvolutionBackwardFilterAlgorithm;
Expand Down Expand Up @@ -792,14 +797,22 @@ class CuDNNConvolutionOp {
}
}
#endif // CUDNN_MAJOR < 7

// Fix for issue #11241
int cudnn_find_issue_max_features = 64 * 1024;
if (add_to_weight_ && Features(in_shape[conv::kData]) >= cudnn_find_issue_max_features) {
this->back_algo_w_.Set(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, true);
}

// An algo specification by the user may be cached here, but another
// convolution will match only if identically specified.
// We're caching results of *Get* as well as *Find*, but these records
// will be held distinctly because param_.cudnn_tune is part of the key.
CuDNNConvAlgoReg::Get()->Register(param_, in_shape, out_shape, dtype_,
cudnn_forward_compute_type,
cudnn_backward_compute_type,
SMArch(rctx.ctx.dev_id), this->forward_algo_,
SMArch(rctx.ctx.dev_id), this->add_to_weight_,
this->forward_algo_,
this->back_algo_, this->back_algo_w_);
}
// If we're allowing Tensor Core variants of the algos to be considered in
Expand Down Expand Up @@ -921,6 +934,19 @@ class CuDNNConvolutionOp {
return tensor.MSize() * sizeof(DType);
}

// Given a tensor shape of this operation, return the number of features 'c'
int64_t Features(const TShape &dshape) {
int c = 0;
switch (dshape.ndim()) {
case 3: c = ConvertLayout(dshape.get<3>(), param_.layout.value(), kNCW)[1]; break;
case 4: c = ConvertLayout(dshape.get<4>(), param_.layout.value(), kNCHW)[1]; break;
case 5: c = ConvertLayout(dshape.get<5>(), param_.layout.value(), kNCDHW)[1]; break;
default:
LOG(FATAL) << "Unexpected convolution data dimension " << dshape.ndim();
}
return c;
}

std::vector<int> param_stride_;
std::vector<int> param_dilate_;
std::vector<int> param_pad_;
Expand Down Expand Up @@ -953,6 +979,8 @@ class CuDNNConvolutionOp {
cudnnTensorFormat_t format_;
// Allow TensorCore algo policy
bool cudnn_tensor_core_;
// Is req[kWeight] == conv::kAddTo ?
bool add_to_weight_;
ConvolutionParam param_;
};
#endif // __CUDACC__ && CUDNN
Expand Down
38 changes: 34 additions & 4 deletions src/operator/nn/cudnn/cudnn_deconvolution-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,11 @@ class CuDNNDeconvolutionOp {
int backward_compute_type,
const std::vector<TShape>& in_shape,
const std::vector<TShape>& out_shape,
const RunContext& rctx) {
const RunContext& rctx,
bool add_to_weight) {
using namespace mshadow;
this->param_ = param;
this->add_to_weight_ = add_to_weight;
InitBufferForParam();
auto cudnn_forward_compute_type = convertToCuDNNDataType(forward_compute_type);
auto cudnn_backward_compute_type = convertToCuDNNDataType(backward_compute_type);
Expand Down Expand Up @@ -257,6 +259,7 @@ class CuDNNDeconvolutionOp {
filter_desc_,
gwmat.dptr_ + weight_offset_ * g));
#elif CUDNN_MAJOR >= 5
CHECK_EQ(add_to_weight_, req[deconv::kWeight] == kAddTo);
CUDNN_CALL(cudnnConvolutionBackwardFilter(
s->dnn_handle_,
&alpha,
Expand Down Expand Up @@ -543,8 +546,8 @@ class CuDNNDeconvolutionOp {
if (!CuDNNDeconvAlgoReg::Get()->Find(param_, in_shape, out_shape, dtype_,
cudnn_forward_compute_type,
cudnn_backward_compute_type,
SMArch(rctx.ctx.dev_id), &forward_algo_,
&back_algo_, &back_algo_w_)) {
SMArch(rctx.ctx.dev_id), add_to_weight_,
&forward_algo_, &back_algo_, &back_algo_w_)) {
mshadow::Stream <gpu> *s = rctx.get_stream<gpu>();
CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
size_t workspace_byte = static_cast<size_t>(param_.workspace * sizeof(DType));
Expand Down Expand Up @@ -578,6 +581,8 @@ class CuDNNDeconvolutionOp {
auto max_bwd_filt_algos = MaxBackwardFilterAlgos(s->dnn_handle_);
std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> bwd_filt_results(max_bwd_filt_algos);
int actual_bwd_filter_algos = 0;
// In cudnn v7.1.4, find() returned wgrad algos that could fail for large c if we
// were summing into the output (i.e. beta != 0). Get() returned OK algos though.
auto bwd_filter_algo_discoverer =
param_.cudnn_tune.value() == conv::kOff ? cudnnGetConvolutionBackwardFilterAlgorithm_v7
: cudnnFindConvolutionBackwardFilterAlgorithm;
Expand Down Expand Up @@ -728,14 +733,23 @@ class CuDNNDeconvolutionOp {
}
}
#endif // CUDNN_MAJOR < 7

// Fix for issue #11241
int cudnn_find_issue_max_features = 64 * 1024;
// With deconvolution, the algo sensitivity is to a large number of output features
if (add_to_weight_ && Features(out_shape[deconv::kOut]) >= cudnn_find_issue_max_features) {
this->back_algo_w_.Set(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, true);
}

// An algo specification by the user may be cached here, but another
// convolution will match only if identically specified.
// We're caching results of *Get* as well as *Find*, but these records
// will be held distinctly because param_.cudnn_tune is part of the key.
CuDNNDeconvAlgoReg::Get()->Register(param_, in_shape, out_shape, dtype_,
cudnn_forward_compute_type,
cudnn_backward_compute_type,
SMArch(rctx.ctx.dev_id), this->forward_algo_,
SMArch(rctx.ctx.dev_id), this->add_to_weight_,
this->forward_algo_,
this->back_algo_, this->back_algo_w_);
}
// If we're allowing Tensor Core variants of the algos to be considered in
Expand Down Expand Up @@ -866,6 +880,20 @@ class CuDNNDeconvolutionOp {
return tensor.MSize() * sizeof(DType);
}


// Given a tensor shape of this operation, return the number of features 'c'
int64_t Features(const TShape &dshape) {
int c = 0;
switch (dshape.ndim()) {
case 3: c = ConvertLayout(dshape.get<3>(), param_.layout.value(), kNCW)[1]; break;
case 4: c = ConvertLayout(dshape.get<4>(), param_.layout.value(), kNCHW)[1]; break;
case 5: c = ConvertLayout(dshape.get<5>(), param_.layout.value(), kNCDHW)[1]; break;
default:
LOG(FATAL) << "Unexpected deconvolution data dimension " << dshape.ndim();
}
return c;
}

std::vector<int> param_stride_;
std::vector<int> param_dilate_;

Expand Down Expand Up @@ -912,6 +940,8 @@ class CuDNNDeconvolutionOp {
cudnnTensorFormat_t format_;
// Allow TensorCore algo policy
bool cudnn_tensor_core_;
// Is req[kWeight] == deconv::kAddTo ?
bool add_to_weight_;
DeconvolutionParam param_;
};
#endif // CUDNN
Expand Down
20 changes: 14 additions & 6 deletions src/operator/nn/deconvolution.cu
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ static CuDNNDeconvolutionOp<DType> &GetCuDNNDeconvOp(const DeconvolutionParam& p
int backward_compute_type,
const std::vector<TShape>& in_shape,
const std::vector<TShape>& out_shape,
const RunContext& rctx) {
const RunContext& rctx,
bool add_to_weight) {
#if DMLC_CXX11_THREAD_LOCAL
static thread_local std::unordered_map<DeconvSignature,
std::shared_ptr<CuDNNDeconvolutionOp<DType> >,
Expand All @@ -55,14 +56,18 @@ static CuDNNDeconvolutionOp<DType> &GetCuDNNDeconvOp(const DeconvolutionParam& p
ndim += s.ndim();
for (auto &s : out_shape)
ndim += s.ndim();
key.Reserve(1 /* for forward_compute_type */ + 1 /* for backward_compute_type */
+ ndim + 1 /* for dev_id */);
key.Reserve(1 /* for forward_compute_type */ +
1 /* for backward_compute_type */ +
ndim /* for in and out shapes */ +
1 /* for dev_id */ +
1 /* for add_to_weight */);

key.AddSign(forward_compute_type);
key.AddSign(backward_compute_type);
key.AddSign(in_shape);
key.AddSign(out_shape);
key.AddSign(rctx.ctx.dev_id);
key.AddSign(add_to_weight ? 1 : 0);

auto it = ops.find(key);
if (it == ops.end()) {
Expand All @@ -72,7 +77,7 @@ static CuDNNDeconvolutionOp<DType> &GetCuDNNDeconvOp(const DeconvolutionParam& p
CHECK(ins_ret.second);
it = ins_ret.first;
it->second->Init(param, forward_compute_type, backward_compute_type, in_shape,
out_shape, rctx);
out_shape, rctx, add_to_weight);
}
return *it->second;
}
Expand Down Expand Up @@ -109,8 +114,10 @@ void DeconvolutionCompute<gpu>(const nnvm::NodeAttrs& attrs,
for (size_t i = 0; i < in_shape.size(); i++) {
in_shape[i] = inputs[i].shape_;
}
// req[deconv::kWeight] is only set for backward, so assume the typical 'write' for now.
auto add_to_weight = false;
GetCuDNNDeconvOp<DType>(param, compute_type, compute_type,
in_shape, out_shape, ctx.run_ctx).Forward(ctx, inputs, req, outputs);
in_shape, out_shape, ctx.run_ctx, add_to_weight).Forward(ctx, inputs, req, outputs);
}
})
#else
Expand Down Expand Up @@ -156,8 +163,9 @@ void DeconvolutionGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
for (size_t i = 0; i < in_shape.size(); i++) {
in_shape[i] = in_data[i].shape_;
}
auto add_to_weight = req[deconv::kWeight] == kAddTo;
GetCuDNNDeconvOp<DType>(param, compute_type, compute_type,
in_shape, out_shape, ctx.run_ctx).Backward(ctx,
in_shape, out_shape, ctx.run_ctx, add_to_weight).Backward(ctx,
std::vector<TBlob>{out_grad}, in_data, req, in_grad);
}
})
Expand Down
2 changes: 1 addition & 1 deletion src/operator/operator_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,7 @@ inline void LogUnimplementedOp(const nnvm::NodeAttrs& attrs,
}

class OpSignature {
std::vector<int> eles;
std::vector<int64_t> eles;
uint64_t hash;

public:
Expand Down
Loading

0 comments on commit 024b5a9

Please sign in to comment.