From b051d8c12e171c2ca686795a2c31dbb0366f0782 Mon Sep 17 00:00:00 2001
From: Dick Carter <dick.carter@comcast.net>
Date: Wed, 13 Mar 2019 12:17:05 -0700
Subject: [PATCH] Cudnn conv dgrad algo filtering (#14310)

* Add test exposing issue with conv dgrad algo 3 for some cudnn's.

* Add test temporarily to tests run with tensorrt CI build (cuda10, cudnn7.4.2)

* Relax tol of new test.

* Fix for problematic conv dgrad algo 3 for some cuDNNs.

* Add algo exclusion term to cudnnFind result processing.

* Revert "Add test temporarily to tests run with tensorrt CI build (cuda10, cudnn7.4.2)"

This reverts commit 1cb743bd2313ca5d1845dbdda6cccd9dcaa5d30c.

* Trigger CI.

* Add link to cuDNN release notes.

* Trigger CI.
---
 src/operator/nn/cudnn/cudnn_convolution-inl.h | 25 +++++++++++++++---
 .../nn/cudnn/cudnn_deconvolution-inl.h        | 24 ++++++++++++++---
 tests/python/gpu/test_operator_gpu.py         | 26 +++++++++++++++++++
 3 files changed, 68 insertions(+), 7 deletions(-)
diff --git a/src/operator/nn/cudnn/cudnn_convolution-inl.h b/src/operator/nn/cudnn/cudnn_convolution-inl.h
index 66df82e4395e..55b263896339 100644
--- a/src/operator/nn/cudnn/cudnn_convolution-inl.h
+++ b/src/operator/nn/cudnn/cudnn_convolution-inl.h
@@ -521,7 +521,19 @@ class CuDNNConvolutionOp {
                                             wshape[1],
                                             wshape[2],
                                             wshape[3]));
-
+#if CUDNN_VERSION >= 7301 && CUDNN_VERSION < 7500
+      auto kernel_h = wshape[2];
+      auto kernel_w = wshape[3];
+      auto stride_h = stride[0];
+      auto stride_w = stride[1];
+      auto pad_h = pad[0];
+      auto pad_w = pad[1];
+      if (param_.layout.value() == kNCHW &&
+          (((stride_h == 2) && (kernel_h % 2 == 0) && (pad_h % 2 == 0)) ||
+           ((stride_w == 2) && (kernel_w % 2 == 0) && (pad_w % 2 == 0)))) {
+        exclude_dgrad_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING;
+      }
+#endif
     } else if (param_.kernel.ndim() == 3) {
       // 3d conv
       #if CUDNN_MAJOR >= 5
@@ -714,7 +726,7 @@ class CuDNNConvolutionOp {
     bwd_data_results.resize(actual_bwd_data_algos);
     AlgoFinalSelect<cudnnConvolutionBwdDataAlgoPerf_t,
                     cudnnConvolutionBwdDataAlgo_t>(bwd_data_results, "backprop-to-data",
-                                                   workspace_byte, bwd);
+                                                   workspace_byte, bwd, exclude_dgrad_algo_);
 #else
     // CUDNN_MAJOR < 7
     const int kMaxAlgos = 10;
@@ -910,12 +922,14 @@ class CuDNNConvolutionOp {
   // workspace constraints.
   template <typename PerfType, typename AlgoType>
   void AlgoFinalSelect(const std::vector<PerfType> &perf_results, std::string kernel_name,
-                       size_t workspace_byte, CuDNNAlgo<AlgoType> *algo) {
+                       size_t workspace_byte, CuDNNAlgo<AlgoType> *algo,
+                       int32_t algo_exclude = -1) {
     // Determine the fastest acceptable algo that matches the algo_preference (-1 = any),
     // regardless of mathType.
     bool enforce_determinism = dmlc::GetEnv("MXNET_ENFORCE_DETERMINISM", false);
     for (decltype(perf_results.size()) i = 0; i != perf_results.size(); ++i) {
       const auto &result = perf_results[i];
+      bool algo_exclusion = static_cast<int32_t>(result.algo) == algo_exclude;
       bool algo_is_tensor_core = false;
       #if CUDNN_MAJOR >= 7
         algo_is_tensor_core = result.mathType == CUDNN_TENSOR_OP_MATH;
@@ -924,7 +938,8 @@ class CuDNNConvolutionOp {
         #if CUDNN_MAJOR >= 7
           (!enforce_determinism || result.determinism == cudnnDeterminism_t::CUDNN_DETERMINISTIC) &&
         #endif
-          (param_.cudnn_tune.value() == conv::kFastest || result.memory <= workspace_byte)) {
+          (param_.cudnn_tune.value() == conv::kLimited || result.memory <= workspace_byte) &&
+          !algo_exclusion) {
         algo->Set(result.algo, algo_is_tensor_core);
         return;
       }
@@ -1104,6 +1119,8 @@ class CuDNNConvolutionOp {
   bool cudnn_tensor_core_;
   // Is req[kWeight] == conv::kAddTo ?
   bool add_to_weight_;
+  // Is there a dgrad algo that should be avoided (-1 == none)?
+  int32_t exclude_dgrad_algo_ = -1;
   ConvolutionParam param_;
 };
 #endif  // __CUDACC__ && CUDNN
diff --git a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
index ec95d2be3309..47f688c8ab9c 100644
--- a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
+++ b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
@@ -446,6 +446,19 @@ class CuDNNDeconvolutionOp {
                                             wshape[1],
                                             wshape[2],
                                             wshape[3]));
+#if CUDNN_VERSION >= 7301 && CUDNN_VERSION < 7500
+      auto kernel_h = wshape[2];
+      auto kernel_w = wshape[3];
+      auto stride_h = stride[0];
+      auto stride_w = stride[1];
+      auto pad_h = o_pad[0];
+      auto pad_w = o_pad[1];
+      if (param_.layout.value() == kNCHW &&
+          (((stride_h == 2) && (kernel_h % 2 == 0) && (pad_h % 2 == 0)) ||
+           ((stride_w == 2) && (kernel_w % 2 == 0) && (pad_w % 2 == 0)))) {
+        exclude_dgrad_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING;
+      }
+#endif
     } else if (param_.kernel.ndim() == 3) {
       // 3d conv
       index_t o_pad[3];
@@ -618,7 +631,7 @@ class CuDNNDeconvolutionOp {
     bwd_data_results.resize(actual_bwd_data_algos);
     AlgoFinalSelect<cudnnConvolutionBwdDataAlgoPerf_t,
         cudnnConvolutionBwdDataAlgo_t>(bwd_data_results, "backprop-to-data",
-                                       workspace_byte, bwd);
+                                       workspace_byte, bwd, exclude_dgrad_algo_);
 #else
     // CUDNN_MAJOR < 7
     const int kMaxAlgos = 10;
@@ -829,11 +842,13 @@ class CuDNNDeconvolutionOp {
   // workspace constraints and a possible user algo preference.
   template <typename PerfType, typename AlgoType>
   void AlgoFinalSelect(const std::vector<PerfType> &perf_results, std::string kernel_name,
-                       size_t workspace_byte, CuDNNAlgo<AlgoType> *algo) {
+                       size_t workspace_byte, CuDNNAlgo<AlgoType> *algo,
+                       int32_t algo_exclude = -1) {
     // Determine the fastest acceptable algo regardless of mathType.
     bool enforce_determinism = dmlc::GetEnv("MXNET_ENFORCE_DETERMINISM", false);
     for (decltype(perf_results.size()) i = 0; i != perf_results.size(); ++i) {
       const auto &result = perf_results[i];
+      bool algo_exclusion = static_cast<int32_t>(result.algo) == algo_exclude;
       bool algo_is_tensor_core = false;
       #if CUDNN_MAJOR >= 7
         algo_is_tensor_core = result.mathType == CUDNN_TENSOR_OP_MATH;
@@ -842,7 +857,8 @@ class CuDNNDeconvolutionOp {
         #if CUDNN_MAJOR >= 7
           (!enforce_determinism || result.determinism == cudnnDeterminism_t::CUDNN_DETERMINISTIC) &&
         #endif
-          (param_.cudnn_tune.value() != conv::kLimited || result.memory <= workspace_byte)) {
+          (param_.cudnn_tune.value() != conv::kLimited || result.memory <= workspace_byte) &&
+          !algo_exclusion) {
         algo->Set(result.algo, algo_is_tensor_core);
         return;
       }
@@ -1025,6 +1041,8 @@ class CuDNNDeconvolutionOp {
   bool cudnn_tensor_core_;
   // Is req[kWeight] == deconv::kAddTo ?
   bool add_to_weight_;
+  // Is there a dgrad algo that should be avoided (-1 == none)?
+  int32_t exclude_dgrad_algo_ = -1;
   DeconvolutionParam param_;
 };
 #endif  // CUDNN
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 38809921f8c6..fbbfc53a9a5e 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -522,6 +522,32 @@ def test_convolution_options():
     check_consistency_NxM([sym, sym_no_cudnn], ctx_list)
 
 
+@with_seed()
+def test_conv_deconv_guards():
+    # Test cases for convolution and deconvolution via strided fft.  Ensure that the framework
+    # guards against problematic CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING in cuDNN [7.3.1,7.5)
+    # see https://docs.nvidia.com/deeplearning/sdk/cudnn-release-notes/rel_750.html#rel_750
+    tol = 1e-1
+    for (op, opname) in [(mx.sym.Convolution, 'conv'), (mx.sym.Deconvolution, 'deconv')]:
+        dataname = opname + '_data'
+        ctx = {'ctx': mx.gpu(0), dataname: (32, 32, 64, 64), 'type_dict': {dataname: np.float32}}
+        test_cases = [
+            {'num_filter':32, 'kernel':(6,6), 'pad':(0,0), 'stride':(2,2), 'name': opname},
+            {'num_filter':32, 'kernel':(6,6), 'pad':(1,1), 'stride':(2,2), 'name': opname},
+            {'num_filter':32, 'kernel':(6,7), 'pad':(0,1), 'stride':(2,2), 'name': opname},
+            {'num_filter':32, 'kernel':(7,6), 'pad':(1,0), 'stride':(2,2), 'name': opname},
+            {'num_filter':32, 'kernel':(7,7), 'pad':(0,0), 'stride':(2,2), 'name': opname},
+            {'num_filter':32, 'kernel':(7,7), 'pad':(1,1), 'stride':(2,2), 'name': opname}]
+        for test_case_args in test_cases:
+            try:
+                sym = op(**test_case_args)
+                sym_no_cudnn = op(cudnn_off=True, **test_case_args)
+                check_consistency([sym, sym_no_cudnn], [ctx, ctx], tol=tol)
+            except:
+                print('Test failure of mx.sym.{} with args: {}'.format(op.__name__, test_case_args))
+                raise
+
+
 def _conv_with_num_streams(seed):
     with random_seed(seed):
         # Try to expose timing-dependent improper workspace sharing by parallel dgrad and wgrad