improve error message of cudnn operators

apache · Jul 25, 2018 · c6d6b2e · c6d6b2e
1 parent 0b8b939
commit c6d6b2e
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 11 deletions.
diff --git a/src/operator/nn/cudnn/cudnn_convolution-inl.h b/src/operator/nn/cudnn/cudnn_convolution-inl.h
@@ -718,7 +718,9 @@ class CuDNNConvolutionOp {
                        && fwd_algo[i].memory > workspace_byte)))
           ++i;
         if (i == nalgo) {
-          LOG(FATAL) << "Failed to find a forward convolution algorithm.";
+          LOG(FATAL) << "Failed to find a forward convolution algorithm. "
+                        "Usually this is due to lack of GPU memory, "
+                        "please consider reducing batch size or model size.";
         } else {
           forward_algo_.Set(fwd_algo[i].algo, false);
         }
@@ -752,7 +754,9 @@ class CuDNNConvolutionOp {
                        && bwd_filter_algo[i].memory > workspace_byte)))
           ++i;
         if (i == nalgo) {
-          LOG(FATAL) << "Failed to find a backward filter convolution algorithm.";
+          LOG(FATAL) << "Failed to find a backward filter convolution algorithm. "
+                        "Usually this is due to lack of GPU memory, "
+                        "please consider reducing batch size or model size.";
         } else {
           back_algo_w_.Set(bwd_filter_algo[i].algo, false);
         }
@@ -786,7 +790,9 @@ class CuDNNConvolutionOp {
                        && bwd_data_algo[i].memory > workspace_byte)))
           ++i;
         if (i == nalgo) {
-          LOG(FATAL) << "Failed to find a backward data convolution algorithm.";
+          LOG(FATAL) << "Failed to find a backward data convolution algorithm. "
+                        "Usually this is due to lack of GPU memory, "
+                        "please consider reducing batch size or model size.";
         } else {
           back_algo_.Set(bwd_data_algo[i].algo, false);
         }
@@ -833,7 +839,9 @@ class CuDNNConvolutionOp {
       }
     }
     auto mode = param_.cudnn_tune.value() == conv::kOff ? " get " : " find ";
-    LOG(FATAL) << "Failed to" << mode << "any " << kernel_name << " convolution algorithm.";
+    LOG(FATAL) << "Failed to" << mode << "any " << kernel_name << " convolution algorithm. "
+               << "Usually this is caused by lack of GPU memory, "
+               << "please consider reducing batch size of model size";
   }
 
   void GetTempSize(const OpContext& ctx) {

diff --git a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
@@ -651,8 +651,10 @@ class CuDNNDeconvolutionOp {
                        && fwd_algo[i].memory > workspace_byte)))
           ++i;
         if (i == nalgo) {
-          LOG(FATAL) << "Failed to find a 'forward' convolution algorithm " <<
-                     "(for use in deconvolution operator backprop-to-data).";
+          LOG(FATAL) << "Failed to find a 'forward' convolution algorithm "
+                     << "(for use in deconvolution operator backprop-to-data). "
+                     << "Usually this is due to lack of GPU memory, "
+                     << "please consider reducing batch size or model size.";
         } else {
           forward_algo_.Set(fwd_algo[i].algo, false);
         }
@@ -686,8 +688,10 @@ class CuDNNDeconvolutionOp {
                        && bwd_filter_algo[i].memory > workspace_byte)))
           ++i;
         if (i == nalgo) {
-          LOG(FATAL) << "Failed to find a backward filter convolution algorithm " <<
-                     "(for use in deconvolution operator backprop-to-filter).";
+          LOG(FATAL) << "Failed to find a backward filter convolution algorithm "
+                     << "(for use in deconvolution operator backprop-to-filter). "
+                     << "Usually this is due to lack of GPU memory, "
+                     << "please consider reducing batch size or model size.";
         } else {
           back_algo_w_.Set(bwd_filter_algo[i].algo, false);
         }
@@ -721,8 +725,10 @@ class CuDNNDeconvolutionOp {
                        && bwd_data_algo[i].memory > workspace_byte)))
           ++i;
         if (i == nalgo) {
-          LOG(FATAL) << "Failed to find a backward data convolution algorithm." <<
-                     "(for use in deconvolution operator forward inference).";
+          LOG(FATAL) << "Failed to find a backward data convolution algorithm. "
+                     << "(for use in deconvolution operator forward inference). "
+                     << "Usually this is due to lack of GPU memory, "
+                     << "please consider reducing batch size or model size.";
         } else {
           back_algo_.Set(bwd_data_algo[i].algo, false);
         }
@@ -774,7 +780,9 @@ class CuDNNDeconvolutionOp {
       }
     }
     auto mode = param_.cudnn_tune.value() == conv::kOff ? " get " : " find ";
-    LOG(FATAL) << "Failed to" << mode << "any " << kernel_name << " deconvolution algorithm.";
+    LOG(FATAL) << "Failed to" << mode << "any " << kernel_name << " deconvolution algorithm. "
+               << "Usually this is due to lack of GPU memory, "
+               << "please consider reducing batch size or model size.";
   }
 
   void GetTempSize(const OpContext& ctx) {