apache · KellenSunderland · Jul 12, 2019 · Jul 2, 2019 · Jul 3, 2019 · Jul 4, 2019
@@ -242,6 +242,16 @@ If ctypes is used, it must be `mxnet._ctypes.ndarray.NDArrayBase`.
 	- If set to '0', disallows implicit type conversions to Float16 to use Tensor Cores
 	- If set to '1', allows CUDA ops like RNN and Convolution to use TensorCores even with Float32 input data by using implicit type casting to Float16. Only has an effect if `MXNET_CUDA_ALLOW_TENSOR_CORE` is `1`.
 
+* MXNET_CUDA_VERSION_CHECKING
+  - 0(false) or 1(true) ```(default=1)```
+  - If set to '0', disallows various runtime checks of the cuda library version and associated warning messages.
+  - If set to '1', permits these checks (e.g. compile vs. link mismatch, old version no longer CI-tested)
+
+* MXNET_CUDNN_VERSION_CHECKING
+  - 0(false) or 1(true) ```(default=1)```
+  - If set to '0', disallows various runtime checks of the cuDNN library version and associated warning messages.
+  - If set to '1', permits these checks (e.g. compile vs. link mismatch, old version no longer CI-tested)
+
 * MXNET_GLUON_REPO
   - Values: String ```(default='https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/'```
   - The repository url to be used for Gluon datasets and pre-trained models.

@@ -44,11 +44,11 @@ struct ResourceRequest {
     kTempSpace,
     /*! \brief common::RandGenerator<xpu> object, which can be used in GPU kernel functions */
     kParallelRandom
-#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+#if MXNET_USE_CUDNN == 1
     ,
     /*! \brief cudnnDropoutDescriptor_t object for GPU dropout kernel functions */
     kCuDNNDropoutDesc
-#endif  // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+#endif  // MXNET_USE_CUDNN == 1
   };
   /*! \brief type of resources */
   Type type;
@@ -162,7 +162,7 @@ struct Resource {
         reinterpret_cast<DType*>(get_space_internal(shape.Size() * sizeof(DType))),
         shape, shape[ndim - 1], stream);
   }
-#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+#if MXNET_USE_CUDNN == 1
   /*!
    * \brief Get cudnn dropout descriptor from shared state space.
    *
@@ -175,7 +175,7 @@ struct Resource {
       mshadow::Stream<gpu> *stream,
       const float dropout,
       uint64_t seed) const;
-#endif  // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+#endif  // MXNET_USE_CUDNN == 1
 
   /*!
    * \brief Get CPU space as mshadow Tensor in specified type.

diff --git a/src/common/cuda_utils.cc b/src/common/cuda_utils.cc
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file cuda_utils.cc
+ * \brief CUDA debugging utilities.
+ */
+
+#include <mxnet/base.h>
+#include "cuda_utils.h"
+
+#if MXNET_USE_CUDA == 1
+
+namespace mxnet {
+namespace common {
+namespace cuda {
+
+// The oldest version of cuda used in upstream MXNet CI testing, both for unix and windows.
+// Users that have rebuilt MXNet against older versions will we advised with a warning to upgrade
+// their systems to match the CI level.  Minimally, users should rerun the CI locally.
+#if defined(_MSC_VER)
+#define MXNET_CI_OLDEST_CUDA_VERSION  9020
+#else
+#define MXNET_CI_OLDEST_CUDA_VERSION 10000
+#endif
+
+// Dynamic init here will emit a warning if runtime and compile-time cuda lib versions mismatch.
+// Also if the user has recompiled their source to a version no longer tested by upstream CI.
+bool cuda_version_check_performed = []() {
+  // Don't bother with checks if there are no GPUs visible (e.g. with CUDA_VISIBLE_DEVICES="")
+  if (dmlc::GetEnv("MXNET_CUDA_VERSION_CHECKING", true) && Context::GetGPUCount() > 0) {
+    // Not currently performing a runtime check of linked-against vs. compiled-against
+    // cuda runtime library, as major.minor must match for libmxnet.so to even load, per:
+    // https://docs.nvidia.com/deploy/cuda-compatibility/#binary-compatibility
+    if (CUDA_VERSION < MXNET_CI_OLDEST_CUDA_VERSION)
+      LOG(WARNING) << "Upgrade advisory: this mxnet has been built against cuda library version "
+                   << CUDA_VERSION << ", which is older than the oldest version tested by CI ("
+                   << MXNET_CI_OLDEST_CUDA_VERSION << ").  "
+                   << "Set MXNET_CUDA_VERSION_CHECKING=0 to quiet this warning.";
+  }
+  return true;
+}();
+
+}  // namespace cuda
+}  // namespace common
+}  // namespace mxnet
+
+#endif  // MXNET_USE_CUDA
+
+#if MXNET_USE_CUDNN == 1
+
+namespace mxnet {
+namespace common {
+namespace cudnn {
+
+// The oldest version of CUDNN used in upstream MXNet CI testing, both for unix and windows.
+// Users that have rebuilt MXNet against older versions will we advised with a warning to upgrade
+// their systems to match the CI level.  Minimally, users should rerun the CI locally.
+#if defined(_MSC_VER)
+#define MXNET_CI_OLDEST_CUDNN_VERSION 7600
+#else
+#define MXNET_CI_OLDEST_CUDNN_VERSION 7600
+#endif
+
+// Dynamic init here will emit a warning if runtime and compile-time cudnn lib versions mismatch.
+// Also if the user has recompiled their source to a version no longer tested by upstream CI.
+bool cudnn_version_check_performed = []() {
+  // Don't bother with checks if there are no GPUs visible (e.g. with CUDA_VISIBLE_DEVICES="")
+  if (dmlc::GetEnv("MXNET_CUDNN_VERSION_CHECKING", true) && Context::GetGPUCount() > 0) {
+    size_t linkedAgainstCudnnVersion = cudnnGetVersion();
+    if (linkedAgainstCudnnVersion != CUDNN_VERSION)
+      LOG(WARNING) << "cuDNN library mismatch: linked-against version " << linkedAgainstCudnnVersion
+                   << " != compiled-against version " << CUDNN_VERSION << ".  "
+                   << "Set MXNET_CUDNN_VERSION_CHECKING=0 to quiet this warning.";
+    if (CUDNN_VERSION < MXNET_CI_OLDEST_CUDNN_VERSION)
+      LOG(WARNING) << "Upgrade advisory: this mxnet has been built against cuDNN library version "
+                   <<  CUDNN_VERSION << ", which is older than the oldest version tested by CI ("
+                   << MXNET_CI_OLDEST_CUDNN_VERSION << ").  "
+                   << "Set MXNET_CUDNN_VERSION_CHECKING=0 to quiet this warning.";
+  }
+  return true;
+}();
+
+}  // namespace cudnn
+}  // namespace common
+}  // namespace mxnet
+
+#endif  // MXNET_USE_CUDNN
diff --git a/src/common/cuda_utils.h b/src/common/cuda_utils.h
@@ -47,12 +47,20 @@ extern __cuda_fake_struct threadIdx;
 extern __cuda_fake_struct blockIdx;
 #endif
 
+#define QUOTE(x) #x
+#define QUOTEVALUE(x) QUOTE(x)
+
 #if MXNET_USE_CUDA
 
 #include <cuda_runtime.h>
 #include <cublas_v2.h>
 #include <curand.h>
 
+#define STATIC_ASSERT_CUDA_VERSION_GE(min_version) \
+  static_assert(CUDA_VERSION >= min_version, "Compiled-against CUDA version " \
+      QUOTEVALUE(CUDA_VERSION) " is too old, please upgrade system to version " \
+      QUOTEVALUE(min_version) " or later.")
+
 /*!
  * \brief When compiling a __device__ function, check that the architecture is >= Kepler (3.0)
  *        Note that __CUDA_ARCH__ is not defined outside of a __device__ function
@@ -441,6 +449,25 @@ inline cublasMath_t SetCublasMathMode(cublasHandle_t blas_handle, cublasMath_t n
 
 #include <cudnn.h>
 
+// Creating CUDNN_VERSION_AS_STRING as follows avoids a static_assert error message that shows
+// the formula for CUDNN_VERSION, i.e. "1000 * 7 + 100 * 6 + 0" rather than number "7600".
+static_assert(CUDNN_PATCHLEVEL < 100 && CUDNN_MINOR < 10,
+              "CUDNN_VERSION_AS_STRING macro assumptions violated.");
+#if CUDNN_PATCHLEVEL >= 10
+#define CUDNN_VERSION_AS_STRING QUOTEVALUE(CUDNN_MAJOR) \
+                                QUOTEVALUE(CUDNN_MINOR) \
+                                QUOTEVALUE(CUDNN_PATCHLEVEL)
+#else
+#define CUDNN_VERSION_AS_STRING QUOTEVALUE(CUDNN_MAJOR) \
+                                QUOTEVALUE(CUDNN_MINOR) \
+                                "0" QUOTEVALUE(CUDNN_PATCHLEVEL)
+#endif
+
+#define STATIC_ASSERT_CUDNN_VERSION_GE(min_version) \
+  static_assert(CUDNN_VERSION >= min_version, "Compiled-against cuDNN version " \
+      CUDNN_VERSION_AS_STRING " is too old, please upgrade system to version " \
+      QUOTEVALUE(min_version) " or later.")
+
 #define CUDNN_CALL(func)                                                      \
   {                                                                           \
     cudnnStatus_t e = (func);                                                 \

diff --git a/src/resource.cc b/src/resource.cc
@@ -92,9 +92,9 @@ class ResourceManagerImpl : public ResourceManager {
     gpu_temp_space_copy_ = dmlc::GetEnv("MXNET_GPU_TEMP_COPY", 1);
     cpu_native_rand_copy_ = dmlc::GetEnv("MXNET_CPU_PARALLEL_RAND_COPY", 1);
     gpu_native_rand_copy_ = dmlc::GetEnv("MXNET_GPU_PARALLEL_RAND_COPY", 4);
-#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+#if MXNET_USE_CUDNN == 1
     gpu_cudnn_dropout_state_copy_ = dmlc::GetEnv("MXNET_GPU_CUDNN_DROPOUT_STATE_COPY", 4);
-#endif  // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+#endif  // MXNET_USE_CUDNN == 1
     engine_ref_ = Engine::_GetSharedRef();
     storage_ref_ = Storage::_GetSharedRef();
     cpu_rand_.reset(new ResourceRandom<cpu>(
@@ -113,9 +113,9 @@ class ResourceManagerImpl : public ResourceManager {
     gpu_rand_.Clear();
     gpu_space_.Clear();
     gpu_parallel_rand_.Clear();
-#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+#if MXNET_USE_CUDNN == 1
     gpu_cudnn_dropout_state_.Clear();
-#endif  // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+#endif  // MXNET_USE_CUDNN == 1
 #endif
     if (engine_ref_ != nullptr) {
       engine_ref_ = nullptr;
@@ -153,14 +153,14 @@ class ResourceManagerImpl : public ResourceManager {
             return new ResourceParallelRandom<gpu>(ctx, gpu_native_rand_copy_, global_seed_);
           })->GetNext();
         }
-#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+#if MXNET_USE_CUDNN == 1
         case ResourceRequest::kCuDNNDropoutDesc: {
           return gpu_cudnn_dropout_state_.Get(ctx.dev_id, [ctx, this]() {
             return new ResourceTempSpace<ResourceRequest::kCuDNNDropoutDesc>(
                 ctx, gpu_cudnn_dropout_state_copy_);
           })->GetNext();
         }
-#endif  // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+#endif  // MXNET_USE_CUDNN == 1
         default: LOG(FATAL) << "Unknown supported type " << req.type;
       }
 #else
@@ -399,13 +399,13 @@ class ResourceManagerImpl : public ResourceManager {
   common::LazyAllocArray<ResourceTempSpace<ResourceRequest::kTempSpace>> gpu_space_;
   /*! \brief GPU parallel (on device) random number resources */
   common::LazyAllocArray<ResourceParallelRandom<gpu> > gpu_parallel_rand_;
-#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+#if MXNET_USE_CUDNN == 1
   /*! \brief number of copies in GPU cudnn dropout descriptor resources */
   int gpu_cudnn_dropout_state_copy_;
   /*! \brief GPU parallel (on device) random number resources */
   common::LazyAllocArray<ResourceTempSpace<ResourceRequest::kCuDNNDropoutDesc>>
     gpu_cudnn_dropout_state_;
-#endif  // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+#endif  // MXNET_USE_CUDNN == 1
 #endif
 };
 }  // namespace resource
@@ -418,7 +418,7 @@ void* Resource::get_host_space_internal(size_t size) const {
   return static_cast<resource::SpaceAllocator*>(ptr_)->GetHostSpace(size);
 }
 
-#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+#if MXNET_USE_CUDNN == 1
 void Resource::get_cudnn_dropout_desc(
     cudnnDropoutDescriptor_t* dropout_desc,
     mshadow::Stream<gpu> *stream,
@@ -442,14 +442,16 @@ void Resource::get_cudnn_dropout_desc(
                                          dropout_state_size,
                                          seed));
   } else {
+    // cudnnRestoreDropoutDescriptor() introduced with cuDNN v7
+    STATIC_ASSERT_CUDNN_VERSION_GE(7000);
     CUDNN_CALL(cudnnRestoreDropoutDescriptor(*dropout_desc, stream->dnn_handle_,
                                              dropout,
                                              state_space->handle.dptr,
                                              state_space->handle.size,
                                              seed));
   }
 }
-#endif  // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+#endif  // MXNET_USE_CUDNN == 1
 
 ResourceManager* ResourceManager::Get() {
   typedef dmlc::ThreadLocalStore<resource::ResourceManagerImpl> inst;