Bypass cuda/cudnn checks if no driver. (#15551)

* Bypass cuda/cudnn checks if no driver. * Perform cuda/cudnn checks only when a gpu context is created. * Fix cpplint.
apache · Jul 17, 2019 · cb0697f · cb0697f
1 parent 57d097b
commit cb0697f
Show file tree

Hide file tree

Showing 4 changed files with 132 additions and 112 deletions.
diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md
@@ -242,12 +242,12 @@ If ctypes is used, it must be `mxnet._ctypes.ndarray.NDArrayBase`.
 	- If set to '0', disallows implicit type conversions to Float16 to use Tensor Cores
 	- If set to '1', allows CUDA ops like RNN and Convolution to use TensorCores even with Float32 input data by using implicit type casting to Float16. Only has an effect if `MXNET_CUDA_ALLOW_TENSOR_CORE` is `1`.
 
-* MXNET_CUDA_VERSION_CHECKING
+* MXNET_CUDA_LIB_CHECKING
   - 0(false) or 1(true) ```(default=1)```
   - If set to '0', disallows various runtime checks of the cuda library version and associated warning messages.
   - If set to '1', permits these checks (e.g. compile vs. link mismatch, old version no longer CI-tested)
 
-* MXNET_CUDNN_VERSION_CHECKING
+* MXNET_CUDNN_LIB_CHECKING
   - 0(false) or 1(true) ```(default=1)```
   - If set to '0', disallows various runtime checks of the cuDNN library version and associated warning messages.
   - If set to '1', permits these checks (e.g. compile vs. link mismatch, old version no longer CI-tested)

diff --git a/include/mxnet/base.h b/include/mxnet/base.h
@@ -191,6 +191,11 @@ struct Context {
    * \return The number of GPUs that are available.
    */
   inline static int32_t GetGPUCount();
+  /*!
+   * Is the cuda driver installed and visible to the system.
+   * \return Whether the driver is present.
+   */
+  inline static bool GPUDriverPresent();
   /*!
    * Get the number of streams that a GPU Worker has available to operations.
    * \return The number of streams that are available.
@@ -222,6 +227,14 @@ struct Context {
    * \return Context
    */
   inline static Context FromString(const std::string& str);
+
+ private:
+#if MXNET_USE_CUDA
+    static void CudaLibChecks();
+#endif
+#if MXNET_USE_CUDNN
+    static void CuDNNLibChecks();
+#endif
 };
 
 #if MXNET_USE_CUDA
@@ -387,17 +400,21 @@ inline bool Context::operator<(const Context &b) const {
 inline Context Context::Create(DeviceType dev_type, int32_t dev_id) {
   Context ctx;
   ctx.dev_type = dev_type;
-  if (dev_id < 0) {
-    ctx.dev_id = 0;
-    if (dev_type & kGPU) {
+  ctx.dev_id = dev_id < 0 ? 0 : dev_id;
+  if (dev_type & kGPU) {
+#if MXNET_USE_CUDA
+    CudaLibChecks();
+#endif
+#if MXNET_USE_CUDNN
+    CuDNNLibChecks();
+#endif
+    if (dev_id < 0) {
 #if MXNET_USE_CUDA
       CHECK_EQ(cudaGetDevice(&ctx.dev_id), cudaSuccess);
 #else
       LOG(FATAL) << "Please compile with CUDA enabled for cuda features";
 #endif
     }
-  } else {
-    ctx.dev_id = dev_id;
   }
   return ctx;
 }
@@ -417,8 +434,21 @@ inline Context Context::GPU(int32_t dev_id) {
   return Create(kGPU, dev_id);
 }
 
+inline bool Context::GPUDriverPresent() {
+#if MXNET_USE_CUDA
+  int cuda_driver_version = 0;
+  CHECK_EQ(cudaDriverGetVersion(&cuda_driver_version), cudaSuccess);
+  return cuda_driver_version > 0;
+#else
+  return false;
+#endif
+}
+
 inline int32_t Context::GetGPUCount() {
 #if MXNET_USE_CUDA
+  if (!GPUDriverPresent()) {
+    return 0;
+  }
   int32_t count;
   cudaError_t e = cudaGetDeviceCount(&count);
   if (e == cudaErrorNoDevice) {

diff --git a/src/base.cc b/src/base.cc
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file base.cc
+ * \brief Implementation of base declarations, e.g. context
+ */
+#include <mxnet/base.h>
+
+namespace mxnet {
+
+#define UNUSED(x) (void)(x)
+
+#if MXNET_USE_CUDA == 1
+// The oldest version of cuda used in upstream MXNet CI testing, both for unix and windows.
+// Users that have rebuilt MXNet against older versions will we advised with a warning to upgrade
+// their systems to match the CI level.  Minimally, users should rerun the CI locally.
+#if defined(_MSC_VER)
+#define MXNET_CI_OLDEST_CUDA_VERSION  9020
+#else
+#define MXNET_CI_OLDEST_CUDA_VERSION 10000
+#endif
+
+void Context::CudaLibChecks() {
+  // One-time init here will emit a warning if no gpus or gpu driver is seen.
+  // Also if the user has recompiled their source to a version no longer tested by upstream CI.
+  static bool cuda_lib_checks_performed = []() {
+    if (dmlc::GetEnv("MXNET_CUDA_LIB_CHECKING", true)) {
+      if (!GPUDriverPresent())
+        LOG(WARNING) << "Please install cuda driver for GPU use.  No cuda driver detected.";
+      else if (GetGPUCount() == 0)
+        LOG(WARNING) << "GPU context requested, but no GPUs found.";
+      else if (CUDA_VERSION < MXNET_CI_OLDEST_CUDA_VERSION)
+        LOG(WARNING) << "Upgrade advisory: this mxnet has been built against cuda library version "
+                     << CUDA_VERSION << ", which is older than the oldest version tested by CI ("
+                     << MXNET_CI_OLDEST_CUDA_VERSION << ").  "
+                     << "Set MXNET_CUDA_LIB_CHECKING=0 to quiet this warning.";
+    }
+    return true;
+  }();
+  UNUSED(cuda_lib_checks_performed);
+}
+#endif  // MXNET_USE_CUDA
+
+#if MXNET_USE_CUDNN == 1
+// The oldest version of CUDNN used in upstream MXNet CI testing, both for unix and windows.
+// Users that have rebuilt MXNet against older versions will we advised with a warning to upgrade
+// their systems to match the CI level.  Minimally, users should rerun the CI locally.
+#if defined(_MSC_VER)
+#define MXNET_CI_OLDEST_CUDNN_VERSION 7600
+#else
+#define MXNET_CI_OLDEST_CUDNN_VERSION 7600
+#endif
+
+void Context::CuDNNLibChecks() {
+  // One-time init here will emit a warning if runtime and compile-time cudnn lib versions mismatch.
+  // Also if the user has recompiled their source to a version no longer tested by upstream CI.
+  static bool cudnn_lib_checks_performed = []() {
+    // Don't bother with checks if there are no GPUs visible (e.g. with CUDA_VISIBLE_DEVICES="")
+    if (dmlc::GetEnv("MXNET_CUDNN_LIB_CHECKING", true) && GetGPUCount() > 0) {
+      size_t linkedAgainstCudnnVersion = cudnnGetVersion();
+      if (linkedAgainstCudnnVersion != CUDNN_VERSION)
+        LOG(WARNING) << "cuDNN lib mismatch: linked-against version " << linkedAgainstCudnnVersion
+                     << " != compiled-against version " << CUDNN_VERSION << ".  "
+                     << "Set MXNET_CUDNN_LIB_CHECKING=0 to quiet this warning.";
+      if (CUDNN_VERSION < MXNET_CI_OLDEST_CUDNN_VERSION)
+        LOG(WARNING) << "Upgrade advisory: this mxnet has been built against cuDNN lib version "
+                     <<  CUDNN_VERSION << ", which is older than the oldest version tested by CI ("
+                     << MXNET_CI_OLDEST_CUDNN_VERSION << ").  "
+                     << "Set MXNET_CUDNN_LIB_CHECKING=0 to quiet this warning.";
+    }
+    return true;
+  }();
+  UNUSED(cudnn_lib_checks_performed);
+}
+#endif  // MXNET_USE_CUDNN
+
+}  // namespace mxnet
diff --git a/src/common/cuda_utils.cc b/src/common/cuda_utils.cc