Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Commit

Permalink
Bypass cuda/cudnn checks if no driver. (#15551)
Browse files Browse the repository at this point in the history
* Bypass cuda/cudnn checks if no driver.

* Perform cuda/cudnn checks only when a gpu context is created.

* Fix cpplint.
  • Loading branch information
DickJC123 authored and ptrendx committed Jul 17, 2019
1 parent 57d097b commit cb0697f
Show file tree
Hide file tree
Showing 4 changed files with 132 additions and 112 deletions.
4 changes: 2 additions & 2 deletions docs/faq/env_var.md
Original file line number Diff line number Diff line change
Expand Up @@ -242,12 +242,12 @@ If ctypes is used, it must be `mxnet._ctypes.ndarray.NDArrayBase`.
- If set to '0', disallows implicit type conversions to Float16 to use Tensor Cores
- If set to '1', allows CUDA ops like RNN and Convolution to use TensorCores even with Float32 input data by using implicit type casting to Float16. Only has an effect if `MXNET_CUDA_ALLOW_TENSOR_CORE` is `1`.

* MXNET_CUDA_VERSION_CHECKING
* MXNET_CUDA_LIB_CHECKING
- 0(false) or 1(true) ```(default=1)```
- If set to '0', disallows various runtime checks of the cuda library version and associated warning messages.
- If set to '1', permits these checks (e.g. compile vs. link mismatch, old version no longer CI-tested)

* MXNET_CUDNN_VERSION_CHECKING
* MXNET_CUDNN_LIB_CHECKING
- 0(false) or 1(true) ```(default=1)```
- If set to '0', disallows various runtime checks of the cuDNN library version and associated warning messages.
- If set to '1', permits these checks (e.g. compile vs. link mismatch, old version no longer CI-tested)
Expand Down
40 changes: 35 additions & 5 deletions include/mxnet/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,11 @@ struct Context {
* \return The number of GPUs that are available.
*/
inline static int32_t GetGPUCount();
/*!
* Is the cuda driver installed and visible to the system.
* \return Whether the driver is present.
*/
inline static bool GPUDriverPresent();
/*!
* Get the number of streams that a GPU Worker has available to operations.
* \return The number of streams that are available.
Expand Down Expand Up @@ -222,6 +227,14 @@ struct Context {
* \return Context
*/
inline static Context FromString(const std::string& str);

private:
#if MXNET_USE_CUDA
static void CudaLibChecks();
#endif
#if MXNET_USE_CUDNN
static void CuDNNLibChecks();
#endif
};

#if MXNET_USE_CUDA
Expand Down Expand Up @@ -387,17 +400,21 @@ inline bool Context::operator<(const Context &b) const {
inline Context Context::Create(DeviceType dev_type, int32_t dev_id) {
Context ctx;
ctx.dev_type = dev_type;
if (dev_id < 0) {
ctx.dev_id = 0;
if (dev_type & kGPU) {
ctx.dev_id = dev_id < 0 ? 0 : dev_id;
if (dev_type & kGPU) {
#if MXNET_USE_CUDA
CudaLibChecks();
#endif
#if MXNET_USE_CUDNN
CuDNNLibChecks();
#endif
if (dev_id < 0) {
#if MXNET_USE_CUDA
CHECK_EQ(cudaGetDevice(&ctx.dev_id), cudaSuccess);
#else
LOG(FATAL) << "Please compile with CUDA enabled for cuda features";
#endif
}
} else {
ctx.dev_id = dev_id;
}
return ctx;
}
Expand All @@ -417,8 +434,21 @@ inline Context Context::GPU(int32_t dev_id) {
return Create(kGPU, dev_id);
}

inline bool Context::GPUDriverPresent() {
#if MXNET_USE_CUDA
int cuda_driver_version = 0;
CHECK_EQ(cudaDriverGetVersion(&cuda_driver_version), cudaSuccess);
return cuda_driver_version > 0;
#else
return false;
#endif
}

inline int32_t Context::GetGPUCount() {
#if MXNET_USE_CUDA
if (!GPUDriverPresent()) {
return 0;
}
int32_t count;
cudaError_t e = cudaGetDeviceCount(&count);
if (e == cudaErrorNoDevice) {
Expand Down
95 changes: 95 additions & 0 deletions src/base.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

/*!
* Copyright (c) 2019 by Contributors
* \file base.cc
* \brief Implementation of base declarations, e.g. context
*/
#include <mxnet/base.h>

namespace mxnet {

#define UNUSED(x) (void)(x)

#if MXNET_USE_CUDA == 1
// The oldest version of cuda used in upstream MXNet CI testing, both for unix and windows.
// Users that have rebuilt MXNet against older versions will we advised with a warning to upgrade
// their systems to match the CI level. Minimally, users should rerun the CI locally.
#if defined(_MSC_VER)
#define MXNET_CI_OLDEST_CUDA_VERSION 9020
#else
#define MXNET_CI_OLDEST_CUDA_VERSION 10000
#endif

void Context::CudaLibChecks() {
// One-time init here will emit a warning if no gpus or gpu driver is seen.
// Also if the user has recompiled their source to a version no longer tested by upstream CI.
static bool cuda_lib_checks_performed = []() {
if (dmlc::GetEnv("MXNET_CUDA_LIB_CHECKING", true)) {
if (!GPUDriverPresent())
LOG(WARNING) << "Please install cuda driver for GPU use. No cuda driver detected.";
else if (GetGPUCount() == 0)
LOG(WARNING) << "GPU context requested, but no GPUs found.";
else if (CUDA_VERSION < MXNET_CI_OLDEST_CUDA_VERSION)
LOG(WARNING) << "Upgrade advisory: this mxnet has been built against cuda library version "
<< CUDA_VERSION << ", which is older than the oldest version tested by CI ("
<< MXNET_CI_OLDEST_CUDA_VERSION << "). "
<< "Set MXNET_CUDA_LIB_CHECKING=0 to quiet this warning.";
}
return true;
}();
UNUSED(cuda_lib_checks_performed);
}
#endif // MXNET_USE_CUDA

#if MXNET_USE_CUDNN == 1
// The oldest version of CUDNN used in upstream MXNet CI testing, both for unix and windows.
// Users that have rebuilt MXNet against older versions will we advised with a warning to upgrade
// their systems to match the CI level. Minimally, users should rerun the CI locally.
#if defined(_MSC_VER)
#define MXNET_CI_OLDEST_CUDNN_VERSION 7600
#else
#define MXNET_CI_OLDEST_CUDNN_VERSION 7600
#endif

void Context::CuDNNLibChecks() {
// One-time init here will emit a warning if runtime and compile-time cudnn lib versions mismatch.
// Also if the user has recompiled their source to a version no longer tested by upstream CI.
static bool cudnn_lib_checks_performed = []() {
// Don't bother with checks if there are no GPUs visible (e.g. with CUDA_VISIBLE_DEVICES="")
if (dmlc::GetEnv("MXNET_CUDNN_LIB_CHECKING", true) && GetGPUCount() > 0) {
size_t linkedAgainstCudnnVersion = cudnnGetVersion();
if (linkedAgainstCudnnVersion != CUDNN_VERSION)
LOG(WARNING) << "cuDNN lib mismatch: linked-against version " << linkedAgainstCudnnVersion
<< " != compiled-against version " << CUDNN_VERSION << ". "
<< "Set MXNET_CUDNN_LIB_CHECKING=0 to quiet this warning.";
if (CUDNN_VERSION < MXNET_CI_OLDEST_CUDNN_VERSION)
LOG(WARNING) << "Upgrade advisory: this mxnet has been built against cuDNN lib version "
<< CUDNN_VERSION << ", which is older than the oldest version tested by CI ("
<< MXNET_CI_OLDEST_CUDNN_VERSION << "). "
<< "Set MXNET_CUDNN_LIB_CHECKING=0 to quiet this warning.";
}
return true;
}();
UNUSED(cudnn_lib_checks_performed);
}
#endif // MXNET_USE_CUDNN

} // namespace mxnet
105 changes: 0 additions & 105 deletions src/common/cuda_utils.cc

This file was deleted.

0 comments on commit cb0697f

Please sign in to comment.