Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Bypass cuda/cudnn checks if no driver. #15551

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/faq/env_var.md
Original file line number Diff line number Diff line change
Expand Up @@ -242,12 +242,12 @@ If ctypes is used, it must be `mxnet._ctypes.ndarray.NDArrayBase`.
- If set to '0', disallows implicit type conversions to Float16 to use Tensor Cores
- If set to '1', allows CUDA ops like RNN and Convolution to use TensorCores even with Float32 input data by using implicit type casting to Float16. Only has an effect if `MXNET_CUDA_ALLOW_TENSOR_CORE` is `1`.

* MXNET_CUDA_VERSION_CHECKING
* MXNET_CUDA_LIB_CHECKING
- 0(false) or 1(true) ```(default=1)```
- If set to '0', disallows various runtime checks of the cuda library version and associated warning messages.
- If set to '1', permits these checks (e.g. compile vs. link mismatch, old version no longer CI-tested)

* MXNET_CUDNN_VERSION_CHECKING
* MXNET_CUDNN_LIB_CHECKING
- 0(false) or 1(true) ```(default=1)```
- If set to '0', disallows various runtime checks of the cuDNN library version and associated warning messages.
- If set to '1', permits these checks (e.g. compile vs. link mismatch, old version no longer CI-tested)
Expand Down
40 changes: 35 additions & 5 deletions include/mxnet/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,11 @@ struct Context {
* \return The number of GPUs that are available.
*/
inline static int32_t GetGPUCount();
/*!
* Is the cuda driver installed and visible to the system.
* \return Whether the driver is present.
*/
inline static bool GPUDriverPresent();
/*!
* Get the number of streams that a GPU Worker has available to operations.
* \return The number of streams that are available.
Expand Down Expand Up @@ -222,6 +227,14 @@ struct Context {
* \return Context
*/
inline static Context FromString(const std::string& str);

private:
#if MXNET_USE_CUDA
static void CudaLibChecks();
#endif
#if MXNET_USE_CUDNN
static void CuDNNLibChecks();
#endif
};

#if MXNET_USE_CUDA
Expand Down Expand Up @@ -387,17 +400,21 @@ inline bool Context::operator<(const Context &b) const {
inline Context Context::Create(DeviceType dev_type, int32_t dev_id) {
Context ctx;
ctx.dev_type = dev_type;
if (dev_id < 0) {
ctx.dev_id = 0;
if (dev_type & kGPU) {
ctx.dev_id = dev_id < 0 ? 0 : dev_id;
if (dev_type & kGPU) {
#if MXNET_USE_CUDA
CudaLibChecks();
#endif
#if MXNET_USE_CUDNN
CuDNNLibChecks();
#endif
if (dev_id < 0) {
#if MXNET_USE_CUDA
CHECK_EQ(cudaGetDevice(&ctx.dev_id), cudaSuccess);
#else
LOG(FATAL) << "Please compile with CUDA enabled for cuda features";
#endif
}
} else {
ctx.dev_id = dev_id;
}
return ctx;
}
Expand All @@ -417,8 +434,21 @@ inline Context Context::GPU(int32_t dev_id) {
return Create(kGPU, dev_id);
}

inline bool Context::GPUDriverPresent() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

😀

#if MXNET_USE_CUDA
int cuda_driver_version = 0;
CHECK_EQ(cudaDriverGetVersion(&cuda_driver_version), cudaSuccess);
return cuda_driver_version > 0;
#else
return false;
#endif
}

inline int32_t Context::GetGPUCount() {
#if MXNET_USE_CUDA
if (!GPUDriverPresent()) {
return 0;
}
int32_t count;
cudaError_t e = cudaGetDeviceCount(&count);
if (e == cudaErrorNoDevice) {
Expand Down
95 changes: 95 additions & 0 deletions src/base.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

/*!
* Copyright (c) 2019 by Contributors
* \file base.cc
* \brief Implementation of base declarations, e.g. context
*/
#include <mxnet/base.h>

namespace mxnet {

#define UNUSED(x) (void)(x)

#if MXNET_USE_CUDA == 1
// The oldest version of cuda used in upstream MXNet CI testing, both for unix and windows.
// Users that have rebuilt MXNet against older versions will we advised with a warning to upgrade
// their systems to match the CI level. Minimally, users should rerun the CI locally.
#if defined(_MSC_VER)
#define MXNET_CI_OLDEST_CUDA_VERSION 9020
#else
#define MXNET_CI_OLDEST_CUDA_VERSION 10000
#endif

void Context::CudaLibChecks() {
// One-time init here will emit a warning if no gpus or gpu driver is seen.
// Also if the user has recompiled their source to a version no longer tested by upstream CI.
static bool cuda_lib_checks_performed = []() {
if (dmlc::GetEnv("MXNET_CUDA_LIB_CHECKING", true)) {
if (!GPUDriverPresent())
LOG(WARNING) << "Please install cuda driver for GPU use. No cuda driver detected.";
else if (GetGPUCount() == 0)
LOG(WARNING) << "GPU context requested, but no GPUs found.";
else if (CUDA_VERSION < MXNET_CI_OLDEST_CUDA_VERSION)
LOG(WARNING) << "Upgrade advisory: this mxnet has been built against cuda library version "
<< CUDA_VERSION << ", which is older than the oldest version tested by CI ("
<< MXNET_CI_OLDEST_CUDA_VERSION << "). "
<< "Set MXNET_CUDA_LIB_CHECKING=0 to quiet this warning.";
}
return true;
}();
UNUSED(cuda_lib_checks_performed);
}
#endif // MXNET_USE_CUDA

#if MXNET_USE_CUDNN == 1
// The oldest version of CUDNN used in upstream MXNet CI testing, both for unix and windows.
// Users that have rebuilt MXNet against older versions will we advised with a warning to upgrade
// their systems to match the CI level. Minimally, users should rerun the CI locally.
#if defined(_MSC_VER)
#define MXNET_CI_OLDEST_CUDNN_VERSION 7600
#else
#define MXNET_CI_OLDEST_CUDNN_VERSION 7600
#endif

void Context::CuDNNLibChecks() {
// One-time init here will emit a warning if runtime and compile-time cudnn lib versions mismatch.
// Also if the user has recompiled their source to a version no longer tested by upstream CI.
static bool cudnn_lib_checks_performed = []() {
// Don't bother with checks if there are no GPUs visible (e.g. with CUDA_VISIBLE_DEVICES="")
if (dmlc::GetEnv("MXNET_CUDNN_LIB_CHECKING", true) && GetGPUCount() > 0) {
size_t linkedAgainstCudnnVersion = cudnnGetVersion();
if (linkedAgainstCudnnVersion != CUDNN_VERSION)
LOG(WARNING) << "cuDNN lib mismatch: linked-against version " << linkedAgainstCudnnVersion
<< " != compiled-against version " << CUDNN_VERSION << ". "
<< "Set MXNET_CUDNN_LIB_CHECKING=0 to quiet this warning.";
if (CUDNN_VERSION < MXNET_CI_OLDEST_CUDNN_VERSION)
LOG(WARNING) << "Upgrade advisory: this mxnet has been built against cuDNN lib version "
<< CUDNN_VERSION << ", which is older than the oldest version tested by CI ("
<< MXNET_CI_OLDEST_CUDNN_VERSION << "). "
<< "Set MXNET_CUDNN_LIB_CHECKING=0 to quiet this warning.";
}
return true;
}();
UNUSED(cudnn_lib_checks_performed);
}
#endif // MXNET_USE_CUDNN

} // namespace mxnet
105 changes: 0 additions & 105 deletions src/common/cuda_utils.cc

This file was deleted.