From 7192fc89e9e5b4ccc61170f58aec7f8a72987dc8 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Tue, 2 Jul 2019 15:05:09 -0700 Subject: [PATCH 1/5] Add STATIC_ASSERT_{CUDA,CUDNN}_VERSION_GE macros. Protect rnn.cc against CUDNN_VERSION < 7000. --- docs/faq/env_var.md | 10 ++++ src/common/cuda_utils.cc | 114 +++++++++++++++++++++++++++++++++++++++ src/common/cuda_utils.h | 27 ++++++++++ src/operator/rnn.cc | 1 + 4 files changed, 152 insertions(+) create mode 100644 src/common/cuda_utils.cc diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md index cdd528cd8c8f..a5ba07098005 100644 --- a/docs/faq/env_var.md +++ b/docs/faq/env_var.md @@ -242,6 +242,16 @@ If ctypes is used, it must be `mxnet._ctypes.ndarray.NDArrayBase`. - If set to '0', disallows implicit type conversions to Float16 to use Tensor Cores - If set to '1', allows CUDA ops like RNN and Convolution to use TensorCores even with Float32 input data by using implicit type casting to Float16. Only has an effect if `MXNET_CUDA_ALLOW_TENSOR_CORE` is `1`. +* MXNET_CUDA_VERSION_CHECKING + - 0(false) or 1(true) ```(default=1)``` + - If set to '0', disallows various runtime checks of the cuda library version and associated warning messages. + - If set to '1', permits these checks (e.g. compile vs. link mismatch, old version no longer CI-tested) + +* MXNET_CUDNN_VERSION_CHECKING + - 0(false) or 1(true) ```(default=1)``` + - If set to '0', disallows various runtime checks of the cuDNN library version and associated warning messages. + - If set to '1', permits these checks (e.g. compile vs. link mismatch, old version no longer CI-tested) + * MXNET_GLUON_REPO - Values: String ```(default='https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/'``` - The repository url to be used for Gluon datasets and pre-trained models. diff --git a/src/common/cuda_utils.cc b/src/common/cuda_utils.cc new file mode 100644 index 000000000000..344fd4e21279 --- /dev/null +++ b/src/common/cuda_utils.cc @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2019 by Contributors + * \file cuda_utils.cc + * \brief CUDA debugging utilities. + */ + +#include +#include "cuda_utils.h" + +#if MXNET_USE_CUDA == 1 + +namespace mxnet { +namespace common { +namespace cuda { + +// The oldest version of cuda used in upstream MXNet CI testing, both for unix and windows. +// Users that have rebuilt MXNet against older versions will we advised with a warning to upgrade +// their systems to match the CI level. Minimally, users should rerun the CI locally. +#if defined(_MSC_VER) +#define MXNET_CI_OLDEST_CUDA_VERSION 9020 +#else +#define MXNET_CI_OLDEST_CUDA_VERSION 10000 +#endif + + +// Start-up check that the version of cuda compiled-against matches the linked-against version. +bool CudaVersionChecks() { + if (dmlc::GetEnv("MXNET_CUDA_VERSION_CHECKING", true)) { + int linkedAgainstCudaVersion = 0; + CUDA_CALL(cudaRuntimeGetVersion(&linkedAgainstCudaVersion)); + if (linkedAgainstCudaVersion != CUDA_VERSION) + LOG(WARNING) << "cuda library mismatch: linked-against version " << linkedAgainstCudaVersion + << " != compiled-against version " << CUDA_VERSION << "." + << "Set MXNET_CUDA_VERSION_CHECKING=0 to quiet this warning."; + if (CUDA_VERSION < MXNET_CI_OLDEST_CUDA_VERSION) + LOG(WARNING) << "Upgrade advisory: this mxnet has been built against cuda library version " + << CUDA_VERSION << ", which is older than the oldest version tested by CI (" + << MXNET_CI_OLDEST_CUDA_VERSION << "). " + << "Set MXNET_CUDA_VERSION_CHECKING=0 to quiet this warning."; + } + return true; +} + +// Dynamic initialization here will emit a warning if runtime and compile-time versions mismatch. +// Also if the user has recompiled their source to a version no longer tested by upstream CI. +bool cuda_version_ok = CudaVersionChecks(); + +} // namespace cuda +} // namespace common +} // namespace mxnet + +#endif // MXNET_USE_CUDA + +#if MXNET_USE_CUDNN == 1 + +namespace mxnet { +namespace common { +namespace cudnn { + +// The oldest version of CUDNN used in upstream MXNet CI testing, both for unix and windows. +// Users that have rebuilt MXNet against older versions will we advised with a warning to upgrade +// their systems to match the CI level. Minimally, users should rerun the CI locally. +#if defined(_MSC_VER) +#define MXNET_CI_OLDEST_CUDNN_VERSION 7600 +#else +#define MXNET_CI_OLDEST_CUDNN_VERSION 7600 +#endif + +// Start-up check that the version of cudnn compiled-against matches the linked-against version. +// Also if the user has recompiled their source to a version no longer tested by upstream CI. +bool CuDNNVersionChecks() { + if (dmlc::GetEnv("MXNET_CUDNN_VERSION_CHECKING", true)) { + size_t linkedAgainstCudnnVersion = cudnnGetVersion(); + if (linkedAgainstCudnnVersion != CUDNN_VERSION) + LOG(WARNING) << "cuDNN library mismatch: linked-against version " << linkedAgainstCudnnVersion + << " != compiled-against version " << CUDNN_VERSION << ". " + << "Set MXNET_CUDNN_VERSION_CHECKING=0 to quiet this warning."; + if (CUDNN_VERSION < MXNET_CI_OLDEST_CUDNN_VERSION) + LOG(WARNING) << "Upgrade advisory: this mxnet has been built against cuDNN library version " + << CUDNN_VERSION << ", which is older than the oldest version tested by CI (" + << MXNET_CI_OLDEST_CUDNN_VERSION << "). " + << "Set MXNET_CUDNN_VERSION_CHECKING=0 to quiet this warning."; + } + return true; +} + +// Dynamic initialization here will emit a warning if runtime and compile-time versions mismatch. +// Also if the user has recompiled their source to a version no longer tested by upstream CI. +bool cudnn_version_ok = CuDNNVersionChecks(); + +} // namespace cudnn +} // namespace common +} // namespace mxnet + +#endif // MXNET_USE_CUDNN diff --git a/src/common/cuda_utils.h b/src/common/cuda_utils.h index 0dd9d2db3722..acc8d5fac6df 100644 --- a/src/common/cuda_utils.h +++ b/src/common/cuda_utils.h @@ -47,12 +47,20 @@ extern __cuda_fake_struct threadIdx; extern __cuda_fake_struct blockIdx; #endif +#define QUOTE(x) #x +#define QUOTEVALUE(x) QUOTE(x) + #if MXNET_USE_CUDA #include #include #include +#define STATIC_ASSERT_CUDA_VERSION_GE(min_version) \ + static_assert(CUDA_VERSION >= min_version, "Compiled-against CUDA version " \ + QUOTEVALUE(CUDA_VERSION) " is too old, please upgrade system to version " \ + QUOTEVALUE(min_version) " or later.") + /*! * \brief When compiling a __device__ function, check that the architecture is >= Kepler (3.0) * Note that __CUDA_ARCH__ is not defined outside of a __device__ function @@ -441,6 +449,25 @@ inline cublasMath_t SetCublasMathMode(cublasHandle_t blas_handle, cublasMath_t n #include +// Creating CUDNN_VERSION_AS_STRING as follows avoids a static_assert error message that shows +// the formula for CUDNN_VERSION, i.e. "1000 * 7 + 100 * 6 + 0" rather than number "7600". +static_assert(CUDNN_PATCHLEVEL < 100 && CUDNN_MINOR < 10, + "CUDNN_VERSION_AS_STRING macro assumptions violated."); +#if CUDNN_PATCHLEVEL >= 10 +#define CUDNN_VERSION_AS_STRING QUOTEVALUE(CUDNN_MAJOR) \ + QUOTEVALUE(CUDNN_MINOR) \ + QUOTEVALUE(CUDNN_PATCHLEVEL) +#else +#define CUDNN_VERSION_AS_STRING QUOTEVALUE(CUDNN_MAJOR) \ + QUOTEVALUE(CUDNN_MINOR) \ + "0" QUOTEVALUE(CUDNN_PATCHLEVEL) +#endif + +#define STATIC_ASSERT_CUDNN_VERSION_GE(min_version) \ + static_assert(CUDNN_VERSION >= min_version, "Compiled-against cuDNN version " \ + CUDNN_VERSION_AS_STRING " is too old, please upgrade system to version " \ + QUOTEVALUE(min_version) " or later.") + #define CUDNN_CALL(func) \ { \ cudnnStatus_t e = (func); \ diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc index 6a0dbd7a4e23..b21241d9bacd 100644 --- a/src/operator/rnn.cc +++ b/src/operator/rnn.cc @@ -172,6 +172,7 @@ static std::vector RNNResourceEx(const NodeAttrs& attrs, const std::vector request; if (dev_mask == kGPU) { #if MXNET_USE_CUDNN_RNN + STATIC_ASSERT_CUDNN_VERSION_GE(7000); request.emplace_back(ResourceRequest::kTempSpace); const RNNParam& param = nnvm::get(attrs.parsed); From ea410f7490472f8f8b9209e6f3f187a775d6a944 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Wed, 3 Jul 2019 14:28:41 -0700 Subject: [PATCH 2/5] Omit cuda/cudnn lib version checks when no visible gpu devices. --- src/common/cuda_utils.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/common/cuda_utils.cc b/src/common/cuda_utils.cc index 344fd4e21279..4a18235b2d6f 100644 --- a/src/common/cuda_utils.cc +++ b/src/common/cuda_utils.cc @@ -44,7 +44,8 @@ namespace cuda { // Start-up check that the version of cuda compiled-against matches the linked-against version. bool CudaVersionChecks() { - if (dmlc::GetEnv("MXNET_CUDA_VERSION_CHECKING", true)) { + // Don't bother with checks if there are no GPUs visible (e.g. with CUDA_VISIBLE_DEVICES="") + if (dmlc::GetEnv("MXNET_CUDA_VERSION_CHECKING", true) && Context::GetGPUCount() > 0) { int linkedAgainstCudaVersion = 0; CUDA_CALL(cudaRuntimeGetVersion(&linkedAgainstCudaVersion)); if (linkedAgainstCudaVersion != CUDA_VERSION) @@ -88,7 +89,8 @@ namespace cudnn { // Start-up check that the version of cudnn compiled-against matches the linked-against version. // Also if the user has recompiled their source to a version no longer tested by upstream CI. bool CuDNNVersionChecks() { - if (dmlc::GetEnv("MXNET_CUDNN_VERSION_CHECKING", true)) { + // Don't bother with checks if there are no GPUs visible (e.g. with CUDA_VISIBLE_DEVICES="") + if (dmlc::GetEnv("MXNET_CUDNN_VERSION_CHECKING", true) && Context::GetGPUCount() > 0) { size_t linkedAgainstCudnnVersion = cudnnGetVersion(); if (linkedAgainstCudnnVersion != CUDNN_VERSION) LOG(WARNING) << "cuDNN library mismatch: linked-against version " << linkedAgainstCudnnVersion From ab5112fcc1974bdc3718717bfc6f2f1480511a5e Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Wed, 3 Jul 2019 18:37:23 -0700 Subject: [PATCH 3/5] Move STATIC_ASSERT_... to resource.cc. --- include/mxnet/resource.h | 8 ++++---- src/operator/rnn.cc | 1 - src/resource.cc | 22 ++++++++++++---------- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/include/mxnet/resource.h b/include/mxnet/resource.h index 34c8f88d1ca9..f8ee6364807c 100644 --- a/include/mxnet/resource.h +++ b/include/mxnet/resource.h @@ -44,11 +44,11 @@ struct ResourceRequest { kTempSpace, /*! \brief common::RandGenerator object, which can be used in GPU kernel functions */ kParallelRandom -#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7 +#if MXNET_USE_CUDNN == 1 , /*! \brief cudnnDropoutDescriptor_t object for GPU dropout kernel functions */ kCuDNNDropoutDesc -#endif // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7 +#endif // MXNET_USE_CUDNN == 1 }; /*! \brief type of resources */ Type type; @@ -162,7 +162,7 @@ struct Resource { reinterpret_cast(get_space_internal(shape.Size() * sizeof(DType))), shape, shape[ndim - 1], stream); } -#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7 +#if MXNET_USE_CUDNN == 1 /*! * \brief Get cudnn dropout descriptor from shared state space. * @@ -175,7 +175,7 @@ struct Resource { mshadow::Stream *stream, const float dropout, uint64_t seed) const; -#endif // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7 +#endif // MXNET_USE_CUDNN == 1 /*! * \brief Get CPU space as mshadow Tensor in specified type. diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc index b21241d9bacd..6a0dbd7a4e23 100644 --- a/src/operator/rnn.cc +++ b/src/operator/rnn.cc @@ -172,7 +172,6 @@ static std::vector RNNResourceEx(const NodeAttrs& attrs, const std::vector request; if (dev_mask == kGPU) { #if MXNET_USE_CUDNN_RNN - STATIC_ASSERT_CUDNN_VERSION_GE(7000); request.emplace_back(ResourceRequest::kTempSpace); const RNNParam& param = nnvm::get(attrs.parsed); diff --git a/src/resource.cc b/src/resource.cc index cd6320d393b1..3f461243e499 100644 --- a/src/resource.cc +++ b/src/resource.cc @@ -92,9 +92,9 @@ class ResourceManagerImpl : public ResourceManager { gpu_temp_space_copy_ = dmlc::GetEnv("MXNET_GPU_TEMP_COPY", 1); cpu_native_rand_copy_ = dmlc::GetEnv("MXNET_CPU_PARALLEL_RAND_COPY", 1); gpu_native_rand_copy_ = dmlc::GetEnv("MXNET_GPU_PARALLEL_RAND_COPY", 4); -#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7 +#if MXNET_USE_CUDNN == 1 gpu_cudnn_dropout_state_copy_ = dmlc::GetEnv("MXNET_GPU_CUDNN_DROPOUT_STATE_COPY", 4); -#endif // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7 +#endif // MXNET_USE_CUDNN == 1 engine_ref_ = Engine::_GetSharedRef(); storage_ref_ = Storage::_GetSharedRef(); cpu_rand_.reset(new ResourceRandom( @@ -113,9 +113,9 @@ class ResourceManagerImpl : public ResourceManager { gpu_rand_.Clear(); gpu_space_.Clear(); gpu_parallel_rand_.Clear(); -#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7 +#if MXNET_USE_CUDNN == 1 gpu_cudnn_dropout_state_.Clear(); -#endif // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7 +#endif // MXNET_USE_CUDNN == 1 #endif if (engine_ref_ != nullptr) { engine_ref_ = nullptr; @@ -153,14 +153,14 @@ class ResourceManagerImpl : public ResourceManager { return new ResourceParallelRandom(ctx, gpu_native_rand_copy_, global_seed_); })->GetNext(); } -#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7 +#if MXNET_USE_CUDNN == 1 case ResourceRequest::kCuDNNDropoutDesc: { return gpu_cudnn_dropout_state_.Get(ctx.dev_id, [ctx, this]() { return new ResourceTempSpace( ctx, gpu_cudnn_dropout_state_copy_); })->GetNext(); } -#endif // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7 +#endif // MXNET_USE_CUDNN == 1 default: LOG(FATAL) << "Unknown supported type " << req.type; } #else @@ -399,13 +399,13 @@ class ResourceManagerImpl : public ResourceManager { common::LazyAllocArray> gpu_space_; /*! \brief GPU parallel (on device) random number resources */ common::LazyAllocArray > gpu_parallel_rand_; -#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7 +#if MXNET_USE_CUDNN == 1 /*! \brief number of copies in GPU cudnn dropout descriptor resources */ int gpu_cudnn_dropout_state_copy_; /*! \brief GPU parallel (on device) random number resources */ common::LazyAllocArray> gpu_cudnn_dropout_state_; -#endif // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7 +#endif // MXNET_USE_CUDNN == 1 #endif }; } // namespace resource @@ -418,7 +418,7 @@ void* Resource::get_host_space_internal(size_t size) const { return static_cast(ptr_)->GetHostSpace(size); } -#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7 +#if MXNET_USE_CUDNN == 1 void Resource::get_cudnn_dropout_desc( cudnnDropoutDescriptor_t* dropout_desc, mshadow::Stream *stream, @@ -442,6 +442,8 @@ void Resource::get_cudnn_dropout_desc( dropout_state_size, seed)); } else { + // cudnnRestoreDropoutDescriptor() introduced with cuDNN v7 + STATIC_ASSERT_CUDNN_VERSION_GE(7000); CUDNN_CALL(cudnnRestoreDropoutDescriptor(*dropout_desc, stream->dnn_handle_, dropout, state_space->handle.dptr, @@ -449,7 +451,7 @@ void Resource::get_cudnn_dropout_desc( seed)); } } -#endif // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7 +#endif // MXNET_USE_CUDNN == 1 ResourceManager* ResourceManager::Get() { typedef dmlc::ThreadLocalStore inst; From 59cd7e07501d60e4e442750f1599009c4d30d2a1 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Wed, 3 Jul 2019 19:07:32 -0700 Subject: [PATCH 4/5] Remove function names in cuda/cudnn version check impl. --- src/common/cuda_utils.cc | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/src/common/cuda_utils.cc b/src/common/cuda_utils.cc index 4a18235b2d6f..4201a41935ac 100644 --- a/src/common/cuda_utils.cc +++ b/src/common/cuda_utils.cc @@ -41,9 +41,9 @@ namespace cuda { #define MXNET_CI_OLDEST_CUDA_VERSION 10000 #endif - -// Start-up check that the version of cuda compiled-against matches the linked-against version. -bool CudaVersionChecks() { +// Dynamic init here will emit a warning if runtime and compile-time cuda lib versions mismatch. +// Also if the user has recompiled their source to a version no longer tested by upstream CI. +bool cuda_version_check_performed = []() { // Don't bother with checks if there are no GPUs visible (e.g. with CUDA_VISIBLE_DEVICES="") if (dmlc::GetEnv("MXNET_CUDA_VERSION_CHECKING", true) && Context::GetGPUCount() > 0) { int linkedAgainstCudaVersion = 0; @@ -59,11 +59,7 @@ bool CudaVersionChecks() { << "Set MXNET_CUDA_VERSION_CHECKING=0 to quiet this warning."; } return true; -} - -// Dynamic initialization here will emit a warning if runtime and compile-time versions mismatch. -// Also if the user has recompiled their source to a version no longer tested by upstream CI. -bool cuda_version_ok = CudaVersionChecks(); +}(); } // namespace cuda } // namespace common @@ -86,9 +82,9 @@ namespace cudnn { #define MXNET_CI_OLDEST_CUDNN_VERSION 7600 #endif -// Start-up check that the version of cudnn compiled-against matches the linked-against version. +// Dynamic init here will emit a warning if runtime and compile-time cudnn lib versions mismatch. // Also if the user has recompiled their source to a version no longer tested by upstream CI. -bool CuDNNVersionChecks() { +bool cudnn_version_check_performed = []() { // Don't bother with checks if there are no GPUs visible (e.g. with CUDA_VISIBLE_DEVICES="") if (dmlc::GetEnv("MXNET_CUDNN_VERSION_CHECKING", true) && Context::GetGPUCount() > 0) { size_t linkedAgainstCudnnVersion = cudnnGetVersion(); @@ -103,11 +99,7 @@ bool CuDNNVersionChecks() { << "Set MXNET_CUDNN_VERSION_CHECKING=0 to quiet this warning."; } return true; -} - -// Dynamic initialization here will emit a warning if runtime and compile-time versions mismatch. -// Also if the user has recompiled their source to a version no longer tested by upstream CI. -bool cudnn_version_ok = CuDNNVersionChecks(); +}(); } // namespace cudnn } // namespace common From 9ec6097965208b2f04e61b17036f7fee995401af Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Thu, 11 Jul 2019 15:13:07 -0700 Subject: [PATCH 5/5] Remove runtime cuda lib check- major.minor already needed for program load. --- src/common/cuda_utils.cc | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/common/cuda_utils.cc b/src/common/cuda_utils.cc index 4201a41935ac..728d1e6681e6 100644 --- a/src/common/cuda_utils.cc +++ b/src/common/cuda_utils.cc @@ -46,12 +46,9 @@ namespace cuda { bool cuda_version_check_performed = []() { // Don't bother with checks if there are no GPUs visible (e.g. with CUDA_VISIBLE_DEVICES="") if (dmlc::GetEnv("MXNET_CUDA_VERSION_CHECKING", true) && Context::GetGPUCount() > 0) { - int linkedAgainstCudaVersion = 0; - CUDA_CALL(cudaRuntimeGetVersion(&linkedAgainstCudaVersion)); - if (linkedAgainstCudaVersion != CUDA_VERSION) - LOG(WARNING) << "cuda library mismatch: linked-against version " << linkedAgainstCudaVersion - << " != compiled-against version " << CUDA_VERSION << "." - << "Set MXNET_CUDA_VERSION_CHECKING=0 to quiet this warning."; + // Not currently performing a runtime check of linked-against vs. compiled-against + // cuda runtime library, as major.minor must match for libmxnet.so to even load, per: + // https://docs.nvidia.com/deploy/cuda-compatibility/#binary-compatibility if (CUDA_VERSION < MXNET_CI_OLDEST_CUDA_VERSION) LOG(WARNING) << "Upgrade advisory: this mxnet has been built against cuda library version " << CUDA_VERSION << ", which is older than the oldest version tested by CI ("