Skip to content

Commit

Permalink
CudnnFind() usage improvements (apache#12804)
Browse files Browse the repository at this point in the history
* Add mx.context.gpu_memory_info() to python api for flexible tests.

* Add test_gluon_gpu.py:test_large_models to show cudnnFind headroom issue.

* Output model sizes tried by test_gluon_gpu.py:test_large_models.

* Fix perl interface to MXGetGPUMemoryInformation.

* Increase difficulty of test_gluon_gpu.py:test_large_models.

* Forgot a file in fix for perl.

* Modify test to pass on no-cudnn CI runner.

* Mutex algo reg updates, serialize cudnnFind calls.

* Fix for cudnnFind memory headroom issue.

* Fix cpplint.

* Respond to reviewers comments.

* Guard against improper MXNET_GPU_MEM_LARGE_ALLOC_ROUND_SIZE values.

* Fix potentially unassigned var.
  • Loading branch information
DickJC123 authored and ChaiBapchya committed Oct 30, 2018
1 parent 0dc4646 commit 72eecdd
Show file tree
Hide file tree
Showing 12 changed files with 707 additions and 491 deletions.
1 change: 1 addition & 0 deletions CONTRIBUTORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -187,3 +187,4 @@ List of Contributors
* [LuckyPigeon](https://github.com/LuckyPigeon)
* [Anton Chernov](https://github.com/lebeg)
* [Denisa Roberts](https://github.com/D-Roberts)
* [Dick Carter](https://github.com/DickJC123)
4 changes: 4 additions & 0 deletions docs/faq/env_var.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
* MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF
- Values: Int ```(default=24)```
- The cutoff threshold that decides the rounding strategy. Let's denote the threshold as T. If the memory size is smaller than `2 ** T` (by default, it's 2 ** 24 = 16MB), it rounds to the smallest `2 ** n` that is larger than the requested memory size; if the memory size is larger than `2 ** T`, it rounds to the next k * 2 ** T.
* MXNET_GPU_MEM_LARGE_ALLOC_ROUND_SIZE
- Values: Int ```(default=2097152)```
- When using the naive pool type, memory allocations larger than this threshhold are rounded up to a multiple of this value.
- The default was chosen to minimize global memory fragmentation within the GPU driver. Set this to 1 to disable.

## Engine Type

Expand Down
14 changes: 7 additions & 7 deletions include/mxnet/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -225,11 +225,11 @@ struct Context {
/*!
* \brief get the free and total available memory on a GPU
* \param dev the GPU number to query
* \param free_mem pointer to the integer holding free GPU memory
* \param total_mem pointer to the integer holding total GPU memory
* \param free_mem pointer to the uint64_t holding free GPU memory
* \param total_mem pointer to the uint64_t holding total GPU memory
* \return No return value
*/
inline static void GetGPUMemoryInformation(int dev, int *free, int *total);
inline static void GetGPUMemoryInformation(int dev, uint64_t *free, uint64_t *total);
/*!
* Create a pinned CPU context.
* \param dev_id the device id for corresponding GPU.
Expand Down Expand Up @@ -334,8 +334,8 @@ inline int32_t Context::GetGPUCount() {
#endif
}

inline void Context::GetGPUMemoryInformation(int dev, int *free_mem,
int *total_mem) {
inline void Context::GetGPUMemoryInformation(int dev, uint64_t *free_mem,
uint64_t *total_mem) {
#if MXNET_USE_CUDA

size_t memF, memT;
Expand All @@ -354,8 +354,8 @@ inline void Context::GetGPUMemoryInformation(int dev, int *free_mem,
e = cudaSetDevice(curDevice);
CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);

*free_mem = static_cast<int>(memF);
*total_mem = static_cast<int>(memT);
*free_mem = static_cast<uint64_t>(memF);
*total_mem = static_cast<uint64_t>(memT);

#else
LOG(FATAL)
Expand Down
10 changes: 10 additions & 0 deletions include/mxnet/c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -441,13 +441,23 @@ MXNET_DLL int MXGetGPUCount(int* out);

/*!
* \brief get the free and total available memory on a GPU
* Note: Deprecated, use MXGetGPUMemoryInformation64 instead.
* \param dev the GPU number to query
* \param free_mem pointer to the integer holding free GPU memory
* \param total_mem pointer to the integer holding total GPU memory
* \return 0 when success, -1 when failure happens
*/
MXNET_DLL int MXGetGPUMemoryInformation(int dev, int *free_mem, int *total_mem);

/*!
* \brief get the free and total available memory on a GPU
* \param dev the GPU number to query
* \param free_mem pointer to the uint64_t holding free GPU memory
* \param total_mem pointer to the uint64_t holding total GPU memory
* \return 0 when success, -1 when failure happens
*/
MXNET_DLL int MXGetGPUMemoryInformation64(int dev, uint64_t *free_mem, uint64_t *total_mem);

/*!
* \brief get the MXNet library version as an integer
* \param pointer to the integer holding the version number
Expand Down
10 changes: 10 additions & 0 deletions perl-package/AI-MXNetCAPI/mxnet.i
Original file line number Diff line number Diff line change
Expand Up @@ -344,13 +344,23 @@ int MXGetGPUCount(int* out);

/*!
* \brief get the free and total available memory on a GPU
* Note: deprecated, use MXGetGPUMemoryInformation64().
* \param dev the GPU number to query
* \param free_mem pointer to the integer holding free GPU memory
* \param total_mem pointer to the integer holding total GPU memory
* \return 0 when success, -1 when failure happens
*/
int MXGetGPUMemoryInformation(int dev, int *out, int *out);

/*!
* \brief get the free and total available memory on a GPU
* \param dev the GPU number to query
* \param free_mem pointer to the uint64_t holding free GPU memory
* \param total_mem pointer to the uint64_t holding total GPU memory
* \return 0 when success, -1 when failure happens
*/
int MXGetGPUMemoryInformation64(int dev, uint64_t *out, uint64_t *out);


//-------------------------------------
// Part 1: NDArray creation and deletion
Expand Down
24 changes: 24 additions & 0 deletions python/mxnet/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,30 @@ def num_gpus():
check_call(_LIB.MXGetGPUCount(ctypes.byref(count)))
return count.value

def gpu_memory_info(device_id=0):
"""Query CUDA for the free and total bytes of GPU global memory.
Parameters
----------
device_id : int, optional
The device id of the GPU device.
Raises
------
Will raise an exception on any CUDA error.
Returns
-------
(free, total) : (int, int)
The number of GPUs.
"""
free = ctypes.c_uint64()
total = ctypes.c_uint64()
dev_id = ctypes.c_int(device_id)
check_call(_LIB.MXGetGPUMemoryInformation64(dev_id, ctypes.byref(free), ctypes.byref(total)))
return (free.value, total.value)

def current_context():
"""Returns the current context.
Expand Down
11 changes: 11 additions & 0 deletions src/c_api/c_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,18 @@ int MXGetGPUCount(int* out) {
API_END();
}

// Deprecated: use MXGetGPUMemoryInformation64() instead.
int MXGetGPUMemoryInformation(int dev, int *free_mem, int *total_mem) {
API_BEGIN();
uint64_t free_mem64 = 0UL;
uint64_t total_mem64 = 0UL;
Context::GetGPUMemoryInformation(dev, &free_mem64, &total_mem64);
*free_mem = static_cast<int>(free_mem64);
*total_mem = static_cast<int>(total_mem64);
API_END();
}

int MXGetGPUMemoryInformation64(int dev, uint64_t *free_mem, uint64_t *total_mem) {
API_BEGIN();
Context::GetGPUMemoryInformation(dev, free_mem, total_mem);
API_END();
Expand Down
66 changes: 28 additions & 38 deletions src/operator/nn/cudnn/cudnn_algoreg-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
#include <mutex>
#include <string>
#include <vector>
#include <functional>
#include <utility>
#include "../../../common/cuda_utils.h"
#include "../convolution-inl.h"
#include "../deconvolution-inl.h"
Expand Down Expand Up @@ -65,7 +67,11 @@ class CuDNNAlgo {
template<typename ParamType>
class CuDNNAlgoReg {
public:
bool Find(const ParamType &param,
using AlgoSetter_t = std::function<void(CuDNNAlgo<cudnnConvolutionFwdAlgo_t> *,
CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t> *,
CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> *)>;

void FindOrElseRegister(const ParamType &param,
const std::vector<TShape> &in_shape,
const std::vector<TShape> &out_shape,
cudnnDataType_t cudnn_data_type,
Expand All @@ -75,7 +81,8 @@ class CuDNNAlgoReg {
bool add_to_weight,
CuDNNAlgo<cudnnConvolutionFwdAlgo_t> *fwd,
CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t> *bwd,
CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> *flt) {
CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> *flt,
const AlgoSetter_t &algo_setter) {
CHECK(in_shape.size() == 2 || in_shape.size() == 3);
ParamKey key{param, in_shape[0], in_shape[1], out_shape[0], cudnn_data_type,
cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch, add_to_weight};
Expand All @@ -85,45 +92,28 @@ class CuDNNAlgoReg {
*fwd = i->second.fwd;
*bwd = i->second.bwd;
*flt = i->second.flt;
return true;
}
return false;
}

void Register(const ParamType &param,
const std::vector<TShape> &in_shape,
const std::vector<TShape> &out_shape,
cudnnDataType_t cudnn_data_type,
cudnnDataType_t cudnn_forward_compute_type,
cudnnDataType_t cudnn_backward_compute_type,
int sm_arch,
bool add_to_weight,
const CuDNNAlgo<cudnnConvolutionFwdAlgo_t> &fwd,
const CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t> &bwd,
const CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> &flt) {
CHECK(in_shape.size() == 2 || in_shape.size() == 3);
ParamKey key{param, in_shape[0], in_shape[1], out_shape[0], cudnn_data_type,
cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch, add_to_weight};
std::lock_guard<std::mutex> guard(lock_);
if (param.cudnn_tune.value() && reg_.size() % 50 == 0) {
LOG(INFO) << "Running performance tests to find the best convolution "
"algorithm, "
"this can take a while... (setting env variable "
"MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)";
if (reg_.size() >= 1000) {
// Many people are very concerned about this warning, so change the warning once.
if (!is_warning_autotune_) {
LOG(INFO)
<< "If you see this message in the middle of training, you are "
"probably using bucketing. Consider setting env variable "
"MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable cudnn tuning.";
is_warning_autotune_ = true;
} else {
if (param.cudnn_tune.value() && reg_.size() % 50 == 0) {
LOG(INFO) << "Running performance tests to find the best convolution "
"algorithm, "
"this can take a while... (setting env variable "
"MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)";
if (reg_.size() >= 1000) {
// Many people are very concerned about this warning, so change the warning once.
if (!is_warning_autotune_) {
LOG(INFO)
<< "If you see this message in the middle of training, you are "
"probably using bucketing. Consider setting env variable "
"MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable cudnn tuning.";
is_warning_autotune_ = true;
}
}
}
// Call provided function to determine the algos- likely uses cudnnFind() or cudnnGet()
algo_setter(fwd, bwd, flt);
// Save result so future lookups hit in this registry
reg_.insert(std::pair<ParamKey, CudnnAlgorithms>(key, CudnnAlgorithms{*fwd, *bwd, *flt}));
}
reg_[key].fwd = fwd;
reg_[key].bwd = bwd;
reg_[key].flt = flt;
}

static CuDNNAlgoReg *Get();
Expand Down
Loading

0 comments on commit 72eecdd

Please sign in to comment.