-
Notifications
You must be signed in to change notification settings - Fork 6.8k
CudnnFind() usage improvements #12804
Changes from 10 commits
7ad40a2
94614a5
8369e1e
aff8df4
e95eb25
7434d4b
7bf8d51
7f724c9
7596ad6
aa60af6
08b9f55
83df6d5
e043791
810af44
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -258,6 +258,30 @@ def num_gpus(): | |
check_call(_LIB.MXGetGPUCount(ctypes.byref(count))) | ||
return count.value | ||
|
||
def gpu_memory_info(device_id=0): | ||
"""Query CUDA for the free and total bytes of GPU global memory. | ||
|
||
Parameters | ||
---------- | ||
device_id : int, optional | ||
The device id of the GPU device. | ||
|
||
Raises | ||
------ | ||
Will raise an exception on any CUDA error. | ||
|
||
Returns | ||
------- | ||
(free, total) : (int, int) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Minor: 'total' - is it referring to total used, total available or the total size of the physical GPU. Also, aren't they 64 bit integers. So maybe 'long' would be more appropriate. Since we are exposing this API in python, it'd be a good idea to document it well. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I prefer to leave this as is. Regarding int vs long, I'm not a Python wizard, but ints are 'plain integers' and longs have unlimited precision:
And unfortunately, there's not a real short answer to what 'total' memory means. We're wrapping the cuda call cudaMemGetInfo(), and the NVIDIA documentation says:
Let's say you've got a GPU with published memory T. The GPU driver puts some control structures like the page table in that memory, so call that driver overhead D. Finally, your GPU may be driving a monitor, so a window manager is using the GPU with overhead W. So what does the API return for 'total' in this scenario? The answer is T - D. The long answer then is: 'total' means the total memory available to both your MXNet process and other processes that may be using the GPU. I don't know a way to suggest this succinctly without introducing more confusion. |
||
The number of GPUs. | ||
|
||
""" | ||
free = ctypes.c_uint64() | ||
total = ctypes.c_uint64() | ||
dev_id = ctypes.c_int(device_id) | ||
check_call(_LIB.MXGetGPUMemoryInformation(dev_id, ctypes.byref(free), ctypes.byref(total))) | ||
return (free.value, total.value) | ||
|
||
def current_context(): | ||
"""Returns the current context. | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,6 +30,8 @@ | |
#include <mutex> | ||
#include <string> | ||
#include <vector> | ||
#include <functional> | ||
#include <utility> | ||
#include "../../../common/cuda_utils.h" | ||
#include "../convolution-inl.h" | ||
#include "../deconvolution-inl.h" | ||
|
@@ -65,7 +67,11 @@ class CuDNNAlgo { | |
template<typename ParamType> | ||
class CuDNNAlgoReg { | ||
public: | ||
bool Find(const ParamType ¶m, | ||
using AlgoSetter_t = std::function<void(CuDNNAlgo<cudnnConvolutionFwdAlgo_t> *, | ||
CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t> *, | ||
CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> *)>; | ||
|
||
void FindOrElseRegister(const ParamType ¶m, | ||
const std::vector<TShape> &in_shape, | ||
const std::vector<TShape> &out_shape, | ||
cudnnDataType_t cudnn_data_type, | ||
|
@@ -75,7 +81,7 @@ class CuDNNAlgoReg { | |
bool add_to_weight, | ||
CuDNNAlgo<cudnnConvolutionFwdAlgo_t> *fwd, | ||
CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t> *bwd, | ||
CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> *flt) { | ||
CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> *flt, AlgoSetter_t algo_setter) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would using a const ref for algo_setter save us a copy? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Minor: Other parameters are on their own lines. Maybe algo_setter can go in its own line. |
||
CHECK(in_shape.size() == 2 || in_shape.size() == 3); | ||
ParamKey key{param, in_shape[0], in_shape[1], out_shape[0], cudnn_data_type, | ||
cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch, add_to_weight}; | ||
|
@@ -85,45 +91,28 @@ class CuDNNAlgoReg { | |
*fwd = i->second.fwd; | ||
*bwd = i->second.bwd; | ||
*flt = i->second.flt; | ||
return true; | ||
} | ||
return false; | ||
} | ||
|
||
void Register(const ParamType ¶m, | ||
const std::vector<TShape> &in_shape, | ||
const std::vector<TShape> &out_shape, | ||
cudnnDataType_t cudnn_data_type, | ||
cudnnDataType_t cudnn_forward_compute_type, | ||
cudnnDataType_t cudnn_backward_compute_type, | ||
int sm_arch, | ||
bool add_to_weight, | ||
const CuDNNAlgo<cudnnConvolutionFwdAlgo_t> &fwd, | ||
const CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t> &bwd, | ||
const CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> &flt) { | ||
CHECK(in_shape.size() == 2 || in_shape.size() == 3); | ||
ParamKey key{param, in_shape[0], in_shape[1], out_shape[0], cudnn_data_type, | ||
cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch, add_to_weight}; | ||
std::lock_guard<std::mutex> guard(lock_); | ||
if (param.cudnn_tune.value() && reg_.size() % 50 == 0) { | ||
LOG(INFO) << "Running performance tests to find the best convolution " | ||
"algorithm, " | ||
"this can take a while... (setting env variable " | ||
"MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)"; | ||
if (reg_.size() >= 1000) { | ||
// Many people are very concerned about this warning, so change the warning once. | ||
if (!is_warning_autotune_) { | ||
LOG(INFO) | ||
<< "If you see this message in the middle of training, you are " | ||
"probably using bucketing. Consider setting env variable " | ||
"MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable cudnn tuning."; | ||
is_warning_autotune_ = true; | ||
} else { | ||
if (param.cudnn_tune.value() && reg_.size() % 50 == 0) { | ||
LOG(INFO) << "Running performance tests to find the best convolution " | ||
"algorithm, " | ||
"this can take a while... (setting env variable " | ||
"MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)"; | ||
if (reg_.size() >= 1000) { | ||
// Many people are very concerned about this warning, so change the warning once. | ||
if (!is_warning_autotune_) { | ||
LOG(INFO) | ||
<< "If you see this message in the middle of training, you are " | ||
"probably using bucketing. Consider setting env variable " | ||
"MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable cudnn tuning."; | ||
is_warning_autotune_ = true; | ||
} | ||
} | ||
} | ||
// Call provided function to determine the algos- likely uses cudnnFind() or cudnnGet() | ||
algo_setter(fwd, bwd, flt); | ||
// Save result so future lookups hit in this registry | ||
reg_.insert(std::pair<ParamKey, CudnnAlgorithms>(key, CudnnAlgorithms{*fwd, *bwd, *flt})); | ||
} | ||
reg_[key].fwd = fwd; | ||
reg_[key].bwd = bwd; | ||
reg_[key].flt = flt; | ||
} | ||
|
||
static CuDNNAlgoReg *Get(); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@szha Given the usage I'd consider this a non-breaking change, but it is technically changing types to an argument. From a SemVer perspective are you ok with changing this in a minor update?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I also pondered this question and it's worth discussing. As the MXGetGPUMemoryInformation interface stands now, it would only report global memory total and free bytes < 2MB correctly. It's been many years since GPUs had so little global memory, so it's hard to imagine anyone out there using an interface so broken. MXNet didn't wrap this C++ API in Python, although it was exposed in Perl I think.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah I also grepped through the code and only see references to this API being used from Perl.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think @DickJC123 you meant 2GB? One alternative is to provide the new interface as MXGetGPUMemoryInformation64 and update the old interface in major version