Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Commit

Permalink
cuDNN non-persistant bidirectional RNN dgrad sync fix (#16391)
Browse files Browse the repository at this point in the history
* Alter test_lstm_bidirectional to demo fast-fail with optional wgrad.

* Fix cuDNN RNN dgrad sync.

* Simplify gpu activity sync sequence.

* Remove repeated running of now-passing test.

* Trigger CI
  • Loading branch information
DickJC123 authored and ptrendx committed Oct 10, 2019
1 parent cfe9e50 commit a2018ba
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 32 deletions.
83 changes: 54 additions & 29 deletions src/operator/rnn-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -493,6 +493,7 @@ class RNNOp {

CUDNN_CALL(cudnnCreateRNNDescriptor(&rnn_desc_));
CUDNN_CALL(cudnnCreateDropoutDescriptor(&dropout_desc_));
CUDA_CALL(cudaEventCreateWithFlags(&dgrad_sync_event_, cudaEventDisableTiming));

#if MXNET_USE_CUDNN_GE_7200
CUDNN_CALL(cudnnCreateRNNDataDescriptor(&x_data_desc_));
Expand Down Expand Up @@ -537,6 +538,7 @@ class RNNOp {
CUDNN_CALL(cudnnDestroyFilterDescriptor(dw_desc_));
CUDNN_CALL(cudnnDestroyRNNDescriptor(rnn_desc_));
CUDNN_CALL(cudnnDestroyDropoutDescriptor(dropout_desc_));
CUDA_CALL(cudaEventDestroy(dgrad_sync_event_));

if (init_cudnn_) {
for (size_t i = 0; i < x_desc_vec_.size(); ++i) {
Expand Down Expand Up @@ -1066,20 +1068,23 @@ class RNNOp {
workspace_byte_,
reserve_space_.dptr,
reserve_space_byte_));
CUDNN_CALL(cudnnRNNBackwardWeightsEx(s->dnn_handle_,
rnn_desc_,
x_data_desc_,
x.dptr_,
hx_desc_,
hx.dptr_,
y_data_desc_,
y.dptr_,
temp_space.dptr_,
workspace_byte_,
dw_desc_,
dw.dptr_,
reserve_space_.dptr,
reserve_space_byte_));
SyncDgrad();
if (req[rnn_enum::kParams] != kNullOp) {
CUDNN_CALL(cudnnRNNBackwardWeightsEx(s->dnn_handle_,
rnn_desc_,
x_data_desc_,
x.dptr_,
hx_desc_,
hx.dptr_,
y_data_desc_,
y.dptr_,
temp_space.dptr_,
workspace_byte_,
dw_desc_,
dw.dptr_,
reserve_space_.dptr,
reserve_space_byte_));
}
#else
CUDNN_CALL(cudnnRNNBackwardData(s->dnn_handle_,
rnn_desc_,
Expand Down Expand Up @@ -1108,21 +1113,24 @@ class RNNOp {
workspace_byte_,
reserve_space_.dptr,
reserve_space_byte_));
CUDNN_CALL(cudnnRNNBackwardWeights(s->dnn_handle_,
rnn_desc_,
param_.seq_length_,
x_desc_vec_.data(),
x.dptr_,
hx_desc_,
hx.dptr_,
y_desc_vec_.data(),
y.dptr_,
temp_space.dptr_,
workspace_byte_,
dw_desc_,
dw.dptr_,
reserve_space_.dptr,
reserve_space_byte_));
SyncDgrad();
if (req[rnn_enum::kParams] != kNullOp) {
CUDNN_CALL(cudnnRNNBackwardWeights(s->dnn_handle_,
rnn_desc_,
param_.seq_length_,
x_desc_vec_.data(),
x.dptr_,
hx_desc_,
hx.dptr_,
y_desc_vec_.data(),
y.dptr_,
temp_space.dptr_,
workspace_byte_,
dw_desc_,
dw.dptr_,
reserve_space_.dptr,
reserve_space_byte_));
}
#endif // MXNET_USE_CUDNN_GE_7200
#endif // MXNET_USE_CUDNN == 1 && defined(__CUDACC__)

Expand Down Expand Up @@ -1365,6 +1373,7 @@ class RNNOp {
// RNN descriptors
cudnnDataType_t dtype_with_fallback_;
cudnnRNNAlgo_t rnn_algo = CUDNN_RNN_ALGO_STANDARD;
dgrad_sync_needed_ = (rnn_algo == CUDNN_RNN_ALGO_STANDARD) && param_.bidirectional;
// On arch's 50 and 52(Maxwell), the gpu doesn't support native fp16 compute.
// Before cuDNN 7.5.0, when running fp16, cuDNN fallback to fp32 under the hood on Maxwell.
// That's not the case begining from 7.5.0. Thereby adding fallback explicitly here.
Expand Down Expand Up @@ -1484,6 +1493,20 @@ class RNNOp {
}
#endif // MXNET_USE_CUDNN == 1 && defined(__CUDACC__)
}

#if MXNET_USE_CUDNN == 1 && defined(__CUDACC__)
// cuDNN versions up to and including v7.6.4 did not sync a last dgrad kernel back to the main
// cudnn handle's stream (non-persistant algo, bidirectional only). This could result in silent
// non-determinstic failures with very low probability, seen more often when wgrad is bypassed.
inline void SyncDgrad() {
if (CUDNN_VERSION <= 7604 && dgrad_sync_needed_) {
// Without blocking the CPU, create a synchronization point of all current GPU activity. No
// need to call cudaStreamWaitEvent- cudaEventRecord on the legacy default stream suffices.
CUDA_CALL(cudaEventRecord(dgrad_sync_event_, cudaStreamLegacy));
}
}
#endif // MXNET_USE_CUDNN == 1 && defined(__CUDACC__)

#if MXNET_USE_CUDNN == 1
cudnnDataType_t dtype_;
bool init_cudnn_;
Expand Down Expand Up @@ -1511,6 +1534,8 @@ class RNNOp {
bool cudnn_tensor_core_;

cudnnTensorFormat_t format_;
cudaEvent_t dgrad_sync_event_;
bool dgrad_sync_needed_ = false;
#endif // MXNET_USE_CUDNN
bool init_space_, temp_init_space_;
size_t reserve_cpu_space_size_, temp_cpu_space_size_;
Expand Down
7 changes: 4 additions & 3 deletions tests/python/unittest/test_operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,11 @@ def check_rnn_consistency(cell1, cell2, T, N, I, H, grad_req, rtol=1e-2, atol=1e
dy = mx.random.uniform(shape=mod1.get_outputs()[0].shape)
mod1.backward(out_grads=[dy])
mod2.backward(out_grads=[dy])
if grad_req != 'null':
assert_allclose(mod1.get_input_grads()[0].asnumpy(), mod2.get_input_grads()[0].asnumpy(), rtol=rtol, atol=atol)
else:
if type(grad_req) is dict and grad_req['data'] == 'null' or grad_req == 'null':
assert(mod1.get_input_grads()[0] == None)
assert(mod2.get_input_grads()[0] == None)
else:
assert_allclose(mod1.get_input_grads()[0].asnumpy(), mod2.get_input_grads()[0].asnumpy(), rtol=rtol, atol=atol)


@with_seed()
Expand Down Expand Up @@ -149,6 +149,7 @@ def test_lstm_bidirectional():
check_rnn_consistency(fused, stack, T, N, I, H, 'write')
check_rnn_consistency(fused, stack, T, N, I, H, 'add')
check_rnn_consistency(fused, stack, T, N, I, H, 'null')
check_rnn_consistency(fused, stack, T, N, I, H, {'data': 'add', 'parameters': 'null'})

@with_seed()
@assert_raises_cudnn_not_satisfied(min_version='5.1.10')
Expand Down

0 comments on commit a2018ba

Please sign in to comment.