From a2018ba6c6b0df6fb1f2172faf311e260f66ce76 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Thu, 10 Oct 2019 13:34:35 -0700 Subject: [PATCH] cuDNN non-persistant bidirectional RNN dgrad sync fix (#16391) * Alter test_lstm_bidirectional to demo fast-fail with optional wgrad. * Fix cuDNN RNN dgrad sync. * Simplify gpu activity sync sequence. * Remove repeated running of now-passing test. * Trigger CI --- src/operator/rnn-inl.h | 83 +++++++++++++++++--------- tests/python/unittest/test_operator.py | 7 ++- 2 files changed, 58 insertions(+), 32 deletions(-) diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index 5652960baa43..cf6fe10fd328 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -493,6 +493,7 @@ class RNNOp { CUDNN_CALL(cudnnCreateRNNDescriptor(&rnn_desc_)); CUDNN_CALL(cudnnCreateDropoutDescriptor(&dropout_desc_)); + CUDA_CALL(cudaEventCreateWithFlags(&dgrad_sync_event_, cudaEventDisableTiming)); #if MXNET_USE_CUDNN_GE_7200 CUDNN_CALL(cudnnCreateRNNDataDescriptor(&x_data_desc_)); @@ -537,6 +538,7 @@ class RNNOp { CUDNN_CALL(cudnnDestroyFilterDescriptor(dw_desc_)); CUDNN_CALL(cudnnDestroyRNNDescriptor(rnn_desc_)); CUDNN_CALL(cudnnDestroyDropoutDescriptor(dropout_desc_)); + CUDA_CALL(cudaEventDestroy(dgrad_sync_event_)); if (init_cudnn_) { for (size_t i = 0; i < x_desc_vec_.size(); ++i) { @@ -1066,20 +1068,23 @@ class RNNOp { workspace_byte_, reserve_space_.dptr, reserve_space_byte_)); - CUDNN_CALL(cudnnRNNBackwardWeightsEx(s->dnn_handle_, - rnn_desc_, - x_data_desc_, - x.dptr_, - hx_desc_, - hx.dptr_, - y_data_desc_, - y.dptr_, - temp_space.dptr_, - workspace_byte_, - dw_desc_, - dw.dptr_, - reserve_space_.dptr, - reserve_space_byte_)); + SyncDgrad(); + if (req[rnn_enum::kParams] != kNullOp) { + CUDNN_CALL(cudnnRNNBackwardWeightsEx(s->dnn_handle_, + rnn_desc_, + x_data_desc_, + x.dptr_, + hx_desc_, + hx.dptr_, + y_data_desc_, + y.dptr_, + temp_space.dptr_, + workspace_byte_, + dw_desc_, + dw.dptr_, + reserve_space_.dptr, + reserve_space_byte_)); + } #else CUDNN_CALL(cudnnRNNBackwardData(s->dnn_handle_, rnn_desc_, @@ -1108,21 +1113,24 @@ class RNNOp { workspace_byte_, reserve_space_.dptr, reserve_space_byte_)); - CUDNN_CALL(cudnnRNNBackwardWeights(s->dnn_handle_, - rnn_desc_, - param_.seq_length_, - x_desc_vec_.data(), - x.dptr_, - hx_desc_, - hx.dptr_, - y_desc_vec_.data(), - y.dptr_, - temp_space.dptr_, - workspace_byte_, - dw_desc_, - dw.dptr_, - reserve_space_.dptr, - reserve_space_byte_)); + SyncDgrad(); + if (req[rnn_enum::kParams] != kNullOp) { + CUDNN_CALL(cudnnRNNBackwardWeights(s->dnn_handle_, + rnn_desc_, + param_.seq_length_, + x_desc_vec_.data(), + x.dptr_, + hx_desc_, + hx.dptr_, + y_desc_vec_.data(), + y.dptr_, + temp_space.dptr_, + workspace_byte_, + dw_desc_, + dw.dptr_, + reserve_space_.dptr, + reserve_space_byte_)); + } #endif // MXNET_USE_CUDNN_GE_7200 #endif // MXNET_USE_CUDNN == 1 && defined(__CUDACC__) @@ -1365,6 +1373,7 @@ class RNNOp { // RNN descriptors cudnnDataType_t dtype_with_fallback_; cudnnRNNAlgo_t rnn_algo = CUDNN_RNN_ALGO_STANDARD; + dgrad_sync_needed_ = (rnn_algo == CUDNN_RNN_ALGO_STANDARD) && param_.bidirectional; // On arch's 50 and 52(Maxwell), the gpu doesn't support native fp16 compute. // Before cuDNN 7.5.0, when running fp16, cuDNN fallback to fp32 under the hood on Maxwell. // That's not the case begining from 7.5.0. Thereby adding fallback explicitly here. @@ -1484,6 +1493,20 @@ class RNNOp { } #endif // MXNET_USE_CUDNN == 1 && defined(__CUDACC__) } + +#if MXNET_USE_CUDNN == 1 && defined(__CUDACC__) + // cuDNN versions up to and including v7.6.4 did not sync a last dgrad kernel back to the main + // cudnn handle's stream (non-persistant algo, bidirectional only). This could result in silent + // non-determinstic failures with very low probability, seen more often when wgrad is bypassed. + inline void SyncDgrad() { + if (CUDNN_VERSION <= 7604 && dgrad_sync_needed_) { + // Without blocking the CPU, create a synchronization point of all current GPU activity. No + // need to call cudaStreamWaitEvent- cudaEventRecord on the legacy default stream suffices. + CUDA_CALL(cudaEventRecord(dgrad_sync_event_, cudaStreamLegacy)); + } + } +#endif // MXNET_USE_CUDNN == 1 && defined(__CUDACC__) + #if MXNET_USE_CUDNN == 1 cudnnDataType_t dtype_; bool init_cudnn_; @@ -1511,6 +1534,8 @@ class RNNOp { bool cudnn_tensor_core_; cudnnTensorFormat_t format_; + cudaEvent_t dgrad_sync_event_; + bool dgrad_sync_needed_ = false; #endif // MXNET_USE_CUDNN bool init_space_, temp_init_space_; size_t reserve_cpu_space_size_, temp_cpu_space_size_; diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index 4f981d5e29c7..35460676da28 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -68,11 +68,11 @@ def check_rnn_consistency(cell1, cell2, T, N, I, H, grad_req, rtol=1e-2, atol=1e dy = mx.random.uniform(shape=mod1.get_outputs()[0].shape) mod1.backward(out_grads=[dy]) mod2.backward(out_grads=[dy]) - if grad_req != 'null': - assert_allclose(mod1.get_input_grads()[0].asnumpy(), mod2.get_input_grads()[0].asnumpy(), rtol=rtol, atol=atol) - else: + if type(grad_req) is dict and grad_req['data'] == 'null' or grad_req == 'null': assert(mod1.get_input_grads()[0] == None) assert(mod2.get_input_grads()[0] == None) + else: + assert_allclose(mod1.get_input_grads()[0].asnumpy(), mod2.get_input_grads()[0].asnumpy(), rtol=rtol, atol=atol) @with_seed() @@ -149,6 +149,7 @@ def test_lstm_bidirectional(): check_rnn_consistency(fused, stack, T, N, I, H, 'write') check_rnn_consistency(fused, stack, T, N, I, H, 'add') check_rnn_consistency(fused, stack, T, N, I, H, 'null') + check_rnn_consistency(fused, stack, T, N, I, H, {'data': 'add', 'parameters': 'null'}) @with_seed() @assert_raises_cudnn_not_satisfied(min_version='5.1.10')