From 8740d0fa046b6626cc7a2010671120b3fd2e59df Mon Sep 17 00:00:00 2001 From: "Li, Hao H" Date: Wed, 20 Mar 2019 10:35:44 +0800 Subject: [PATCH 01/25] change RNN OP to stateful --- src/operator/cudnn_rnn-inl.h | 93 ++++++----- src/operator/rnn-inl.h | 264 +++++--------------------------- src/operator/rnn.cc | 288 +++++++++++++++++++++++++++++++++-- src/operator/rnn.cu | 67 ++++++-- 4 files changed, 416 insertions(+), 296 deletions(-) diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h index cc8e4db404da..425c67864e28 100644 --- a/src/operator/cudnn_rnn-inl.h +++ b/src/operator/cudnn_rnn-inl.h @@ -27,7 +27,6 @@ #define MXNET_OPERATOR_CUDNN_RNN_INL_H_ #define USE_CUDNN_LSTM_PROJ MXNET_USE_CUDNN == 1 && CUDNN_VERSION >= 7200 - #include #include #include @@ -40,8 +39,9 @@ namespace mxnet { namespace op { #if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5 template -class CuDNNRNNOp : public Operator { +class CuDNNRNNOp { public: + RNNParam param_; explicit CuDNNRNNOp(RNNParam param) { this->param_ = param; init_cudnn_ = false; @@ -99,12 +99,6 @@ class CuDNNRNNOp : public Operator { #endif // RNN Direction direction_ = param_.bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL; - // Other - if (param_.mode == rnn_enum::kLstm) - param_.lstm_q_ = true; - else - param_.lstm_q_ = false; - // Create descriptors CUDNN_CALL(cudnnCreateTensorDescriptor(&hx_desc_)); CUDNN_CALL(cudnnCreateTensorDescriptor(&cx_desc_)); @@ -166,18 +160,20 @@ class CuDNNRNNOp : public Operator { #endif } - virtual void Forward(const OpContext &ctx, const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { + void Forward(const OpContext &ctx, const std::vector &in_data, + const std::vector &req, + const std::vector &out_data) { using namespace mshadow; - size_t in_expected = param_.lstm_q_ ? 4 : 3; - size_t out_expected = param_.lstm_q_ ? 3 : 2; - if (!param_.state_outputs) - out_expected = 1; + size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3; + // kOut + size_t num_outputs = 1; + if (param_.state_outputs) { + // kOut, kStateOut, kStateCellOut + num_outputs = (param_.mode == rnn_enum::kLstm) ? 3 : 2; + } - CHECK_EQ(in_data.size(), in_expected); - CHECK_EQ(out_data.size(), out_expected); + CHECK_EQ(in_data.size(), num_inputs); + CHECK_EQ(out_data.size(), num_outputs); Stream *s = ctx.get_stream(); // get input + output tensors Tensor x = in_data[rnn_enum::kData].get(s); @@ -191,10 +187,9 @@ class CuDNNRNNOp : public Operator { DType * cx_ptr = NULL; DType * cy_ptr = NULL; - - if (param_.lstm_q_) + if (param_.mode == rnn_enum::kLstm) cx_ptr = (in_data[rnn_enum::kStateCell].get(s)).dptr_; - if (param_.lstm_q_ && param_.state_outputs) + if (param_.mode == rnn_enum::kLstm && param_.state_outputs) cy_ptr = (out_data[rnn_enum::kStateCellOut].get(s)).dptr_; CHECK_EQ(x.CheckContiguous(), true); @@ -367,24 +362,26 @@ class CuDNNRNNOp : public Operator { } } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { + void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad) { using namespace mshadow; - size_t in_expected = param_.lstm_q_ ? 4 : 3; - size_t out_expected = param_.lstm_q_ ? 3 : 2; - if (!param_.state_outputs) - out_expected = 1; - - CHECK_EQ(in_data.size(), in_expected); - CHECK_EQ(out_data.size(), out_expected); - CHECK_EQ(in_grad.size(), in_expected); - CHECK_EQ(out_grad.size(), out_expected); - CHECK_EQ(req.size(), in_expected); + size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3; + // kOut + size_t num_outputs = 1; + if (param_.state_outputs) { + // kOut, kStateOut, kStateCellOut + num_outputs = (param_.mode == rnn_enum::kLstm) ? 3 : 2; + } + + CHECK_EQ(in_data.size(), num_inputs); + CHECK_EQ(out_data.size(), num_outputs); + CHECK_EQ(in_grad.size(), num_inputs); + CHECK_EQ(out_grad.size(), num_outputs); + CHECK_EQ(req.size(), num_inputs); CHECK_NE(req[rnn_enum::kData], kAddTo) << "AddTo is not supported for data"; CHECK_NE(req[rnn_enum::kState], kAddTo) << "AddTo is not supported for state"; Stream *s = ctx.get_stream(); @@ -534,13 +531,17 @@ class CuDNNRNNOp : public Operator { #if CUDNN_MAJOR >= 5 format_ = CUDNN_TENSOR_NCHW; #endif - size_t in_expected = param_.lstm_q_ ? 4 : 3; - size_t out_expected = param_.lstm_q_ ? 3 : 2; - if (!param_.state_outputs) - out_expected = 1; - CHECK_EQ(in_data.size(), in_expected); - CHECK_EQ(out_data.size(), out_expected); + size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3; + // kOut + size_t num_outputs = 1; + if (param_.state_outputs) { + // kOut, kStateOut, kStateCellOut + num_outputs = (param_.mode == rnn_enum::kLstm) ? 3 : 2; + } + + CHECK_EQ(in_data.size(), num_inputs); + CHECK_EQ(out_data.size(), num_outputs); if (!init_cudnn_) { init_cudnn_ = true; // get input + output tensors @@ -854,10 +855,8 @@ class CuDNNRNNOp : public Operator { #if CUDNN_MAJOR >= 5 cudnnTensorFormat_t format_; #endif - RNNParam param_; }; -#endif // __CUDACC__ && CUDNN +#endif // CUDNN } // namespace op } // namespace mxnet - #endif // MXNET_OPERATOR_CUDNN_RNN_INL_H_ diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index 71ad331786ae..0460c193de5b 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -160,9 +160,8 @@ struct RNNParam : public dmlc::Parameter { uint32_t num_layers; bool bidirectional, state_outputs; int mode; - float p, pkeep_; + float p; int seq_length_, batch_size_, input_size_; - bool lstm_q_; // whether type is lstm dmlc::optional projection_size; dmlc::optional lstm_state_clip_min, lstm_state_clip_max; bool lstm_state_clip_nan; @@ -212,7 +211,6 @@ struct RNNParam : public dmlc::Parameter { } }; - /** * @params: ws: Temp workspace for gemm's output storage. * rs: Reserve space of forward intermediate data used for training. @@ -236,6 +234,7 @@ struct RNNParam : public dmlc::Parameter { * hy's shape is [num_layers, batch_size, state_size] * cy_ptr: Only used in lstm mode. pointer of tensor cy containing the cell state * for t=seq_length. cy' shape is [num_layers, batch_size, state_size] + * dropout: should be 0 <= dropout < 1 * mode: Specifies the type of RNN to compute. */ template @@ -377,7 +376,7 @@ void RNNBackward(DType* ws, } template -class RNNOp : public Operator{ +class RNNOp { public: explicit RNNOp(RNNParam p) :param_(p), init_space_(false), reserve_space_size_(0) { @@ -397,23 +396,24 @@ class RNNOp : public Operator{ } } - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { + void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data) { using namespace mshadow; using namespace mshadow::expr; CHECK(param_.p >= 0.0f && param_.p < 1.0f) << "unsupported dropout value, should be 0 <= dropout < 1"; - size_t in_expected = (param_.mode == rnn_enum::kLstm) ? 4 : 3; - size_t out_expected = (param_.mode == rnn_enum::kLstm) ? 3 : 2; - if (!param_.state_outputs) { - out_expected = 1; + size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3; + // kOut + size_t num_outputs = 1; + if (param_.state_outputs) { + // kOut, kStateOut, kStateCellOut + num_outputs = (param_.mode == rnn_enum::kLstm) ? 3 : 2; } - CHECK_EQ(in_data.size(), in_expected); - CHECK_EQ(out_data.size(), out_expected); + CHECK_EQ(in_data.size(), num_inputs); + CHECK_EQ(out_data.size(), num_outputs); Stream *s = ctx.get_stream(); // get input + output tensor Tensor x = in_data[rnn_enum::kData].get(s); @@ -427,7 +427,6 @@ class RNNOp : public Operator{ param_.seq_length_ = x.shape_[0]; param_.batch_size_ = x.shape_[1]; param_.input_size_ = x.shape_[2]; - const int direction = param_.bidirectional ? 2 : 1; const int bsize = GetRnnBiasSize(param_.num_layers, param_.state_size, direction, param_.mode); DType* b_ptr = w.dptr_ + w.shape_[0] - bsize; @@ -451,7 +450,6 @@ class RNNOp : public Operator{ param_.state_size, direction, param_.mode); Tensor workspace = ctx.requested[rnn_enum::kTempSpace] .get_space_typed(Shape1(workspace_size), s); - if (ctx.is_train) { const size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction, param_.seq_length_, param_.batch_size_, @@ -460,7 +458,6 @@ class RNNOp : public Operator{ Storage::Get()->Free(reserve_space_); init_space_ = false; } - if (!init_space_) { reserve_space_ = Storage::Get()->Alloc(r_size * sizeof(DType), Context::CPU()); reserve_space_size_ = r_size; @@ -468,6 +465,7 @@ class RNNOp : public Operator{ } DType* reserve_space_ptr = static_cast(reserve_space_.dptr); + RNNForwardTraining(workspace.dptr_, reserve_space_ptr, param_.state_outputs, @@ -508,28 +506,30 @@ class RNNOp : public Operator{ } } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { + void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad) { using namespace mshadow; using namespace mshadow::expr; CHECK(param_.p >= 0.0f && param_.p < 1.0f) << "unsupported dropout value, should be 0 <= dropout < 1"; - size_t in_expected = (param_.mode == rnn_enum::kLstm) ? 4 : 3; - size_t out_expected = (param_.mode == rnn_enum::kLstm) ? 3 : 2; - if (!param_.state_outputs) { - out_expected = 1; + size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3; + // kOut + size_t num_outputs = 1; + if (param_.state_outputs) { + // kOut, kStateOut, kStateCellOut + num_outputs = (param_.mode == rnn_enum::kLstm) ? 3 : 2; } - CHECK_EQ(in_data.size(), in_expected); - CHECK_EQ(out_data.size(), out_expected); - CHECK_EQ(in_grad.size(), in_expected); - CHECK_EQ(out_grad.size(), out_expected); - CHECK_EQ(req.size(), in_expected); + + CHECK_EQ(in_data.size(), num_inputs); + CHECK_EQ(out_data.size(), num_outputs); + CHECK_EQ(in_grad.size(), num_inputs); + CHECK_EQ(out_grad.size(), num_outputs); + CHECK_EQ(req.size(), num_inputs); CHECK_NE(req[rnn_enum::kData], kAddTo) << "AddTo is not supported for data"; CHECK_NE(req[rnn_enum::kState], kAddTo) << "AddTo is not supported for state"; mshadow::Stream *s = ctx.get_stream(); @@ -556,6 +556,7 @@ class RNNOp : public Operator{ const int direction = param_.bidirectional ? 2 : 1; const int bsize = GetRnnBiasSize(param_.num_layers, param_.state_size, direction, param_.mode); + DType* db_ptr = dw.dptr_ + w.shape_[0] - bsize; DType * dhy_ptr = NULL; @@ -585,6 +586,7 @@ class RNNOp : public Operator{ size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction, param_.seq_length_, param_.batch_size_, param_.state_size, param_.mode); + if (!init_space_ || reserve_space_size_ != r_size) { LOG(FATAL) << "Check forward init error"; } @@ -620,203 +622,15 @@ class RNNOp : public Operator{ param_.mode); } - private: RNNParam param_; + + private: bool init_space_; size_t reserve_space_size_; Storage::Handle reserve_space_; }; // class RNNOp -template -Operator* CreateOp(RNNParam param, int dtype); - -#if DMLC_USE_CXX11 -class RNNProp : public OperatorProperty { - public: - std::vector ListArguments() const override { - if (param_.mode == rnn_enum::kLstm) { - return {"data", "parameters", "state", "state_cell"}; - } else { - return {"data", "parameters", "state"}; - } - } - - std::vector ListOutputs() const override { - std::vector outputs = {"output"}; - if (!param_.state_outputs) - return outputs; - else - outputs.emplace_back("state"); - if (param_.mode == rnn_enum::kLstm) - outputs.emplace_back("state_cell"); - return outputs; - } - - int NumOutputs() const override { - int mode_num = (param_.mode == rnn_enum::kLstm) ? 2 : 1; - int num_outputs = param_.state_outputs ? (mode_num + 1) : 1; - return num_outputs; - } - - void Init(const std::vector >& kwargs) override { - param_.Init(kwargs); - } - - std::map GetParams() const override { - return param_.__DICT__(); - } - - bool InferShape(mxnet::ShapeVector *in_shape, - mxnet::ShapeVector *out_shape, - mxnet::ShapeVector *aux_shape) const override { - using namespace mshadow; - if (param_.mode == rnn_enum::kLstm) { - CHECK_EQ(in_shape->size(), 4U) << "Input:[data, parameters, state, cell_state]"; - } else { - CHECK_EQ(in_shape->size(), 3U) << "Input:[data, parameters, state]"; - } - const mxnet::TShape &dshape = (*in_shape)[rnn_enum::kData]; - if (dshape.ndim() == 0) return false; - CHECK_EQ(dshape.ndim(), 3U) \ - << "Input data should be rank-3 tensor of dim [sequence length, batch size, input size]"; - // data: [sequence len, batch, input dimension] - int batch_size = dshape[1]; - int input_size = dshape[2]; - int numDirections = param_.bidirectional ? 2 : 1; - int total_layers = numDirections * param_.num_layers; // double for bidirectional - int layer_size = (param_.projection_size.has_value()) ? - param_.projection_size.value() : param_.state_size; - SHAPE_ASSIGN_CHECK(*in_shape, - rnn_enum::kState, - Shape3(total_layers, batch_size, layer_size)); - if (param_.mode == rnn_enum::kLstm) - SHAPE_ASSIGN_CHECK(*in_shape, - rnn_enum::kStateCell, - Shape3(total_layers, batch_size, param_.state_size)); - - // calculate parameter vector length - int param_size = GetRnnParamSize(param_.num_layers, - input_size, - param_.state_size, - numDirections, - param_.mode, - param_.projection_size); - SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(param_size)); - - out_shape->clear(); - // output: [sequence len, batch, output size] - mxnet::TShape oshape = dshape; - if (param_.projection_size.has_value()) { - oshape[2] = numDirections * param_.projection_size.value(); - } else { - oshape[2] = numDirections * param_.state_size; - } - out_shape->push_back(oshape); - if (!param_.state_outputs) { - return true; - } else { - // outStateShape: [layer_num, batch, state size] - mxnet::TShape outStateShape = dshape; - outStateShape[0] = total_layers; - outStateShape[1] = batch_size; - if (param_.projection_size.has_value()) { - outStateShape[2] = param_.projection_size.value(); - } else { - outStateShape[2] = param_.state_size; - } - out_shape->push_back(outStateShape); - // Deal with lstm cell state - if (param_.mode == rnn_enum::kLstm) { - mxnet::TShape cellStateShape = dshape; - cellStateShape[0] = total_layers; - cellStateShape[1] = batch_size; - cellStateShape[2] = param_.state_size; - out_shape->push_back(cellStateShape); - } - return true; - } - } - - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - CHECK_GE(in_type->size(), 1U); - int dtype = (*in_type)[0]; - CHECK_NE(dtype, -1) << "First input must have specified type"; - for (size_t i = 0; i < in_type->size(); ++i) { - if ((*in_type)[i] == -1) { - (*in_type)[i] = dtype; - } else { - UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]); - } - } - out_type->clear(); - out_type->push_back(dtype); - if (!param_.state_outputs) { - return true; - } else { - out_type->push_back(dtype); - // Deal with lstm cell state - if (param_.mode == rnn_enum::kLstm) - out_type->push_back(dtype); - return true; - } - } - - OperatorProperty* Copy() const override { - auto ptr = new RNNProp(); - ptr->param_ = param_; - return ptr; - } - - std::string TypeString() const override { - return "RNN"; - } - - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - std::vector dep = {in_data[rnn_enum::kData], in_data[rnn_enum::kParams], - in_data[rnn_enum::kState], out_data[rnn_enum::kOut], out_grad[rnn_enum::kOut]}; - - if (param_.state_outputs) { - dep.push_back(out_data[rnn_enum::kStateOut]); - dep.push_back(out_grad[rnn_enum::kStateOut]); - } - - if (param_.mode == rnn_enum::kLstm) { - dep.push_back(in_data[rnn_enum::kStateCell]); - if (param_.state_outputs) { - dep.push_back(out_data[rnn_enum::kStateCellOut]); - dep.push_back(out_grad[rnn_enum::kStateCellOut]); - } - } - return dep; - } - - std::vector ForwardResource( - const mxnet::ShapeVector &in_shape) const override { - return {ResourceRequest::kTempSpace}; - } - - std::vector BackwardResource( - const mxnet::ShapeVector &in_shape) const override { - return {ResourceRequest::kTempSpace}; - } - - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented"; - return NULL; - } - - Operator* CreateOperatorEx(Context ctx, mxnet::ShapeVector *in_shape, - std::vector *in_type) const override; - - private: - RNNParam param_; -}; // class RNNProp -#endif // DMLC_USE_CXX11 } // namespace op } // namespace mxnet + #endif // MXNET_OPERATOR_RNN_INL_H_ diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc index 621b9eb110e7..6a9b7c001477 100644 --- a/src/operator/rnn.cc +++ b/src/operator/rnn.cc @@ -24,27 +24,244 @@ * \author Sebastian Bodenstein */ #include "./rnn-inl.h" +#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5 +#include "./cudnn_rnn-inl.h" +#endif // MXNET_USE_CUDNN && CUDNN_MAJOR namespace mxnet { namespace op { -template<> -Operator *CreateOp(RNNParam param, int dtype) { - Operator *op = nullptr; - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new RNNOp(param); + +DMLC_REGISTER_PARAMETER(RNNParam); +static inline std::vector ListArguments(const RNNParam& param_) { + if (param_.mode == rnn_enum::kLstm) { + return {"data", "parameters", "state", "state_cell"}; + } else { + return {"data", "parameters", "state"}; + } +} + +static bool RNNShape(const nnvm::NodeAttrs& attrs, + std::vector *in_shape, + std::vector *out_shape) { + const RNNParam& param_ = nnvm::get(attrs.parsed); + using namespace mshadow; + if (param_.mode == rnn_enum::kLstm) { + CHECK_EQ(in_shape->size(), 4U) << "Needed input:[data, parameters, state, cell_state]," + << " got in_shape->size(): " << in_shape->size(); + } else { + CHECK_EQ(in_shape->size(), 3U) << + "Needed input:[data, parameters, state], got in_shape->size(): " << in_shape->size(); + } + const TShape &dshape = (*in_shape)[rnn_enum::kData]; + if (dshape.ndim() == 0) return false; + CHECK_EQ(dshape.ndim(), 3U) \ + << "Input data should be rank-3 tensor of dim [sequence length, batch size, input size]"; + // data: [sequence len, batch, input dimension] + int batch_size = dshape[1]; + int input_size = dshape[2]; + int numDirections = param_.bidirectional ? 2 : 1; + int total_layers = numDirections * param_.num_layers; // double for bidirectional + int layer_size = (param_.projection_size.has_value()) ? + param_.projection_size.value() : param_.state_size; + SHAPE_ASSIGN_CHECK(*in_shape, + rnn_enum::kState, + Shape3(total_layers, batch_size, layer_size)); + if (param_.mode == rnn_enum::kLstm) { + SHAPE_ASSIGN_CHECK(*in_shape, + rnn_enum::kStateCell, + Shape3(total_layers, batch_size, param_.state_size)); + } + + // calculate parameter vector length + int param_size = GetRnnParamSize(param_.num_layers, + input_size, + param_.state_size, + numDirections, + param_.mode, + param_.projection_size); + SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(param_size)); + out_shape->clear(); + // output: [sequence len, batch, output size] + TShape oshape = dshape; + if (param_.projection_size.has_value()) { + oshape[2] = numDirections * param_.projection_size.value(); + } else { + oshape[2] = numDirections * param_.state_size; + } + out_shape->push_back(oshape); + if (param_.state_outputs) { + // outStateShape: [layer_num, batch, state size] + TShape outStateShape = dshape; + outStateShape[0] = total_layers; + outStateShape[1] = batch_size; + if (param_.projection_size.has_value()) { + outStateShape[2] = param_.projection_size.value(); + } else { + outStateShape[2] = param_.state_size; + } + out_shape->push_back(outStateShape); + // Deal with lstm cell state + if (param_.mode == rnn_enum::kLstm) { + TShape cellStateShape = dshape; + cellStateShape[0] = total_layers; + cellStateShape[1] = batch_size; + cellStateShape[2] = param_.state_size; + out_shape->push_back(cellStateShape); + } + } + return true; +} + +static bool RNNType(const nnvm::NodeAttrs& attrs, + std::vector *in_type, + std::vector *out_type) { + const RNNParam& param_ = nnvm::get(attrs.parsed); + if (param_.mode == rnn_enum::kLstm) { + CHECK_EQ(in_type->size(), 4U); + } else { + CHECK_EQ(in_type->size(), 3U); + } + int dtype = (*in_type)[0]; + CHECK_NE(dtype, -1) << "First input must have specified type"; + for (size_t i = 0; i < in_type->size(); ++i) { + if ((*in_type)[i] == -1) { + TYPE_ASSIGN_CHECK(*in_type, i, dtype); + } else { + UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments(param_)[i]); + } + } + out_type->clear(); + out_type->push_back(dtype); + if (param_.state_outputs) { + out_type->push_back(dtype); + // Deal with lstm cell state + if (param_.mode == rnn_enum::kLstm) + out_type->push_back(dtype); + } + return true; +} + +inline static bool RNNStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + DispatchMode wanted_mode = DispatchMode::kFCompute; + + return storage_type_assign(out_attrs, mxnet::kDefaultStorage, + dispatch_mode, wanted_mode); +} + +inline static bool BackwardRNNStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + DispatchMode wanted_mode = DispatchMode::kFCompute; + return storage_type_assign(out_attrs, mxnet::kDefaultStorage, + dispatch_mode, wanted_mode); +} + +struct RNNGrad { + const char *op_name; + std::vector operator()(const nnvm::NodePtr &n, + const std::vector &ograd) const { + const RNNParam& params = nnvm::get(n->attrs.parsed); + std::vector heads{ n->inputs[rnn_enum::kData], + n->inputs[rnn_enum::kParams], n->inputs[rnn_enum::kState] }; + heads.emplace_back(nnvm::NodeEntry{n, rnn_enum::kOut, 0}); + heads.push_back(ograd[rnn_enum::kOut]); + if (params.state_outputs) { + heads.emplace_back(nnvm::NodeEntry{n, rnn_enum::kStateOut, 0}); + heads.push_back(ograd[rnn_enum::kStateOut]); + } + if (params.mode == rnn_enum::kLstm) { + heads.push_back(n->inputs[rnn_enum::kStateCell]); + if (params.state_outputs) { + heads.emplace_back(nnvm::NodeEntry{n, rnn_enum::kStateCellOut, 0}); + heads.push_back(ograd[rnn_enum::kStateCellOut]); + } + } + return MakeGradNode(op_name, n, heads, n->attrs.dict); + } +}; + +static OpStatePtr CreateRNNState(const nnvm::NodeAttrs &attrs, + const Context ctx, + const mxnet::ShapeVector &in_shapes, + const std::vector &in_types) { + const RNNParam& param = nnvm::get(attrs.parsed); + OpStatePtr state = OpStatePtr(); + MSHADOW_REAL_TYPE_SWITCH(in_types[rnn_enum::kData], DType, { + #if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5 + state = OpStatePtr::Create>(param); + #else + state = OpStatePtr::Create>(param); + #endif + return state; }); - return op; + return OpStatePtr(); // should never reach here } -Operator *RNNProp::CreateOperatorEx(Context ctx, - mxnet::ShapeVector *in_shape, - std::vector *in_type) const { - DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]); +template +void RNNStatefulCompute(const OpStatePtr& state, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + int dtype = inputs[rnn_enum::kData].type_flag_; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + RNNOp& op = state.get_state>(); + op.Forward(ctx, inputs, req, outputs); + }); } +/* +index description +0: x +1: w +2: hx +3: y +4: dy +5: hy +6: dhy +7: cx +8: cy +9: dcy +*/ +template +void RNNStatefulGradCompute(const OpStatePtr& state, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + std::vector in_data(inputs.begin(), inputs.begin() + 3); + std::vector out_data{inputs[3]}; + std::vector out_grad{inputs[4]}; + const std::vector &in_grad = outputs; -DMLC_REGISTER_PARAMETER(RNNParam); + int dtype = inputs[rnn_enum::kData].type_flag_; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + RNNOp& op = state.get_state>(); + const RNNParam& param = op.param_; + int index = 5; + if (param.state_outputs) { + out_data.push_back(inputs[index++]); + out_grad.push_back(inputs[index++]); + } -MXNET_REGISTER_OP_PROPERTY(RNN, RNNProp) + if (param.mode == rnn_enum::kLstm) { + in_data.push_back(inputs[index++]); + if (param.state_outputs) { + out_data.push_back(inputs[index++]); + out_grad.push_back(inputs[index]); + } + } + + op.Backward(ctx, out_grad, in_data, out_data, req, in_grad); + }); +} + +NNVM_REGISTER_OP(RNN) .describe(R"code(Applies recurrent layers to input data. Currently, vanilla RNN, LSTM and GRU are implemented, with both multi-layer and bidirectional support. @@ -97,7 +314,38 @@ The definition of GRU here is slightly different from paper but compatible with z_t = \mathrm{sigmoid}(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\ n_t = \tanh(W_{in} x_t + b_{in} + r_t * (W_{hn} h_{(t-1)}+ b_{hn})) \\ h_t = (1 - z_t) * n_t + z_t * h_{(t-1)} \\ - \end{array})code") + \end{array} +)code" ADD_FILELINE) +.set_attr_parser(ParamParser) +.set_num_inputs([](const NodeAttrs& attrs) { + const RNNParam& params = nnvm::get(attrs.parsed); + return params.mode == rnn_enum::kLstm ? 4 : 3; +}) +.set_num_outputs([](const NodeAttrs& attrs) { + const RNNParam& params = nnvm::get(attrs.parsed); + // kOut + int num_outputs = 1; + if (params.state_outputs) { + // kOut, kStateOut, kStateCellOut + num_outputs = (params.mode == rnn_enum::kLstm) ? 3 : 2; + } + + return num_outputs; +}) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + const RNNParam& params = nnvm::get(attrs.parsed); + return ListArguments(params); +}) +.set_attr("FInferShape", RNNShape) +.set_attr("FInferType", RNNType) +.set_attr("FInferStorageType", RNNStorageType) +.set_attr("FCreateOpState", CreateRNNState) +.set_attr("FStatefulCompute", RNNStatefulCompute) +.set_attr("FGradient", RNNGrad{"_backward_RNN"}) +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) .add_argument("data", "NDArray-or-Symbol", "Input data to RNN") .add_argument("parameters", "NDArray-or-Symbol", "Vector of all RNN trainable parameters concatenated") @@ -105,5 +353,19 @@ The definition of GRU here is slightly different from paper but compatible with .add_argument("state_cell", "NDArray-or-Symbol", "initial cell state for LSTM networks (only for LSTM)") .add_arguments(RNNParam::__FIELDS__()); + +NNVM_REGISTER_OP(_backward_RNN) +.set_num_outputs([](const NodeAttrs& attrs) { + const RNNParam& params = nnvm::get(attrs.parsed); + return params.mode == rnn_enum::kLstm ? 4 : 3; +}) +.set_attr_parser(ParamParser) +.set_attr("TIsLayerOpBackward", true) +.set_attr("TIsBackward", true) +.set_attr("FInferStorageType", BackwardRNNStorageType) +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +.set_attr("FStatefulCompute", RNNStatefulGradCompute); } // namespace op } // namespace mxnet diff --git a/src/operator/rnn.cu b/src/operator/rnn.cu index 402a8cf5f503..91f38f91f67c 100644 --- a/src/operator/rnn.cu +++ b/src/operator/rnn.cu @@ -32,18 +32,63 @@ namespace mxnet { namespace op { -template<> -Operator* CreateOp(RNNParam param, int dtype) { - Operator *op = NULL; -#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5 - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new CuDNNRNNOp(param); - }) -#else - LOG(FATAL) << "RNN on GPU is only available for cuDNN at the moment."; -#endif // MXNET_USE_CUDNN && CUDNN_MAJOR - return op; + +template +void RNNStatefulCompute(const OpStatePtr& state, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + int dtype = inputs[rnn_enum::kData].type_flag_; + #if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5 + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + CuDNNRNNOp& op = state.get_state>(); + op.Forward(ctx, inputs, req, outputs); + }); + #else + LOG(FATAL) << "RNN on GPU is only available for cuDNN at the moment."; + #endif // MXNET_USE_CUDNN && CUDNN_MAJOR } +template +void RNNStatefulGradCompute(const OpStatePtr& state, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + std::vector in_data(inputs.begin(), inputs.begin() + 3); + std::vector out_data{inputs[3]}; + std::vector out_grad{inputs[4]}; + const std::vector &in_grad = outputs; + int dtype = inputs[rnn_enum::kData].type_flag_; + #if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5 + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + CuDNNRNNOp& op = state.get_state>(); + const RNNParam& param = op.param_; + int index = 5; + if (param.state_outputs) { + out_data.push_back(inputs[index++]); + out_grad.push_back(inputs[index++]); + } + + if (param.mode == rnn_enum::kLstm) { + in_data.push_back(inputs[index++]); + if (param.state_outputs) { + out_data.push_back(inputs[index++]); + out_grad.push_back(inputs[index]); + } + } + op.Backward(ctx, out_grad, in_data, out_data, req, in_grad); + }); + #else + LOG(FATAL) << "RNN on GPU is only available for cuDNN at the moment."; + #endif // MXNET_USE_CUDNN && CUDNN_MAJOR +} + +NNVM_REGISTER_OP(RNN) +.set_attr("FStatefulCompute", RNNStatefulCompute); + +NNVM_REGISTER_OP(_backward_RNN) +.set_attr("FStatefulCompute", RNNStatefulGradCompute); } // namespace op } // namespace mxnet From 9c38854721ab0e86799d300445ebff7e5af6dff3 Mon Sep 17 00:00:00 2001 From: "Li, Hao H" Date: Wed, 20 Mar 2019 12:26:03 +0800 Subject: [PATCH 02/25] retrigger the ci From 50440e9a148b698fa67ae842c5f694e9208cf2be Mon Sep 17 00:00:00 2001 From: "Li, Hao H" Date: Wed, 20 Mar 2019 13:22:05 +0800 Subject: [PATCH 03/25] fix windows compile issue --- src/operator/rnn.cc | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc index 6a9b7c001477..5da870643cd0 100644 --- a/src/operator/rnn.cc +++ b/src/operator/rnn.cc @@ -192,14 +192,17 @@ static OpStatePtr CreateRNNState(const nnvm::NodeAttrs &attrs, const std::vector &in_types) { const RNNParam& param = nnvm::get(attrs.parsed); OpStatePtr state = OpStatePtr(); - MSHADOW_REAL_TYPE_SWITCH(in_types[rnn_enum::kData], DType, { - #if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5 + #if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5 + MSHADOW_REAL_TYPE_SWITCH(in_types[rnn_enum::kData], DType, { state = OpStatePtr::Create>(param); - #else + return state; + }); + #else + MSHADOW_REAL_TYPE_SWITCH(in_types[rnn_enum::kData], DType, { state = OpStatePtr::Create>(param); - #endif return state; - }); + }); + #endif return OpStatePtr(); // should never reach here } From 5324d283db2b7f01ab97e83a68979ce31988a618 Mon Sep 17 00:00:00 2001 From: "Li, Hao H" Date: Wed, 20 Mar 2019 22:07:00 +0800 Subject: [PATCH 04/25] move cudnnrnn class into rnn-inl.h --- src/operator/cudnn_rnn-inl.h | 862 --------------------------------- src/operator/rnn-inl.h | 898 +++++++++++++++++++++++++++++++++++ src/operator/rnn.cc | 104 ---- src/operator/rnn.cu | 55 --- 4 files changed, 898 insertions(+), 1021 deletions(-) delete mode 100644 src/operator/cudnn_rnn-inl.h diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h deleted file mode 100644 index 425c67864e28..000000000000 --- a/src/operator/cudnn_rnn-inl.h +++ /dev/null @@ -1,862 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * Copyright (c) 2016 by Contributors - * \file cudnn_rnn-inl.h - * \brief - * \author Sebastian Bodenstein -*/ -#ifndef MXNET_OPERATOR_CUDNN_RNN_INL_H_ -#define MXNET_OPERATOR_CUDNN_RNN_INL_H_ - -#define USE_CUDNN_LSTM_PROJ MXNET_USE_CUDNN == 1 && CUDNN_VERSION >= 7200 -#include -#include -#include -#include -#include -#include -#include "./rnn-inl.h" - -namespace mxnet { -namespace op { -#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5 -template -class CuDNNRNNOp { - public: - RNNParam param_; - explicit CuDNNRNNOp(RNNParam param) { - this->param_ = param; - init_cudnn_ = false; - dtype_ = mshadow::DataType::kCudnnFlag; - // TensorCore algos only allowed on fp16-I/O convolutions if permitted by the global policy. - // No tests in place for fp16 RNNs, so leave TensorCore disabled for now. - cudnn_tensor_core_ = false; - // When fp16 RNN tests are introduced, we can enable TensorCore as follows: -// cudnn_tensor_core = -// mshadow::DataType::kFlag == mshadow::kFloat16 && GetEnvAllowTensorCore(); - // Defaults - input_mode_ = CUDNN_LINEAR_INPUT; // Don't support this yet - // RNN Mode - switch (param_.mode) { - case rnn_enum::kRnnRelu: - mode_ = CUDNN_RNN_RELU; - break; - case rnn_enum::kRnnTanh: - mode_ = CUDNN_RNN_TANH; - break; - case rnn_enum::kLstm: - mode_ = CUDNN_LSTM; - break; - case rnn_enum::kGru: - mode_ = CUDNN_GRU; - break; - default: - LOG(FATAL) << "Not implmented"; - } -#if USE_CUDNN_LSTM_PROJ - if (param_.projection_size.has_value()) { - CHECK_EQ(param_.mode, rnn_enum::kLstm) - << "Projection is only supported for LSTM."; - CHECK_GE(param_.state_size, param_.projection_size.value()) - << "State size must be larger than projection size."; - } -#else - CHECK(!param_.projection_size.has_value()) - << "Projection is only supported for LSTM with CuDNN version later than 7.1.1."; -#endif -#if USE_CUDNN_LSTM_PROJ - if (param_.lstm_state_clip_min.has_value() - || param_.lstm_state_clip_max.has_value()) { - CHECK_EQ(param_.mode, rnn_enum::kLstm) - << "State clipping is only supported for LSTM."; - CHECK(param_.lstm_state_clip_min.has_value() && param_.lstm_state_clip_max.has_value()) - << "lstm_state_clip_min and lstm_state_clip_max must be specified together."; - CHECK_GE(param_.lstm_state_clip_max.value(), param_.lstm_state_clip_min.value()) - << "lstm_state_clip_max must be greater or equal to lstm_state_clip_min"; - } -#else - CHECK(!param_.lstm_state_clip_min.has_value() - && !param_.lstm_state_clip_max.has_value()) - << "State clipping is only supported for LSTM with CuDNN version later than 7.2.1."; -#endif - // RNN Direction - direction_ = param_.bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL; - // Create descriptors - CUDNN_CALL(cudnnCreateTensorDescriptor(&hx_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&cx_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&hy_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&cy_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&dhx_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&dcx_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&dhy_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&dcy_desc_)); - - CUDNN_CALL(cudnnCreateFilterDescriptor(&w_desc_)); - CUDNN_CALL(cudnnCreateFilterDescriptor(&dw_desc_)); - - CUDNN_CALL(cudnnCreateRNNDescriptor(&rnn_desc_)); - CUDNN_CALL(cudnnCreateDropoutDescriptor(&dropout_desc_)); - - #if USE_CUDNN_LSTM_PROJ - CUDNN_CALL(cudnnCreateRNNDataDescriptor(&x_data_desc_)); - CUDNN_CALL(cudnnCreateRNNDataDescriptor(&y_data_desc_)); - CUDNN_CALL(cudnnCreateRNNDataDescriptor(&dx_data_desc_)); - CUDNN_CALL(cudnnCreateRNNDataDescriptor(&dy_data_desc_)); - #endif - } - - ~CuDNNRNNOp() { - CUDNN_CALL(cudnnDestroyTensorDescriptor(hx_desc_)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(cx_desc_)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(hy_desc_)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(cy_desc_)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(dhx_desc_)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(dcx_desc_)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(dhy_desc_)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(dcy_desc_)); - - CUDNN_CALL(cudnnDestroyFilterDescriptor(w_desc_)); - CUDNN_CALL(cudnnDestroyFilterDescriptor(dw_desc_)); - CUDNN_CALL(cudnnDestroyRNNDescriptor(rnn_desc_)); - CUDNN_CALL(cudnnDestroyDropoutDescriptor(dropout_desc_)); - - if (init_cudnn_) { - for (size_t i = 0; i < x_desc_vec_.size(); ++i) { - CUDNN_CALL(cudnnDestroyTensorDescriptor(x_desc_vec_[i])); - CUDNN_CALL(cudnnDestroyTensorDescriptor(y_desc_vec_[i])); - CUDNN_CALL(cudnnDestroyTensorDescriptor(dx_desc_vec_[i])); - CUDNN_CALL(cudnnDestroyTensorDescriptor(dy_desc_vec_[i])); - } - init_cudnn_ = false; - - Storage::Get()->Free(reserve_space_); - if (param_.p > 0) { - Storage::Get()->Free(dropout_states_); - } - } - #if USE_CUDNN_LSTM_PROJ - CUDNN_CALL(cudnnDestroyRNNDataDescriptor(x_data_desc_)); - CUDNN_CALL(cudnnDestroyRNNDataDescriptor(y_data_desc_)); - CUDNN_CALL(cudnnDestroyRNNDataDescriptor(dx_data_desc_)); - CUDNN_CALL(cudnnDestroyRNNDataDescriptor(dy_data_desc_)); - #endif - } - - void Forward(const OpContext &ctx, const std::vector &in_data, - const std::vector &req, - const std::vector &out_data) { - using namespace mshadow; - size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3; - // kOut - size_t num_outputs = 1; - if (param_.state_outputs) { - // kOut, kStateOut, kStateCellOut - num_outputs = (param_.mode == rnn_enum::kLstm) ? 3 : 2; - } - - CHECK_EQ(in_data.size(), num_inputs); - CHECK_EQ(out_data.size(), num_outputs); - Stream *s = ctx.get_stream(); - // get input + output tensors - Tensor x = in_data[rnn_enum::kData].get(s); - Tensor w = in_data[rnn_enum::kParams].get(s); - Tensor hx = in_data[rnn_enum::kState].get(s); - Tensor y = out_data[rnn_enum::kOut].get(s); - - void * hy_ptr = NULL; - if (param_.state_outputs) - hy_ptr = out_data[rnn_enum::kStateOut].get(s).dptr_; - - DType * cx_ptr = NULL; - DType * cy_ptr = NULL; - if (param_.mode == rnn_enum::kLstm) - cx_ptr = (in_data[rnn_enum::kStateCell].get(s)).dptr_; - if (param_.mode == rnn_enum::kLstm && param_.state_outputs) - cy_ptr = (out_data[rnn_enum::kStateCellOut].get(s)).dptr_; - - CHECK_EQ(x.CheckContiguous(), true); - CHECK_EQ(w.CheckContiguous(), true); - CHECK_EQ(hx.CheckContiguous(), true); - CHECK_EQ(y.CheckContiguous(), true); - - if (!init_cudnn_) { - Init(s, in_data, out_data); - } - // Get temp space - int temp_size = workspace_size_; - Tensor temp_space = - ctx.requested[rnn_enum::kTempSpace].get_space_typed( - mshadow::Shape1(temp_size), s); - #if USE_CUDNN_LSTM_PROJ - std::vector seqLengthArray(param_.batch_size_, param_.seq_length_); - CUDNN_CALL(cudnnSetRNNDataDescriptor(x_data_desc_, - dtype_, - CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED, - param_.seq_length_, - param_.batch_size_, - param_.input_size_, - seqLengthArray.data(), - nullptr)); - int out_size = - (param_.projection_size.has_value()) ? param_.projection_size.value() : param_.state_size; - out_size = (param_.bidirectional) ? (out_size * 2) : out_size; - CUDNN_CALL(cudnnSetRNNDataDescriptor(y_data_desc_, - dtype_, - CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED, - param_.seq_length_, - param_.batch_size_, - out_size, - seqLengthArray.data(), - nullptr)); - if (ctx.is_train) { - CUDNN_CALL(cudnnSetRNNDataDescriptor(dx_data_desc_, - dtype_, - CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED, - param_.seq_length_, - param_.batch_size_, - param_.input_size_, - seqLengthArray.data(), - nullptr)); - CUDNN_CALL(cudnnSetRNNDataDescriptor(dy_data_desc_, - dtype_, - CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED, - param_.seq_length_, - param_.batch_size_, - out_size, - seqLengthArray.data(), - nullptr)); - } - #endif - - #if USE_CUDNN_LSTM_PROJ - bool clip_state = param_.lstm_state_clip_min.has_value(); - bool clip_nan = param_.lstm_state_clip_nan; - CUDNN_CALL(cudnnRNNSetClip(s->dnn_handle_, - rnn_desc_, - clip_state ? CUDNN_RNN_CLIP_MINMAX : CUDNN_RNN_CLIP_NONE, - clip_nan ? CUDNN_NOT_PROPAGATE_NAN : CUDNN_PROPAGATE_NAN, - clip_state ? param_.lstm_state_clip_min.value() : 0.0, - clip_state ? param_.lstm_state_clip_max.value() : 0.0)); - #endif - - if (ctx.is_train) { - #if USE_CUDNN_LSTM_PROJ - CUDNN_CALL(cudnnRNNForwardTrainingEx(s->dnn_handle_, - rnn_desc_, - x_data_desc_, - x.dptr_, - hx_desc_, - hx.dptr_, - cx_desc_, - cx_ptr, - w_desc_, - w.dptr_, - y_data_desc_, - y.dptr_, - hy_desc_, - hy_ptr, - cy_desc_, - cy_ptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - temp_space.dptr_, - workspace_byte_, - reserve_space_.dptr, - reserve_space_byte_)); - #else - CUDNN_CALL(cudnnRNNForwardTraining(s->dnn_handle_, - rnn_desc_, - param_.seq_length_, - x_desc_vec_.data(), - x.dptr_, - hx_desc_, - hx.dptr_, - cx_desc_, - cx_ptr, - w_desc_, - w.dptr_, - y_desc_vec_.data(), - y.dptr_, - hy_desc_, - hy_ptr, - cy_desc_, - cy_ptr, - temp_space.dptr_, - workspace_byte_, - reserve_space_.dptr, - reserve_space_byte_)); - #endif - } else { - #if USE_CUDNN_LSTM_PROJ - CUDNN_CALL(cudnnRNNForwardInferenceEx(s->dnn_handle_, - rnn_desc_, - x_data_desc_, - x.dptr_, - hx_desc_, - hx.dptr_, - cx_desc_, - cx_ptr, - w_desc_, - w.dptr_, - y_data_desc_, - y.dptr_, - hy_desc_, - hy_ptr, - cy_desc_, - cy_ptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - temp_space.dptr_, - workspace_byte_)); - #else - CUDNN_CALL(cudnnRNNForwardInference(s->dnn_handle_, - rnn_desc_, - param_.seq_length_, - x_desc_vec_.data(), - x.dptr_, - hx_desc_, - hx.dptr_, - cx_desc_, - cx_ptr, - w_desc_, - w.dptr_, - y_desc_vec_.data(), - y.dptr_, - hy_desc_, - hy_ptr, - cy_desc_, - cy_ptr, - temp_space.dptr_, - workspace_byte_)); - #endif - } - } - - void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad) { - using namespace mshadow; - size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3; - // kOut - size_t num_outputs = 1; - if (param_.state_outputs) { - // kOut, kStateOut, kStateCellOut - num_outputs = (param_.mode == rnn_enum::kLstm) ? 3 : 2; - } - - CHECK_EQ(in_data.size(), num_inputs); - CHECK_EQ(out_data.size(), num_outputs); - CHECK_EQ(in_grad.size(), num_inputs); - CHECK_EQ(out_grad.size(), num_outputs); - CHECK_EQ(req.size(), num_inputs); - CHECK_NE(req[rnn_enum::kData], kAddTo) << "AddTo is not supported for data"; - CHECK_NE(req[rnn_enum::kState], kAddTo) << "AddTo is not supported for state"; - Stream *s = ctx.get_stream(); - // get input + output tensors - Tensor x = in_data[rnn_enum::kData].get(s); - Tensor dx = in_grad[rnn_enum::kData].get(s); - Tensor w = in_data[rnn_enum::kParams].get(s); - Tensor dw = in_grad[rnn_enum::kParams].get(s); - Tensor hx = in_data[rnn_enum::kState].get(s); - Tensor dhx = in_grad[rnn_enum::kState].get(s); - Tensor y = out_data[rnn_enum::kOut].get(s); - Tensor dy = out_grad[rnn_enum::kOut].get(s); - if (req[rnn_enum::kParams] != kAddTo) { - dw = mshadow::expr::ScalarExp(0.0f); - } - // only need kStateOut grad output_states is true - void * dhy_ptr = NULL; - if (param_.state_outputs) - dhy_ptr = out_grad[rnn_enum::kStateOut].get(s).dptr_; - - // Deal with lstm - void * dcx_ptr = NULL; - void * dcy_ptr = NULL; - void * cx_ptr = NULL; - - if (param_.mode == rnn_enum::kLstm) { - CHECK_NE(req[rnn_enum::kStateCell], kAddTo) << "AddTo is not supported for state cell"; - cx_ptr = (in_data[rnn_enum::kStateCell].get(s)).dptr_; - dcx_ptr = (in_grad[rnn_enum::kStateCell].get(s)).dptr_; - } - if ((param_.mode == rnn_enum::kLstm) && param_.state_outputs) - dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get(s)).dptr_; - - CHECK_EQ(x.CheckContiguous(), true); - CHECK_EQ(w.CheckContiguous(), true); - CHECK_EQ(dw.CheckContiguous(), true); - CHECK_EQ(hx.CheckContiguous(), true); - CHECK_EQ(dhx.CheckContiguous(), true); - CHECK_EQ(y.CheckContiguous(), true); - CHECK_EQ(dy.CheckContiguous(), true); - - if (!init_cudnn_) { - Init(s, in_data, out_data); - } - - // Get temp space - int temp_size = workspace_size_; - Tensor temp_space = - ctx.requested[rnn_enum::kTempSpace].get_space_typed( - mshadow::Shape1(temp_size), s); - #if USE_CUDNN_LSTM_PROJ - CUDNN_CALL(cudnnRNNBackwardDataEx(s->dnn_handle_, - rnn_desc_, - y_data_desc_, - y.dptr_, - dy_data_desc_, - dy.dptr_, - nullptr, - nullptr, - dhy_desc_, - dhy_ptr, - dcy_desc_, - dcy_ptr, - w_desc_, - w.dptr_, - hx_desc_, - hx.dptr_, - cx_desc_, - cx_ptr, - dx_data_desc_, - dx.dptr_, - dhx_desc_, - dhx.dptr_, - dcx_desc_, - dcx_ptr, - nullptr, - nullptr, - temp_space.dptr_, - workspace_byte_, - reserve_space_.dptr, - reserve_space_byte_)); - CUDNN_CALL(cudnnRNNBackwardWeightsEx(s->dnn_handle_, - rnn_desc_, - x_data_desc_, - x.dptr_, - hx_desc_, - hx.dptr_, - y_data_desc_, - y.dptr_, - temp_space.dptr_, - workspace_byte_, - dw_desc_, - dw.dptr_, - reserve_space_.dptr, - reserve_space_byte_)); - #else - CUDNN_CALL(cudnnRNNBackwardData(s->dnn_handle_, - rnn_desc_, - param_.seq_length_, - y_desc_vec_.data(), - y.dptr_, - dy_desc_vec_.data(), - dy.dptr_, - dhy_desc_, - dhy_ptr, - dcy_desc_, - dcy_ptr, - w_desc_, - w.dptr_, - hx_desc_, - hx.dptr_, - cx_desc_, - cx_ptr, - dx_desc_vec_.data(), - dx.dptr_, - dhx_desc_, - dhx.dptr_, - dcx_desc_, - dcx_ptr, - temp_space.dptr_, - workspace_byte_, - reserve_space_.dptr, - reserve_space_byte_)); - CUDNN_CALL(cudnnRNNBackwardWeights(s->dnn_handle_, - rnn_desc_, - param_.seq_length_, - x_desc_vec_.data(), - x.dptr_, - hx_desc_, - hx.dptr_, - y_desc_vec_.data(), - y.dptr_, - temp_space.dptr_, - workspace_byte_, - dw_desc_, - dw.dptr_, - reserve_space_.dptr, - reserve_space_byte_)); - #endif - } - - private: - inline void Init(mshadow::Stream *s, - const std::vector &in_data, - const std::vector &out_data) { - using namespace mshadow; - #if CUDNN_MAJOR >= 5 - format_ = CUDNN_TENSOR_NCHW; - #endif - - size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3; - // kOut - size_t num_outputs = 1; - if (param_.state_outputs) { - // kOut, kStateOut, kStateCellOut - num_outputs = (param_.mode == rnn_enum::kLstm) ? 3 : 2; - } - - CHECK_EQ(in_data.size(), num_inputs); - CHECK_EQ(out_data.size(), num_outputs); - if (!init_cudnn_) { - init_cudnn_ = true; - // get input + output tensors - Tensor x = in_data[rnn_enum::kData].get(s); - Tensor w = in_data[rnn_enum::kParams].get(s); - param_.seq_length_ = x.shape_[0]; - param_.batch_size_ = x.shape_[1]; - param_.input_size_ = x.shape_[2]; - - // Tensor Descriptors - std::vector x_vec(param_.seq_length_); - std::vector y_vec(param_.seq_length_); - std::vector dx_vec(param_.seq_length_); - std::vector dy_vec(param_.seq_length_); - int dimA[3]; - int strideA[3]; - for (int i = 0; i < param_.seq_length_; i++) { - CUDNN_CALL(cudnnCreateTensorDescriptor(&x_vec[i])); - CUDNN_CALL(cudnnCreateTensorDescriptor(&y_vec[i])); - CUDNN_CALL(cudnnCreateTensorDescriptor(&dx_vec[i])); - CUDNN_CALL(cudnnCreateTensorDescriptor(&dy_vec[i])); - - dimA[0] = param_.batch_size_; - dimA[1] = param_.input_size_; - dimA[2] = 1; - strideA[0] = dimA[2] * dimA[1]; - strideA[1] = dimA[2]; - strideA[2] = 1; - - CUDNN_CALL(cudnnSetTensorNdDescriptor(x_vec[i], - dtype_, - 3, - dimA, - strideA)); - CUDNN_CALL(cudnnSetTensorNdDescriptor(dx_vec[i], - dtype_, - 3, - dimA, - strideA)); - dimA[0] = param_.batch_size_; - dimA[1] = param_.bidirectional ? param_.state_size * 2 : param_.state_size; - dimA[2] = 1; - strideA[0] = dimA[2] * dimA[1]; - strideA[1] = dimA[2]; - strideA[2] = 1; - - CUDNN_CALL(cudnnSetTensorNdDescriptor(y_vec[i], - dtype_, - 3, - dimA, - strideA)); - CUDNN_CALL(cudnnSetTensorNdDescriptor(dy_vec[i], - dtype_, - 3, - dimA, - strideA)); - } - x_desc_vec_ = x_vec; - y_desc_vec_ = y_vec; - dx_desc_vec_ = dx_vec; - dy_desc_vec_ = dy_vec; - - // set the state tensors - dimA[0] = param_.num_layers * (param_.bidirectional ? 2 : 1); - dimA[1] = param_.batch_size_; - dimA[2] = param_.state_size; - strideA[0] = dimA[2] * dimA[1]; - strideA[1] = dimA[2]; - strideA[2] = 1; - #if USE_CUDNN_LSTM_PROJ - int dimB[3]; - int strideB[3]; - dimB[0] = param_.num_layers * (param_.bidirectional ? 2 : 1); - dimB[1] = param_.batch_size_; - dimB[2] = param_.projection_size.has_value() ? - param_.projection_size.value() : param_.state_size; - strideB[0] = dimB[2] * dimB[1]; - strideB[1] = dimB[2]; - strideB[2] = 1; - #endif - - #if USE_CUDNN_LSTM_PROJ - CUDNN_CALL(cudnnSetTensorNdDescriptor(hx_desc_, - dtype_, - 3, - dimB, - strideB)); - #else - CUDNN_CALL(cudnnSetTensorNdDescriptor(hx_desc_, - dtype_, - 3, - dimA, - strideA)); - #endif - CUDNN_CALL(cudnnSetTensorNdDescriptor(cx_desc_, - dtype_, - 3, - dimA, - strideA)); - #if USE_CUDNN_LSTM_PROJ - CUDNN_CALL(cudnnSetTensorNdDescriptor(hy_desc_, - dtype_, - 3, - dimB, - strideB)); - #else - CUDNN_CALL(cudnnSetTensorNdDescriptor(hy_desc_, - dtype_, - 3, - dimA, - strideA)); - #endif - CUDNN_CALL(cudnnSetTensorNdDescriptor(cy_desc_, - dtype_, - 3, - dimA, - strideA)); - #if USE_CUDNN_LSTM_PROJ - CUDNN_CALL(cudnnSetTensorNdDescriptor(dhx_desc_, - dtype_, - 3, - dimB, - strideB)); - #else - CUDNN_CALL(cudnnSetTensorNdDescriptor(dhx_desc_, - dtype_, - 3, - dimA, - strideA)); - #endif - CUDNN_CALL(cudnnSetTensorNdDescriptor(dcx_desc_, - dtype_, - 3, - dimA, - strideA)); - #if USE_CUDNN_LSTM_PROJ - CUDNN_CALL(cudnnSetTensorNdDescriptor(dhy_desc_, - dtype_, - 3, - dimB, - strideB)); - #else - CUDNN_CALL(cudnnSetTensorNdDescriptor(dhy_desc_, - dtype_, - 3, - dimA, - strideA)); - #endif - CUDNN_CALL(cudnnSetTensorNdDescriptor(dcy_desc_, - dtype_, - 3, - dimA, - strideA)); - - // Create Dropout descriptors - if (param_.p > 0) { - CUDNN_CALL(cudnnDropoutGetStatesSize(s->dnn_handle_, &dropout_byte_)); - dropout_size_ = dropout_byte_ / sizeof(DType); - dropout_states_ = Storage::Get()->Alloc(dropout_byte_, Context::GPU(s->dev_id)); - } else { - dropout_states_ = {}; - dropout_byte_ = 0; - } - CUDNN_CALL(cudnnSetDropoutDescriptor(dropout_desc_, s->dnn_handle_, - param_.p, // discard probability - dropout_states_.dptr, dropout_byte_, - seed_)); - // RNN descriptors - #if CUDNN_MAJOR >= 6 - cudnnRNNAlgo_t rnn_algo = CUDNN_RNN_ALGO_STANDARD; - CUDNN_CALL(cudnnSetRNNDescriptor_v6(s->dnn_handle_, - rnn_desc_, - param_.state_size, - param_.num_layers, - dropout_desc_, - input_mode_, - direction_, - mode_, - rnn_algo, - dtype_)); - #else - CUDNN_CALL(cudnnSetRNNDescriptor(rnn_desc_, - param_.state_size, - param_.num_layers, - dropout_desc_, - input_mode_, - direction_, - mode_, - dtype_)); - #endif - #if CUDNN_MAJOR >= 7 - cudnnMathType_t math_type = CUDNN_DEFAULT_MATH; - if (cudnn_tensor_core_ && rnn_algo == CUDNN_RNN_ALGO_STANDARD) { - math_type = CUDNN_TENSOR_OP_MATH; - } - #if CUDNN_VERSION >= 7200 - if (GetEnvAllowTensorCore() && GetEnvAllowTensorCoreConversion() && - (DataType::kFlag != kFloat16)) - math_type = CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION; - #endif - CUDNN_CALL(cudnnSetRNNMatrixMathType(rnn_desc_, math_type)); - #endif - #if USE_CUDNN_LSTM_PROJ - if (param_.projection_size.has_value()) { - CUDNN_CALL(cudnnSetRNNProjectionLayers(s->dnn_handle_, - rnn_desc_, - param_.projection_size.value(), - 0)); - } - #endif - // Get temp space sizes - CUDNN_CALL(cudnnGetRNNWorkspaceSize(s->dnn_handle_, - rnn_desc_, - param_.seq_length_, - x_desc_vec_.data(), - &workspace_byte_)); - CUDNN_CALL(cudnnGetRNNTrainingReserveSize(s->dnn_handle_, - rnn_desc_, - param_.seq_length_, - x_desc_vec_.data(), - &reserve_space_byte_)); - workspace_size_ = workspace_byte_ / sizeof(DType); - // Allocate the reserve space - reserve_space_ = Storage::Get()->Alloc(reserve_space_byte_, Context::GPU(s->dev_id)); - - // Check that number of params are correct - size_t cudnn_param_size; - CUDNN_CALL(cudnnGetRNNParamsSize(s->dnn_handle_, - rnn_desc_, - x_desc_vec_[0], - &cudnn_param_size, - dtype_)); - CHECK_EQ(w.shape_[0] * sizeof(DType), cudnn_param_size); - - // Set param descriptors - int dim_w[3] = {1, 1, 1}; - dim_w[0] = w.shape_[0]; - CUDNN_CALL(cudnnSetFilterNdDescriptor(w_desc_, - dtype_, - format_, - 3, - dim_w)); - CUDNN_CALL(cudnnSetFilterNdDescriptor(dw_desc_, - dtype_, - format_, - 3, - dim_w)); - - // Query weight layout - // cudnnFilterDescriptor_t m_desc; - // CHECK_EQ(cudnnCreateFilterDescriptor(&m_desc), CUDNN_STATUS_SUCCESS); - // DType *p; - // int n = 2; - // int64_t last = 0; - // if (param_.mode == rnn_enum::kLstm) n = 8; - // else if (param_.mode == rnn_enum::kGru) n = 6; - - // for (int i = 0; i < param_.num_layers*(param_.bidirectional?2:1); ++i) { - // for (int j = 0; j < n; ++j) { - // CHECK_EQ(cudnnGetRNNLinLayerMatrixParams(s->dnn_handle_, rnn_desc_, - // i, x_desc_vec_[0], w_desc_, 0, j, m_desc, (void**)&p), CUDNN_STATUS_SUCCESS); - // LOG(INFO) << ((int64_t)(p - NULL))/sizeof(DType) - last; - // last = ((int64_t)(p - NULL))/sizeof(DType); - // cudnnDataType_t t; - // cudnnTensorFormat_t f; - // int ndim = 5; - // int dims[5] = {0, 0, 0, 0, 0}; - // CHECK_EQ(cudnnGetFilterNdDescriptor(m_desc, ndim, &t, &f, &ndim, &dims[0]), - // CUDNN_STATUS_SUCCESS); - // LOG(INFO) << "w: " << i << " " << j << " " << ((int64_t)(p - NULL))/sizeof(DType); - // for (int i = 0; i < ndim; ++i) LOG(INFO) << dims[i]; - // } - // } - - // for (int i = 0; i < param_.num_layers*(param_.bidirectional?2:1); ++i) { - // for (int j = 0; j < n; ++j) { - // CHECK_EQ(cudnnGetRNNLinLayerBiasParams(s->dnn_handle_, rnn_desc_, i, x_desc_vec_[0], - // w_desc_, 0, j, m_desc, (void**)&p), CUDNN_STATUS_SUCCESS); - // LOG(INFO) << ((int64_t)(p - NULL))/sizeof(DType) - last; - // last = ((int64_t)(p - NULL))/sizeof(DType); - // LOG(INFO) << "b: " << i << " " << j << " " << ((int64_t)(p - NULL))/sizeof(DType); - // } - // } - } - } - - cudnnDataType_t dtype_; - bool init_cudnn_; - cudnnRNNDescriptor_t rnn_desc_; - cudnnRNNMode_t mode_; - cudnnDirectionMode_t direction_; - cudnnRNNInputMode_t input_mode_; - cudnnDropoutDescriptor_t dropout_desc_; - Storage::Handle dropout_states_, reserve_space_; - uint64_t seed_ = 17 + rand() % 4096; // NOLINT(runtime/threadsafe_fn) - size_t workspace_byte_, reserve_space_byte_, dropout_byte_; - int workspace_size_, dropout_size_; - std::vector x_desc_vec_, y_desc_vec_, dx_desc_vec_, dy_desc_vec_; - #if USE_CUDNN_LSTM_PROJ - cudnnRNNDataDescriptor_t x_data_desc_, y_data_desc_, dx_data_desc_, dy_data_desc_; - #endif - cudnnTensorDescriptor_t hx_desc_, cx_desc_; - cudnnTensorDescriptor_t hy_desc_, cy_desc_; - cudnnTensorDescriptor_t dhx_desc_, dcx_desc_; - cudnnTensorDescriptor_t dhy_desc_, dcy_desc_; - - cudnnFilterDescriptor_t w_desc_, dw_desc_; - // Allow TensorCore algo policy - bool cudnn_tensor_core_; - - #if CUDNN_MAJOR >= 5 - cudnnTensorFormat_t format_; - #endif -}; -#endif // CUDNN -} // namespace op -} // namespace mxnet -#endif // MXNET_OPERATOR_CUDNN_RNN_INL_H_ diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index 0460c193de5b..e0b5de7b18f0 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -26,6 +26,9 @@ #ifndef MXNET_OPERATOR_RNN_INL_H_ #define MXNET_OPERATOR_RNN_INL_H_ +#define MXNET_USE_CUDNN_RNN MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5 +#define USE_CUDNN_LSTM_PROJ MXNET_USE_CUDNN == 1 && CUDNN_VERSION >= 7200 + #include #include #include @@ -35,6 +38,7 @@ #include #include #include +#include #include "./math.h" #include "./math_functions-inl.h" #include "./operator_common.h" @@ -375,6 +379,826 @@ void RNNBackward(DType* ws, } } +#if MXNET_USE_CUDNN_RNN +template +class RNNOp { + public: + RNNParam param_; + explicit RNNOp(RNNParam param) { + this->param_ = param; + init_cudnn_ = false; + dtype_ = mshadow::DataType::kCudnnFlag; + // TensorCore algos only allowed on fp16-I/O convolutions if permitted by the global policy. + // No tests in place for fp16 RNNs, so leave TensorCore disabled for now. + cudnn_tensor_core_ = false; + // When fp16 RNN tests are introduced, we can enable TensorCore as follows: +// cudnn_tensor_core = +// mshadow::DataType::kFlag == mshadow::kFloat16 && GetEnvAllowTensorCore(); + // Defaults + input_mode_ = CUDNN_LINEAR_INPUT; // Don't support this yet + // RNN Mode + switch (param_.mode) { + case rnn_enum::kRnnRelu: + mode_ = CUDNN_RNN_RELU; + break; + case rnn_enum::kRnnTanh: + mode_ = CUDNN_RNN_TANH; + break; + case rnn_enum::kLstm: + mode_ = CUDNN_LSTM; + break; + case rnn_enum::kGru: + mode_ = CUDNN_GRU; + break; + default: + LOG(FATAL) << "Not implmented"; + } +#if USE_CUDNN_LSTM_PROJ + if (param_.projection_size.has_value()) { + CHECK_EQ(param_.mode, rnn_enum::kLstm) + << "Projection is only supported for LSTM."; + CHECK_GE(param_.state_size, param_.projection_size.value()) + << "State size must be larger than projection size."; + } +#else + CHECK(!param_.projection_size.has_value()) + << "Projection is only supported for LSTM with CuDNN version later than 7.1.1."; +#endif +#if USE_CUDNN_LSTM_PROJ + if (param_.lstm_state_clip_min.has_value() + || param_.lstm_state_clip_max.has_value()) { + CHECK_EQ(param_.mode, rnn_enum::kLstm) + << "State clipping is only supported for LSTM."; + CHECK(param_.lstm_state_clip_min.has_value() && param_.lstm_state_clip_max.has_value()) + << "lstm_state_clip_min and lstm_state_clip_max must be specified together."; + CHECK_GE(param_.lstm_state_clip_max.value(), param_.lstm_state_clip_min.value()) + << "lstm_state_clip_max must be greater or equal to lstm_state_clip_min"; + } +#else + CHECK(!param_.lstm_state_clip_min.has_value() + && !param_.lstm_state_clip_max.has_value()) + << "State clipping is only supported for LSTM with CuDNN version later than 7.2.1."; +#endif + // RNN Direction + direction_ = param_.bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL; + // Create descriptors + CUDNN_CALL(cudnnCreateTensorDescriptor(&hx_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&cx_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&hy_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&cy_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&dhx_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&dcx_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&dhy_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&dcy_desc_)); + + CUDNN_CALL(cudnnCreateFilterDescriptor(&w_desc_)); + CUDNN_CALL(cudnnCreateFilterDescriptor(&dw_desc_)); + + CUDNN_CALL(cudnnCreateRNNDescriptor(&rnn_desc_)); + CUDNN_CALL(cudnnCreateDropoutDescriptor(&dropout_desc_)); + + #if USE_CUDNN_LSTM_PROJ + CUDNN_CALL(cudnnCreateRNNDataDescriptor(&x_data_desc_)); + CUDNN_CALL(cudnnCreateRNNDataDescriptor(&y_data_desc_)); + CUDNN_CALL(cudnnCreateRNNDataDescriptor(&dx_data_desc_)); + CUDNN_CALL(cudnnCreateRNNDataDescriptor(&dy_data_desc_)); + #endif + } + + ~RNNOp() { + CUDNN_CALL(cudnnDestroyTensorDescriptor(hx_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(cx_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(hy_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(cy_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(dhx_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(dcx_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(dhy_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(dcy_desc_)); + + CUDNN_CALL(cudnnDestroyFilterDescriptor(w_desc_)); + CUDNN_CALL(cudnnDestroyFilterDescriptor(dw_desc_)); + CUDNN_CALL(cudnnDestroyRNNDescriptor(rnn_desc_)); + CUDNN_CALL(cudnnDestroyDropoutDescriptor(dropout_desc_)); + + if (init_cudnn_) { + for (size_t i = 0; i < x_desc_vec_.size(); ++i) { + CUDNN_CALL(cudnnDestroyTensorDescriptor(x_desc_vec_[i])); + CUDNN_CALL(cudnnDestroyTensorDescriptor(y_desc_vec_[i])); + CUDNN_CALL(cudnnDestroyTensorDescriptor(dx_desc_vec_[i])); + CUDNN_CALL(cudnnDestroyTensorDescriptor(dy_desc_vec_[i])); + } + init_cudnn_ = false; + + Storage::Get()->Free(reserve_space_); + if (param_.p > 0) { + Storage::Get()->Free(dropout_states_); + } + } + #if USE_CUDNN_LSTM_PROJ + CUDNN_CALL(cudnnDestroyRNNDataDescriptor(x_data_desc_)); + CUDNN_CALL(cudnnDestroyRNNDataDescriptor(y_data_desc_)); + CUDNN_CALL(cudnnDestroyRNNDataDescriptor(dx_data_desc_)); + CUDNN_CALL(cudnnDestroyRNNDataDescriptor(dy_data_desc_)); + #endif + } + + void Forward(const OpContext &ctx, const std::vector &in_data, + const std::vector &req, + const std::vector &out_data) { + using namespace mshadow; + size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3; + // kOut + size_t num_outputs = 1; + if (param_.state_outputs) { + // kOut, kStateOut, kStateCellOut + num_outputs = (param_.mode == rnn_enum::kLstm) ? 3 : 2; + } + + CHECK_EQ(in_data.size(), num_inputs); + CHECK_EQ(out_data.size(), num_outputs); + Stream *s = ctx.get_stream(); + // get input + output tensors + Tensor x = in_data[rnn_enum::kData].get(s); + Tensor w = in_data[rnn_enum::kParams].get(s); + Tensor hx = in_data[rnn_enum::kState].get(s); + Tensor y = out_data[rnn_enum::kOut].get(s); + + void * hy_ptr = NULL; + if (param_.state_outputs) + hy_ptr = out_data[rnn_enum::kStateOut].get(s).dptr_; + + DType * cx_ptr = NULL; + DType * cy_ptr = NULL; + if (param_.mode == rnn_enum::kLstm) + cx_ptr = (in_data[rnn_enum::kStateCell].get(s)).dptr_; + if (param_.mode == rnn_enum::kLstm && param_.state_outputs) + cy_ptr = (out_data[rnn_enum::kStateCellOut].get(s)).dptr_; + + CHECK_EQ(x.CheckContiguous(), true); + CHECK_EQ(w.CheckContiguous(), true); + CHECK_EQ(hx.CheckContiguous(), true); + CHECK_EQ(y.CheckContiguous(), true); + + if (!init_cudnn_) { + Init(s, in_data, out_data); + } + // Get temp space + int temp_size = workspace_size_; + Tensor temp_space = + ctx.requested[rnn_enum::kTempSpace].get_space_typed( + mshadow::Shape1(temp_size), s); + #if USE_CUDNN_LSTM_PROJ + std::vector seqLengthArray(param_.batch_size_, param_.seq_length_); + CUDNN_CALL(cudnnSetRNNDataDescriptor(x_data_desc_, + dtype_, + CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED, + param_.seq_length_, + param_.batch_size_, + param_.input_size_, + seqLengthArray.data(), + nullptr)); + int out_size = + (param_.projection_size.has_value()) ? param_.projection_size.value() : param_.state_size; + out_size = (param_.bidirectional) ? (out_size * 2) : out_size; + CUDNN_CALL(cudnnSetRNNDataDescriptor(y_data_desc_, + dtype_, + CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED, + param_.seq_length_, + param_.batch_size_, + out_size, + seqLengthArray.data(), + nullptr)); + if (ctx.is_train) { + CUDNN_CALL(cudnnSetRNNDataDescriptor(dx_data_desc_, + dtype_, + CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED, + param_.seq_length_, + param_.batch_size_, + param_.input_size_, + seqLengthArray.data(), + nullptr)); + CUDNN_CALL(cudnnSetRNNDataDescriptor(dy_data_desc_, + dtype_, + CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED, + param_.seq_length_, + param_.batch_size_, + out_size, + seqLengthArray.data(), + nullptr)); + } + #endif + + #if USE_CUDNN_LSTM_PROJ + bool clip_state = param_.lstm_state_clip_min.has_value(); + bool clip_nan = param_.lstm_state_clip_nan; + CUDNN_CALL(cudnnRNNSetClip(s->dnn_handle_, + rnn_desc_, + clip_state ? CUDNN_RNN_CLIP_MINMAX : CUDNN_RNN_CLIP_NONE, + clip_nan ? CUDNN_NOT_PROPAGATE_NAN : CUDNN_PROPAGATE_NAN, + clip_state ? param_.lstm_state_clip_min.value() : 0.0, + clip_state ? param_.lstm_state_clip_max.value() : 0.0)); + #endif + + if (ctx.is_train) { + #if USE_CUDNN_LSTM_PROJ + CUDNN_CALL(cudnnRNNForwardTrainingEx(s->dnn_handle_, + rnn_desc_, + x_data_desc_, + x.dptr_, + hx_desc_, + hx.dptr_, + cx_desc_, + cx_ptr, + w_desc_, + w.dptr_, + y_data_desc_, + y.dptr_, + hy_desc_, + hy_ptr, + cy_desc_, + cy_ptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + temp_space.dptr_, + workspace_byte_, + reserve_space_.dptr, + reserve_space_byte_)); + #else + CUDNN_CALL(cudnnRNNForwardTraining(s->dnn_handle_, + rnn_desc_, + param_.seq_length_, + x_desc_vec_.data(), + x.dptr_, + hx_desc_, + hx.dptr_, + cx_desc_, + cx_ptr, + w_desc_, + w.dptr_, + y_desc_vec_.data(), + y.dptr_, + hy_desc_, + hy_ptr, + cy_desc_, + cy_ptr, + temp_space.dptr_, + workspace_byte_, + reserve_space_.dptr, + reserve_space_byte_)); + #endif + } else { + #if USE_CUDNN_LSTM_PROJ + CUDNN_CALL(cudnnRNNForwardInferenceEx(s->dnn_handle_, + rnn_desc_, + x_data_desc_, + x.dptr_, + hx_desc_, + hx.dptr_, + cx_desc_, + cx_ptr, + w_desc_, + w.dptr_, + y_data_desc_, + y.dptr_, + hy_desc_, + hy_ptr, + cy_desc_, + cy_ptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + temp_space.dptr_, + workspace_byte_)); + #else + CUDNN_CALL(cudnnRNNForwardInference(s->dnn_handle_, + rnn_desc_, + param_.seq_length_, + x_desc_vec_.data(), + x.dptr_, + hx_desc_, + hx.dptr_, + cx_desc_, + cx_ptr, + w_desc_, + w.dptr_, + y_desc_vec_.data(), + y.dptr_, + hy_desc_, + hy_ptr, + cy_desc_, + cy_ptr, + temp_space.dptr_, + workspace_byte_)); + #endif + } + } + + void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad) { + using namespace mshadow; + size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3; + // kOut + size_t num_outputs = 1; + if (param_.state_outputs) { + // kOut, kStateOut, kStateCellOut + num_outputs = (param_.mode == rnn_enum::kLstm) ? 3 : 2; + } + + CHECK_EQ(in_data.size(), num_inputs); + CHECK_EQ(out_data.size(), num_outputs); + CHECK_EQ(in_grad.size(), num_inputs); + CHECK_EQ(out_grad.size(), num_outputs); + CHECK_EQ(req.size(), num_inputs); + CHECK_NE(req[rnn_enum::kData], kAddTo) << "AddTo is not supported for data"; + CHECK_NE(req[rnn_enum::kState], kAddTo) << "AddTo is not supported for state"; + Stream *s = ctx.get_stream(); + // get input + output tensors + Tensor x = in_data[rnn_enum::kData].get(s); + Tensor dx = in_grad[rnn_enum::kData].get(s); + Tensor w = in_data[rnn_enum::kParams].get(s); + Tensor dw = in_grad[rnn_enum::kParams].get(s); + Tensor hx = in_data[rnn_enum::kState].get(s); + Tensor dhx = in_grad[rnn_enum::kState].get(s); + Tensor y = out_data[rnn_enum::kOut].get(s); + Tensor dy = out_grad[rnn_enum::kOut].get(s); + if (req[rnn_enum::kParams] != kAddTo) { + dw = mshadow::expr::ScalarExp(0.0f); + } + // only need kStateOut grad output_states is true + void * dhy_ptr = NULL; + if (param_.state_outputs) + dhy_ptr = out_grad[rnn_enum::kStateOut].get(s).dptr_; + + // Deal with lstm + void * dcx_ptr = NULL; + void * dcy_ptr = NULL; + void * cx_ptr = NULL; + + if (param_.mode == rnn_enum::kLstm) { + CHECK_NE(req[rnn_enum::kStateCell], kAddTo) << "AddTo is not supported for state cell"; + cx_ptr = (in_data[rnn_enum::kStateCell].get(s)).dptr_; + dcx_ptr = (in_grad[rnn_enum::kStateCell].get(s)).dptr_; + } + if ((param_.mode == rnn_enum::kLstm) && param_.state_outputs) + dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get(s)).dptr_; + + CHECK_EQ(x.CheckContiguous(), true); + CHECK_EQ(w.CheckContiguous(), true); + CHECK_EQ(dw.CheckContiguous(), true); + CHECK_EQ(hx.CheckContiguous(), true); + CHECK_EQ(dhx.CheckContiguous(), true); + CHECK_EQ(y.CheckContiguous(), true); + CHECK_EQ(dy.CheckContiguous(), true); + + if (!init_cudnn_) { + Init(s, in_data, out_data); + } + + // Get temp space + int temp_size = workspace_size_; + Tensor temp_space = + ctx.requested[rnn_enum::kTempSpace].get_space_typed( + mshadow::Shape1(temp_size), s); + #if USE_CUDNN_LSTM_PROJ + CUDNN_CALL(cudnnRNNBackwardDataEx(s->dnn_handle_, + rnn_desc_, + y_data_desc_, + y.dptr_, + dy_data_desc_, + dy.dptr_, + nullptr, + nullptr, + dhy_desc_, + dhy_ptr, + dcy_desc_, + dcy_ptr, + w_desc_, + w.dptr_, + hx_desc_, + hx.dptr_, + cx_desc_, + cx_ptr, + dx_data_desc_, + dx.dptr_, + dhx_desc_, + dhx.dptr_, + dcx_desc_, + dcx_ptr, + nullptr, + nullptr, + temp_space.dptr_, + workspace_byte_, + reserve_space_.dptr, + reserve_space_byte_)); + CUDNN_CALL(cudnnRNNBackwardWeightsEx(s->dnn_handle_, + rnn_desc_, + x_data_desc_, + x.dptr_, + hx_desc_, + hx.dptr_, + y_data_desc_, + y.dptr_, + temp_space.dptr_, + workspace_byte_, + dw_desc_, + dw.dptr_, + reserve_space_.dptr, + reserve_space_byte_)); + #else + CUDNN_CALL(cudnnRNNBackwardData(s->dnn_handle_, + rnn_desc_, + param_.seq_length_, + y_desc_vec_.data(), + y.dptr_, + dy_desc_vec_.data(), + dy.dptr_, + dhy_desc_, + dhy_ptr, + dcy_desc_, + dcy_ptr, + w_desc_, + w.dptr_, + hx_desc_, + hx.dptr_, + cx_desc_, + cx_ptr, + dx_desc_vec_.data(), + dx.dptr_, + dhx_desc_, + dhx.dptr_, + dcx_desc_, + dcx_ptr, + temp_space.dptr_, + workspace_byte_, + reserve_space_.dptr, + reserve_space_byte_)); + CUDNN_CALL(cudnnRNNBackwardWeights(s->dnn_handle_, + rnn_desc_, + param_.seq_length_, + x_desc_vec_.data(), + x.dptr_, + hx_desc_, + hx.dptr_, + y_desc_vec_.data(), + y.dptr_, + temp_space.dptr_, + workspace_byte_, + dw_desc_, + dw.dptr_, + reserve_space_.dptr, + reserve_space_byte_)); + #endif + } + + private: + inline void Init(mshadow::Stream *s, + const std::vector &in_data, + const std::vector &out_data) { + using namespace mshadow; + #if CUDNN_MAJOR >= 5 + format_ = CUDNN_TENSOR_NCHW; + #endif + + size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3; + // kOut + size_t num_outputs = 1; + if (param_.state_outputs) { + // kOut, kStateOut, kStateCellOut + num_outputs = (param_.mode == rnn_enum::kLstm) ? 3 : 2; + } + + CHECK_EQ(in_data.size(), num_inputs); + CHECK_EQ(out_data.size(), num_outputs); + if (!init_cudnn_) { + init_cudnn_ = true; + // get input + output tensors + Tensor x = in_data[rnn_enum::kData].get(s); + Tensor w = in_data[rnn_enum::kParams].get(s); + param_.seq_length_ = x.shape_[0]; + param_.batch_size_ = x.shape_[1]; + param_.input_size_ = x.shape_[2]; + + // Tensor Descriptors + std::vector x_vec(param_.seq_length_); + std::vector y_vec(param_.seq_length_); + std::vector dx_vec(param_.seq_length_); + std::vector dy_vec(param_.seq_length_); + int dimA[3]; + int strideA[3]; + for (int i = 0; i < param_.seq_length_; i++) { + CUDNN_CALL(cudnnCreateTensorDescriptor(&x_vec[i])); + CUDNN_CALL(cudnnCreateTensorDescriptor(&y_vec[i])); + CUDNN_CALL(cudnnCreateTensorDescriptor(&dx_vec[i])); + CUDNN_CALL(cudnnCreateTensorDescriptor(&dy_vec[i])); + + dimA[0] = param_.batch_size_; + dimA[1] = param_.input_size_; + dimA[2] = 1; + strideA[0] = dimA[2] * dimA[1]; + strideA[1] = dimA[2]; + strideA[2] = 1; + + CUDNN_CALL(cudnnSetTensorNdDescriptor(x_vec[i], + dtype_, + 3, + dimA, + strideA)); + CUDNN_CALL(cudnnSetTensorNdDescriptor(dx_vec[i], + dtype_, + 3, + dimA, + strideA)); + dimA[0] = param_.batch_size_; + dimA[1] = param_.bidirectional ? param_.state_size * 2 : param_.state_size; + dimA[2] = 1; + strideA[0] = dimA[2] * dimA[1]; + strideA[1] = dimA[2]; + strideA[2] = 1; + + CUDNN_CALL(cudnnSetTensorNdDescriptor(y_vec[i], + dtype_, + 3, + dimA, + strideA)); + CUDNN_CALL(cudnnSetTensorNdDescriptor(dy_vec[i], + dtype_, + 3, + dimA, + strideA)); + } + x_desc_vec_ = x_vec; + y_desc_vec_ = y_vec; + dx_desc_vec_ = dx_vec; + dy_desc_vec_ = dy_vec; + + // set the state tensors + dimA[0] = param_.num_layers * (param_.bidirectional ? 2 : 1); + dimA[1] = param_.batch_size_; + dimA[2] = param_.state_size; + strideA[0] = dimA[2] * dimA[1]; + strideA[1] = dimA[2]; + strideA[2] = 1; + #if USE_CUDNN_LSTM_PROJ + int dimB[3]; + int strideB[3]; + dimB[0] = param_.num_layers * (param_.bidirectional ? 2 : 1); + dimB[1] = param_.batch_size_; + dimB[2] = param_.projection_size.has_value() ? + param_.projection_size.value() : param_.state_size; + strideB[0] = dimB[2] * dimB[1]; + strideB[1] = dimB[2]; + strideB[2] = 1; + #endif + + #if USE_CUDNN_LSTM_PROJ + CUDNN_CALL(cudnnSetTensorNdDescriptor(hx_desc_, + dtype_, + 3, + dimB, + strideB)); + #else + CUDNN_CALL(cudnnSetTensorNdDescriptor(hx_desc_, + dtype_, + 3, + dimA, + strideA)); + #endif + CUDNN_CALL(cudnnSetTensorNdDescriptor(cx_desc_, + dtype_, + 3, + dimA, + strideA)); + #if USE_CUDNN_LSTM_PROJ + CUDNN_CALL(cudnnSetTensorNdDescriptor(hy_desc_, + dtype_, + 3, + dimB, + strideB)); + #else + CUDNN_CALL(cudnnSetTensorNdDescriptor(hy_desc_, + dtype_, + 3, + dimA, + strideA)); + #endif + CUDNN_CALL(cudnnSetTensorNdDescriptor(cy_desc_, + dtype_, + 3, + dimA, + strideA)); + #if USE_CUDNN_LSTM_PROJ + CUDNN_CALL(cudnnSetTensorNdDescriptor(dhx_desc_, + dtype_, + 3, + dimB, + strideB)); + #else + CUDNN_CALL(cudnnSetTensorNdDescriptor(dhx_desc_, + dtype_, + 3, + dimA, + strideA)); + #endif + CUDNN_CALL(cudnnSetTensorNdDescriptor(dcx_desc_, + dtype_, + 3, + dimA, + strideA)); + #if USE_CUDNN_LSTM_PROJ + CUDNN_CALL(cudnnSetTensorNdDescriptor(dhy_desc_, + dtype_, + 3, + dimB, + strideB)); + #else + CUDNN_CALL(cudnnSetTensorNdDescriptor(dhy_desc_, + dtype_, + 3, + dimA, + strideA)); + #endif + CUDNN_CALL(cudnnSetTensorNdDescriptor(dcy_desc_, + dtype_, + 3, + dimA, + strideA)); + + // Create Dropout descriptors + if (param_.p > 0) { + CUDNN_CALL(cudnnDropoutGetStatesSize(s->dnn_handle_, &dropout_byte_)); + dropout_size_ = dropout_byte_ / sizeof(DType); + dropout_states_ = Storage::Get()->Alloc(dropout_byte_, Context::GPU(s->dev_id)); + } else { + dropout_states_ = {}; + dropout_byte_ = 0; + } + CUDNN_CALL(cudnnSetDropoutDescriptor(dropout_desc_, s->dnn_handle_, + param_.p, // discard probability + dropout_states_.dptr, dropout_byte_, + seed_)); + // RNN descriptors + #if CUDNN_MAJOR >= 6 + cudnnRNNAlgo_t rnn_algo = CUDNN_RNN_ALGO_STANDARD; + CUDNN_CALL(cudnnSetRNNDescriptor_v6(s->dnn_handle_, + rnn_desc_, + param_.state_size, + param_.num_layers, + dropout_desc_, + input_mode_, + direction_, + mode_, + rnn_algo, + dtype_)); + #else + CUDNN_CALL(cudnnSetRNNDescriptor(rnn_desc_, + param_.state_size, + param_.num_layers, + dropout_desc_, + input_mode_, + direction_, + mode_, + dtype_)); + #endif + #if CUDNN_MAJOR >= 7 + cudnnMathType_t math_type = CUDNN_DEFAULT_MATH; + if (cudnn_tensor_core_ && rnn_algo == CUDNN_RNN_ALGO_STANDARD) { + math_type = CUDNN_TENSOR_OP_MATH; + } + #if CUDNN_VERSION >= 7200 + if (GetEnvAllowTensorCore() && GetEnvAllowTensorCoreConversion() && + (DataType::kFlag != kFloat16)) + math_type = CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION; + #endif + CUDNN_CALL(cudnnSetRNNMatrixMathType(rnn_desc_, math_type)); + #endif + #if USE_CUDNN_LSTM_PROJ + if (param_.projection_size.has_value()) { + CUDNN_CALL(cudnnSetRNNProjectionLayers(s->dnn_handle_, + rnn_desc_, + param_.projection_size.value(), + 0)); + } + #endif + // Get temp space sizes + CUDNN_CALL(cudnnGetRNNWorkspaceSize(s->dnn_handle_, + rnn_desc_, + param_.seq_length_, + x_desc_vec_.data(), + &workspace_byte_)); + CUDNN_CALL(cudnnGetRNNTrainingReserveSize(s->dnn_handle_, + rnn_desc_, + param_.seq_length_, + x_desc_vec_.data(), + &reserve_space_byte_)); + workspace_size_ = workspace_byte_ / sizeof(DType); + // Allocate the reserve space + reserve_space_ = Storage::Get()->Alloc(reserve_space_byte_, Context::GPU(s->dev_id)); + + // Check that number of params are correct + size_t cudnn_param_size; + CUDNN_CALL(cudnnGetRNNParamsSize(s->dnn_handle_, + rnn_desc_, + x_desc_vec_[0], + &cudnn_param_size, + dtype_)); + CHECK_EQ(w.shape_[0] * sizeof(DType), cudnn_param_size); + + // Set param descriptors + int dim_w[3] = {1, 1, 1}; + dim_w[0] = w.shape_[0]; + CUDNN_CALL(cudnnSetFilterNdDescriptor(w_desc_, + dtype_, + format_, + 3, + dim_w)); + CUDNN_CALL(cudnnSetFilterNdDescriptor(dw_desc_, + dtype_, + format_, + 3, + dim_w)); + + // Query weight layout + // cudnnFilterDescriptor_t m_desc; + // CHECK_EQ(cudnnCreateFilterDescriptor(&m_desc), CUDNN_STATUS_SUCCESS); + // DType *p; + // int n = 2; + // int64_t last = 0; + // if (param_.mode == rnn_enum::kLstm) n = 8; + // else if (param_.mode == rnn_enum::kGru) n = 6; + + // for (int i = 0; i < param_.num_layers*(param_.bidirectional?2:1); ++i) { + // for (int j = 0; j < n; ++j) { + // CHECK_EQ(cudnnGetRNNLinLayerMatrixParams(s->dnn_handle_, rnn_desc_, + // i, x_desc_vec_[0], w_desc_, 0, j, m_desc, (void**)&p), CUDNN_STATUS_SUCCESS); + // LOG(INFO) << ((int64_t)(p - NULL))/sizeof(DType) - last; + // last = ((int64_t)(p - NULL))/sizeof(DType); + // cudnnDataType_t t; + // cudnnTensorFormat_t f; + // int ndim = 5; + // int dims[5] = {0, 0, 0, 0, 0}; + // CHECK_EQ(cudnnGetFilterNdDescriptor(m_desc, ndim, &t, &f, &ndim, &dims[0]), + // CUDNN_STATUS_SUCCESS); + // LOG(INFO) << "w: " << i << " " << j << " " << ((int64_t)(p - NULL))/sizeof(DType); + // for (int i = 0; i < ndim; ++i) LOG(INFO) << dims[i]; + // } + // } + + // for (int i = 0; i < param_.num_layers*(param_.bidirectional?2:1); ++i) { + // for (int j = 0; j < n; ++j) { + // CHECK_EQ(cudnnGetRNNLinLayerBiasParams(s->dnn_handle_, rnn_desc_, i, x_desc_vec_[0], + // w_desc_, 0, j, m_desc, (void**)&p), CUDNN_STATUS_SUCCESS); + // LOG(INFO) << ((int64_t)(p - NULL))/sizeof(DType) - last; + // last = ((int64_t)(p - NULL))/sizeof(DType); + // LOG(INFO) << "b: " << i << " " << j << " " << ((int64_t)(p - NULL))/sizeof(DType); + // } + // } + } + } + + cudnnDataType_t dtype_; + bool init_cudnn_; + cudnnRNNDescriptor_t rnn_desc_; + cudnnRNNMode_t mode_; + cudnnDirectionMode_t direction_; + cudnnRNNInputMode_t input_mode_; + cudnnDropoutDescriptor_t dropout_desc_; + Storage::Handle dropout_states_, reserve_space_; + uint64_t seed_ = 17 + rand() % 4096; // NOLINT(runtime/threadsafe_fn) + size_t workspace_byte_, reserve_space_byte_, dropout_byte_; + int workspace_size_, dropout_size_; + std::vector x_desc_vec_, y_desc_vec_, dx_desc_vec_, dy_desc_vec_; + #if USE_CUDNN_LSTM_PROJ + cudnnRNNDataDescriptor_t x_data_desc_, y_data_desc_, dx_data_desc_, dy_data_desc_; + #endif + cudnnTensorDescriptor_t hx_desc_, cx_desc_; + cudnnTensorDescriptor_t hy_desc_, cy_desc_; + cudnnTensorDescriptor_t dhx_desc_, dcx_desc_; + cudnnTensorDescriptor_t dhy_desc_, dcy_desc_; + + cudnnFilterDescriptor_t w_desc_, dw_desc_; + // Allow TensorCore algo policy + bool cudnn_tensor_core_; + + #if CUDNN_MAJOR >= 5 + cudnnTensorFormat_t format_; + #endif +}; +#else template class RNNOp { public: @@ -629,6 +1453,80 @@ class RNNOp { size_t reserve_space_size_; Storage::Handle reserve_space_; }; // class RNNOp +#endif + +static OpStatePtr CreateRNNState(const nnvm::NodeAttrs &attrs, + const Context ctx, + const mxnet::ShapeVector &in_shapes, + const std::vector &in_types) { + const RNNParam& param = nnvm::get(attrs.parsed); + OpStatePtr state = OpStatePtr(); + MSHADOW_REAL_TYPE_SWITCH(in_types[rnn_enum::kData], DType, { + state = OpStatePtr::Create>(param); + return state; + }); + + return OpStatePtr(); +} + +template +void RNNStatefulCompute(const OpStatePtr& state, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + int dtype = inputs[rnn_enum::kData].type_flag_; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + RNNOp& op = state.get_state>(); + op.Forward(ctx, inputs, req, outputs); + }); +} + +/* +index description +0: x +1: w +2: hx +3: y +4: dy +5: hy +6: dhy +7: cx +8: cy +9: dcy +*/ +template +void RNNStatefulGradCompute(const OpStatePtr& state, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + std::vector in_data(inputs.begin(), inputs.begin() + 3); + std::vector out_data{inputs[3]}; + std::vector out_grad{inputs[4]}; + const std::vector &in_grad = outputs; + + int dtype = inputs[rnn_enum::kData].type_flag_; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + RNNOp& op = state.get_state>(); + const RNNParam& param = op.param_; + int index = 5; + if (param.state_outputs) { + out_data.push_back(inputs[index++]); + out_grad.push_back(inputs[index++]); + } + + if (param.mode == rnn_enum::kLstm) { + in_data.push_back(inputs[index++]); + if (param.state_outputs) { + out_data.push_back(inputs[index++]); + out_grad.push_back(inputs[index]); + } + } + + op.Backward(ctx, out_grad, in_data, out_data, req, in_grad); + }); +} } // namespace op } // namespace mxnet diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc index 5da870643cd0..aa5c3f305975 100644 --- a/src/operator/rnn.cc +++ b/src/operator/rnn.cc @@ -24,9 +24,6 @@ * \author Sebastian Bodenstein */ #include "./rnn-inl.h" -#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5 -#include "./cudnn_rnn-inl.h" -#endif // MXNET_USE_CUDNN && CUDNN_MAJOR namespace mxnet { namespace op { @@ -141,27 +138,6 @@ static bool RNNType(const nnvm::NodeAttrs& attrs, return true; } -inline static bool RNNStorageType(const nnvm::NodeAttrs& attrs, - const int dev_mask, - DispatchMode* dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { - DispatchMode wanted_mode = DispatchMode::kFCompute; - - return storage_type_assign(out_attrs, mxnet::kDefaultStorage, - dispatch_mode, wanted_mode); -} - -inline static bool BackwardRNNStorageType(const nnvm::NodeAttrs& attrs, - const int dev_mask, - DispatchMode* dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { - DispatchMode wanted_mode = DispatchMode::kFCompute; - return storage_type_assign(out_attrs, mxnet::kDefaultStorage, - dispatch_mode, wanted_mode); -} - struct RNNGrad { const char *op_name; std::vector operator()(const nnvm::NodePtr &n, @@ -186,84 +162,6 @@ struct RNNGrad { } }; -static OpStatePtr CreateRNNState(const nnvm::NodeAttrs &attrs, - const Context ctx, - const mxnet::ShapeVector &in_shapes, - const std::vector &in_types) { - const RNNParam& param = nnvm::get(attrs.parsed); - OpStatePtr state = OpStatePtr(); - #if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5 - MSHADOW_REAL_TYPE_SWITCH(in_types[rnn_enum::kData], DType, { - state = OpStatePtr::Create>(param); - return state; - }); - #else - MSHADOW_REAL_TYPE_SWITCH(in_types[rnn_enum::kData], DType, { - state = OpStatePtr::Create>(param); - return state; - }); - #endif - return OpStatePtr(); // should never reach here -} - -template -void RNNStatefulCompute(const OpStatePtr& state, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - int dtype = inputs[rnn_enum::kData].type_flag_; - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - RNNOp& op = state.get_state>(); - op.Forward(ctx, inputs, req, outputs); - }); -} -/* -index description -0: x -1: w -2: hx -3: y -4: dy -5: hy -6: dhy -7: cx -8: cy -9: dcy -*/ -template -void RNNStatefulGradCompute(const OpStatePtr& state, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - std::vector in_data(inputs.begin(), inputs.begin() + 3); - std::vector out_data{inputs[3]}; - std::vector out_grad{inputs[4]}; - const std::vector &in_grad = outputs; - - int dtype = inputs[rnn_enum::kData].type_flag_; - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - RNNOp& op = state.get_state>(); - const RNNParam& param = op.param_; - int index = 5; - if (param.state_outputs) { - out_data.push_back(inputs[index++]); - out_grad.push_back(inputs[index++]); - } - - if (param.mode == rnn_enum::kLstm) { - in_data.push_back(inputs[index++]); - if (param.state_outputs) { - out_data.push_back(inputs[index++]); - out_grad.push_back(inputs[index]); - } - } - - op.Backward(ctx, out_grad, in_data, out_data, req, in_grad); - }); -} - NNVM_REGISTER_OP(RNN) .describe(R"code(Applies recurrent layers to input data. Currently, vanilla RNN, LSTM and GRU are implemented, with both multi-layer and bidirectional support. @@ -342,7 +240,6 @@ The definition of GRU here is slightly different from paper but compatible with }) .set_attr("FInferShape", RNNShape) .set_attr("FInferType", RNNType) -.set_attr("FInferStorageType", RNNStorageType) .set_attr("FCreateOpState", CreateRNNState) .set_attr("FStatefulCompute", RNNStatefulCompute) .set_attr("FGradient", RNNGrad{"_backward_RNN"}) @@ -365,7 +262,6 @@ NNVM_REGISTER_OP(_backward_RNN) .set_attr_parser(ParamParser) .set_attr("TIsLayerOpBackward", true) .set_attr("TIsBackward", true) -.set_attr("FInferStorageType", BackwardRNNStorageType) .set_attr("FResourceRequest", [](const NodeAttrs& n) { return std::vector{ResourceRequest::kTempSpace}; }) diff --git a/src/operator/rnn.cu b/src/operator/rnn.cu index 91f38f91f67c..77bb95522711 100644 --- a/src/operator/rnn.cu +++ b/src/operator/rnn.cu @@ -26,65 +26,10 @@ #include "./rnn-inl.h" #include -#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5 -#include "./cudnn_rnn-inl.h" -#endif // MXNET_USE_CUDNN && CUDNN_MAJOR namespace mxnet { namespace op { -template -void RNNStatefulCompute(const OpStatePtr& state, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - int dtype = inputs[rnn_enum::kData].type_flag_; - #if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5 - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - CuDNNRNNOp& op = state.get_state>(); - op.Forward(ctx, inputs, req, outputs); - }); - #else - LOG(FATAL) << "RNN on GPU is only available for cuDNN at the moment."; - #endif // MXNET_USE_CUDNN && CUDNN_MAJOR -} - -template -void RNNStatefulGradCompute(const OpStatePtr& state, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - std::vector in_data(inputs.begin(), inputs.begin() + 3); - std::vector out_data{inputs[3]}; - std::vector out_grad{inputs[4]}; - const std::vector &in_grad = outputs; - int dtype = inputs[rnn_enum::kData].type_flag_; - #if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5 - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - CuDNNRNNOp& op = state.get_state>(); - const RNNParam& param = op.param_; - int index = 5; - if (param.state_outputs) { - out_data.push_back(inputs[index++]); - out_grad.push_back(inputs[index++]); - } - - if (param.mode == rnn_enum::kLstm) { - in_data.push_back(inputs[index++]); - if (param.state_outputs) { - out_data.push_back(inputs[index++]); - out_grad.push_back(inputs[index]); - } - } - op.Backward(ctx, out_grad, in_data, out_data, req, in_grad); - }); - #else - LOG(FATAL) << "RNN on GPU is only available for cuDNN at the moment."; - #endif // MXNET_USE_CUDNN && CUDNN_MAJOR -} - NNVM_REGISTER_OP(RNN) .set_attr("FStatefulCompute", RNNStatefulCompute); From 31ec7d0a64a7972545e1820928db34d7698083c8 Mon Sep 17 00:00:00 2001 From: "Li, Hao H" Date: Wed, 20 Mar 2019 23:19:15 +0800 Subject: [PATCH 05/25] fix some bugs --- src/operator/rnn-inl.h | 113 ++++++++++++++++++++++------------------- 1 file changed, 62 insertions(+), 51 deletions(-) diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index e0b5de7b18f0..e5386781498f 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -380,7 +380,7 @@ void RNNBackward(DType* ws, } #if MXNET_USE_CUDNN_RNN -template +template class RNNOp { public: RNNParam param_; @@ -505,6 +505,7 @@ class RNNOp { void Forward(const OpContext &ctx, const std::vector &in_data, const std::vector &req, const std::vector &out_data) { +#if defined(__CUDACC__) using namespace mshadow; size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3; // kOut @@ -516,23 +517,23 @@ class RNNOp { CHECK_EQ(in_data.size(), num_inputs); CHECK_EQ(out_data.size(), num_outputs); - Stream *s = ctx.get_stream(); + Stream *s = ctx.get_stream(); // get input + output tensors - Tensor x = in_data[rnn_enum::kData].get(s); - Tensor w = in_data[rnn_enum::kParams].get(s); - Tensor hx = in_data[rnn_enum::kState].get(s); - Tensor y = out_data[rnn_enum::kOut].get(s); + Tensor x = in_data[rnn_enum::kData].get(s); + Tensor w = in_data[rnn_enum::kParams].get(s); + Tensor hx = in_data[rnn_enum::kState].get(s); + Tensor y = out_data[rnn_enum::kOut].get(s); void * hy_ptr = NULL; if (param_.state_outputs) - hy_ptr = out_data[rnn_enum::kStateOut].get(s).dptr_; + hy_ptr = out_data[rnn_enum::kStateOut].get(s).dptr_; DType * cx_ptr = NULL; DType * cy_ptr = NULL; if (param_.mode == rnn_enum::kLstm) - cx_ptr = (in_data[rnn_enum::kStateCell].get(s)).dptr_; + cx_ptr = (in_data[rnn_enum::kStateCell].get(s)).dptr_; if (param_.mode == rnn_enum::kLstm && param_.state_outputs) - cy_ptr = (out_data[rnn_enum::kStateCellOut].get(s)).dptr_; + cy_ptr = (out_data[rnn_enum::kStateCellOut].get(s)).dptr_; CHECK_EQ(x.CheckContiguous(), true); CHECK_EQ(w.CheckContiguous(), true); @@ -544,8 +545,8 @@ class RNNOp { } // Get temp space int temp_size = workspace_size_; - Tensor temp_space = - ctx.requested[rnn_enum::kTempSpace].get_space_typed( + Tensor temp_space = + ctx.requested[rnn_enum::kTempSpace].get_space_typed( mshadow::Shape1(temp_size), s); #if USE_CUDNN_LSTM_PROJ std::vector seqLengthArray(param_.batch_size_, param_.seq_length_); @@ -702,6 +703,7 @@ class RNNOp { workspace_byte_)); #endif } +#endif } void Backward(const OpContext &ctx, @@ -710,6 +712,7 @@ class RNNOp { const std::vector &out_data, const std::vector &req, const std::vector &in_grad) { +#if defined(__CUDACC__) using namespace mshadow; size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3; // kOut @@ -726,23 +729,23 @@ class RNNOp { CHECK_EQ(req.size(), num_inputs); CHECK_NE(req[rnn_enum::kData], kAddTo) << "AddTo is not supported for data"; CHECK_NE(req[rnn_enum::kState], kAddTo) << "AddTo is not supported for state"; - Stream *s = ctx.get_stream(); + Stream *s = ctx.get_stream(); // get input + output tensors - Tensor x = in_data[rnn_enum::kData].get(s); - Tensor dx = in_grad[rnn_enum::kData].get(s); - Tensor w = in_data[rnn_enum::kParams].get(s); - Tensor dw = in_grad[rnn_enum::kParams].get(s); - Tensor hx = in_data[rnn_enum::kState].get(s); - Tensor dhx = in_grad[rnn_enum::kState].get(s); - Tensor y = out_data[rnn_enum::kOut].get(s); - Tensor dy = out_grad[rnn_enum::kOut].get(s); + Tensor x = in_data[rnn_enum::kData].get(s); + Tensor dx = in_grad[rnn_enum::kData].get(s); + Tensor w = in_data[rnn_enum::kParams].get(s); + Tensor dw = in_grad[rnn_enum::kParams].get(s); + Tensor hx = in_data[rnn_enum::kState].get(s); + Tensor dhx = in_grad[rnn_enum::kState].get(s); + Tensor y = out_data[rnn_enum::kOut].get(s); + Tensor dy = out_grad[rnn_enum::kOut].get(s); if (req[rnn_enum::kParams] != kAddTo) { dw = mshadow::expr::ScalarExp(0.0f); } // only need kStateOut grad output_states is true void * dhy_ptr = NULL; if (param_.state_outputs) - dhy_ptr = out_grad[rnn_enum::kStateOut].get(s).dptr_; + dhy_ptr = out_grad[rnn_enum::kStateOut].get(s).dptr_; // Deal with lstm void * dcx_ptr = NULL; @@ -751,11 +754,11 @@ class RNNOp { if (param_.mode == rnn_enum::kLstm) { CHECK_NE(req[rnn_enum::kStateCell], kAddTo) << "AddTo is not supported for state cell"; - cx_ptr = (in_data[rnn_enum::kStateCell].get(s)).dptr_; - dcx_ptr = (in_grad[rnn_enum::kStateCell].get(s)).dptr_; + cx_ptr = (in_data[rnn_enum::kStateCell].get(s)).dptr_; + dcx_ptr = (in_grad[rnn_enum::kStateCell].get(s)).dptr_; } if ((param_.mode == rnn_enum::kLstm) && param_.state_outputs) - dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get(s)).dptr_; + dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get(s)).dptr_; CHECK_EQ(x.CheckContiguous(), true); CHECK_EQ(w.CheckContiguous(), true); @@ -771,8 +774,8 @@ class RNNOp { // Get temp space int temp_size = workspace_size_; - Tensor temp_space = - ctx.requested[rnn_enum::kTempSpace].get_space_typed( + Tensor temp_space = + ctx.requested[rnn_enum::kTempSpace].get_space_typed( mshadow::Shape1(temp_size), s); #if USE_CUDNN_LSTM_PROJ CUDNN_CALL(cudnnRNNBackwardDataEx(s->dnn_handle_, @@ -863,13 +866,16 @@ class RNNOp { reserve_space_.dptr, reserve_space_byte_)); #endif +#endif } private: - inline void Init(mshadow::Stream *s, + inline void Init(mshadow::Stream *s, const std::vector &in_data, const std::vector &out_data) { +#if defined(__CUDACC__) using namespace mshadow; + #if CUDNN_MAJOR >= 5 format_ = CUDNN_TENSOR_NCHW; #endif @@ -887,8 +893,8 @@ class RNNOp { if (!init_cudnn_) { init_cudnn_ = true; // get input + output tensors - Tensor x = in_data[rnn_enum::kData].get(s); - Tensor w = in_data[rnn_enum::kParams].get(s); + Tensor x = in_data[rnn_enum::kData].get(s); + Tensor w = in_data[rnn_enum::kParams].get(s); param_.seq_length_ = x.shape_[0]; param_.batch_size_ = x.shape_[1]; param_.input_size_ = x.shape_[2]; @@ -1168,6 +1174,7 @@ class RNNOp { // } // } } +#endif } cudnnDataType_t dtype_; @@ -1199,7 +1206,7 @@ class RNNOp { #endif }; #else -template +template class RNNOp { public: explicit RNNOp(RNNParam p) @@ -1238,12 +1245,12 @@ class RNNOp { } CHECK_EQ(in_data.size(), num_inputs); CHECK_EQ(out_data.size(), num_outputs); - Stream *s = ctx.get_stream(); + Stream *s = ctx.get_stream(); // get input + output tensor - Tensor x = in_data[rnn_enum::kData].get(s); - Tensor w = in_data[rnn_enum::kParams].get(s); - Tensor hx = in_data[rnn_enum::kState].get(s); - Tensor y = out_data[rnn_enum::kOut].get(s); + Tensor x = in_data[rnn_enum::kData].get(s); + Tensor w = in_data[rnn_enum::kParams].get(s); + Tensor hx = in_data[rnn_enum::kState].get(s); + Tensor y = out_data[rnn_enum::kOut].get(s); CHECK(x.CheckContiguous()); CHECK(w.CheckContiguous()); CHECK(hx.CheckContiguous()); @@ -1272,8 +1279,8 @@ class RNNOp { // allocate temp space const size_t workspace_size = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_, param_.state_size, direction, param_.mode); - Tensor workspace = ctx.requested[rnn_enum::kTempSpace] - .get_space_typed(Shape1(workspace_size), s); + Tensor workspace = ctx.requested[rnn_enum::kTempSpace] + .get_space_typed(Shape1(workspace_size), s); if (ctx.is_train) { const size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction, param_.seq_length_, param_.batch_size_, @@ -1356,16 +1363,16 @@ class RNNOp { CHECK_EQ(req.size(), num_inputs); CHECK_NE(req[rnn_enum::kData], kAddTo) << "AddTo is not supported for data"; CHECK_NE(req[rnn_enum::kState], kAddTo) << "AddTo is not supported for state"; - mshadow::Stream *s = ctx.get_stream(); + mshadow::Stream *s = ctx.get_stream(); // get input + output tensors - Tensor x = in_data[rnn_enum::kData].get(s); - Tensor w = in_data[rnn_enum::kParams].get(s); - Tensor hx = in_data[rnn_enum::kState].get(s); - Tensor y = out_data[rnn_enum::kOut].get(s); - Tensor dx = in_grad[rnn_enum::kData].get(s); - Tensor dw = in_grad[rnn_enum::kParams].get(s); - Tensor dhx = in_grad[rnn_enum::kState].get(s); - Tensor dy = out_grad[rnn_enum::kOut].get(s); + Tensor x = in_data[rnn_enum::kData].get(s); + Tensor w = in_data[rnn_enum::kParams].get(s); + Tensor hx = in_data[rnn_enum::kState].get(s); + Tensor y = out_data[rnn_enum::kOut].get(s); + Tensor dx = in_grad[rnn_enum::kData].get(s); + Tensor dw = in_grad[rnn_enum::kParams].get(s); + Tensor dhx = in_grad[rnn_enum::kState].get(s); + Tensor dy = out_grad[rnn_enum::kOut].get(s); CHECK(x.CheckContiguous()); CHECK(w.CheckContiguous()); CHECK(hx.CheckContiguous()); @@ -1404,8 +1411,8 @@ class RNNOp { // allocate temp space const size_t workspace_size = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_, param_.state_size, direction, param_.mode); - Tensor workspace = ctx.requested[rnn_enum::kTempSpace] - .get_space_typed(Shape1(workspace_size), s); + Tensor workspace = ctx.requested[rnn_enum::kTempSpace] + .get_space_typed(Shape1(workspace_size), s); size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction, param_.seq_length_, param_.batch_size_, @@ -1462,7 +1469,11 @@ static OpStatePtr CreateRNNState(const nnvm::NodeAttrs &attrs, const RNNParam& param = nnvm::get(attrs.parsed); OpStatePtr state = OpStatePtr(); MSHADOW_REAL_TYPE_SWITCH(in_types[rnn_enum::kData], DType, { - state = OpStatePtr::Create>(param); + if (ctx.dev_type == kGPU) { + state = OpStatePtr::Create>(param); + } else { + state = OpStatePtr::Create>(param); + } return state; }); @@ -1477,7 +1488,7 @@ void RNNStatefulCompute(const OpStatePtr& state, const std::vector& outputs) { int dtype = inputs[rnn_enum::kData].type_flag_; MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - RNNOp& op = state.get_state>(); + RNNOp& op = state.get_state>(); op.Forward(ctx, inputs, req, outputs); }); } @@ -1508,7 +1519,7 @@ void RNNStatefulGradCompute(const OpStatePtr& state, int dtype = inputs[rnn_enum::kData].type_flag_; MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - RNNOp& op = state.get_state>(); + RNNOp& op = state.get_state>(); const RNNParam& param = op.param_; int index = 5; if (param.state_outputs) { From 27d11e7a3611ba2aa528d04d30bf90dae649a15c Mon Sep 17 00:00:00 2001 From: "Li, Hao H" Date: Thu, 21 Mar 2019 21:25:22 +0800 Subject: [PATCH 06/25] fix bug about gpu case like tests/python/gpu/test_gluon_gpu.test_rnn_layers_fp32 etc --- src/operator/rnn-inl.h | 499 +++++++++++++++++------------------------ 1 file changed, 206 insertions(+), 293 deletions(-) diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index e5386781498f..30fe746ce528 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -379,13 +379,15 @@ void RNNBackward(DType* ws, } } -#if MXNET_USE_CUDNN_RNN template class RNNOp { public: RNNParam param_; - explicit RNNOp(RNNParam param) { + Context ctx_; + explicit RNNOp(RNNParam param, Context ctx) { this->param_ = param; + this->ctx_ = ctx; + #if MXNET_USE_CUDNN_RNN init_cudnn_ = false; dtype_ = mshadow::DataType::kCudnnFlag; // TensorCore algos only allowed on fp16-I/O convolutions if permitted by the global policy. @@ -463,9 +465,23 @@ class RNNOp { CUDNN_CALL(cudnnCreateRNNDataDescriptor(&dx_data_desc_)); CUDNN_CALL(cudnnCreateRNNDataDescriptor(&dy_data_desc_)); #endif + #endif + if (ctx_.dev_type == kCPU) { + this->init_space_ = false; + this->reserve_space_size_ = 0; + if (param_.projection_size.has_value()) { + LOG(FATAL) << + "hidden layer projection is only supported for GPU with CuDNN later than 7.1.1"; + } + if (param_.lstm_state_clip_min.has_value() + || param_.lstm_state_clip_max.has_value()) { + LOG(FATAL) << "LSTM state clipping is only supported for GPU with CuDNN later than 7.2.1"; + } + } } ~RNNOp() { + #if MXNET_USE_CUDNN_RNN CUDNN_CALL(cudnnDestroyTensorDescriptor(hx_desc_)); CUDNN_CALL(cudnnDestroyTensorDescriptor(cx_desc_)); CUDNN_CALL(cudnnDestroyTensorDescriptor(hy_desc_)); @@ -500,12 +516,22 @@ class RNNOp { CUDNN_CALL(cudnnDestroyRNNDataDescriptor(dx_data_desc_)); CUDNN_CALL(cudnnDestroyRNNDataDescriptor(dy_data_desc_)); #endif + #endif + if (ctx_.dev_type == kCPU) { + if (init_space_) { + Storage::Get()->Free(reserve_cpu_space_); + init_space_ = false; + } + } } void Forward(const OpContext &ctx, const std::vector &in_data, const std::vector &req, const std::vector &out_data) { -#if defined(__CUDACC__) + using namespace mshadow::expr; + CHECK(param_.p >= 0.0f && param_.p < 1.0f) + << "unsupported dropout value, should be 0 <= dropout < 1"; + using namespace mshadow; size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3; // kOut @@ -524,10 +550,17 @@ class RNNOp { Tensor hx = in_data[rnn_enum::kState].get(s); Tensor y = out_data[rnn_enum::kOut].get(s); - void * hy_ptr = NULL; - if (param_.state_outputs) - hy_ptr = out_data[rnn_enum::kStateOut].get(s).dptr_; + param_.seq_length_ = x.shape_[0]; + param_.batch_size_ = x.shape_[1]; + param_.input_size_ = x.shape_[2]; + const int direction = param_.bidirectional ? 2 : 1; + const int bsize = GetRnnBiasSize(param_.num_layers, param_.state_size, direction, param_.mode); + DType* b_ptr = w.dptr_ + w.shape_[0] - bsize; + DType* hy_ptr = NULL; + if (param_.state_outputs) { + hy_ptr = out_data[rnn_enum::kStateOut].dptr(); + } DType * cx_ptr = NULL; DType * cy_ptr = NULL; if (param_.mode == rnn_enum::kLstm) @@ -540,6 +573,12 @@ class RNNOp { CHECK_EQ(hx.CheckContiguous(), true); CHECK_EQ(y.CheckContiguous(), true); + // allocate temp space + const size_t workspace_size_cpu = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_, + param_.state_size, direction, param_.mode); + + DType* workspace_cpu = NULL; + #if MXNET_USE_CUDNN_RNN && (USE_CUDNN_LSTM_PROJ || defined(__CUDACC__)) if (!init_cudnn_) { Init(s, in_data, out_data); } @@ -547,7 +586,10 @@ class RNNOp { int temp_size = workspace_size_; Tensor temp_space = ctx.requested[rnn_enum::kTempSpace].get_space_typed( - mshadow::Shape1(temp_size), s); + mshadow::Shape1(temp_size + workspace_size_cpu), s); + + workspace_cpu = temp_space.dptr_ + temp_size; + #if USE_CUDNN_LSTM_PROJ std::vector seqLengthArray(param_.batch_size_, param_.seq_length_); CUDNN_CALL(cudnnSetRNNDataDescriptor(x_data_desc_, @@ -703,7 +745,69 @@ class RNNOp { workspace_byte_)); #endif } -#endif + #endif + + if (ctx_.dev_type == kCPU) { + if (!workspace_cpu) { + Tensor workspace = ctx.requested[rnn_enum::kTempSpace] + .get_space_typed(Shape1(workspace_size_cpu), s); + workspace_cpu = workspace.dptr_; + } + if (ctx.is_train) { + const size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction, + param_.seq_length_, param_.batch_size_, + param_.state_size, param_.mode); + if (init_space_ && reserve_space_size_ < r_size) { + Storage::Get()->Free(reserve_cpu_space_); + init_space_ = false; + } + if (!init_space_) { + reserve_cpu_space_ = Storage::Get()->Alloc(r_size * sizeof(DType), Context::CPU()); + reserve_space_size_ = r_size; + init_space_ = true; + } + + DType* reserve_space_ptr = static_cast(reserve_cpu_space_.dptr); + + RNNForwardTraining(workspace_cpu, + reserve_space_ptr, + param_.state_outputs, + param_.num_layers, + direction, + param_.seq_length_, + param_.batch_size_, + param_.input_size_, + param_.state_size, + x.dptr_, + hx.dptr_, + cx_ptr, + w.dptr_, + b_ptr, + y.dptr_, + hy_ptr, + cy_ptr, + param_.p, + param_.mode); + } else { + RNNForwardInference(workspace_cpu, + param_.state_outputs, + param_.num_layers, + direction, + param_.seq_length_, + param_.batch_size_, + param_.input_size_, + param_.state_size, + x.dptr_, + hx.dptr_, + cx_ptr, + w.dptr_, + b_ptr, + y.dptr_, + hy_ptr, + cy_ptr, + param_.mode); + } + } } void Backward(const OpContext &ctx, @@ -712,8 +816,11 @@ class RNNOp { const std::vector &out_data, const std::vector &req, const std::vector &in_grad) { -#if defined(__CUDACC__) using namespace mshadow; + using namespace mshadow::expr; + CHECK(param_.p >= 0.0f && param_.p < 1.0f) + << "unsupported dropout value, should be 0 <= dropout < 1"; + size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3; // kOut size_t num_outputs = 1; @@ -739,18 +846,36 @@ class RNNOp { Tensor dhx = in_grad[rnn_enum::kState].get(s); Tensor y = out_data[rnn_enum::kOut].get(s); Tensor dy = out_grad[rnn_enum::kOut].get(s); + + CHECK_EQ(x.CheckContiguous(), true); + CHECK_EQ(w.CheckContiguous(), true); + CHECK_EQ(dw.CheckContiguous(), true); + CHECK_EQ(hx.CheckContiguous(), true); + CHECK_EQ(dhx.CheckContiguous(), true); + CHECK_EQ(y.CheckContiguous(), true); + CHECK_EQ(dy.CheckContiguous(), true); + if (req[rnn_enum::kParams] != kAddTo) { dw = mshadow::expr::ScalarExp(0.0f); } - // only need kStateOut grad output_states is true - void * dhy_ptr = NULL; - if (param_.state_outputs) - dhy_ptr = out_grad[rnn_enum::kStateOut].get(s).dptr_; - // Deal with lstm - void * dcx_ptr = NULL; - void * dcy_ptr = NULL; - void * cx_ptr = NULL; + param_.seq_length_ = x.shape_[0]; + param_.batch_size_ = x.shape_[1]; + param_.input_size_ = x.shape_[2]; + + const int direction = param_.bidirectional ? 2 : 1; + const int bsize = GetRnnBiasSize(param_.num_layers, param_.state_size, direction, param_.mode); + + DType* db_ptr = dw.dptr_ + w.shape_[0] - bsize; + + DType * dhy_ptr = NULL; + if (param_.state_outputs) { + dhy_ptr = out_grad[rnn_enum::kStateOut].dptr(); + } + + DType* dcx_ptr = NULL; + DType* dcy_ptr = NULL; + DType* cx_ptr = NULL; if (param_.mode == rnn_enum::kLstm) { CHECK_NE(req[rnn_enum::kStateCell], kAddTo) << "AddTo is not supported for state cell"; @@ -760,14 +885,12 @@ class RNNOp { if ((param_.mode == rnn_enum::kLstm) && param_.state_outputs) dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get(s)).dptr_; - CHECK_EQ(x.CheckContiguous(), true); - CHECK_EQ(w.CheckContiguous(), true); - CHECK_EQ(dw.CheckContiguous(), true); - CHECK_EQ(hx.CheckContiguous(), true); - CHECK_EQ(dhx.CheckContiguous(), true); - CHECK_EQ(y.CheckContiguous(), true); - CHECK_EQ(dy.CheckContiguous(), true); - + // allocate temp space + const size_t workspace_size_cpu = + GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_, + param_.state_size, direction, param_.mode); + DType* workspace_cpu = NULL; + #if MXNET_USE_CUDNN_RNN && (USE_CUDNN_LSTM_PROJ || defined(__CUDACC__)) if (!init_cudnn_) { Init(s, in_data, out_data); } @@ -776,7 +899,8 @@ class RNNOp { int temp_size = workspace_size_; Tensor temp_space = ctx.requested[rnn_enum::kTempSpace].get_space_typed( - mshadow::Shape1(temp_size), s); + mshadow::Shape1(temp_size + workspace_size_cpu), s); + workspace_cpu = temp_space.dptr_ + temp_size; #if USE_CUDNN_LSTM_PROJ CUDNN_CALL(cudnnRNNBackwardDataEx(s->dnn_handle_, rnn_desc_, @@ -866,14 +990,60 @@ class RNNOp { reserve_space_.dptr, reserve_space_byte_)); #endif -#endif + #endif + + if (ctx_.dev_type == kCPU) { + if (!workspace_cpu) { + Tensor workspace = ctx.requested[rnn_enum::kTempSpace] + .get_space_typed(Shape1(workspace_size_cpu), s); + workspace_cpu = workspace.dptr_; + } + size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction, + param_.seq_length_, param_.batch_size_, + param_.state_size, param_.mode); + + if (!init_space_ || reserve_space_size_ != r_size) { + LOG(FATAL) << "Check forward init error"; + } + + DType* reserve_space_ptr = static_cast(reserve_cpu_space_.dptr); + RNNBackward(workspace_cpu, + reserve_space_ptr, + param_.num_layers, + direction, + param_.seq_length_, + param_.batch_size_, + param_.input_size_, + param_.state_size, + x.dptr_, + hx.dptr_, + cx_ptr, + w.dptr_, + y.dptr_, + dy.dptr_, + dhy_ptr, + dcy_ptr, + dx.dptr_, + dhx.dptr_, + dcx_ptr, + dw.dptr_, + db_ptr, + req[rnn_enum::kData], + req[rnn_enum::kParams], + req[rnn_enum::kState], + // State cell should be present for LSTMs, but is absent for other RNNs. + param_.mode == rnn_enum::kLstm ? req[rnn_enum::kStateCell] : kNullOp, + param_.p, + param_.mode); + } } + private: inline void Init(mshadow::Stream *s, const std::vector &in_data, const std::vector &out_data) { -#if defined(__CUDACC__) + #if MXNET_USE_CUDNN_RNN && (USE_CUDNN_LSTM_PROJ || defined(__CUDACC__)) using namespace mshadow; #if CUDNN_MAJOR >= 5 @@ -970,7 +1140,6 @@ class RNNOp { strideB[1] = dimB[2]; strideB[2] = 1; #endif - #if USE_CUDNN_LSTM_PROJ CUDNN_CALL(cudnnSetTensorNdDescriptor(hx_desc_, dtype_, @@ -1114,7 +1283,6 @@ class RNNOp { workspace_size_ = workspace_byte_ / sizeof(DType); // Allocate the reserve space reserve_space_ = Storage::Get()->Alloc(reserve_space_byte_, Context::GPU(s->dev_id)); - // Check that number of params are correct size_t cudnn_param_size; CUDNN_CALL(cudnnGetRNNParamsSize(s->dnn_handle_, @@ -1123,7 +1291,6 @@ class RNNOp { &cudnn_param_size, dtype_)); CHECK_EQ(w.shape_[0] * sizeof(DType), cudnn_param_size); - // Set param descriptors int dim_w[3] = {1, 1, 1}; dim_w[0] = w.shape_[0]; @@ -1174,9 +1341,9 @@ class RNNOp { // } // } } -#endif + #endif } - + #if MXNET_USE_CUDNN_RNN cudnnDataType_t dtype_; bool init_cudnn_; cudnnRNNDescriptor_t rnn_desc_; @@ -1204,263 +1371,11 @@ class RNNOp { #if CUDNN_MAJOR >= 5 cudnnTensorFormat_t format_; #endif -}; -#else -template -class RNNOp { - public: - explicit RNNOp(RNNParam p) - :param_(p), init_space_(false), reserve_space_size_(0) { - if (param_.projection_size.has_value()) { - LOG(FATAL) << "hidden layer projection is only supported for GPU with CuDNN later than 7.1.1"; - } - if (param_.lstm_state_clip_min.has_value() - || param_.lstm_state_clip_max.has_value()) { - LOG(FATAL) << "LSTM state clipping is only supported for GPU with CuDNN later than 7.2.1"; - } - } - - ~RNNOp() { - if (init_space_) { - Storage::Get()->Free(reserve_space_); - init_space_ = false; - } - } - - void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK(param_.p >= 0.0f && param_.p < 1.0f) - << "unsupported dropout value, should be 0 <= dropout < 1"; - - size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3; - // kOut - size_t num_outputs = 1; - if (param_.state_outputs) { - // kOut, kStateOut, kStateCellOut - num_outputs = (param_.mode == rnn_enum::kLstm) ? 3 : 2; - } - CHECK_EQ(in_data.size(), num_inputs); - CHECK_EQ(out_data.size(), num_outputs); - Stream *s = ctx.get_stream(); - // get input + output tensor - Tensor x = in_data[rnn_enum::kData].get(s); - Tensor w = in_data[rnn_enum::kParams].get(s); - Tensor hx = in_data[rnn_enum::kState].get(s); - Tensor y = out_data[rnn_enum::kOut].get(s); - CHECK(x.CheckContiguous()); - CHECK(w.CheckContiguous()); - CHECK(hx.CheckContiguous()); - CHECK(y.CheckContiguous()); - param_.seq_length_ = x.shape_[0]; - param_.batch_size_ = x.shape_[1]; - param_.input_size_ = x.shape_[2]; - const int direction = param_.bidirectional ? 2 : 1; - const int bsize = GetRnnBiasSize(param_.num_layers, param_.state_size, direction, param_.mode); - DType* b_ptr = w.dptr_ + w.shape_[0] - bsize; - - DType* hy_ptr = NULL; - if (param_.state_outputs) { - hy_ptr = out_data[rnn_enum::kStateOut].dptr(); - } - DType* cx_ptr = NULL; - DType* cy_ptr = NULL; - - if (param_.mode == rnn_enum::kLstm) { - cx_ptr = in_data[rnn_enum::kStateCell].dptr(); - if (param_.state_outputs) { - cy_ptr = out_data[rnn_enum::kStateCellOut].dptr(); - } - } - - // allocate temp space - const size_t workspace_size = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_, - param_.state_size, direction, param_.mode); - Tensor workspace = ctx.requested[rnn_enum::kTempSpace] - .get_space_typed(Shape1(workspace_size), s); - if (ctx.is_train) { - const size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction, - param_.seq_length_, param_.batch_size_, - param_.state_size, param_.mode); - if (init_space_ && reserve_space_size_ < r_size) { - Storage::Get()->Free(reserve_space_); - init_space_ = false; - } - if (!init_space_) { - reserve_space_ = Storage::Get()->Alloc(r_size * sizeof(DType), Context::CPU()); - reserve_space_size_ = r_size; - init_space_ = true; - } - - DType* reserve_space_ptr = static_cast(reserve_space_.dptr); - - RNNForwardTraining(workspace.dptr_, - reserve_space_ptr, - param_.state_outputs, - param_.num_layers, - direction, - param_.seq_length_, - param_.batch_size_, - param_.input_size_, - param_.state_size, - x.dptr_, - hx.dptr_, - cx_ptr, - w.dptr_, - b_ptr, - y.dptr_, - hy_ptr, - cy_ptr, - param_.p, - param_.mode); - } else { - RNNForwardInference(workspace.dptr_, - param_.state_outputs, - param_.num_layers, - direction, - param_.seq_length_, - param_.batch_size_, - param_.input_size_, - param_.state_size, - x.dptr_, - hx.dptr_, - cx_ptr, - w.dptr_, - b_ptr, - y.dptr_, - hy_ptr, - cy_ptr, - param_.mode); - } - } - - void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK(param_.p >= 0.0f && param_.p < 1.0f) - << "unsupported dropout value, should be 0 <= dropout < 1"; - - size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3; - // kOut - size_t num_outputs = 1; - if (param_.state_outputs) { - // kOut, kStateOut, kStateCellOut - num_outputs = (param_.mode == rnn_enum::kLstm) ? 3 : 2; - } - - CHECK_EQ(in_data.size(), num_inputs); - CHECK_EQ(out_data.size(), num_outputs); - CHECK_EQ(in_grad.size(), num_inputs); - CHECK_EQ(out_grad.size(), num_outputs); - CHECK_EQ(req.size(), num_inputs); - CHECK_NE(req[rnn_enum::kData], kAddTo) << "AddTo is not supported for data"; - CHECK_NE(req[rnn_enum::kState], kAddTo) << "AddTo is not supported for state"; - mshadow::Stream *s = ctx.get_stream(); - // get input + output tensors - Tensor x = in_data[rnn_enum::kData].get(s); - Tensor w = in_data[rnn_enum::kParams].get(s); - Tensor hx = in_data[rnn_enum::kState].get(s); - Tensor y = out_data[rnn_enum::kOut].get(s); - Tensor dx = in_grad[rnn_enum::kData].get(s); - Tensor dw = in_grad[rnn_enum::kParams].get(s); - Tensor dhx = in_grad[rnn_enum::kState].get(s); - Tensor dy = out_grad[rnn_enum::kOut].get(s); - CHECK(x.CheckContiguous()); - CHECK(w.CheckContiguous()); - CHECK(hx.CheckContiguous()); - CHECK(y.CheckContiguous()); - CHECK(dx.CheckContiguous()); - CHECK(dw.CheckContiguous()); - CHECK(dhx.CheckContiguous()); - CHECK(dy.CheckContiguous()); - param_.seq_length_ = x.shape_[0]; - param_.batch_size_ = x.shape_[1]; - param_.input_size_ = x.shape_[2]; - - const int direction = param_.bidirectional ? 2 : 1; - const int bsize = GetRnnBiasSize(param_.num_layers, param_.state_size, direction, param_.mode); - - DType* db_ptr = dw.dptr_ + w.shape_[0] - bsize; - - DType * dhy_ptr = NULL; - if (param_.state_outputs) { - dhy_ptr = out_grad[rnn_enum::kStateOut].dptr(); - } - - DType * cx_ptr = NULL; - DType * dcx_ptr = NULL; - DType * dcy_ptr = NULL; - - if (param_.mode == rnn_enum::kLstm) { - CHECK_NE(req[rnn_enum::kStateCell], kAddTo) << "AddTo is not supported for state cell"; - cx_ptr = in_data[rnn_enum::kStateCell].dptr(); - dcx_ptr = in_grad[rnn_enum::kStateCell].dptr(); - if (param_.state_outputs) { - dcy_ptr = out_grad[rnn_enum::kStateCellOut].dptr(); - } - } - - // allocate temp space - const size_t workspace_size = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_, - param_.state_size, direction, param_.mode); - Tensor workspace = ctx.requested[rnn_enum::kTempSpace] - .get_space_typed(Shape1(workspace_size), s); - - size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction, - param_.seq_length_, param_.batch_size_, - param_.state_size, param_.mode); - - if (!init_space_ || reserve_space_size_ != r_size) { - LOG(FATAL) << "Check forward init error"; - } - - DType* reserve_space_ptr = static_cast(reserve_space_.dptr); - RNNBackward(workspace.dptr_, - reserve_space_ptr, - param_.num_layers, - direction, - param_.seq_length_, - param_.batch_size_, - param_.input_size_, - param_.state_size, - x.dptr_, - hx.dptr_, - cx_ptr, - w.dptr_, - y.dptr_, - dy.dptr_, - dhy_ptr, - dcy_ptr, - dx.dptr_, - dhx.dptr_, - dcx_ptr, - dw.dptr_, - db_ptr, - req[rnn_enum::kData], - req[rnn_enum::kParams], - req[rnn_enum::kState], - // State cell should be present for LSTMs, but is absent for other RNNs. - param_.mode == rnn_enum::kLstm ? req[rnn_enum::kStateCell] : kNullOp, - param_.p, - param_.mode); - } - - RNNParam param_; - - private: + #endif bool init_space_; size_t reserve_space_size_; - Storage::Handle reserve_space_; -}; // class RNNOp -#endif + Storage::Handle reserve_cpu_space_; +}; // class RNNOp static OpStatePtr CreateRNNState(const nnvm::NodeAttrs &attrs, const Context ctx, @@ -1470,14 +1385,12 @@ static OpStatePtr CreateRNNState(const nnvm::NodeAttrs &attrs, OpStatePtr state = OpStatePtr(); MSHADOW_REAL_TYPE_SWITCH(in_types[rnn_enum::kData], DType, { if (ctx.dev_type == kGPU) { - state = OpStatePtr::Create>(param); + state = OpStatePtr::Create>(param, ctx); } else { - state = OpStatePtr::Create>(param); + state = OpStatePtr::Create>(param, ctx); } - return state; }); - - return OpStatePtr(); + return state; } template From a523b77e3232256604f5e6316e25731d8e4a1fd9 Mon Sep 17 00:00:00 2001 From: "Li, Hao H" Date: Fri, 22 Mar 2019 10:10:19 +0800 Subject: [PATCH 07/25] fix gpu compile issue of unix-gpu and windows-gpu --- src/operator/rnn-inl.h | 63 +++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 32 deletions(-) diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index 30fe746ce528..feffd8c26ca2 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -468,7 +468,7 @@ class RNNOp { #endif if (ctx_.dev_type == kCPU) { this->init_space_ = false; - this->reserve_space_size_ = 0; + this->reserve_cpu_space_size_ = 0; if (param_.projection_size.has_value()) { LOG(FATAL) << "hidden layer projection is only supported for GPU with CuDNN later than 7.1.1"; @@ -528,11 +528,10 @@ class RNNOp { void Forward(const OpContext &ctx, const std::vector &in_data, const std::vector &req, const std::vector &out_data) { + using namespace mshadow; using namespace mshadow::expr; CHECK(param_.p >= 0.0f && param_.p < 1.0f) << "unsupported dropout value, should be 0 <= dropout < 1"; - - using namespace mshadow; size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3; // kOut size_t num_outputs = 1; @@ -574,11 +573,11 @@ class RNNOp { CHECK_EQ(y.CheckContiguous(), true); // allocate temp space - const size_t workspace_size_cpu = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_, + const size_t work_cpu_space_size = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_, param_.state_size, direction, param_.mode); + DType* work_cpu_space = NULL; - DType* workspace_cpu = NULL; - #if MXNET_USE_CUDNN_RNN && (USE_CUDNN_LSTM_PROJ || defined(__CUDACC__)) + #if MXNET_USE_CUDNN_RNN && defined(__CUDACC__) if (!init_cudnn_) { Init(s, in_data, out_data); } @@ -586,9 +585,9 @@ class RNNOp { int temp_size = workspace_size_; Tensor temp_space = ctx.requested[rnn_enum::kTempSpace].get_space_typed( - mshadow::Shape1(temp_size + workspace_size_cpu), s); + mshadow::Shape1(temp_size + work_cpu_space_size), s); - workspace_cpu = temp_space.dptr_ + temp_size; + work_cpu_space = temp_space.dptr_ + temp_size; #if USE_CUDNN_LSTM_PROJ std::vector seqLengthArray(param_.batch_size_, param_.seq_length_); @@ -748,28 +747,28 @@ class RNNOp { #endif if (ctx_.dev_type == kCPU) { - if (!workspace_cpu) { + if (!work_cpu_space) { Tensor workspace = ctx.requested[rnn_enum::kTempSpace] - .get_space_typed(Shape1(workspace_size_cpu), s); - workspace_cpu = workspace.dptr_; + .get_space_typed(Shape1(work_cpu_space_size), s); + work_cpu_space = workspace.dptr_; } if (ctx.is_train) { const size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction, param_.seq_length_, param_.batch_size_, param_.state_size, param_.mode); - if (init_space_ && reserve_space_size_ < r_size) { + if (init_space_ && reserve_cpu_space_size_ < r_size) { Storage::Get()->Free(reserve_cpu_space_); init_space_ = false; } if (!init_space_) { reserve_cpu_space_ = Storage::Get()->Alloc(r_size * sizeof(DType), Context::CPU()); - reserve_space_size_ = r_size; + reserve_cpu_space_size_ = r_size; init_space_ = true; } DType* reserve_space_ptr = static_cast(reserve_cpu_space_.dptr); - RNNForwardTraining(workspace_cpu, + RNNForwardTraining(work_cpu_space, reserve_space_ptr, param_.state_outputs, param_.num_layers, @@ -789,7 +788,7 @@ class RNNOp { param_.p, param_.mode); } else { - RNNForwardInference(workspace_cpu, + RNNForwardInference(work_cpu_space, param_.state_outputs, param_.num_layers, direction, @@ -886,11 +885,11 @@ class RNNOp { dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get(s)).dptr_; // allocate temp space - const size_t workspace_size_cpu = + const size_t work_cpu_space_size = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_, param_.state_size, direction, param_.mode); - DType* workspace_cpu = NULL; - #if MXNET_USE_CUDNN_RNN && (USE_CUDNN_LSTM_PROJ || defined(__CUDACC__)) + DType* work_cpu_space = NULL; + #if MXNET_USE_CUDNN_RNN && defined(__CUDACC__) if (!init_cudnn_) { Init(s, in_data, out_data); } @@ -899,8 +898,8 @@ class RNNOp { int temp_size = workspace_size_; Tensor temp_space = ctx.requested[rnn_enum::kTempSpace].get_space_typed( - mshadow::Shape1(temp_size + workspace_size_cpu), s); - workspace_cpu = temp_space.dptr_ + temp_size; + mshadow::Shape1(temp_size + work_cpu_space_size), s); + work_cpu_space = temp_space.dptr_ + temp_size; #if USE_CUDNN_LSTM_PROJ CUDNN_CALL(cudnnRNNBackwardDataEx(s->dnn_handle_, rnn_desc_, @@ -993,21 +992,21 @@ class RNNOp { #endif if (ctx_.dev_type == kCPU) { - if (!workspace_cpu) { + if (!work_cpu_space) { Tensor workspace = ctx.requested[rnn_enum::kTempSpace] - .get_space_typed(Shape1(workspace_size_cpu), s); - workspace_cpu = workspace.dptr_; + .get_space_typed(Shape1(work_cpu_space_size), s); + work_cpu_space = workspace.dptr_; } size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction, param_.seq_length_, param_.batch_size_, param_.state_size, param_.mode); - if (!init_space_ || reserve_space_size_ != r_size) { + if (!init_space_ || reserve_cpu_space_size_ != r_size) { LOG(FATAL) << "Check forward init error"; } DType* reserve_space_ptr = static_cast(reserve_cpu_space_.dptr); - RNNBackward(workspace_cpu, + RNNBackward(work_cpu_space, reserve_space_ptr, param_.num_layers, direction, @@ -1043,13 +1042,7 @@ class RNNOp { inline void Init(mshadow::Stream *s, const std::vector &in_data, const std::vector &out_data) { - #if MXNET_USE_CUDNN_RNN && (USE_CUDNN_LSTM_PROJ || defined(__CUDACC__)) using namespace mshadow; - - #if CUDNN_MAJOR >= 5 - format_ = CUDNN_TENSOR_NCHW; - #endif - size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3; // kOut size_t num_outputs = 1; @@ -1060,6 +1053,12 @@ class RNNOp { CHECK_EQ(in_data.size(), num_inputs); CHECK_EQ(out_data.size(), num_outputs); + + #if MXNET_USE_CUDNN_RNN && defined(__CUDACC__) + #if CUDNN_MAJOR >= 5 + format_ = CUDNN_TENSOR_NCHW; + #endif + if (!init_cudnn_) { init_cudnn_ = true; // get input + output tensors @@ -1373,7 +1372,7 @@ class RNNOp { #endif #endif bool init_space_; - size_t reserve_space_size_; + size_t reserve_cpu_space_size_; Storage::Handle reserve_cpu_space_; }; // class RNNOp From 148d110c7ac385c67197ee944711d4167c38c0f4 Mon Sep 17 00:00:00 2001 From: "Li, Hao H" Date: Fri, 22 Mar 2019 13:02:06 +0800 Subject: [PATCH 08/25] print log for test --- src/operator/rnn-inl.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index feffd8c26ca2..409e511f0434 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -576,6 +576,26 @@ class RNNOp { const size_t work_cpu_space_size = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_, param_.state_size, direction, param_.mode); DType* work_cpu_space = NULL; + #if MXNET_USE_CUDNN_RNN + LOG(INFO) << "MXNET_USE_CUDNN_RNN:true"; + #else + LOG(INFO) << "MXNET_USE_CUDNN_RNN:false"; + #endif + #if defined(__CUDACC__) + LOG(INFO) << "defined(__CUDACC__):true"; + #else + LOG(INFO) << "defined(__CUDACC__):false"; + #endif + #if MXNET_USE_CUDNN == 1 + LOG(INFO) << "MXNET_USE_CUDNN == 1:true"; + #else + LOG(INFO) << "MXNET_USE_CUDNN == 1:false"; + #endif + #if CUDNN_VERSION >= 7200 + LOG(INFO) << "CUDNN_VERSION >= 7200:true"; + #else + LOG(INFO) << "CUDNN_VERSION >= 7200:false"; + #endif #if MXNET_USE_CUDNN_RNN && defined(__CUDACC__) if (!init_cudnn_) { From 7b97de4e4592138126207be4ee093bbb097e8bec Mon Sep 17 00:00:00 2001 From: "Li, Hao H" Date: Fri, 22 Mar 2019 14:24:25 +0800 Subject: [PATCH 09/25] fix GPU NO CUDNN for unix-gpu case --- src/operator/rnn-inl.h | 163 +++++++++++++++++++++++++++++++++++------ 1 file changed, 142 insertions(+), 21 deletions(-) diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index 409e511f0434..60a8af3f4d20 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -466,6 +466,24 @@ class RNNOp { CUDNN_CALL(cudnnCreateRNNDataDescriptor(&dy_data_desc_)); #endif #endif + + #if !MXNET_USE_CUDNN_RNN || !defined(__CUDACC__) + // GPU NO CUDNN + if (ctx_.dev_type == kGPU) { + this->init_space_ = false; + this->reserve_cpu_space_size_ = 0; + if (param_.projection_size.has_value()) { + LOG(FATAL) << + "hidden layer projection is only supported for GPU with CuDNN later than 7.1.1"; + } + if (param_.lstm_state_clip_min.has_value() + || param_.lstm_state_clip_max.has_value()) { + LOG(FATAL) << "LSTM state clipping is only supported for GPU with CuDNN later than 7.2.1"; + } + } + #endif + + // if dev_type is CPU, run CPU code if (ctx_.dev_type == kCPU) { this->init_space_ = false; this->reserve_cpu_space_size_ = 0; @@ -517,6 +535,18 @@ class RNNOp { CUDNN_CALL(cudnnDestroyRNNDataDescriptor(dy_data_desc_)); #endif #endif + + #if !MXNET_USE_CUDNN_RNN || !defined(__CUDACC__) + // GPU NO CUDNN + if (ctx_.dev_type == kGPU) { + if (init_space_) { + Storage::Get()->Free(reserve_cpu_space_); + init_space_ = false; + } + } + #endif + + // if dev_type is CPU, run CPU code if (ctx_.dev_type == kCPU) { if (init_space_) { Storage::Get()->Free(reserve_cpu_space_); @@ -576,27 +606,6 @@ class RNNOp { const size_t work_cpu_space_size = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_, param_.state_size, direction, param_.mode); DType* work_cpu_space = NULL; - #if MXNET_USE_CUDNN_RNN - LOG(INFO) << "MXNET_USE_CUDNN_RNN:true"; - #else - LOG(INFO) << "MXNET_USE_CUDNN_RNN:false"; - #endif - #if defined(__CUDACC__) - LOG(INFO) << "defined(__CUDACC__):true"; - #else - LOG(INFO) << "defined(__CUDACC__):false"; - #endif - #if MXNET_USE_CUDNN == 1 - LOG(INFO) << "MXNET_USE_CUDNN == 1:true"; - #else - LOG(INFO) << "MXNET_USE_CUDNN == 1:false"; - #endif - #if CUDNN_VERSION >= 7200 - LOG(INFO) << "CUDNN_VERSION >= 7200:true"; - #else - LOG(INFO) << "CUDNN_VERSION >= 7200:false"; - #endif - #if MXNET_USE_CUDNN_RNN && defined(__CUDACC__) if (!init_cudnn_) { Init(s, in_data, out_data); @@ -764,8 +773,72 @@ class RNNOp { workspace_byte_)); #endif } + #else + // GPU NO CUDNN + if (ctx_.dev_type == kGPU) { + if (!work_cpu_space) { + Tensor workspace = ctx.requested[rnn_enum::kTempSpace] + .get_space_typed(Shape1(work_cpu_space_size), s); + work_cpu_space = workspace.dptr_; + } + if (ctx.is_train) { + const size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction, + param_.seq_length_, param_.batch_size_, + param_.state_size, param_.mode); + if (init_space_ && reserve_cpu_space_size_ < r_size) { + Storage::Get()->Free(reserve_cpu_space_); + init_space_ = false; + } + if (!init_space_) { + reserve_cpu_space_ = Storage::Get()->Alloc(r_size * sizeof(DType), Context::CPU()); + reserve_cpu_space_size_ = r_size; + init_space_ = true; + } + + DType* reserve_space_ptr = static_cast(reserve_cpu_space_.dptr); + + RNNForwardTraining(work_cpu_space, + reserve_space_ptr, + param_.state_outputs, + param_.num_layers, + direction, + param_.seq_length_, + param_.batch_size_, + param_.input_size_, + param_.state_size, + x.dptr_, + hx.dptr_, + cx_ptr, + w.dptr_, + b_ptr, + y.dptr_, + hy_ptr, + cy_ptr, + param_.p, + param_.mode); + } else { + RNNForwardInference(work_cpu_space, + param_.state_outputs, + param_.num_layers, + direction, + param_.seq_length_, + param_.batch_size_, + param_.input_size_, + param_.state_size, + x.dptr_, + hx.dptr_, + cx_ptr, + w.dptr_, + b_ptr, + y.dptr_, + hy_ptr, + cy_ptr, + param_.mode); + } + } #endif + // if dev_type is CPU, run CPU code if (ctx_.dev_type == kCPU) { if (!work_cpu_space) { Tensor workspace = ctx.requested[rnn_enum::kTempSpace] @@ -1009,8 +1082,56 @@ class RNNOp { reserve_space_.dptr, reserve_space_byte_)); #endif + + #else + // GPU NO CUDNN + if (ctx_.dev_type == kGPU) { + if (!work_cpu_space) { + Tensor workspace = ctx.requested[rnn_enum::kTempSpace] + .get_space_typed(Shape1(work_cpu_space_size), s); + work_cpu_space = workspace.dptr_; + } + size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction, + param_.seq_length_, param_.batch_size_, + param_.state_size, param_.mode); + + if (!init_space_ || reserve_cpu_space_size_ != r_size) { + LOG(FATAL) << "Check forward init error"; + } + + DType* reserve_space_ptr = static_cast(reserve_cpu_space_.dptr); + RNNBackward(work_cpu_space, + reserve_space_ptr, + param_.num_layers, + direction, + param_.seq_length_, + param_.batch_size_, + param_.input_size_, + param_.state_size, + x.dptr_, + hx.dptr_, + cx_ptr, + w.dptr_, + y.dptr_, + dy.dptr_, + dhy_ptr, + dcy_ptr, + dx.dptr_, + dhx.dptr_, + dcx_ptr, + dw.dptr_, + db_ptr, + req[rnn_enum::kData], + req[rnn_enum::kParams], + req[rnn_enum::kState], + // State cell should be present for LSTMs, but is absent for other RNNs. + param_.mode == rnn_enum::kLstm ? req[rnn_enum::kStateCell] : kNullOp, + param_.p, + param_.mode); + } #endif + // if dev_type is CPU, run CPU code if (ctx_.dev_type == kCPU) { if (!work_cpu_space) { Tensor workspace = ctx.requested[rnn_enum::kTempSpace] From 58c7f86ddb9ae99a58980e3b1ea4c85938d4e8d5 Mon Sep 17 00:00:00 2001 From: "Li, Hao H" Date: Fri, 22 Mar 2019 15:47:00 +0800 Subject: [PATCH 10/25] print log --- src/operator/rnn-inl.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index 60a8af3f4d20..9b4f0b15028c 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -467,9 +467,15 @@ class RNNOp { #endif #endif + #if MXNET_USE_CUDNN_RNN + LOG(INFO) << "MXNET_USE_CUDNN_RNN:true"; + #else + LOG(INFO) << "MXNET_USE_CUDNN_RNN:false"; + #endif #if !MXNET_USE_CUDNN_RNN || !defined(__CUDACC__) // GPU NO CUDNN if (ctx_.dev_type == kGPU) { + LOG(INFO) << "ctx_.dev_type == kGPU:true"; this->init_space_ = false; this->reserve_cpu_space_size_ = 0; if (param_.projection_size.has_value()) { @@ -777,6 +783,7 @@ class RNNOp { // GPU NO CUDNN if (ctx_.dev_type == kGPU) { if (!work_cpu_space) { + LOG(INFO) << "work_cpu_space is null, allocate"; Tensor workspace = ctx.requested[rnn_enum::kTempSpace] .get_space_typed(Shape1(work_cpu_space_size), s); work_cpu_space = workspace.dptr_; From 2edb811887e17401b72027a576e69f1b84517408 Mon Sep 17 00:00:00 2001 From: "Li, Hao H" Date: Fri, 22 Mar 2019 22:57:10 +0800 Subject: [PATCH 11/25] remove print log and make gpu case has same error message as master when USE_CUDA=1 USE_CUDNN=0 --- src/operator/rnn-inl.h | 147 +---------------------------------------- 1 file changed, 1 insertion(+), 146 deletions(-) diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index 9b4f0b15028c..56003d3fca08 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -465,31 +465,10 @@ class RNNOp { CUDNN_CALL(cudnnCreateRNNDataDescriptor(&dx_data_desc_)); CUDNN_CALL(cudnnCreateRNNDataDescriptor(&dy_data_desc_)); #endif - #endif - - #if MXNET_USE_CUDNN_RNN - LOG(INFO) << "MXNET_USE_CUDNN_RNN:true"; #else - LOG(INFO) << "MXNET_USE_CUDNN_RNN:false"; - #endif - #if !MXNET_USE_CUDNN_RNN || !defined(__CUDACC__) - // GPU NO CUDNN - if (ctx_.dev_type == kGPU) { - LOG(INFO) << "ctx_.dev_type == kGPU:true"; - this->init_space_ = false; - this->reserve_cpu_space_size_ = 0; - if (param_.projection_size.has_value()) { - LOG(FATAL) << - "hidden layer projection is only supported for GPU with CuDNN later than 7.1.1"; - } - if (param_.lstm_state_clip_min.has_value() - || param_.lstm_state_clip_max.has_value()) { - LOG(FATAL) << "LSTM state clipping is only supported for GPU with CuDNN later than 7.2.1"; - } - } + LOG(FATAL) << "RNN on GPU is only available for cuDNN at the moment."; #endif - // if dev_type is CPU, run CPU code if (ctx_.dev_type == kCPU) { this->init_space_ = false; this->reserve_cpu_space_size_ = 0; @@ -542,17 +521,6 @@ class RNNOp { #endif #endif - #if !MXNET_USE_CUDNN_RNN || !defined(__CUDACC__) - // GPU NO CUDNN - if (ctx_.dev_type == kGPU) { - if (init_space_) { - Storage::Get()->Free(reserve_cpu_space_); - init_space_ = false; - } - } - #endif - - // if dev_type is CPU, run CPU code if (ctx_.dev_type == kCPU) { if (init_space_) { Storage::Get()->Free(reserve_cpu_space_); @@ -779,73 +747,8 @@ class RNNOp { workspace_byte_)); #endif } - #else - // GPU NO CUDNN - if (ctx_.dev_type == kGPU) { - if (!work_cpu_space) { - LOG(INFO) << "work_cpu_space is null, allocate"; - Tensor workspace = ctx.requested[rnn_enum::kTempSpace] - .get_space_typed(Shape1(work_cpu_space_size), s); - work_cpu_space = workspace.dptr_; - } - if (ctx.is_train) { - const size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction, - param_.seq_length_, param_.batch_size_, - param_.state_size, param_.mode); - if (init_space_ && reserve_cpu_space_size_ < r_size) { - Storage::Get()->Free(reserve_cpu_space_); - init_space_ = false; - } - if (!init_space_) { - reserve_cpu_space_ = Storage::Get()->Alloc(r_size * sizeof(DType), Context::CPU()); - reserve_cpu_space_size_ = r_size; - init_space_ = true; - } - - DType* reserve_space_ptr = static_cast(reserve_cpu_space_.dptr); - - RNNForwardTraining(work_cpu_space, - reserve_space_ptr, - param_.state_outputs, - param_.num_layers, - direction, - param_.seq_length_, - param_.batch_size_, - param_.input_size_, - param_.state_size, - x.dptr_, - hx.dptr_, - cx_ptr, - w.dptr_, - b_ptr, - y.dptr_, - hy_ptr, - cy_ptr, - param_.p, - param_.mode); - } else { - RNNForwardInference(work_cpu_space, - param_.state_outputs, - param_.num_layers, - direction, - param_.seq_length_, - param_.batch_size_, - param_.input_size_, - param_.state_size, - x.dptr_, - hx.dptr_, - cx_ptr, - w.dptr_, - b_ptr, - y.dptr_, - hy_ptr, - cy_ptr, - param_.mode); - } - } #endif - // if dev_type is CPU, run CPU code if (ctx_.dev_type == kCPU) { if (!work_cpu_space) { Tensor workspace = ctx.requested[rnn_enum::kTempSpace] @@ -1089,56 +992,8 @@ class RNNOp { reserve_space_.dptr, reserve_space_byte_)); #endif - - #else - // GPU NO CUDNN - if (ctx_.dev_type == kGPU) { - if (!work_cpu_space) { - Tensor workspace = ctx.requested[rnn_enum::kTempSpace] - .get_space_typed(Shape1(work_cpu_space_size), s); - work_cpu_space = workspace.dptr_; - } - size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction, - param_.seq_length_, param_.batch_size_, - param_.state_size, param_.mode); - - if (!init_space_ || reserve_cpu_space_size_ != r_size) { - LOG(FATAL) << "Check forward init error"; - } - - DType* reserve_space_ptr = static_cast(reserve_cpu_space_.dptr); - RNNBackward(work_cpu_space, - reserve_space_ptr, - param_.num_layers, - direction, - param_.seq_length_, - param_.batch_size_, - param_.input_size_, - param_.state_size, - x.dptr_, - hx.dptr_, - cx_ptr, - w.dptr_, - y.dptr_, - dy.dptr_, - dhy_ptr, - dcy_ptr, - dx.dptr_, - dhx.dptr_, - dcx_ptr, - dw.dptr_, - db_ptr, - req[rnn_enum::kData], - req[rnn_enum::kParams], - req[rnn_enum::kState], - // State cell should be present for LSTMs, but is absent for other RNNs. - param_.mode == rnn_enum::kLstm ? req[rnn_enum::kStateCell] : kNullOp, - param_.p, - param_.mode); - } #endif - // if dev_type is CPU, run CPU code if (ctx_.dev_type == kCPU) { if (!work_cpu_space) { Tensor workspace = ctx.requested[rnn_enum::kTempSpace] From b94c45215d2a366c867a3d205fa522c42c446ee5 Mon Sep 17 00:00:00 2001 From: "Li, Hao H" Date: Fri, 22 Mar 2019 23:55:13 +0800 Subject: [PATCH 12/25] fix typo bug --- src/operator/rnn-inl.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index 56003d3fca08..db90fe8b74de 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -466,7 +466,9 @@ class RNNOp { CUDNN_CALL(cudnnCreateRNNDataDescriptor(&dy_data_desc_)); #endif #else - LOG(FATAL) << "RNN on GPU is only available for cuDNN at the moment."; + if (ctx_.dev_type == kGPU) { + LOG(FATAL) << "RNN on GPU is only available for cuDNN at the moment."; + } #endif if (ctx_.dev_type == kCPU) { From bb6e9f1d2f7ff341b2935a794122b4e6812e13b9 Mon Sep 17 00:00:00 2001 From: "Li, Hao H" Date: Mon, 25 Mar 2019 08:57:26 +0800 Subject: [PATCH 13/25] retrigger the ci From adc92d354826dfdb9d1030461de5ccd5be72edec Mon Sep 17 00:00:00 2001 From: "Li, Hao H" Date: Mon, 25 Mar 2019 14:08:51 +0800 Subject: [PATCH 14/25] retrigger the ci From 67b3909f4044760fdeff3bd4af896c1d2da904a8 Mon Sep 17 00:00:00 2001 From: "Li, Hao H" Date: Tue, 26 Mar 2019 13:45:19 +0800 Subject: [PATCH 15/25] retrigger the ci From 3987b9c5a0abfc568e37683f5202956bf6eb56d3 Mon Sep 17 00:00:00 2001 From: "Li, Hao H" Date: Tue, 26 Mar 2019 22:21:15 +0800 Subject: [PATCH 16/25] fix comments --- src/operator/rnn-inl.h | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index db90fe8b74de..30055ff16e9a 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -511,9 +511,6 @@ class RNNOp { init_cudnn_ = false; Storage::Get()->Free(reserve_space_); - if (param_.p > 0) { - Storage::Get()->Free(dropout_states_); - } } #if USE_CUDNN_LSTM_PROJ CUDNN_CALL(cudnnDestroyRNNDataDescriptor(x_data_desc_)); @@ -584,7 +581,7 @@ class RNNOp { DType* work_cpu_space = NULL; #if MXNET_USE_CUDNN_RNN && defined(__CUDACC__) if (!init_cudnn_) { - Init(s, in_data, out_data); + Init(ctx, s, in_data, out_data); } // Get temp space int temp_size = workspace_size_; @@ -896,7 +893,7 @@ class RNNOp { DType* work_cpu_space = NULL; #if MXNET_USE_CUDNN_RNN && defined(__CUDACC__) if (!init_cudnn_) { - Init(s, in_data, out_data); + Init(ctx, s, in_data, out_data); } // Get temp space @@ -1044,7 +1041,8 @@ class RNNOp { private: - inline void Init(mshadow::Stream *s, + inline void Init(const OpContext &ctx, + mshadow::Stream *s, const std::vector &in_data, const std::vector &out_data) { using namespace mshadow; @@ -1218,18 +1216,21 @@ class RNNOp { strideA)); // Create Dropout descriptors + DType* dropout_states_ = NULL; if (param_.p > 0) { CUDNN_CALL(cudnnDropoutGetStatesSize(s->dnn_handle_, &dropout_byte_)); dropout_size_ = dropout_byte_ / sizeof(DType); - dropout_states_ = Storage::Get()->Alloc(dropout_byte_, Context::GPU(s->dev_id)); + dropout_states_ = ctx.requested[rnn_enum::kTempSpace].get_space_typed( + mshadow::Shape1(dropout_size_), s).dptr_; } else { - dropout_states_ = {}; dropout_byte_ = 0; } + CUDNN_CALL(cudnnSetDropoutDescriptor(dropout_desc_, s->dnn_handle_, param_.p, // discard probability - dropout_states_.dptr, dropout_byte_, + dropout_states_, dropout_byte_, seed_)); + // RNN descriptors #if CUDNN_MAJOR >= 6 cudnnRNNAlgo_t rnn_algo = CUDNN_RNN_ALGO_STANDARD; @@ -1355,7 +1356,7 @@ class RNNOp { cudnnDirectionMode_t direction_; cudnnRNNInputMode_t input_mode_; cudnnDropoutDescriptor_t dropout_desc_; - Storage::Handle dropout_states_, reserve_space_; + Storage::Handle reserve_space_; uint64_t seed_ = 17 + rand() % 4096; // NOLINT(runtime/threadsafe_fn) size_t workspace_byte_, reserve_space_byte_, dropout_byte_; int workspace_size_, dropout_size_; From d6b3fe5ccea013056cc5a3f2e4a3ff78f0974d74 Mon Sep 17 00:00:00 2001 From: "Li, Hao H" Date: Wed, 27 Mar 2019 08:50:39 +0800 Subject: [PATCH 17/25] retrigger the ci From 9b7c3fcca3909169827e08e54cfab4e4f3a1ea55 Mon Sep 17 00:00:00 2001 From: "Li, Hao H" Date: Mon, 8 Apr 2019 10:10:58 +0800 Subject: [PATCH 18/25] fix comments --- src/operator/rnn-inl.h | 44 ++++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index 30055ff16e9a..c1fd005ac90d 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -563,8 +563,8 @@ class RNNOp { if (param_.state_outputs) { hy_ptr = out_data[rnn_enum::kStateOut].dptr(); } - DType * cx_ptr = NULL; - DType * cy_ptr = NULL; + DType* cx_ptr = NULL; + DType* cy_ptr = NULL; if (param_.mode == rnn_enum::kLstm) cx_ptr = (in_data[rnn_enum::kStateCell].get(s)).dptr_; if (param_.mode == rnn_enum::kLstm && param_.state_outputs) @@ -575,10 +575,6 @@ class RNNOp { CHECK_EQ(hx.CheckContiguous(), true); CHECK_EQ(y.CheckContiguous(), true); - // allocate temp space - const size_t work_cpu_space_size = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_, - param_.state_size, direction, param_.mode); - DType* work_cpu_space = NULL; #if MXNET_USE_CUDNN_RNN && defined(__CUDACC__) if (!init_cudnn_) { Init(ctx, s, in_data, out_data); @@ -587,9 +583,7 @@ class RNNOp { int temp_size = workspace_size_; Tensor temp_space = ctx.requested[rnn_enum::kTempSpace].get_space_typed( - mshadow::Shape1(temp_size + work_cpu_space_size), s); - - work_cpu_space = temp_space.dptr_ + temp_size; + mshadow::Shape1(temp_size), s); #if USE_CUDNN_LSTM_PROJ std::vector seqLengthArray(param_.batch_size_, param_.seq_length_); @@ -749,11 +743,13 @@ class RNNOp { #endif if (ctx_.dev_type == kCPU) { - if (!work_cpu_space) { - Tensor workspace = ctx.requested[rnn_enum::kTempSpace] - .get_space_typed(Shape1(work_cpu_space_size), s); - work_cpu_space = workspace.dptr_; - } + // allocate temp space + const size_t work_cpu_space_size = + GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_, + param_.state_size, direction, param_.mode); + Tensor workspace = ctx.requested[rnn_enum::kTempSpace] + .get_space_typed(Shape1(work_cpu_space_size), s); + DType* work_cpu_space = workspace.dptr_; if (ctx.is_train) { const size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction, param_.seq_length_, param_.batch_size_, @@ -855,6 +851,7 @@ class RNNOp { CHECK_EQ(dhx.CheckContiguous(), true); CHECK_EQ(y.CheckContiguous(), true); CHECK_EQ(dy.CheckContiguous(), true); + CHECK_EQ(dx.CheckContiguous(), true); if (req[rnn_enum::kParams] != kAddTo) { dw = mshadow::expr::ScalarExp(0.0f); @@ -886,11 +883,6 @@ class RNNOp { if ((param_.mode == rnn_enum::kLstm) && param_.state_outputs) dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get(s)).dptr_; - // allocate temp space - const size_t work_cpu_space_size = - GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_, - param_.state_size, direction, param_.mode); - DType* work_cpu_space = NULL; #if MXNET_USE_CUDNN_RNN && defined(__CUDACC__) if (!init_cudnn_) { Init(ctx, s, in_data, out_data); @@ -900,8 +892,7 @@ class RNNOp { int temp_size = workspace_size_; Tensor temp_space = ctx.requested[rnn_enum::kTempSpace].get_space_typed( - mshadow::Shape1(temp_size + work_cpu_space_size), s); - work_cpu_space = temp_space.dptr_ + temp_size; + mshadow::Shape1(temp_size), s); #if USE_CUDNN_LSTM_PROJ CUDNN_CALL(cudnnRNNBackwardDataEx(s->dnn_handle_, rnn_desc_, @@ -994,11 +985,14 @@ class RNNOp { #endif if (ctx_.dev_type == kCPU) { - if (!work_cpu_space) { - Tensor workspace = ctx.requested[rnn_enum::kTempSpace] + // allocate temp space + const size_t work_cpu_space_size = + GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_, + param_.state_size, direction, param_.mode); + DType* work_cpu_space = NULL; + Tensor workspace = ctx.requested[rnn_enum::kTempSpace] .get_space_typed(Shape1(work_cpu_space_size), s); - work_cpu_space = workspace.dptr_; - } + work_cpu_space = workspace.dptr_; size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction, param_.seq_length_, param_.batch_size_, param_.state_size, param_.mode); From 27a6840cd159288dfdb211a6b5fce5314b2a4e69 Mon Sep 17 00:00:00 2001 From: "Li, Hao H" Date: Mon, 8 Apr 2019 12:09:07 +0800 Subject: [PATCH 19/25] retrigger the ci From 0aec50e9127ebe44d0ba5f774e26a8f0ccbeb2c8 Mon Sep 17 00:00:00 2001 From: "Li, Hao H" Date: Mon, 8 Apr 2019 15:40:03 +0800 Subject: [PATCH 20/25] fix comments --- src/operator/rnn-inl.h | 10 ++++------ src/operator/rnn.cc | 17 +++++++++++++++-- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index c1fd005ac90d..f3d7a4a4a4ff 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -51,7 +51,7 @@ namespace rnn_enum { enum RNNOpInputs {kData, kParams, kState, kStateCell}; enum RNNOpOutputs {kOut, kStateOut, kStateCellOut}; enum RNNModeType {kRnnRelu, kRnnTanh, kLstm, kGru}; - enum RNNOpResource {kTempSpace}; + enum RNNOpResource {kTempSpace, kCuDNNDropoutDescSpace}; } inline int GetRnnParamSize(int num_layer, @@ -1212,10 +1212,8 @@ class RNNOp { // Create Dropout descriptors DType* dropout_states_ = NULL; if (param_.p > 0) { - CUDNN_CALL(cudnnDropoutGetStatesSize(s->dnn_handle_, &dropout_byte_)); - dropout_size_ = dropout_byte_ / sizeof(DType); - dropout_states_ = ctx.requested[rnn_enum::kTempSpace].get_space_typed( - mshadow::Shape1(dropout_size_), s).dptr_; + ctx.requested[rnn_enum::kCuDNNDropoutDescSpace].get_cudnn_dropout_desc + (&dropout_desc_, s, 1.0f - param_.p, seed_); } else { dropout_byte_ = 0; } @@ -1353,7 +1351,7 @@ class RNNOp { Storage::Handle reserve_space_; uint64_t seed_ = 17 + rand() % 4096; // NOLINT(runtime/threadsafe_fn) size_t workspace_byte_, reserve_space_byte_, dropout_byte_; - int workspace_size_, dropout_size_; + int workspace_size_; std::vector x_desc_vec_, y_desc_vec_, dx_desc_vec_, dy_desc_vec_; #if USE_CUDNN_LSTM_PROJ cudnnRNNDataDescriptor_t x_data_desc_, y_data_desc_, dx_data_desc_, dy_data_desc_; diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc index aa5c3f305975..70ee3cd216e0 100644 --- a/src/operator/rnn.cc +++ b/src/operator/rnn.cc @@ -243,8 +243,21 @@ The definition of GRU here is slightly different from paper but compatible with .set_attr("FCreateOpState", CreateRNNState) .set_attr("FStatefulCompute", RNNStatefulCompute) .set_attr("FGradient", RNNGrad{"_backward_RNN"}) -.set_attr("FResourceRequest", [](const NodeAttrs& n) { - return std::vector{ResourceRequest::kTempSpace}; +.set_attr("FResourceRequestEx", + [](const NodeAttrs& attrs, const int dev_mask, const DispatchMode dispatch_mode) { + std::vector request; + request.emplace_back(ResourceRequest::kTempSpace); + const RNNParam& param = nnvm::get(attrs.parsed); + if (param.p == 0) return request; + if (dev_mask == kGPU) { +#if MXNET_USE_CUDNN_RNN + if (1.0f - param.p > 0) { + request.emplace_back(ResourceRequest::kCuDNNDropoutDesc); + return request; + } +#endif + } + return request; }) .add_argument("data", "NDArray-or-Symbol", "Input data to RNN") .add_argument("parameters", "NDArray-or-Symbol", From 69f7f7852944ae181c450e7bd30e30aa6f21bf00 Mon Sep 17 00:00:00 2001 From: "Li, Hao H" Date: Mon, 8 Apr 2019 20:06:07 +0800 Subject: [PATCH 21/25] retrigger the ci From c950d1f4b7e864fc744e43bdb8b8d859737568f3 Mon Sep 17 00:00:00 2001 From: "Li, Hao H" Date: Tue, 9 Apr 2019 06:41:14 +0800 Subject: [PATCH 22/25] retrigger the ci From 5c5a9a446dd8f3110df8e5bf5587cda425eda3b4 Mon Sep 17 00:00:00 2001 From: "Li, Hao H" Date: Wed, 10 Apr 2019 10:54:12 +0800 Subject: [PATCH 23/25] retrigger the ci From d99d82aa863877958502e391f95e80cfe0043bcc Mon Sep 17 00:00:00 2001 From: "Li, Hao H" Date: Wed, 10 Apr 2019 20:01:48 +0800 Subject: [PATCH 24/25] retrigger the ci From 69295caad333a6e79d84a48410a0893c10786ab8 Mon Sep 17 00:00:00 2001 From: "Li, Hao H" Date: Thu, 11 Apr 2019 12:26:26 +0800 Subject: [PATCH 25/25] fix comments --- src/operator/rnn-inl.h | 69 +++++++++++++++++++++++------------------- src/operator/rnn.cc | 4 --- 2 files changed, 38 insertions(+), 35 deletions(-) diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index f3d7a4a4a4ff..37f21ce6d126 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -51,7 +51,7 @@ namespace rnn_enum { enum RNNOpInputs {kData, kParams, kState, kStateCell}; enum RNNOpOutputs {kOut, kStateOut, kStateCellOut}; enum RNNModeType {kRnnRelu, kRnnTanh, kLstm, kGru}; - enum RNNOpResource {kTempSpace, kCuDNNDropoutDescSpace}; + enum RNNOpResource {kCuDNNDropoutDescSpace}; } inline int GetRnnParamSize(int num_layer, @@ -473,7 +473,10 @@ class RNNOp { if (ctx_.dev_type == kCPU) { this->init_space_ = false; + this->temp_init_space_ = false; this->reserve_cpu_space_size_ = 0; + this->temp_cpu_space_size_ = 0; + if (param_.projection_size.has_value()) { LOG(FATAL) << "hidden layer projection is only supported for GPU with CuDNN later than 7.1.1"; @@ -509,7 +512,7 @@ class RNNOp { CUDNN_CALL(cudnnDestroyTensorDescriptor(dy_desc_vec_[i])); } init_cudnn_ = false; - + Storage::Get()->Free(temp_space_); Storage::Get()->Free(reserve_space_); } #if USE_CUDNN_LSTM_PROJ @@ -525,6 +528,10 @@ class RNNOp { Storage::Get()->Free(reserve_cpu_space_); init_space_ = false; } + if (temp_init_space_) { + Storage::Get()->Free(temp_cpu_space_); + temp_init_space_ = false; + } } } @@ -579,11 +586,6 @@ class RNNOp { if (!init_cudnn_) { Init(ctx, s, in_data, out_data); } - // Get temp space - int temp_size = workspace_size_; - Tensor temp_space = - ctx.requested[rnn_enum::kTempSpace].get_space_typed( - mshadow::Shape1(temp_size), s); #if USE_CUDNN_LSTM_PROJ std::vector seqLengthArray(param_.batch_size_, param_.seq_length_); @@ -663,7 +665,7 @@ class RNNOp { nullptr, nullptr, nullptr, - temp_space.dptr_, + temp_space_.dptr, workspace_byte_, reserve_space_.dptr, reserve_space_byte_)); @@ -685,7 +687,7 @@ class RNNOp { hy_ptr, cy_desc_, cy_ptr, - temp_space.dptr_, + temp_space_.dptr, workspace_byte_, reserve_space_.dptr, reserve_space_byte_)); @@ -716,7 +718,7 @@ class RNNOp { nullptr, nullptr, nullptr, - temp_space.dptr_, + temp_space_.dptr, workspace_byte_)); #else CUDNN_CALL(cudnnRNNForwardInference(s->dnn_handle_, @@ -736,7 +738,7 @@ class RNNOp { hy_ptr, cy_desc_, cy_ptr, - temp_space.dptr_, + temp_space_.dptr, workspace_byte_)); #endif } @@ -747,9 +749,17 @@ class RNNOp { const size_t work_cpu_space_size = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_, param_.state_size, direction, param_.mode); - Tensor workspace = ctx.requested[rnn_enum::kTempSpace] - .get_space_typed(Shape1(work_cpu_space_size), s); - DType* work_cpu_space = workspace.dptr_; + if (temp_init_space_ && temp_cpu_space_size_ < work_cpu_space_size) { + Storage::Get()->Free(temp_cpu_space_); + temp_init_space_ = false; + } + if (!temp_init_space_) { + temp_cpu_space_ = Storage::Get()->Alloc + (work_cpu_space_size * sizeof(DType), Context::CPU()); + temp_cpu_space_size_ = work_cpu_space_size; + temp_init_space_ = true; + } + DType* work_cpu_space = static_cast(temp_cpu_space_.dptr); if (ctx.is_train) { const size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction, param_.seq_length_, param_.batch_size_, @@ -888,11 +898,6 @@ class RNNOp { Init(ctx, s, in_data, out_data); } - // Get temp space - int temp_size = workspace_size_; - Tensor temp_space = - ctx.requested[rnn_enum::kTempSpace].get_space_typed( - mshadow::Shape1(temp_size), s); #if USE_CUDNN_LSTM_PROJ CUDNN_CALL(cudnnRNNBackwardDataEx(s->dnn_handle_, rnn_desc_, @@ -920,7 +925,7 @@ class RNNOp { dcx_ptr, nullptr, nullptr, - temp_space.dptr_, + temp_space_.dptr, workspace_byte_, reserve_space_.dptr, reserve_space_byte_)); @@ -932,7 +937,7 @@ class RNNOp { hx.dptr_, y_data_desc_, y.dptr_, - temp_space.dptr_, + temp_space_.dptr, workspace_byte_, dw_desc_, dw.dptr_, @@ -962,7 +967,7 @@ class RNNOp { dhx.dptr_, dcx_desc_, dcx_ptr, - temp_space.dptr_, + temp_space_.dptr, workspace_byte_, reserve_space_.dptr, reserve_space_byte_)); @@ -975,7 +980,7 @@ class RNNOp { hx.dptr_, y_desc_vec_.data(), y.dptr_, - temp_space.dptr_, + temp_space_.dptr, workspace_byte_, dw_desc_, dw.dptr_, @@ -989,10 +994,10 @@ class RNNOp { const size_t work_cpu_space_size = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_, param_.state_size, direction, param_.mode); - DType* work_cpu_space = NULL; - Tensor workspace = ctx.requested[rnn_enum::kTempSpace] - .get_space_typed(Shape1(work_cpu_space_size), s); - work_cpu_space = workspace.dptr_; + if (!temp_init_space_ || temp_cpu_space_size_ != work_cpu_space_size) { + LOG(FATAL) << "Check temp init error"; + } + DType* work_cpu_space = static_cast(temp_cpu_space_.dptr); size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction, param_.seq_length_, param_.batch_size_, param_.state_size, param_.mode); @@ -1280,6 +1285,8 @@ class RNNOp { workspace_size_ = workspace_byte_ / sizeof(DType); // Allocate the reserve space reserve_space_ = Storage::Get()->Alloc(reserve_space_byte_, Context::GPU(s->dev_id)); + // Allocate the temp space + temp_space_ = Storage::Get()->Alloc(workspace_byte_, Context::GPU(s->dev_id)); // Check that number of params are correct size_t cudnn_param_size; CUDNN_CALL(cudnnGetRNNParamsSize(s->dnn_handle_, @@ -1348,7 +1355,7 @@ class RNNOp { cudnnDirectionMode_t direction_; cudnnRNNInputMode_t input_mode_; cudnnDropoutDescriptor_t dropout_desc_; - Storage::Handle reserve_space_; + Storage::Handle reserve_space_, temp_space_; uint64_t seed_ = 17 + rand() % 4096; // NOLINT(runtime/threadsafe_fn) size_t workspace_byte_, reserve_space_byte_, dropout_byte_; int workspace_size_; @@ -1369,9 +1376,9 @@ class RNNOp { cudnnTensorFormat_t format_; #endif #endif - bool init_space_; - size_t reserve_cpu_space_size_; - Storage::Handle reserve_cpu_space_; + bool init_space_, temp_init_space_; + size_t reserve_cpu_space_size_, temp_cpu_space_size_; + Storage::Handle reserve_cpu_space_, temp_cpu_space_; }; // class RNNOp static OpStatePtr CreateRNNState(const nnvm::NodeAttrs &attrs, diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc index 70ee3cd216e0..74c563afceb1 100644 --- a/src/operator/rnn.cc +++ b/src/operator/rnn.cc @@ -246,7 +246,6 @@ The definition of GRU here is slightly different from paper but compatible with .set_attr("FResourceRequestEx", [](const NodeAttrs& attrs, const int dev_mask, const DispatchMode dispatch_mode) { std::vector request; - request.emplace_back(ResourceRequest::kTempSpace); const RNNParam& param = nnvm::get(attrs.parsed); if (param.p == 0) return request; if (dev_mask == kGPU) { @@ -275,9 +274,6 @@ NNVM_REGISTER_OP(_backward_RNN) .set_attr_parser(ParamParser) .set_attr("TIsLayerOpBackward", true) .set_attr("TIsBackward", true) -.set_attr("FResourceRequest", [](const NodeAttrs& n) { - return std::vector{ResourceRequest::kTempSpace}; -}) .set_attr("FStatefulCompute", RNNStatefulGradCompute); } // namespace op } // namespace mxnet