From 8740d0fa046b6626cc7a2010671120b3fd2e59df Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Wed, 20 Mar 2019 10:35:44 +0800
Subject: [PATCH 01/25] change RNN OP to stateful

---
 src/operator/cudnn_rnn-inl.h |  93 ++++++-----
 src/operator/rnn-inl.h       | 264 +++++---------------------------
 src/operator/rnn.cc          | 288 +++++++++++++++++++++++++++++++++--
 src/operator/rnn.cu          |  67 ++++++--
 4 files changed, 416 insertions(+), 296 deletions(-)
diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index cc8e4db404da..425c67864e28 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -27,7 +27,6 @@
 #define MXNET_OPERATOR_CUDNN_RNN_INL_H_
 
 #define USE_CUDNN_LSTM_PROJ MXNET_USE_CUDNN == 1 && CUDNN_VERSION >= 7200
-
 #include <mxnet/storage.h>
 #include <vector>
 #include <map>
@@ -40,8 +39,9 @@ namespace mxnet {
 namespace op {
 #if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
 template<typename DType>
-class CuDNNRNNOp : public Operator {
+class CuDNNRNNOp {
  public:
+  RNNParam param_;
   explicit CuDNNRNNOp(RNNParam param) {
     this->param_ = param;
     init_cudnn_ = false;
@@ -99,12 +99,6 @@ class CuDNNRNNOp : public Operator {
 #endif
     // RNN Direction
     direction_ = param_.bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL;
-    // Other
-    if (param_.mode == rnn_enum::kLstm)
-      param_.lstm_q_ = true;
-    else
-      param_.lstm_q_ = false;
-
     // Create descriptors
     CUDNN_CALL(cudnnCreateTensorDescriptor(&hx_desc_));
     CUDNN_CALL(cudnnCreateTensorDescriptor(&cx_desc_));
@@ -166,18 +160,20 @@ class CuDNNRNNOp : public Operator {
     #endif
   }
 
-  virtual void Forward(const OpContext &ctx, const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
+  void Forward(const OpContext &ctx, const std::vector<TBlob> &in_data,
+               const std::vector<OpReqType> &req,
+               const std::vector<TBlob> &out_data) {
     using namespace mshadow;
-    size_t in_expected = param_.lstm_q_ ? 4 : 3;
-    size_t out_expected = param_.lstm_q_ ? 3 : 2;
-    if (!param_.state_outputs)
-        out_expected = 1;
+    size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
+    //  kOut
+    size_t num_outputs = 1;
+    if (param_.state_outputs) {
+      // kOut, kStateOut, kStateCellOut
+      num_outputs = (param_.mode == rnn_enum::kLstm) ? 3 : 2;
+    }
 
-    CHECK_EQ(in_data.size(), in_expected);
-    CHECK_EQ(out_data.size(), out_expected);
+    CHECK_EQ(in_data.size(), num_inputs);
+    CHECK_EQ(out_data.size(), num_outputs);
     Stream<gpu> *s = ctx.get_stream<gpu>();
     // get input + output tensors
     Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
@@ -191,10 +187,9 @@ class CuDNNRNNOp : public Operator {
 
     DType * cx_ptr = NULL;
     DType * cy_ptr = NULL;
-
-    if (param_.lstm_q_)
+    if (param_.mode == rnn_enum::kLstm)
       cx_ptr = (in_data[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
-    if (param_.lstm_q_ && param_.state_outputs)
+    if (param_.mode == rnn_enum::kLstm && param_.state_outputs)
       cy_ptr = (out_data[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
 
     CHECK_EQ(x.CheckContiguous(), true);
@@ -367,24 +362,26 @@ class CuDNNRNNOp : public Operator {
     }
   }
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
+  void Backward(const OpContext &ctx,
+                const std::vector<TBlob> &out_grad,
+                const std::vector<TBlob> &in_data,
+                const std::vector<TBlob> &out_data,
+                const std::vector<OpReqType> &req,
+                const std::vector<TBlob> &in_grad) {
     using namespace mshadow;
-    size_t in_expected = param_.lstm_q_ ? 4 : 3;
-    size_t out_expected = param_.lstm_q_ ? 3 : 2;
-    if (!param_.state_outputs)
-      out_expected = 1;
-
-    CHECK_EQ(in_data.size(), in_expected);
-    CHECK_EQ(out_data.size(), out_expected);
-    CHECK_EQ(in_grad.size(), in_expected);
-    CHECK_EQ(out_grad.size(), out_expected);
-    CHECK_EQ(req.size(), in_expected);
+    size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
+    //  kOut
+    size_t num_outputs = 1;
+    if (param_.state_outputs) {
+      // kOut, kStateOut, kStateCellOut
+      num_outputs = (param_.mode == rnn_enum::kLstm) ? 3 : 2;
+    }
+
+    CHECK_EQ(in_data.size(), num_inputs);
+    CHECK_EQ(out_data.size(), num_outputs);
+    CHECK_EQ(in_grad.size(), num_inputs);
+    CHECK_EQ(out_grad.size(), num_outputs);
+    CHECK_EQ(req.size(), num_inputs);
     CHECK_NE(req[rnn_enum::kData], kAddTo) << "AddTo is not supported for data";
     CHECK_NE(req[rnn_enum::kState], kAddTo) << "AddTo is not supported for state";
     Stream<gpu> *s = ctx.get_stream<gpu>();
@@ -534,13 +531,17 @@ class CuDNNRNNOp : public Operator {
     #if CUDNN_MAJOR >= 5
     format_ = CUDNN_TENSOR_NCHW;
     #endif
-    size_t in_expected = param_.lstm_q_ ? 4 : 3;
-    size_t out_expected = param_.lstm_q_ ? 3 : 2;
-    if (!param_.state_outputs)
-      out_expected = 1;
 
-    CHECK_EQ(in_data.size(), in_expected);
-    CHECK_EQ(out_data.size(), out_expected);
+    size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
+    //  kOut
+    size_t num_outputs = 1;
+    if (param_.state_outputs) {
+      // kOut, kStateOut, kStateCellOut
+      num_outputs = (param_.mode == rnn_enum::kLstm) ? 3 : 2;
+    }
+
+    CHECK_EQ(in_data.size(), num_inputs);
+    CHECK_EQ(out_data.size(), num_outputs);
     if (!init_cudnn_) {
       init_cudnn_ = true;
       // get input + output tensors
@@ -854,10 +855,8 @@ class CuDNNRNNOp : public Operator {
   #if CUDNN_MAJOR >= 5
   cudnnTensorFormat_t format_;
   #endif
-  RNNParam param_;
 };
-#endif  // __CUDACC__ && CUDNN
+#endif  // CUDNN
 }  // namespace op
 }  // namespace mxnet
-
 #endif  // MXNET_OPERATOR_CUDNN_RNN_INL_H_
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 71ad331786ae..0460c193de5b 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -160,9 +160,8 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
   uint32_t num_layers;
   bool bidirectional, state_outputs;
   int mode;
-  float p, pkeep_;
+  float p;
   int seq_length_, batch_size_, input_size_;
-  bool lstm_q_;  // whether type is lstm
   dmlc::optional<int> projection_size;
   dmlc::optional<double> lstm_state_clip_min, lstm_state_clip_max;
   bool lstm_state_clip_nan;
@@ -212,7 +211,6 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
   }
 };
 
-
 /**
  * @params: ws: Temp workspace for gemm's output storage.
  *          rs: Reserve space of forward intermediate data used for training.
@@ -236,6 +234,7 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
  *                  hy's shape is [num_layers, batch_size, state_size]
  *          cy_ptr: Only used in lstm mode. pointer of tensor cy  containing the cell state
  *                  for t=seq_length. cy' shape is [num_layers, batch_size, state_size]
+ *          dropout: should be 0 <= dropout < 1
  *          mode: Specifies the type of RNN to compute.
  */
 template <typename DType>
@@ -377,7 +376,7 @@ void RNNBackward(DType* ws,
 }
 
 template<typename DType>
-class RNNOp : public Operator{
+class RNNOp {
  public:
   explicit RNNOp(RNNParam p)
     :param_(p), init_space_(false), reserve_space_size_(0) {
@@ -397,23 +396,24 @@ class RNNOp : public Operator{
     }
   }
 
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
+  void Forward(const OpContext &ctx,
+               const std::vector<TBlob> &in_data,
+               const std::vector<OpReqType> &req,
+               const std::vector<TBlob> &out_data) {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK(param_.p >= 0.0f && param_.p < 1.0f)
         << "unsupported dropout value, should be 0 <= dropout < 1";
 
-    size_t in_expected = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
-    size_t out_expected = (param_.mode == rnn_enum::kLstm) ? 3 : 2;
-    if (!param_.state_outputs) {
-      out_expected = 1;
+    size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
+    //  kOut
+    size_t num_outputs = 1;
+    if (param_.state_outputs) {
+      // kOut, kStateOut, kStateCellOut
+      num_outputs = (param_.mode == rnn_enum::kLstm) ? 3 : 2;
     }
-    CHECK_EQ(in_data.size(), in_expected);
-    CHECK_EQ(out_data.size(), out_expected);
+    CHECK_EQ(in_data.size(), num_inputs);
+    CHECK_EQ(out_data.size(), num_outputs);
     Stream<cpu> *s = ctx.get_stream<cpu>();
     // get input + output tensor
     Tensor<cpu, 3, DType> x = in_data[rnn_enum::kData].get<cpu, 3, DType>(s);
@@ -427,7 +427,6 @@ class RNNOp : public Operator{
     param_.seq_length_ = x.shape_[0];
     param_.batch_size_ = x.shape_[1];
     param_.input_size_ = x.shape_[2];
-
     const int direction = param_.bidirectional ? 2 : 1;
     const int bsize = GetRnnBiasSize(param_.num_layers, param_.state_size, direction, param_.mode);
     DType* b_ptr = w.dptr_ + w.shape_[0] - bsize;
@@ -451,7 +450,6 @@ class RNNOp : public Operator{
                                                       param_.state_size, direction, param_.mode);
     Tensor<cpu, 1, DType> workspace = ctx.requested[rnn_enum::kTempSpace]
         .get_space_typed<cpu, 1, DType>(Shape1(workspace_size), s);
-
     if (ctx.is_train) {
       const size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
                                                    param_.seq_length_, param_.batch_size_,
@@ -460,7 +458,6 @@ class RNNOp : public Operator{
         Storage::Get()->Free(reserve_space_);
         init_space_ = false;
       }
-
       if (!init_space_) {
         reserve_space_ = Storage::Get()->Alloc(r_size * sizeof(DType), Context::CPU());
         reserve_space_size_ = r_size;
@@ -468,6 +465,7 @@ class RNNOp : public Operator{
       }
 
       DType* reserve_space_ptr = static_cast<DType*>(reserve_space_.dptr);
+
       RNNForwardTraining<DType>(workspace.dptr_,
                                 reserve_space_ptr,
                                 param_.state_outputs,
@@ -508,28 +506,30 @@ class RNNOp : public Operator{
     }
   }
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
+  void Backward(const OpContext &ctx,
+                const std::vector<TBlob> &out_grad,
+                const std::vector<TBlob> &in_data,
+                const std::vector<TBlob> &out_data,
+                const std::vector<OpReqType> &req,
+                const std::vector<TBlob> &in_grad) {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK(param_.p >= 0.0f && param_.p < 1.0f)
         << "unsupported dropout value, should be 0 <= dropout < 1";
 
-    size_t in_expected = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
-    size_t out_expected = (param_.mode == rnn_enum::kLstm) ? 3 : 2;
-    if (!param_.state_outputs) {
-      out_expected = 1;
+    size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
+    //  kOut
+    size_t num_outputs = 1;
+    if (param_.state_outputs) {
+      // kOut, kStateOut, kStateCellOut
+      num_outputs = (param_.mode == rnn_enum::kLstm) ? 3 : 2;
     }
-    CHECK_EQ(in_data.size(), in_expected);
-    CHECK_EQ(out_data.size(), out_expected);
-    CHECK_EQ(in_grad.size(), in_expected);
-    CHECK_EQ(out_grad.size(), out_expected);
-    CHECK_EQ(req.size(), in_expected);
+
+    CHECK_EQ(in_data.size(), num_inputs);
+    CHECK_EQ(out_data.size(), num_outputs);
+    CHECK_EQ(in_grad.size(), num_inputs);
+    CHECK_EQ(out_grad.size(), num_outputs);
+    CHECK_EQ(req.size(), num_inputs);
     CHECK_NE(req[rnn_enum::kData], kAddTo) << "AddTo is not supported for data";
     CHECK_NE(req[rnn_enum::kState], kAddTo) << "AddTo is not supported for state";
     mshadow::Stream<cpu> *s = ctx.get_stream<cpu>();
@@ -556,6 +556,7 @@ class RNNOp : public Operator{
 
     const int direction = param_.bidirectional ? 2 : 1;
     const int bsize = GetRnnBiasSize(param_.num_layers, param_.state_size, direction, param_.mode);
+
     DType* db_ptr = dw.dptr_ + w.shape_[0] - bsize;
 
     DType * dhy_ptr = NULL;
@@ -585,6 +586,7 @@ class RNNOp : public Operator{
     size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
                                            param_.seq_length_, param_.batch_size_,
                                            param_.state_size, param_.mode);
+
     if (!init_space_ || reserve_space_size_ != r_size) {
       LOG(FATAL) << "Check forward init error";
     }
@@ -620,203 +622,15 @@ class RNNOp : public Operator{
                        param_.mode);
   }
 
- private:
   RNNParam param_;
+
+ private:
   bool init_space_;
   size_t reserve_space_size_;
   Storage::Handle reserve_space_;
 };  // class RNNOp
 
-template<typename xpu>
-Operator* CreateOp(RNNParam param, int dtype);
-
-#if DMLC_USE_CXX11
-class RNNProp : public OperatorProperty {
- public:
-  std::vector<std::string> ListArguments() const override {
-    if (param_.mode == rnn_enum::kLstm) {
-      return {"data", "parameters", "state", "state_cell"};
-    } else {
-      return {"data", "parameters", "state"};
-    }
-  }
-
-  std::vector<std::string> ListOutputs() const override {
-    std::vector<std::string> outputs = {"output"};
-    if (!param_.state_outputs)
-      return outputs;
-    else
-      outputs.emplace_back("state");
-    if (param_.mode == rnn_enum::kLstm)
-      outputs.emplace_back("state_cell");
-    return outputs;
-  }
-
-  int NumOutputs() const override {
-    int mode_num = (param_.mode == rnn_enum::kLstm) ? 2 : 1;
-    int num_outputs = param_.state_outputs ? (mode_num + 1) : 1;
-    return num_outputs;
-  }
-
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    param_.Init(kwargs);
-  }
-
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
-
-  bool InferShape(mxnet::ShapeVector *in_shape,
-                  mxnet::ShapeVector *out_shape,
-                  mxnet::ShapeVector *aux_shape) const override {
-    using namespace mshadow;
-    if (param_.mode == rnn_enum::kLstm) {
-      CHECK_EQ(in_shape->size(), 4U) << "Input:[data, parameters, state, cell_state]";
-    } else {
-      CHECK_EQ(in_shape->size(), 3U) << "Input:[data, parameters, state]";
-    }
-    const mxnet::TShape &dshape = (*in_shape)[rnn_enum::kData];
-    if (dshape.ndim() ==  0) return false;
-    CHECK_EQ(dshape.ndim(), 3U) \
-        << "Input data should be rank-3 tensor of dim [sequence length, batch size, input size]";
-    // data: [sequence len, batch, input dimension]
-    int batch_size = dshape[1];
-    int input_size = dshape[2];
-    int numDirections = param_.bidirectional ? 2 : 1;
-    int total_layers = numDirections * param_.num_layers;  // double for bidirectional
-    int layer_size = (param_.projection_size.has_value()) ?
-                     param_.projection_size.value() : param_.state_size;
-    SHAPE_ASSIGN_CHECK(*in_shape,
-                       rnn_enum::kState,
-                       Shape3(total_layers, batch_size, layer_size));
-    if (param_.mode == rnn_enum::kLstm)
-      SHAPE_ASSIGN_CHECK(*in_shape,
-                         rnn_enum::kStateCell,
-                         Shape3(total_layers, batch_size, param_.state_size));
-
-    // calculate parameter vector length
-    int param_size = GetRnnParamSize(param_.num_layers,
-                                     input_size,
-                                     param_.state_size,
-                                     numDirections,
-                                     param_.mode,
-                                     param_.projection_size);
-    SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(param_size));
-
-    out_shape->clear();
-    // output: [sequence len, batch, output size]
-    mxnet::TShape oshape = dshape;
-    if (param_.projection_size.has_value()) {
-      oshape[2] = numDirections * param_.projection_size.value();
-    } else {
-      oshape[2] = numDirections * param_.state_size;
-    }
-    out_shape->push_back(oshape);
-    if (!param_.state_outputs) {
-      return true;
-    } else {
-      // outStateShape: [layer_num, batch, state size]
-      mxnet::TShape outStateShape = dshape;
-      outStateShape[0] = total_layers;
-      outStateShape[1] = batch_size;
-      if (param_.projection_size.has_value()) {
-        outStateShape[2] = param_.projection_size.value();
-      } else {
-        outStateShape[2] = param_.state_size;
-      }
-      out_shape->push_back(outStateShape);
-      // Deal with lstm cell state
-      if (param_.mode == rnn_enum::kLstm) {
-        mxnet::TShape cellStateShape = dshape;
-        cellStateShape[0] = total_layers;
-        cellStateShape[1] = batch_size;
-        cellStateShape[2] = param_.state_size;
-        out_shape->push_back(cellStateShape);
-      }
-      return true;
-    }
-  }
-
-  bool InferType(std::vector<int> *in_type,
-                 std::vector<int> *out_type,
-                 std::vector<int> *aux_type) const override {
-    CHECK_GE(in_type->size(), 1U);
-    int dtype = (*in_type)[0];
-    CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (size_t i = 0; i < in_type->size(); ++i) {
-      if ((*in_type)[i] == -1) {
-        (*in_type)[i] = dtype;
-      } else {
-        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
-      }
-    }
-    out_type->clear();
-    out_type->push_back(dtype);
-    if (!param_.state_outputs) {
-      return true;
-    } else {
-      out_type->push_back(dtype);
-      // Deal with lstm cell state
-      if (param_.mode == rnn_enum::kLstm)
-        out_type->push_back(dtype);
-      return true;
-    }
-  }
-
-  OperatorProperty* Copy() const override {
-    auto ptr = new RNNProp();
-    ptr->param_ = param_;
-    return ptr;
-  }
-
-  std::string TypeString() const override {
-    return "RNN";
-  }
-
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
-    std::vector<int> dep = {in_data[rnn_enum::kData], in_data[rnn_enum::kParams],
-        in_data[rnn_enum::kState], out_data[rnn_enum::kOut], out_grad[rnn_enum::kOut]};
-
-    if (param_.state_outputs) {
-      dep.push_back(out_data[rnn_enum::kStateOut]);
-      dep.push_back(out_grad[rnn_enum::kStateOut]);
-    }
-
-    if (param_.mode == rnn_enum::kLstm) {
-      dep.push_back(in_data[rnn_enum::kStateCell]);
-      if (param_.state_outputs) {
-        dep.push_back(out_data[rnn_enum::kStateCellOut]);
-        dep.push_back(out_grad[rnn_enum::kStateCellOut]);
-      }
-    }
-    return dep;
-  }
-
-  std::vector<ResourceRequest> ForwardResource(
-      const mxnet::ShapeVector &in_shape) const override {
-    return {ResourceRequest::kTempSpace};
-  }
-
-  std::vector<ResourceRequest> BackwardResource(
-      const mxnet::ShapeVector &in_shape) const override {
-    return {ResourceRequest::kTempSpace};
-  }
-
-  Operator* CreateOperator(Context ctx) const override {
-    LOG(FATAL) << "Not Implemented";
-    return NULL;
-  }
-
-  Operator* CreateOperatorEx(Context ctx, mxnet::ShapeVector *in_shape,
-                             std::vector<int> *in_type) const override;
-
- private:
-  RNNParam param_;
-};  // class RNNProp
-#endif  // DMLC_USE_CXX11
 }  // namespace op
 }  // namespace mxnet
+
 #endif  // MXNET_OPERATOR_RNN_INL_H_
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index 621b9eb110e7..6a9b7c001477 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -24,27 +24,244 @@
  * \author Sebastian Bodenstein
 */
 #include "./rnn-inl.h"
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
+#include "./cudnn_rnn-inl.h"
+#endif  // MXNET_USE_CUDNN && CUDNN_MAJOR
 
 namespace mxnet {
 namespace op {
-template<>
-Operator *CreateOp<cpu>(RNNParam param, int dtype) {
-  Operator *op = nullptr;
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new RNNOp<DType>(param);
+
+DMLC_REGISTER_PARAMETER(RNNParam);
+static inline std::vector<std::string> ListArguments(const RNNParam& param_) {
+  if (param_.mode == rnn_enum::kLstm) {
+    return {"data", "parameters", "state", "state_cell"};
+  } else {
+    return {"data", "parameters", "state"};
+  }
+}
+
+static bool RNNShape(const nnvm::NodeAttrs& attrs,
+                     std::vector<TShape> *in_shape,
+                     std::vector<TShape> *out_shape) {
+  const RNNParam& param_ = nnvm::get<RNNParam>(attrs.parsed);
+  using namespace mshadow;
+  if (param_.mode == rnn_enum::kLstm) {
+    CHECK_EQ(in_shape->size(), 4U) << "Needed input:[data, parameters, state, cell_state],"
+        << " got in_shape->size(): " << in_shape->size();
+  } else {
+    CHECK_EQ(in_shape->size(), 3U) <<
+      "Needed input:[data, parameters, state], got in_shape->size(): " << in_shape->size();
+  }
+  const TShape &dshape = (*in_shape)[rnn_enum::kData];
+  if (dshape.ndim() ==  0) return false;
+  CHECK_EQ(dshape.ndim(), 3U) \
+      << "Input data should be rank-3 tensor of dim [sequence length, batch size, input size]";
+  // data: [sequence len, batch, input dimension]
+  int batch_size = dshape[1];
+  int input_size = dshape[2];
+  int numDirections = param_.bidirectional ? 2 : 1;
+  int total_layers = numDirections * param_.num_layers;  // double for bidirectional
+  int layer_size = (param_.projection_size.has_value()) ?
+      param_.projection_size.value() : param_.state_size;
+  SHAPE_ASSIGN_CHECK(*in_shape,
+                     rnn_enum::kState,
+                     Shape3(total_layers, batch_size, layer_size));
+  if (param_.mode == rnn_enum::kLstm) {
+    SHAPE_ASSIGN_CHECK(*in_shape,
+                       rnn_enum::kStateCell,
+                       Shape3(total_layers, batch_size, param_.state_size));
+  }
+
+  // calculate parameter vector length
+  int param_size = GetRnnParamSize(param_.num_layers,
+                                   input_size,
+                                   param_.state_size,
+                                   numDirections,
+                                   param_.mode,
+                                   param_.projection_size);
+  SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(param_size));
+  out_shape->clear();
+  // output: [sequence len, batch, output size]
+  TShape oshape = dshape;
+  if (param_.projection_size.has_value()) {
+    oshape[2] = numDirections * param_.projection_size.value();
+  } else {
+    oshape[2] = numDirections * param_.state_size;
+  }
+  out_shape->push_back(oshape);
+  if (param_.state_outputs) {
+    // outStateShape: [layer_num, batch, state size]
+    TShape outStateShape = dshape;
+    outStateShape[0] = total_layers;
+    outStateShape[1] = batch_size;
+    if (param_.projection_size.has_value()) {
+      outStateShape[2] = param_.projection_size.value();
+    } else {
+      outStateShape[2] = param_.state_size;
+    }
+    out_shape->push_back(outStateShape);
+    // Deal with lstm cell state
+    if (param_.mode == rnn_enum::kLstm) {
+      TShape cellStateShape = dshape;
+      cellStateShape[0] = total_layers;
+      cellStateShape[1] = batch_size;
+      cellStateShape[2] = param_.state_size;
+      out_shape->push_back(cellStateShape);
+    }
+  }
+  return true;
+}
+
+static bool RNNType(const nnvm::NodeAttrs& attrs,
+                    std::vector<int> *in_type,
+                    std::vector<int> *out_type) {
+  const RNNParam& param_ = nnvm::get<RNNParam>(attrs.parsed);
+  if (param_.mode == rnn_enum::kLstm) {
+    CHECK_EQ(in_type->size(), 4U);
+  } else {
+    CHECK_EQ(in_type->size(), 3U);
+  }
+  int dtype = (*in_type)[0];
+  CHECK_NE(dtype, -1) << "First input must have specified type";
+  for (size_t i = 0; i < in_type->size(); ++i) {
+    if ((*in_type)[i] == -1) {
+      TYPE_ASSIGN_CHECK(*in_type, i, dtype);
+    } else {
+      UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments(param_)[i]);
+    }
+  }
+  out_type->clear();
+  out_type->push_back(dtype);
+  if (param_.state_outputs) {
+    out_type->push_back(dtype);
+    // Deal with lstm cell state
+    if (param_.mode == rnn_enum::kLstm)
+      out_type->push_back(dtype);
+  }
+  return true;
+}
+
+inline static bool RNNStorageType(const nnvm::NodeAttrs& attrs,
+                                  const int dev_mask,
+                                  DispatchMode* dispatch_mode,
+                                  std::vector<int> *in_attrs,
+                                  std::vector<int> *out_attrs) {
+  DispatchMode wanted_mode = DispatchMode::kFCompute;
+
+  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
+                             dispatch_mode, wanted_mode);
+}
+
+inline static bool BackwardRNNStorageType(const nnvm::NodeAttrs& attrs,
+                                          const int dev_mask,
+                                          DispatchMode* dispatch_mode,
+                                          std::vector<int> *in_attrs,
+                                          std::vector<int> *out_attrs) {
+  DispatchMode wanted_mode = DispatchMode::kFCompute;
+  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
+                             dispatch_mode, wanted_mode);
+}
+
+struct RNNGrad {
+  const char *op_name;
+  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr &n,
+          const std::vector<nnvm::NodeEntry> &ograd) const {
+    const RNNParam& params = nnvm::get<RNNParam>(n->attrs.parsed);
+    std::vector<nnvm::NodeEntry> heads{ n->inputs[rnn_enum::kData],
+      n->inputs[rnn_enum::kParams], n->inputs[rnn_enum::kState] };
+    heads.emplace_back(nnvm::NodeEntry{n, rnn_enum::kOut, 0});
+    heads.push_back(ograd[rnn_enum::kOut]);
+    if (params.state_outputs) {
+      heads.emplace_back(nnvm::NodeEntry{n, rnn_enum::kStateOut, 0});
+      heads.push_back(ograd[rnn_enum::kStateOut]);
+    }
+    if (params.mode == rnn_enum::kLstm) {
+      heads.push_back(n->inputs[rnn_enum::kStateCell]);
+      if (params.state_outputs) {
+        heads.emplace_back(nnvm::NodeEntry{n, rnn_enum::kStateCellOut, 0});
+        heads.push_back(ograd[rnn_enum::kStateCellOut]);
+      }
+    }
+    return MakeGradNode(op_name, n, heads, n->attrs.dict);
+  }
+};
+
+static OpStatePtr CreateRNNState(const nnvm::NodeAttrs &attrs,
+                                 const Context ctx,
+                                 const mxnet::ShapeVector &in_shapes,
+                                 const std::vector<int> &in_types) {
+  const RNNParam& param = nnvm::get<RNNParam>(attrs.parsed);
+  OpStatePtr state = OpStatePtr();
+  MSHADOW_REAL_TYPE_SWITCH(in_types[rnn_enum::kData], DType, {
+    #if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
+      state = OpStatePtr::Create<CuDNNRNNOp<DType>>(param);
+    #else
+      state = OpStatePtr::Create<RNNOp<DType>>(param);
+    #endif
+    return state;
   });
-  return op;
+  return OpStatePtr();  // should never reach here
 }
 
-Operator *RNNProp::CreateOperatorEx(Context ctx,
-                                  mxnet::ShapeVector *in_shape,
-                                  std::vector<int> *in_type) const {
-  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+template<typename xpu>
+void RNNStatefulCompute(const OpStatePtr& state,
+                        const OpContext& ctx,
+                        const std::vector<TBlob>& inputs,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<TBlob>& outputs) {
+  int dtype = inputs[rnn_enum::kData].type_flag_;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    RNNOp<DType>& op = state.get_state<RNNOp<DType>>();
+    op.Forward(ctx, inputs, req, outputs);
+  });
 }
+/*
+index description
+0: x
+1: w
+2: hx
+3: y
+4: dy
+5: hy
+6: dhy
+7: cx
+8: cy
+9: dcy
+*/
+template<typename xpu>
+void RNNStatefulGradCompute(const OpStatePtr& state,
+                            const OpContext& ctx,
+                            const std::vector<TBlob>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<TBlob>& outputs) {
+  std::vector<TBlob> in_data(inputs.begin(), inputs.begin() + 3);
+  std::vector<TBlob> out_data{inputs[3]};
+  std::vector<TBlob> out_grad{inputs[4]};
+  const std::vector<TBlob> &in_grad = outputs;
 
-DMLC_REGISTER_PARAMETER(RNNParam);
+  int dtype = inputs[rnn_enum::kData].type_flag_;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    RNNOp<DType>& op = state.get_state<RNNOp<DType>>();
+    const RNNParam& param = op.param_;
+    int index = 5;
+    if (param.state_outputs) {
+      out_data.push_back(inputs[index++]);
+      out_grad.push_back(inputs[index++]);
+    }
 
-MXNET_REGISTER_OP_PROPERTY(RNN, RNNProp)
+    if (param.mode == rnn_enum::kLstm) {
+      in_data.push_back(inputs[index++]);
+      if (param.state_outputs) {
+        out_data.push_back(inputs[index++]);
+        out_grad.push_back(inputs[index]);
+      }
+    }
+
+    op.Backward(ctx, out_grad, in_data, out_data, req, in_grad);
+  });
+}
+
+NNVM_REGISTER_OP(RNN)
 .describe(R"code(Applies recurrent layers to input data. Currently, vanilla RNN, LSTM and GRU are
 implemented, with both multi-layer and bidirectional support.
 
@@ -97,7 +314,38 @@ The definition of GRU here is slightly different from paper but compatible with
             z_t = \mathrm{sigmoid}(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\
             n_t = \tanh(W_{in} x_t + b_{in} + r_t * (W_{hn} h_{(t-1)}+ b_{hn})) \\
             h_t = (1 - z_t) * n_t + z_t * h_{(t-1)} \\
-            \end{array})code")
+            \end{array}
+)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<RNNParam>)
+.set_num_inputs([](const NodeAttrs& attrs) {
+  const RNNParam& params = nnvm::get<RNNParam>(attrs.parsed);
+  return params.mode == rnn_enum::kLstm ? 4 : 3;
+})
+.set_num_outputs([](const NodeAttrs& attrs) {
+  const RNNParam& params = nnvm::get<RNNParam>(attrs.parsed);
+  //  kOut
+  int num_outputs = 1;
+  if (params.state_outputs) {
+    // kOut, kStateOut, kStateCellOut
+    num_outputs = (params.mode == rnn_enum::kLstm) ? 3 : 2;
+  }
+
+  return num_outputs;
+})
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+  const RNNParam& params = nnvm::get<RNNParam>(attrs.parsed);
+  return ListArguments(params);
+})
+.set_attr<mxnet::FInferShape>("FInferShape", RNNShape)
+.set_attr<nnvm::FInferType>("FInferType", RNNType)
+.set_attr<FInferStorageType>("FInferStorageType", RNNStorageType)
+.set_attr<FCreateOpState>("FCreateOpState", CreateRNNState)
+.set_attr<FStatefulCompute>("FStatefulCompute<cpu>", RNNStatefulCompute<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", RNNGrad{"_backward_RNN"})
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
 .add_argument("data", "NDArray-or-Symbol", "Input data to RNN")
 .add_argument("parameters", "NDArray-or-Symbol",
               "Vector of all RNN trainable parameters concatenated")
@@ -105,5 +353,19 @@ The definition of GRU here is slightly different from paper but compatible with
 .add_argument("state_cell", "NDArray-or-Symbol",
               "initial cell state for LSTM networks (only for LSTM)")
 .add_arguments(RNNParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_RNN)
+.set_num_outputs([](const NodeAttrs& attrs) {
+  const RNNParam& params = nnvm::get<RNNParam>(attrs.parsed);
+  return params.mode == rnn_enum::kLstm ? 4 : 3;
+})
+.set_attr_parser(ParamParser<RNNParam>)
+.set_attr<bool>("TIsLayerOpBackward", true)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FInferStorageType>("FInferStorageType", BackwardRNNStorageType)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+.set_attr<FStatefulCompute>("FStatefulCompute<cpu>", RNNStatefulGradCompute<cpu>);
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/rnn.cu b/src/operator/rnn.cu
index 402a8cf5f503..91f38f91f67c 100644
--- a/src/operator/rnn.cu
+++ b/src/operator/rnn.cu
@@ -32,18 +32,63 @@
 
 namespace mxnet {
 namespace op {
-template<>
-Operator* CreateOp<gpu>(RNNParam param, int dtype) {
-  Operator *op = NULL;
-#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new CuDNNRNNOp<DType>(param);
-  })
-#else
-  LOG(FATAL) << "RNN on GPU is only available for cuDNN at the moment.";
-#endif  // MXNET_USE_CUDNN && CUDNN_MAJOR
-  return op;
+
+template<typename xpu>
+void RNNStatefulCompute(const OpStatePtr& state,
+                        const OpContext& ctx,
+                        const std::vector<TBlob>& inputs,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<TBlob>& outputs) {
+  int dtype = inputs[rnn_enum::kData].type_flag_;
+  #if  MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
+    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+      CuDNNRNNOp<DType>& op = state.get_state<CuDNNRNNOp<DType>>();
+      op.Forward(ctx, inputs, req, outputs);
+    });
+  #else
+    LOG(FATAL) << "RNN on GPU is only available for cuDNN at the moment.";
+  #endif  // MXNET_USE_CUDNN && CUDNN_MAJOR
 }
 
+template<typename xpu>
+void RNNStatefulGradCompute(const OpStatePtr& state,
+                            const OpContext& ctx,
+                            const std::vector<TBlob>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<TBlob>& outputs) {
+  std::vector<TBlob> in_data(inputs.begin(), inputs.begin() + 3);
+  std::vector<TBlob> out_data{inputs[3]};
+  std::vector<TBlob> out_grad{inputs[4]};
+  const std::vector<TBlob> &in_grad = outputs;
+  int dtype = inputs[rnn_enum::kData].type_flag_;
+  #if  MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
+    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+      CuDNNRNNOp<DType>& op = state.get_state<CuDNNRNNOp<DType>>();
+      const RNNParam& param = op.param_;
+      int index = 5;
+      if (param.state_outputs) {
+        out_data.push_back(inputs[index++]);
+        out_grad.push_back(inputs[index++]);
+      }
+
+      if (param.mode == rnn_enum::kLstm) {
+        in_data.push_back(inputs[index++]);
+          if (param.state_outputs) {
+          out_data.push_back(inputs[index++]);
+          out_grad.push_back(inputs[index]);
+        }
+      }
+      op.Backward(ctx, out_grad, in_data, out_data, req, in_grad);
+    });
+  #else
+    LOG(FATAL) << "RNN on GPU is only available for cuDNN at the moment.";
+  #endif  // MXNET_USE_CUDNN && CUDNN_MAJOR
+}
+
+NNVM_REGISTER_OP(RNN)
+.set_attr<FStatefulCompute>("FStatefulCompute<gpu>", RNNStatefulCompute<gpu>);
+
+NNVM_REGISTER_OP(_backward_RNN)
+.set_attr<FStatefulCompute>("FStatefulCompute<gpu>", RNNStatefulGradCompute<gpu>);
 }  // namespace op
 }  // namespace mxnet

From 9c38854721ab0e86799d300445ebff7e5af6dff3 Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Wed, 20 Mar 2019 12:26:03 +0800
Subject: [PATCH 02/25] retrigger the ci


From 50440e9a148b698fa67ae842c5f694e9208cf2be Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Wed, 20 Mar 2019 13:22:05 +0800
Subject: [PATCH 03/25] fix windows compile issue

---
 src/operator/rnn.cc | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index 6a9b7c001477..5da870643cd0 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -192,14 +192,17 @@ static OpStatePtr CreateRNNState(const nnvm::NodeAttrs &attrs,
                                  const std::vector<int> &in_types) {
   const RNNParam& param = nnvm::get<RNNParam>(attrs.parsed);
   OpStatePtr state = OpStatePtr();
-  MSHADOW_REAL_TYPE_SWITCH(in_types[rnn_enum::kData], DType, {
-    #if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
+  #if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
+    MSHADOW_REAL_TYPE_SWITCH(in_types[rnn_enum::kData], DType, {
       state = OpStatePtr::Create<CuDNNRNNOp<DType>>(param);
-    #else
+    return state;
+    });
+  #else
+    MSHADOW_REAL_TYPE_SWITCH(in_types[rnn_enum::kData], DType, {
       state = OpStatePtr::Create<RNNOp<DType>>(param);
-    #endif
     return state;
-  });
+    });
+  #endif
   return OpStatePtr();  // should never reach here
 }
 

From 5324d283db2b7f01ab97e83a68979ce31988a618 Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Wed, 20 Mar 2019 22:07:00 +0800
Subject: [PATCH 04/25] move cudnnrnn class into rnn-inl.h

---
 src/operator/cudnn_rnn-inl.h | 862 ---------------------------------
 src/operator/rnn-inl.h       | 898 +++++++++++++++++++++++++++++++++++
 src/operator/rnn.cc          | 104 ----
 src/operator/rnn.cu          |  55 ---
 4 files changed, 898 insertions(+), 1021 deletions(-)
 delete mode 100644 src/operator/cudnn_rnn-inl.h

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
deleted file mode 100644
index 425c67864e28..000000000000
--- a/src/operator/cudnn_rnn-inl.h
+++ /dev/null
@@ -1,862 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2016 by Contributors
- * \file cudnn_rnn-inl.h
- * \brief
- * \author Sebastian Bodenstein
-*/
-#ifndef MXNET_OPERATOR_CUDNN_RNN_INL_H_
-#define MXNET_OPERATOR_CUDNN_RNN_INL_H_
-
-#define USE_CUDNN_LSTM_PROJ MXNET_USE_CUDNN == 1 && CUDNN_VERSION >= 7200
-#include <mxnet/storage.h>
-#include <vector>
-#include <map>
-#include <string>
-#include <utility>
-#include <cstdint>
-#include "./rnn-inl.h"
-
-namespace mxnet {
-namespace op {
-#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
-template<typename DType>
-class CuDNNRNNOp {
- public:
-  RNNParam param_;
-  explicit CuDNNRNNOp(RNNParam param) {
-    this->param_ = param;
-    init_cudnn_ = false;
-    dtype_ = mshadow::DataType<DType>::kCudnnFlag;
-    // TensorCore algos only allowed on fp16-I/O convolutions if permitted by the global policy.
-    // No tests in place for fp16 RNNs, so leave TensorCore disabled for now.
-    cudnn_tensor_core_ = false;
-    // When fp16 RNN tests are introduced, we can enable TensorCore as follows:
-//    cudnn_tensor_core =
-//        mshadow::DataType<DType>::kFlag == mshadow::kFloat16 && GetEnvAllowTensorCore();
-    // Defaults
-    input_mode_ = CUDNN_LINEAR_INPUT;  // Don't support this yet
-    // RNN Mode
-    switch (param_.mode) {
-      case rnn_enum::kRnnRelu:
-        mode_ = CUDNN_RNN_RELU;
-        break;
-      case rnn_enum::kRnnTanh:
-        mode_ = CUDNN_RNN_TANH;
-        break;
-      case rnn_enum::kLstm:
-        mode_ = CUDNN_LSTM;
-        break;
-      case rnn_enum::kGru:
-        mode_ = CUDNN_GRU;
-        break;
-      default:
-        LOG(FATAL) << "Not implmented";
-    }
-#if USE_CUDNN_LSTM_PROJ
-    if (param_.projection_size.has_value()) {
-      CHECK_EQ(param_.mode, rnn_enum::kLstm)
-        << "Projection is only supported for LSTM.";
-      CHECK_GE(param_.state_size, param_.projection_size.value())
-        << "State size must be larger than projection size.";
-    }
-#else
-    CHECK(!param_.projection_size.has_value())
-      << "Projection is only supported for LSTM with CuDNN version later than 7.1.1.";
-#endif
-#if USE_CUDNN_LSTM_PROJ
-    if (param_.lstm_state_clip_min.has_value()
-        || param_.lstm_state_clip_max.has_value()) {
-      CHECK_EQ(param_.mode, rnn_enum::kLstm)
-        << "State clipping is only supported for LSTM.";
-      CHECK(param_.lstm_state_clip_min.has_value() && param_.lstm_state_clip_max.has_value())
-        << "lstm_state_clip_min and lstm_state_clip_max must be specified together.";
-      CHECK_GE(param_.lstm_state_clip_max.value(), param_.lstm_state_clip_min.value())
-        << "lstm_state_clip_max must be greater or equal to lstm_state_clip_min";
-    }
-#else
-    CHECK(!param_.lstm_state_clip_min.has_value()
-          && !param_.lstm_state_clip_max.has_value())
-      << "State clipping is only supported for LSTM with CuDNN version later than 7.2.1.";
-#endif
-    // RNN Direction
-    direction_ = param_.bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL;
-    // Create descriptors
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&hx_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&cx_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&hy_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&cy_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&dhx_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&dcx_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&dhy_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&dcy_desc_));
-
-    CUDNN_CALL(cudnnCreateFilterDescriptor(&w_desc_));
-    CUDNN_CALL(cudnnCreateFilterDescriptor(&dw_desc_));
-
-    CUDNN_CALL(cudnnCreateRNNDescriptor(&rnn_desc_));
-    CUDNN_CALL(cudnnCreateDropoutDescriptor(&dropout_desc_));
-
-    #if USE_CUDNN_LSTM_PROJ
-    CUDNN_CALL(cudnnCreateRNNDataDescriptor(&x_data_desc_));
-    CUDNN_CALL(cudnnCreateRNNDataDescriptor(&y_data_desc_));
-    CUDNN_CALL(cudnnCreateRNNDataDescriptor(&dx_data_desc_));
-    CUDNN_CALL(cudnnCreateRNNDataDescriptor(&dy_data_desc_));
-    #endif
-  }
-
-  ~CuDNNRNNOp() {
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(hx_desc_));
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(cx_desc_));
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(hy_desc_));
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(cy_desc_));
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(dhx_desc_));
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(dcx_desc_));
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(dhy_desc_));
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(dcy_desc_));
-
-    CUDNN_CALL(cudnnDestroyFilterDescriptor(w_desc_));
-    CUDNN_CALL(cudnnDestroyFilterDescriptor(dw_desc_));
-    CUDNN_CALL(cudnnDestroyRNNDescriptor(rnn_desc_));
-    CUDNN_CALL(cudnnDestroyDropoutDescriptor(dropout_desc_));
-
-    if (init_cudnn_) {
-      for (size_t i = 0; i < x_desc_vec_.size(); ++i) {
-        CUDNN_CALL(cudnnDestroyTensorDescriptor(x_desc_vec_[i]));
-        CUDNN_CALL(cudnnDestroyTensorDescriptor(y_desc_vec_[i]));
-        CUDNN_CALL(cudnnDestroyTensorDescriptor(dx_desc_vec_[i]));
-        CUDNN_CALL(cudnnDestroyTensorDescriptor(dy_desc_vec_[i]));
-      }
-      init_cudnn_ = false;
-
-      Storage::Get()->Free(reserve_space_);
-      if (param_.p > 0) {
-        Storage::Get()->Free(dropout_states_);
-      }
-    }
-    #if USE_CUDNN_LSTM_PROJ
-    CUDNN_CALL(cudnnDestroyRNNDataDescriptor(x_data_desc_));
-    CUDNN_CALL(cudnnDestroyRNNDataDescriptor(y_data_desc_));
-    CUDNN_CALL(cudnnDestroyRNNDataDescriptor(dx_data_desc_));
-    CUDNN_CALL(cudnnDestroyRNNDataDescriptor(dy_data_desc_));
-    #endif
-  }
-
-  void Forward(const OpContext &ctx, const std::vector<TBlob> &in_data,
-               const std::vector<OpReqType> &req,
-               const std::vector<TBlob> &out_data) {
-    using namespace mshadow;
-    size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
-    //  kOut
-    size_t num_outputs = 1;
-    if (param_.state_outputs) {
-      // kOut, kStateOut, kStateCellOut
-      num_outputs = (param_.mode == rnn_enum::kLstm) ? 3 : 2;
-    }
-
-    CHECK_EQ(in_data.size(), num_inputs);
-    CHECK_EQ(out_data.size(), num_outputs);
-    Stream<gpu> *s = ctx.get_stream<gpu>();
-    // get input + output tensors
-    Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
-    Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
-    Tensor<gpu, 3, DType> hx = in_data[rnn_enum::kState].get<gpu, 3, DType>(s);
-    Tensor<gpu, 3, DType> y = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
-
-    void * hy_ptr = NULL;
-    if (param_.state_outputs)
-      hy_ptr = out_data[rnn_enum::kStateOut].get<gpu, 3, DType>(s).dptr_;
-
-    DType * cx_ptr = NULL;
-    DType * cy_ptr = NULL;
-    if (param_.mode == rnn_enum::kLstm)
-      cx_ptr = (in_data[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
-    if (param_.mode == rnn_enum::kLstm && param_.state_outputs)
-      cy_ptr = (out_data[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
-
-    CHECK_EQ(x.CheckContiguous(), true);
-    CHECK_EQ(w.CheckContiguous(), true);
-    CHECK_EQ(hx.CheckContiguous(), true);
-    CHECK_EQ(y.CheckContiguous(), true);
-
-    if (!init_cudnn_) {
-      Init(s, in_data, out_data);
-    }
-    // Get temp space
-    int temp_size = workspace_size_;
-    Tensor<gpu, 1, DType> temp_space =
-      ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
-                              mshadow::Shape1(temp_size), s);
-    #if USE_CUDNN_LSTM_PROJ
-    std::vector<int> seqLengthArray(param_.batch_size_, param_.seq_length_);
-    CUDNN_CALL(cudnnSetRNNDataDescriptor(x_data_desc_,
-                                         dtype_,
-                                         CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED,
-                                         param_.seq_length_,
-                                         param_.batch_size_,
-                                         param_.input_size_,
-                                         seqLengthArray.data(),
-                                         nullptr));
-    int out_size =
-      (param_.projection_size.has_value()) ? param_.projection_size.value() : param_.state_size;
-    out_size = (param_.bidirectional) ? (out_size * 2) : out_size;
-    CUDNN_CALL(cudnnSetRNNDataDescriptor(y_data_desc_,
-                                         dtype_,
-                                         CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED,
-                                         param_.seq_length_,
-                                         param_.batch_size_,
-                                         out_size,
-                                         seqLengthArray.data(),
-                                         nullptr));
-    if (ctx.is_train) {
-      CUDNN_CALL(cudnnSetRNNDataDescriptor(dx_data_desc_,
-                                           dtype_,
-                                           CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED,
-                                           param_.seq_length_,
-                                           param_.batch_size_,
-                                           param_.input_size_,
-                                           seqLengthArray.data(),
-                                           nullptr));
-      CUDNN_CALL(cudnnSetRNNDataDescriptor(dy_data_desc_,
-                                           dtype_,
-                                           CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED,
-                                           param_.seq_length_,
-                                           param_.batch_size_,
-                                           out_size,
-                                           seqLengthArray.data(),
-                                           nullptr));
-    }
-    #endif
-
-    #if USE_CUDNN_LSTM_PROJ
-    bool clip_state = param_.lstm_state_clip_min.has_value();
-    bool clip_nan = param_.lstm_state_clip_nan;
-    CUDNN_CALL(cudnnRNNSetClip(s->dnn_handle_,
-                               rnn_desc_,
-                               clip_state ? CUDNN_RNN_CLIP_MINMAX : CUDNN_RNN_CLIP_NONE,
-                               clip_nan ? CUDNN_NOT_PROPAGATE_NAN : CUDNN_PROPAGATE_NAN,
-                               clip_state ? param_.lstm_state_clip_min.value() : 0.0,
-                               clip_state ? param_.lstm_state_clip_max.value() : 0.0));
-    #endif
-
-    if (ctx.is_train) {
-      #if USE_CUDNN_LSTM_PROJ
-      CUDNN_CALL(cudnnRNNForwardTrainingEx(s->dnn_handle_,
-                                           rnn_desc_,
-                                           x_data_desc_,
-                                           x.dptr_,
-                                           hx_desc_,
-                                           hx.dptr_,
-                                           cx_desc_,
-                                           cx_ptr,
-                                           w_desc_,
-                                           w.dptr_,
-                                           y_data_desc_,
-                                           y.dptr_,
-                                           hy_desc_,
-                                           hy_ptr,
-                                           cy_desc_,
-                                           cy_ptr,
-                                           nullptr,
-                                           nullptr,
-                                           nullptr,
-                                           nullptr,
-                                           nullptr,
-                                           nullptr,
-                                           nullptr,
-                                           nullptr,
-                                           temp_space.dptr_,
-                                           workspace_byte_,
-                                           reserve_space_.dptr,
-                                           reserve_space_byte_));
-      #else
-      CUDNN_CALL(cudnnRNNForwardTraining(s->dnn_handle_,
-                                         rnn_desc_,
-                                         param_.seq_length_,
-                                         x_desc_vec_.data(),
-                                         x.dptr_,
-                                         hx_desc_,
-                                         hx.dptr_,
-                                         cx_desc_,
-                                         cx_ptr,
-                                         w_desc_,
-                                         w.dptr_,
-                                         y_desc_vec_.data(),
-                                         y.dptr_,
-                                         hy_desc_,
-                                         hy_ptr,
-                                         cy_desc_,
-                                         cy_ptr,
-                                         temp_space.dptr_,
-                                         workspace_byte_,
-                                         reserve_space_.dptr,
-                                         reserve_space_byte_));
-      #endif
-    } else {
-      #if USE_CUDNN_LSTM_PROJ
-      CUDNN_CALL(cudnnRNNForwardInferenceEx(s->dnn_handle_,
-                                            rnn_desc_,
-                                            x_data_desc_,
-                                            x.dptr_,
-                                            hx_desc_,
-                                            hx.dptr_,
-                                            cx_desc_,
-                                            cx_ptr,
-                                            w_desc_,
-                                            w.dptr_,
-                                            y_data_desc_,
-                                            y.dptr_,
-                                            hy_desc_,
-                                            hy_ptr,
-                                            cy_desc_,
-                                            cy_ptr,
-                                            nullptr,
-                                            nullptr,
-                                            nullptr,
-                                            nullptr,
-                                            nullptr,
-                                            nullptr,
-                                            nullptr,
-                                            nullptr,
-                                            temp_space.dptr_,
-                                            workspace_byte_));
-      #else
-      CUDNN_CALL(cudnnRNNForwardInference(s->dnn_handle_,
-                                          rnn_desc_,
-                                          param_.seq_length_,
-                                          x_desc_vec_.data(),
-                                          x.dptr_,
-                                          hx_desc_,
-                                          hx.dptr_,
-                                          cx_desc_,
-                                          cx_ptr,
-                                          w_desc_,
-                                          w.dptr_,
-                                          y_desc_vec_.data(),
-                                          y.dptr_,
-                                          hy_desc_,
-                                          hy_ptr,
-                                          cy_desc_,
-                                          cy_ptr,
-                                          temp_space.dptr_,
-                                          workspace_byte_));
-      #endif
-    }
-  }
-
-  void Backward(const OpContext &ctx,
-                const std::vector<TBlob> &out_grad,
-                const std::vector<TBlob> &in_data,
-                const std::vector<TBlob> &out_data,
-                const std::vector<OpReqType> &req,
-                const std::vector<TBlob> &in_grad) {
-    using namespace mshadow;
-    size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
-    //  kOut
-    size_t num_outputs = 1;
-    if (param_.state_outputs) {
-      // kOut, kStateOut, kStateCellOut
-      num_outputs = (param_.mode == rnn_enum::kLstm) ? 3 : 2;
-    }
-
-    CHECK_EQ(in_data.size(), num_inputs);
-    CHECK_EQ(out_data.size(), num_outputs);
-    CHECK_EQ(in_grad.size(), num_inputs);
-    CHECK_EQ(out_grad.size(), num_outputs);
-    CHECK_EQ(req.size(), num_inputs);
-    CHECK_NE(req[rnn_enum::kData], kAddTo) << "AddTo is not supported for data";
-    CHECK_NE(req[rnn_enum::kState], kAddTo) << "AddTo is not supported for state";
-    Stream<gpu> *s = ctx.get_stream<gpu>();
-    // get input + output tensors
-    Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
-    Tensor<gpu, 3, DType> dx = in_grad[rnn_enum::kData].get<gpu, 3, DType>(s);
-    Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
-    Tensor<gpu, 1, DType> dw = in_grad[rnn_enum::kParams].get<gpu, 1, DType>(s);
-    Tensor<gpu, 3, DType> hx = in_data[rnn_enum::kState].get<gpu, 3, DType>(s);
-    Tensor<gpu, 3, DType> dhx = in_grad[rnn_enum::kState].get<gpu, 3, DType>(s);
-    Tensor<gpu, 3, DType> y = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
-    Tensor<gpu, 3, DType> dy = out_grad[rnn_enum::kOut].get<gpu, 3, DType>(s);
-    if (req[rnn_enum::kParams] != kAddTo) {
-      dw = mshadow::expr::ScalarExp<DType>(0.0f);
-    }
-    // only need kStateOut grad output_states is true
-    void * dhy_ptr = NULL;
-    if (param_.state_outputs)
-      dhy_ptr = out_grad[rnn_enum::kStateOut].get<gpu, 3, DType>(s).dptr_;
-
-    // Deal with lstm
-    void * dcx_ptr = NULL;
-    void * dcy_ptr = NULL;
-    void * cx_ptr = NULL;
-
-    if (param_.mode == rnn_enum::kLstm) {
-      CHECK_NE(req[rnn_enum::kStateCell], kAddTo) << "AddTo is not supported for state cell";
-      cx_ptr = (in_data[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
-      dcx_ptr = (in_grad[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
-    }
-    if ((param_.mode == rnn_enum::kLstm) && param_.state_outputs)
-        dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
-
-    CHECK_EQ(x.CheckContiguous(), true);
-    CHECK_EQ(w.CheckContiguous(), true);
-    CHECK_EQ(dw.CheckContiguous(), true);
-    CHECK_EQ(hx.CheckContiguous(), true);
-    CHECK_EQ(dhx.CheckContiguous(), true);
-    CHECK_EQ(y.CheckContiguous(), true);
-    CHECK_EQ(dy.CheckContiguous(), true);
-
-    if (!init_cudnn_) {
-      Init(s, in_data, out_data);
-    }
-
-    // Get temp space
-    int temp_size = workspace_size_;
-    Tensor<gpu, 1, DType> temp_space =
-      ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
-                              mshadow::Shape1(temp_size), s);
-    #if USE_CUDNN_LSTM_PROJ
-    CUDNN_CALL(cudnnRNNBackwardDataEx(s->dnn_handle_,
-                                      rnn_desc_,
-                                      y_data_desc_,
-                                      y.dptr_,
-                                      dy_data_desc_,
-                                      dy.dptr_,
-                                      nullptr,
-                                      nullptr,
-                                      dhy_desc_,
-                                      dhy_ptr,
-                                      dcy_desc_,
-                                      dcy_ptr,
-                                      w_desc_,
-                                      w.dptr_,
-                                      hx_desc_,
-                                      hx.dptr_,
-                                      cx_desc_,
-                                      cx_ptr,
-                                      dx_data_desc_,
-                                      dx.dptr_,
-                                      dhx_desc_,
-                                      dhx.dptr_,
-                                      dcx_desc_,
-                                      dcx_ptr,
-                                      nullptr,
-                                      nullptr,
-                                      temp_space.dptr_,
-                                      workspace_byte_,
-                                      reserve_space_.dptr,
-                                      reserve_space_byte_));
-    CUDNN_CALL(cudnnRNNBackwardWeightsEx(s->dnn_handle_,
-                                         rnn_desc_,
-                                         x_data_desc_,
-                                         x.dptr_,
-                                         hx_desc_,
-                                         hx.dptr_,
-                                         y_data_desc_,
-                                         y.dptr_,
-                                         temp_space.dptr_,
-                                         workspace_byte_,
-                                         dw_desc_,
-                                         dw.dptr_,
-                                         reserve_space_.dptr,
-                                         reserve_space_byte_));
-    #else
-    CUDNN_CALL(cudnnRNNBackwardData(s->dnn_handle_,
-                                    rnn_desc_,
-                                    param_.seq_length_,
-                                    y_desc_vec_.data(),
-                                    y.dptr_,
-                                    dy_desc_vec_.data(),
-                                    dy.dptr_,
-                                    dhy_desc_,
-                                    dhy_ptr,
-                                    dcy_desc_,
-                                    dcy_ptr,
-                                    w_desc_,
-                                    w.dptr_,
-                                    hx_desc_,
-                                    hx.dptr_,
-                                    cx_desc_,
-                                    cx_ptr,
-                                    dx_desc_vec_.data(),
-                                    dx.dptr_,
-                                    dhx_desc_,
-                                    dhx.dptr_,
-                                    dcx_desc_,
-                                    dcx_ptr,
-                                    temp_space.dptr_,
-                                    workspace_byte_,
-                                    reserve_space_.dptr,
-                                    reserve_space_byte_));
-    CUDNN_CALL(cudnnRNNBackwardWeights(s->dnn_handle_,
-                                       rnn_desc_,
-                                       param_.seq_length_,
-                                       x_desc_vec_.data(),
-                                       x.dptr_,
-                                       hx_desc_,
-                                       hx.dptr_,
-                                       y_desc_vec_.data(),
-                                       y.dptr_,
-                                       temp_space.dptr_,
-                                       workspace_byte_,
-                                       dw_desc_,
-                                       dw.dptr_,
-                                       reserve_space_.dptr,
-                                       reserve_space_byte_));
-    #endif
-  }
-
- private:
-  inline void Init(mshadow::Stream<gpu> *s,
-                   const std::vector<TBlob> &in_data,
-                   const std::vector<TBlob> &out_data) {
-    using namespace mshadow;
-    #if CUDNN_MAJOR >= 5
-    format_ = CUDNN_TENSOR_NCHW;
-    #endif
-
-    size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
-    //  kOut
-    size_t num_outputs = 1;
-    if (param_.state_outputs) {
-      // kOut, kStateOut, kStateCellOut
-      num_outputs = (param_.mode == rnn_enum::kLstm) ? 3 : 2;
-    }
-
-    CHECK_EQ(in_data.size(), num_inputs);
-    CHECK_EQ(out_data.size(), num_outputs);
-    if (!init_cudnn_) {
-      init_cudnn_ = true;
-      // get input + output tensors
-      Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
-      Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
-      param_.seq_length_ = x.shape_[0];
-      param_.batch_size_ = x.shape_[1];
-      param_.input_size_ = x.shape_[2];
-
-      // Tensor Descriptors
-      std::vector<cudnnTensorDescriptor_t> x_vec(param_.seq_length_);
-      std::vector<cudnnTensorDescriptor_t> y_vec(param_.seq_length_);
-      std::vector<cudnnTensorDescriptor_t> dx_vec(param_.seq_length_);
-      std::vector<cudnnTensorDescriptor_t> dy_vec(param_.seq_length_);
-      int dimA[3];
-      int strideA[3];
-      for (int i = 0; i < param_.seq_length_; i++) {
-        CUDNN_CALL(cudnnCreateTensorDescriptor(&x_vec[i]));
-        CUDNN_CALL(cudnnCreateTensorDescriptor(&y_vec[i]));
-        CUDNN_CALL(cudnnCreateTensorDescriptor(&dx_vec[i]));
-        CUDNN_CALL(cudnnCreateTensorDescriptor(&dy_vec[i]));
-
-        dimA[0] = param_.batch_size_;
-        dimA[1] = param_.input_size_;
-        dimA[2] = 1;
-        strideA[0] = dimA[2] * dimA[1];
-        strideA[1] = dimA[2];
-        strideA[2] = 1;
-
-        CUDNN_CALL(cudnnSetTensorNdDescriptor(x_vec[i],
-                                              dtype_,
-                                              3,
-                                              dimA,
-                                              strideA));
-        CUDNN_CALL(cudnnSetTensorNdDescriptor(dx_vec[i],
-                                              dtype_,
-                                              3,
-                                              dimA,
-                                              strideA));
-        dimA[0] = param_.batch_size_;
-        dimA[1] = param_.bidirectional ? param_.state_size * 2 : param_.state_size;
-        dimA[2] = 1;
-        strideA[0] = dimA[2] * dimA[1];
-        strideA[1] = dimA[2];
-        strideA[2] = 1;
-
-        CUDNN_CALL(cudnnSetTensorNdDescriptor(y_vec[i],
-                                              dtype_,
-                                              3,
-                                              dimA,
-                                              strideA));
-        CUDNN_CALL(cudnnSetTensorNdDescriptor(dy_vec[i],
-                                              dtype_,
-                                              3,
-                                              dimA,
-                                              strideA));
-      }
-      x_desc_vec_ = x_vec;
-      y_desc_vec_ = y_vec;
-      dx_desc_vec_ = dx_vec;
-      dy_desc_vec_ = dy_vec;
-
-      // set the state tensors
-      dimA[0] = param_.num_layers * (param_.bidirectional ? 2 : 1);
-      dimA[1] = param_.batch_size_;
-      dimA[2] = param_.state_size;
-      strideA[0] = dimA[2] * dimA[1];
-      strideA[1] = dimA[2];
-      strideA[2] = 1;
-      #if USE_CUDNN_LSTM_PROJ
-      int dimB[3];
-      int strideB[3];
-      dimB[0] = param_.num_layers * (param_.bidirectional ? 2 : 1);
-      dimB[1] = param_.batch_size_;
-      dimB[2] = param_.projection_size.has_value() ?
-                param_.projection_size.value() : param_.state_size;
-      strideB[0] = dimB[2] * dimB[1];
-      strideB[1] = dimB[2];
-      strideB[2] = 1;
-      #endif
-
-      #if USE_CUDNN_LSTM_PROJ
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(hx_desc_,
-                                            dtype_,
-                                            3,
-                                            dimB,
-                                            strideB));
-      #else
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(hx_desc_,
-                                            dtype_,
-                                            3,
-                                            dimA,
-                                            strideA));
-      #endif
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(cx_desc_,
-                                            dtype_,
-                                            3,
-                                            dimA,
-                                            strideA));
-      #if USE_CUDNN_LSTM_PROJ
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(hy_desc_,
-                                            dtype_,
-                                            3,
-                                            dimB,
-                                            strideB));
-      #else
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(hy_desc_,
-                                            dtype_,
-                                            3,
-                                            dimA,
-                                            strideA));
-      #endif
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(cy_desc_,
-                                            dtype_,
-                                            3,
-                                            dimA,
-                                            strideA));
-      #if USE_CUDNN_LSTM_PROJ
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(dhx_desc_,
-                                            dtype_,
-                                            3,
-                                            dimB,
-                                            strideB));
-      #else
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(dhx_desc_,
-                                            dtype_,
-                                            3,
-                                            dimA,
-                                            strideA));
-      #endif
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(dcx_desc_,
-                                            dtype_,
-                                            3,
-                                            dimA,
-                                            strideA));
-      #if USE_CUDNN_LSTM_PROJ
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(dhy_desc_,
-                                            dtype_,
-                                            3,
-                                            dimB,
-                                            strideB));
-      #else
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(dhy_desc_,
-                                            dtype_,
-                                            3,
-                                            dimA,
-                                            strideA));
-      #endif
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(dcy_desc_,
-                                            dtype_,
-                                            3,
-                                            dimA,
-                                            strideA));
-
-      // Create Dropout descriptors
-      if (param_.p > 0) {
-        CUDNN_CALL(cudnnDropoutGetStatesSize(s->dnn_handle_, &dropout_byte_));
-        dropout_size_ = dropout_byte_ / sizeof(DType);
-        dropout_states_ = Storage::Get()->Alloc(dropout_byte_, Context::GPU(s->dev_id));
-      } else {
-        dropout_states_ = {};
-        dropout_byte_ = 0;
-      }
-      CUDNN_CALL(cudnnSetDropoutDescriptor(dropout_desc_, s->dnn_handle_,
-                                           param_.p,  // discard probability
-                                           dropout_states_.dptr, dropout_byte_,
-                                           seed_));
-      // RNN descriptors
-      #if CUDNN_MAJOR >= 6
-      cudnnRNNAlgo_t rnn_algo = CUDNN_RNN_ALGO_STANDARD;
-      CUDNN_CALL(cudnnSetRNNDescriptor_v6(s->dnn_handle_,
-                                          rnn_desc_,
-                                          param_.state_size,
-                                          param_.num_layers,
-                                          dropout_desc_,
-                                          input_mode_,
-                                          direction_,
-                                          mode_,
-                                          rnn_algo,
-                                          dtype_));
-      #else
-      CUDNN_CALL(cudnnSetRNNDescriptor(rnn_desc_,
-                                       param_.state_size,
-                                       param_.num_layers,
-                                       dropout_desc_,
-                                       input_mode_,
-                                       direction_,
-                                       mode_,
-                                       dtype_));
-      #endif
-      #if CUDNN_MAJOR >= 7
-        cudnnMathType_t math_type = CUDNN_DEFAULT_MATH;
-        if (cudnn_tensor_core_ && rnn_algo == CUDNN_RNN_ALGO_STANDARD) {
-          math_type = CUDNN_TENSOR_OP_MATH;
-        }
-      #if CUDNN_VERSION >= 7200
-            if (GetEnvAllowTensorCore() && GetEnvAllowTensorCoreConversion() &&
-                (DataType<DType>::kFlag != kFloat16))
-              math_type = CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION;
-      #endif
-        CUDNN_CALL(cudnnSetRNNMatrixMathType(rnn_desc_, math_type));
-      #endif
-      #if USE_CUDNN_LSTM_PROJ
-      if (param_.projection_size.has_value()) {
-        CUDNN_CALL(cudnnSetRNNProjectionLayers(s->dnn_handle_,
-                                               rnn_desc_,
-                                               param_.projection_size.value(),
-                                               0));
-      }
-      #endif
-      // Get temp space sizes
-      CUDNN_CALL(cudnnGetRNNWorkspaceSize(s->dnn_handle_,
-                                          rnn_desc_,
-                                          param_.seq_length_,
-                                          x_desc_vec_.data(),
-                                          &workspace_byte_));
-      CUDNN_CALL(cudnnGetRNNTrainingReserveSize(s->dnn_handle_,
-                                                rnn_desc_,
-                                                param_.seq_length_,
-                                                x_desc_vec_.data(),
-                                                &reserve_space_byte_));
-      workspace_size_ = workspace_byte_ / sizeof(DType);
-      // Allocate the reserve space
-      reserve_space_ = Storage::Get()->Alloc(reserve_space_byte_, Context::GPU(s->dev_id));
-
-      // Check that number of params are correct
-      size_t cudnn_param_size;
-      CUDNN_CALL(cudnnGetRNNParamsSize(s->dnn_handle_,
-                                       rnn_desc_,
-                                       x_desc_vec_[0],
-                                       &cudnn_param_size,
-                                       dtype_));
-      CHECK_EQ(w.shape_[0] * sizeof(DType), cudnn_param_size);
-
-      // Set param descriptors
-      int dim_w[3] = {1, 1, 1};
-      dim_w[0] = w.shape_[0];
-      CUDNN_CALL(cudnnSetFilterNdDescriptor(w_desc_,
-                                            dtype_,
-                                            format_,
-                                            3,
-                                            dim_w));
-      CUDNN_CALL(cudnnSetFilterNdDescriptor(dw_desc_,
-                                            dtype_,
-                                            format_,
-                                            3,
-                                            dim_w));
-
-      // Query weight layout
-      // cudnnFilterDescriptor_t m_desc;
-      // CHECK_EQ(cudnnCreateFilterDescriptor(&m_desc), CUDNN_STATUS_SUCCESS);
-      // DType *p;
-      // int n = 2;
-      // int64_t last = 0;
-      // if (param_.mode == rnn_enum::kLstm) n = 8;
-      // else if (param_.mode == rnn_enum::kGru) n = 6;
-
-      // for (int i = 0; i < param_.num_layers*(param_.bidirectional?2:1); ++i) {
-      //   for (int j = 0; j < n; ++j) {
-      //     CHECK_EQ(cudnnGetRNNLinLayerMatrixParams(s->dnn_handle_, rnn_desc_,
-      //       i, x_desc_vec_[0], w_desc_, 0, j, m_desc, (void**)&p), CUDNN_STATUS_SUCCESS);
-      //     LOG(INFO) << ((int64_t)(p - NULL))/sizeof(DType) - last;
-      //     last = ((int64_t)(p - NULL))/sizeof(DType);
-      //     cudnnDataType_t t;
-      //     cudnnTensorFormat_t f;
-      //     int ndim = 5;
-      //     int dims[5] = {0, 0, 0, 0, 0};
-      //     CHECK_EQ(cudnnGetFilterNdDescriptor(m_desc, ndim, &t, &f, &ndim, &dims[0]),
-      //       CUDNN_STATUS_SUCCESS);
-      //     LOG(INFO) << "w: " <<  i << " " << j << " " << ((int64_t)(p - NULL))/sizeof(DType);
-      //     for (int i = 0; i < ndim; ++i) LOG(INFO) << dims[i];
-      //   }
-      // }
-
-      // for (int i = 0; i < param_.num_layers*(param_.bidirectional?2:1); ++i) {
-      //   for (int j = 0; j < n; ++j) {
-      //     CHECK_EQ(cudnnGetRNNLinLayerBiasParams(s->dnn_handle_, rnn_desc_, i, x_desc_vec_[0],
-      //       w_desc_, 0, j, m_desc, (void**)&p), CUDNN_STATUS_SUCCESS);
-      //     LOG(INFO) << ((int64_t)(p - NULL))/sizeof(DType) - last;
-      //     last = ((int64_t)(p - NULL))/sizeof(DType);
-      //     LOG(INFO) << "b: " << i << " " << j << " " << ((int64_t)(p - NULL))/sizeof(DType);
-      //   }
-      // }
-    }
-  }
-
-  cudnnDataType_t dtype_;
-  bool init_cudnn_;
-  cudnnRNNDescriptor_t rnn_desc_;
-  cudnnRNNMode_t mode_;
-  cudnnDirectionMode_t direction_;
-  cudnnRNNInputMode_t input_mode_;
-  cudnnDropoutDescriptor_t dropout_desc_;
-  Storage::Handle dropout_states_, reserve_space_;
-  uint64_t seed_ = 17 + rand() % 4096;  // NOLINT(runtime/threadsafe_fn)
-  size_t workspace_byte_, reserve_space_byte_, dropout_byte_;
-  int workspace_size_, dropout_size_;
-  std::vector<cudnnTensorDescriptor_t> x_desc_vec_, y_desc_vec_, dx_desc_vec_, dy_desc_vec_;
-  #if USE_CUDNN_LSTM_PROJ
-  cudnnRNNDataDescriptor_t x_data_desc_, y_data_desc_, dx_data_desc_, dy_data_desc_;
-  #endif
-  cudnnTensorDescriptor_t hx_desc_, cx_desc_;
-  cudnnTensorDescriptor_t hy_desc_, cy_desc_;
-  cudnnTensorDescriptor_t dhx_desc_, dcx_desc_;
-  cudnnTensorDescriptor_t dhy_desc_, dcy_desc_;
-
-  cudnnFilterDescriptor_t w_desc_, dw_desc_;
-  // Allow TensorCore algo policy
-  bool cudnn_tensor_core_;
-
-  #if CUDNN_MAJOR >= 5
-  cudnnTensorFormat_t format_;
-  #endif
-};
-#endif  // CUDNN
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_CUDNN_RNN_INL_H_
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 0460c193de5b..e0b5de7b18f0 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -26,6 +26,9 @@
 #ifndef MXNET_OPERATOR_RNN_INL_H_
 #define MXNET_OPERATOR_RNN_INL_H_
 
+#define MXNET_USE_CUDNN_RNN MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
+#define USE_CUDNN_LSTM_PROJ MXNET_USE_CUDNN == 1 && CUDNN_VERSION >= 7200
+
 #include <dmlc/logging.h>
 #include <dmlc/parameter.h>
 #include <mxnet/operator.h>
@@ -35,6 +38,7 @@
 #include <vector>
 #include <string>
 #include <utility>
+#include <cstdint>
 #include "./math.h"
 #include "./math_functions-inl.h"
 #include "./operator_common.h"
@@ -375,6 +379,826 @@ void RNNBackward(DType* ws,
   }
 }
 
+#if MXNET_USE_CUDNN_RNN
+template<typename DType>
+class RNNOp {
+ public:
+  RNNParam param_;
+  explicit RNNOp(RNNParam param) {
+    this->param_ = param;
+    init_cudnn_ = false;
+    dtype_ = mshadow::DataType<DType>::kCudnnFlag;
+    // TensorCore algos only allowed on fp16-I/O convolutions if permitted by the global policy.
+    // No tests in place for fp16 RNNs, so leave TensorCore disabled for now.
+    cudnn_tensor_core_ = false;
+    // When fp16 RNN tests are introduced, we can enable TensorCore as follows:
+//    cudnn_tensor_core =
+//        mshadow::DataType<DType>::kFlag == mshadow::kFloat16 && GetEnvAllowTensorCore();
+    // Defaults
+    input_mode_ = CUDNN_LINEAR_INPUT;  // Don't support this yet
+    // RNN Mode
+    switch (param_.mode) {
+      case rnn_enum::kRnnRelu:
+        mode_ = CUDNN_RNN_RELU;
+        break;
+      case rnn_enum::kRnnTanh:
+        mode_ = CUDNN_RNN_TANH;
+        break;
+      case rnn_enum::kLstm:
+        mode_ = CUDNN_LSTM;
+        break;
+      case rnn_enum::kGru:
+        mode_ = CUDNN_GRU;
+        break;
+      default:
+        LOG(FATAL) << "Not implmented";
+    }
+#if USE_CUDNN_LSTM_PROJ
+    if (param_.projection_size.has_value()) {
+      CHECK_EQ(param_.mode, rnn_enum::kLstm)
+        << "Projection is only supported for LSTM.";
+      CHECK_GE(param_.state_size, param_.projection_size.value())
+        << "State size must be larger than projection size.";
+    }
+#else
+    CHECK(!param_.projection_size.has_value())
+      << "Projection is only supported for LSTM with CuDNN version later than 7.1.1.";
+#endif
+#if USE_CUDNN_LSTM_PROJ
+    if (param_.lstm_state_clip_min.has_value()
+        || param_.lstm_state_clip_max.has_value()) {
+      CHECK_EQ(param_.mode, rnn_enum::kLstm)
+        << "State clipping is only supported for LSTM.";
+      CHECK(param_.lstm_state_clip_min.has_value() && param_.lstm_state_clip_max.has_value())
+        << "lstm_state_clip_min and lstm_state_clip_max must be specified together.";
+      CHECK_GE(param_.lstm_state_clip_max.value(), param_.lstm_state_clip_min.value())
+        << "lstm_state_clip_max must be greater or equal to lstm_state_clip_min";
+    }
+#else
+    CHECK(!param_.lstm_state_clip_min.has_value()
+          && !param_.lstm_state_clip_max.has_value())
+      << "State clipping is only supported for LSTM with CuDNN version later than 7.2.1.";
+#endif
+    // RNN Direction
+    direction_ = param_.bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL;
+    // Create descriptors
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&hx_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&cx_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&hy_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&cy_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&dhx_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&dcx_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&dhy_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&dcy_desc_));
+
+    CUDNN_CALL(cudnnCreateFilterDescriptor(&w_desc_));
+    CUDNN_CALL(cudnnCreateFilterDescriptor(&dw_desc_));
+
+    CUDNN_CALL(cudnnCreateRNNDescriptor(&rnn_desc_));
+    CUDNN_CALL(cudnnCreateDropoutDescriptor(&dropout_desc_));
+
+    #if USE_CUDNN_LSTM_PROJ
+    CUDNN_CALL(cudnnCreateRNNDataDescriptor(&x_data_desc_));
+    CUDNN_CALL(cudnnCreateRNNDataDescriptor(&y_data_desc_));
+    CUDNN_CALL(cudnnCreateRNNDataDescriptor(&dx_data_desc_));
+    CUDNN_CALL(cudnnCreateRNNDataDescriptor(&dy_data_desc_));
+    #endif
+  }
+
+  ~RNNOp() {
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(hx_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(cx_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(hy_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(cy_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(dhx_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(dcx_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(dhy_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(dcy_desc_));
+
+    CUDNN_CALL(cudnnDestroyFilterDescriptor(w_desc_));
+    CUDNN_CALL(cudnnDestroyFilterDescriptor(dw_desc_));
+    CUDNN_CALL(cudnnDestroyRNNDescriptor(rnn_desc_));
+    CUDNN_CALL(cudnnDestroyDropoutDescriptor(dropout_desc_));
+
+    if (init_cudnn_) {
+      for (size_t i = 0; i < x_desc_vec_.size(); ++i) {
+        CUDNN_CALL(cudnnDestroyTensorDescriptor(x_desc_vec_[i]));
+        CUDNN_CALL(cudnnDestroyTensorDescriptor(y_desc_vec_[i]));
+        CUDNN_CALL(cudnnDestroyTensorDescriptor(dx_desc_vec_[i]));
+        CUDNN_CALL(cudnnDestroyTensorDescriptor(dy_desc_vec_[i]));
+      }
+      init_cudnn_ = false;
+
+      Storage::Get()->Free(reserve_space_);
+      if (param_.p > 0) {
+        Storage::Get()->Free(dropout_states_);
+      }
+    }
+    #if USE_CUDNN_LSTM_PROJ
+    CUDNN_CALL(cudnnDestroyRNNDataDescriptor(x_data_desc_));
+    CUDNN_CALL(cudnnDestroyRNNDataDescriptor(y_data_desc_));
+    CUDNN_CALL(cudnnDestroyRNNDataDescriptor(dx_data_desc_));
+    CUDNN_CALL(cudnnDestroyRNNDataDescriptor(dy_data_desc_));
+    #endif
+  }
+
+  void Forward(const OpContext &ctx, const std::vector<TBlob> &in_data,
+               const std::vector<OpReqType> &req,
+               const std::vector<TBlob> &out_data) {
+    using namespace mshadow;
+    size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
+    //  kOut
+    size_t num_outputs = 1;
+    if (param_.state_outputs) {
+      // kOut, kStateOut, kStateCellOut
+      num_outputs = (param_.mode == rnn_enum::kLstm) ? 3 : 2;
+    }
+
+    CHECK_EQ(in_data.size(), num_inputs);
+    CHECK_EQ(out_data.size(), num_outputs);
+    Stream<gpu> *s = ctx.get_stream<gpu>();
+    // get input + output tensors
+    Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
+    Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
+    Tensor<gpu, 3, DType> hx = in_data[rnn_enum::kState].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> y = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
+
+    void * hy_ptr = NULL;
+    if (param_.state_outputs)
+      hy_ptr = out_data[rnn_enum::kStateOut].get<gpu, 3, DType>(s).dptr_;
+
+    DType * cx_ptr = NULL;
+    DType * cy_ptr = NULL;
+    if (param_.mode == rnn_enum::kLstm)
+      cx_ptr = (in_data[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
+    if (param_.mode == rnn_enum::kLstm && param_.state_outputs)
+      cy_ptr = (out_data[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
+
+    CHECK_EQ(x.CheckContiguous(), true);
+    CHECK_EQ(w.CheckContiguous(), true);
+    CHECK_EQ(hx.CheckContiguous(), true);
+    CHECK_EQ(y.CheckContiguous(), true);
+
+    if (!init_cudnn_) {
+      Init(s, in_data, out_data);
+    }
+    // Get temp space
+    int temp_size = workspace_size_;
+    Tensor<gpu, 1, DType> temp_space =
+      ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
+                              mshadow::Shape1(temp_size), s);
+    #if USE_CUDNN_LSTM_PROJ
+    std::vector<int> seqLengthArray(param_.batch_size_, param_.seq_length_);
+    CUDNN_CALL(cudnnSetRNNDataDescriptor(x_data_desc_,
+                                         dtype_,
+                                         CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED,
+                                         param_.seq_length_,
+                                         param_.batch_size_,
+                                         param_.input_size_,
+                                         seqLengthArray.data(),
+                                         nullptr));
+    int out_size =
+      (param_.projection_size.has_value()) ? param_.projection_size.value() : param_.state_size;
+    out_size = (param_.bidirectional) ? (out_size * 2) : out_size;
+    CUDNN_CALL(cudnnSetRNNDataDescriptor(y_data_desc_,
+                                         dtype_,
+                                         CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED,
+                                         param_.seq_length_,
+                                         param_.batch_size_,
+                                         out_size,
+                                         seqLengthArray.data(),
+                                         nullptr));
+    if (ctx.is_train) {
+      CUDNN_CALL(cudnnSetRNNDataDescriptor(dx_data_desc_,
+                                           dtype_,
+                                           CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED,
+                                           param_.seq_length_,
+                                           param_.batch_size_,
+                                           param_.input_size_,
+                                           seqLengthArray.data(),
+                                           nullptr));
+      CUDNN_CALL(cudnnSetRNNDataDescriptor(dy_data_desc_,
+                                           dtype_,
+                                           CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED,
+                                           param_.seq_length_,
+                                           param_.batch_size_,
+                                           out_size,
+                                           seqLengthArray.data(),
+                                           nullptr));
+    }
+    #endif
+
+    #if USE_CUDNN_LSTM_PROJ
+    bool clip_state = param_.lstm_state_clip_min.has_value();
+    bool clip_nan = param_.lstm_state_clip_nan;
+    CUDNN_CALL(cudnnRNNSetClip(s->dnn_handle_,
+                               rnn_desc_,
+                               clip_state ? CUDNN_RNN_CLIP_MINMAX : CUDNN_RNN_CLIP_NONE,
+                               clip_nan ? CUDNN_NOT_PROPAGATE_NAN : CUDNN_PROPAGATE_NAN,
+                               clip_state ? param_.lstm_state_clip_min.value() : 0.0,
+                               clip_state ? param_.lstm_state_clip_max.value() : 0.0));
+    #endif
+
+    if (ctx.is_train) {
+      #if USE_CUDNN_LSTM_PROJ
+      CUDNN_CALL(cudnnRNNForwardTrainingEx(s->dnn_handle_,
+                                           rnn_desc_,
+                                           x_data_desc_,
+                                           x.dptr_,
+                                           hx_desc_,
+                                           hx.dptr_,
+                                           cx_desc_,
+                                           cx_ptr,
+                                           w_desc_,
+                                           w.dptr_,
+                                           y_data_desc_,
+                                           y.dptr_,
+                                           hy_desc_,
+                                           hy_ptr,
+                                           cy_desc_,
+                                           cy_ptr,
+                                           nullptr,
+                                           nullptr,
+                                           nullptr,
+                                           nullptr,
+                                           nullptr,
+                                           nullptr,
+                                           nullptr,
+                                           nullptr,
+                                           temp_space.dptr_,
+                                           workspace_byte_,
+                                           reserve_space_.dptr,
+                                           reserve_space_byte_));
+      #else
+      CUDNN_CALL(cudnnRNNForwardTraining(s->dnn_handle_,
+                                         rnn_desc_,
+                                         param_.seq_length_,
+                                         x_desc_vec_.data(),
+                                         x.dptr_,
+                                         hx_desc_,
+                                         hx.dptr_,
+                                         cx_desc_,
+                                         cx_ptr,
+                                         w_desc_,
+                                         w.dptr_,
+                                         y_desc_vec_.data(),
+                                         y.dptr_,
+                                         hy_desc_,
+                                         hy_ptr,
+                                         cy_desc_,
+                                         cy_ptr,
+                                         temp_space.dptr_,
+                                         workspace_byte_,
+                                         reserve_space_.dptr,
+                                         reserve_space_byte_));
+      #endif
+    } else {
+      #if USE_CUDNN_LSTM_PROJ
+      CUDNN_CALL(cudnnRNNForwardInferenceEx(s->dnn_handle_,
+                                            rnn_desc_,
+                                            x_data_desc_,
+                                            x.dptr_,
+                                            hx_desc_,
+                                            hx.dptr_,
+                                            cx_desc_,
+                                            cx_ptr,
+                                            w_desc_,
+                                            w.dptr_,
+                                            y_data_desc_,
+                                            y.dptr_,
+                                            hy_desc_,
+                                            hy_ptr,
+                                            cy_desc_,
+                                            cy_ptr,
+                                            nullptr,
+                                            nullptr,
+                                            nullptr,
+                                            nullptr,
+                                            nullptr,
+                                            nullptr,
+                                            nullptr,
+                                            nullptr,
+                                            temp_space.dptr_,
+                                            workspace_byte_));
+      #else
+      CUDNN_CALL(cudnnRNNForwardInference(s->dnn_handle_,
+                                          rnn_desc_,
+                                          param_.seq_length_,
+                                          x_desc_vec_.data(),
+                                          x.dptr_,
+                                          hx_desc_,
+                                          hx.dptr_,
+                                          cx_desc_,
+                                          cx_ptr,
+                                          w_desc_,
+                                          w.dptr_,
+                                          y_desc_vec_.data(),
+                                          y.dptr_,
+                                          hy_desc_,
+                                          hy_ptr,
+                                          cy_desc_,
+                                          cy_ptr,
+                                          temp_space.dptr_,
+                                          workspace_byte_));
+      #endif
+    }
+  }
+
+  void Backward(const OpContext &ctx,
+                const std::vector<TBlob> &out_grad,
+                const std::vector<TBlob> &in_data,
+                const std::vector<TBlob> &out_data,
+                const std::vector<OpReqType> &req,
+                const std::vector<TBlob> &in_grad) {
+    using namespace mshadow;
+    size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
+    //  kOut
+    size_t num_outputs = 1;
+    if (param_.state_outputs) {
+      // kOut, kStateOut, kStateCellOut
+      num_outputs = (param_.mode == rnn_enum::kLstm) ? 3 : 2;
+    }
+
+    CHECK_EQ(in_data.size(), num_inputs);
+    CHECK_EQ(out_data.size(), num_outputs);
+    CHECK_EQ(in_grad.size(), num_inputs);
+    CHECK_EQ(out_grad.size(), num_outputs);
+    CHECK_EQ(req.size(), num_inputs);
+    CHECK_NE(req[rnn_enum::kData], kAddTo) << "AddTo is not supported for data";
+    CHECK_NE(req[rnn_enum::kState], kAddTo) << "AddTo is not supported for state";
+    Stream<gpu> *s = ctx.get_stream<gpu>();
+    // get input + output tensors
+    Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> dx = in_grad[rnn_enum::kData].get<gpu, 3, DType>(s);
+    Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
+    Tensor<gpu, 1, DType> dw = in_grad[rnn_enum::kParams].get<gpu, 1, DType>(s);
+    Tensor<gpu, 3, DType> hx = in_data[rnn_enum::kState].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> dhx = in_grad[rnn_enum::kState].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> y = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> dy = out_grad[rnn_enum::kOut].get<gpu, 3, DType>(s);
+    if (req[rnn_enum::kParams] != kAddTo) {
+      dw = mshadow::expr::ScalarExp<DType>(0.0f);
+    }
+    // only need kStateOut grad output_states is true
+    void * dhy_ptr = NULL;
+    if (param_.state_outputs)
+      dhy_ptr = out_grad[rnn_enum::kStateOut].get<gpu, 3, DType>(s).dptr_;
+
+    // Deal with lstm
+    void * dcx_ptr = NULL;
+    void * dcy_ptr = NULL;
+    void * cx_ptr = NULL;
+
+    if (param_.mode == rnn_enum::kLstm) {
+      CHECK_NE(req[rnn_enum::kStateCell], kAddTo) << "AddTo is not supported for state cell";
+      cx_ptr = (in_data[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
+      dcx_ptr = (in_grad[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
+    }
+    if ((param_.mode == rnn_enum::kLstm) && param_.state_outputs)
+        dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
+
+    CHECK_EQ(x.CheckContiguous(), true);
+    CHECK_EQ(w.CheckContiguous(), true);
+    CHECK_EQ(dw.CheckContiguous(), true);
+    CHECK_EQ(hx.CheckContiguous(), true);
+    CHECK_EQ(dhx.CheckContiguous(), true);
+    CHECK_EQ(y.CheckContiguous(), true);
+    CHECK_EQ(dy.CheckContiguous(), true);
+
+    if (!init_cudnn_) {
+      Init(s, in_data, out_data);
+    }
+
+    // Get temp space
+    int temp_size = workspace_size_;
+    Tensor<gpu, 1, DType> temp_space =
+      ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
+                              mshadow::Shape1(temp_size), s);
+    #if USE_CUDNN_LSTM_PROJ
+    CUDNN_CALL(cudnnRNNBackwardDataEx(s->dnn_handle_,
+                                      rnn_desc_,
+                                      y_data_desc_,
+                                      y.dptr_,
+                                      dy_data_desc_,
+                                      dy.dptr_,
+                                      nullptr,
+                                      nullptr,
+                                      dhy_desc_,
+                                      dhy_ptr,
+                                      dcy_desc_,
+                                      dcy_ptr,
+                                      w_desc_,
+                                      w.dptr_,
+                                      hx_desc_,
+                                      hx.dptr_,
+                                      cx_desc_,
+                                      cx_ptr,
+                                      dx_data_desc_,
+                                      dx.dptr_,
+                                      dhx_desc_,
+                                      dhx.dptr_,
+                                      dcx_desc_,
+                                      dcx_ptr,
+                                      nullptr,
+                                      nullptr,
+                                      temp_space.dptr_,
+                                      workspace_byte_,
+                                      reserve_space_.dptr,
+                                      reserve_space_byte_));
+    CUDNN_CALL(cudnnRNNBackwardWeightsEx(s->dnn_handle_,
+                                         rnn_desc_,
+                                         x_data_desc_,
+                                         x.dptr_,
+                                         hx_desc_,
+                                         hx.dptr_,
+                                         y_data_desc_,
+                                         y.dptr_,
+                                         temp_space.dptr_,
+                                         workspace_byte_,
+                                         dw_desc_,
+                                         dw.dptr_,
+                                         reserve_space_.dptr,
+                                         reserve_space_byte_));
+    #else
+    CUDNN_CALL(cudnnRNNBackwardData(s->dnn_handle_,
+                                    rnn_desc_,
+                                    param_.seq_length_,
+                                    y_desc_vec_.data(),
+                                    y.dptr_,
+                                    dy_desc_vec_.data(),
+                                    dy.dptr_,
+                                    dhy_desc_,
+                                    dhy_ptr,
+                                    dcy_desc_,
+                                    dcy_ptr,
+                                    w_desc_,
+                                    w.dptr_,
+                                    hx_desc_,
+                                    hx.dptr_,
+                                    cx_desc_,
+                                    cx_ptr,
+                                    dx_desc_vec_.data(),
+                                    dx.dptr_,
+                                    dhx_desc_,
+                                    dhx.dptr_,
+                                    dcx_desc_,
+                                    dcx_ptr,
+                                    temp_space.dptr_,
+                                    workspace_byte_,
+                                    reserve_space_.dptr,
+                                    reserve_space_byte_));
+    CUDNN_CALL(cudnnRNNBackwardWeights(s->dnn_handle_,
+                                       rnn_desc_,
+                                       param_.seq_length_,
+                                       x_desc_vec_.data(),
+                                       x.dptr_,
+                                       hx_desc_,
+                                       hx.dptr_,
+                                       y_desc_vec_.data(),
+                                       y.dptr_,
+                                       temp_space.dptr_,
+                                       workspace_byte_,
+                                       dw_desc_,
+                                       dw.dptr_,
+                                       reserve_space_.dptr,
+                                       reserve_space_byte_));
+    #endif
+  }
+
+ private:
+  inline void Init(mshadow::Stream<gpu> *s,
+                   const std::vector<TBlob> &in_data,
+                   const std::vector<TBlob> &out_data) {
+    using namespace mshadow;
+    #if CUDNN_MAJOR >= 5
+    format_ = CUDNN_TENSOR_NCHW;
+    #endif
+
+    size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
+    //  kOut
+    size_t num_outputs = 1;
+    if (param_.state_outputs) {
+      // kOut, kStateOut, kStateCellOut
+      num_outputs = (param_.mode == rnn_enum::kLstm) ? 3 : 2;
+    }
+
+    CHECK_EQ(in_data.size(), num_inputs);
+    CHECK_EQ(out_data.size(), num_outputs);
+    if (!init_cudnn_) {
+      init_cudnn_ = true;
+      // get input + output tensors
+      Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
+      Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
+      param_.seq_length_ = x.shape_[0];
+      param_.batch_size_ = x.shape_[1];
+      param_.input_size_ = x.shape_[2];
+
+      // Tensor Descriptors
+      std::vector<cudnnTensorDescriptor_t> x_vec(param_.seq_length_);
+      std::vector<cudnnTensorDescriptor_t> y_vec(param_.seq_length_);
+      std::vector<cudnnTensorDescriptor_t> dx_vec(param_.seq_length_);
+      std::vector<cudnnTensorDescriptor_t> dy_vec(param_.seq_length_);
+      int dimA[3];
+      int strideA[3];
+      for (int i = 0; i < param_.seq_length_; i++) {
+        CUDNN_CALL(cudnnCreateTensorDescriptor(&x_vec[i]));
+        CUDNN_CALL(cudnnCreateTensorDescriptor(&y_vec[i]));
+        CUDNN_CALL(cudnnCreateTensorDescriptor(&dx_vec[i]));
+        CUDNN_CALL(cudnnCreateTensorDescriptor(&dy_vec[i]));
+
+        dimA[0] = param_.batch_size_;
+        dimA[1] = param_.input_size_;
+        dimA[2] = 1;
+        strideA[0] = dimA[2] * dimA[1];
+        strideA[1] = dimA[2];
+        strideA[2] = 1;
+
+        CUDNN_CALL(cudnnSetTensorNdDescriptor(x_vec[i],
+                                              dtype_,
+                                              3,
+                                              dimA,
+                                              strideA));
+        CUDNN_CALL(cudnnSetTensorNdDescriptor(dx_vec[i],
+                                              dtype_,
+                                              3,
+                                              dimA,
+                                              strideA));
+        dimA[0] = param_.batch_size_;
+        dimA[1] = param_.bidirectional ? param_.state_size * 2 : param_.state_size;
+        dimA[2] = 1;
+        strideA[0] = dimA[2] * dimA[1];
+        strideA[1] = dimA[2];
+        strideA[2] = 1;
+
+        CUDNN_CALL(cudnnSetTensorNdDescriptor(y_vec[i],
+                                              dtype_,
+                                              3,
+                                              dimA,
+                                              strideA));
+        CUDNN_CALL(cudnnSetTensorNdDescriptor(dy_vec[i],
+                                              dtype_,
+                                              3,
+                                              dimA,
+                                              strideA));
+      }
+      x_desc_vec_ = x_vec;
+      y_desc_vec_ = y_vec;
+      dx_desc_vec_ = dx_vec;
+      dy_desc_vec_ = dy_vec;
+
+      // set the state tensors
+      dimA[0] = param_.num_layers * (param_.bidirectional ? 2 : 1);
+      dimA[1] = param_.batch_size_;
+      dimA[2] = param_.state_size;
+      strideA[0] = dimA[2] * dimA[1];
+      strideA[1] = dimA[2];
+      strideA[2] = 1;
+      #if USE_CUDNN_LSTM_PROJ
+      int dimB[3];
+      int strideB[3];
+      dimB[0] = param_.num_layers * (param_.bidirectional ? 2 : 1);
+      dimB[1] = param_.batch_size_;
+      dimB[2] = param_.projection_size.has_value() ?
+                param_.projection_size.value() : param_.state_size;
+      strideB[0] = dimB[2] * dimB[1];
+      strideB[1] = dimB[2];
+      strideB[2] = 1;
+      #endif
+
+      #if USE_CUDNN_LSTM_PROJ
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(hx_desc_,
+                                            dtype_,
+                                            3,
+                                            dimB,
+                                            strideB));
+      #else
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(hx_desc_,
+                                            dtype_,
+                                            3,
+                                            dimA,
+                                            strideA));
+      #endif
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(cx_desc_,
+                                            dtype_,
+                                            3,
+                                            dimA,
+                                            strideA));
+      #if USE_CUDNN_LSTM_PROJ
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(hy_desc_,
+                                            dtype_,
+                                            3,
+                                            dimB,
+                                            strideB));
+      #else
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(hy_desc_,
+                                            dtype_,
+                                            3,
+                                            dimA,
+                                            strideA));
+      #endif
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(cy_desc_,
+                                            dtype_,
+                                            3,
+                                            dimA,
+                                            strideA));
+      #if USE_CUDNN_LSTM_PROJ
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(dhx_desc_,
+                                            dtype_,
+                                            3,
+                                            dimB,
+                                            strideB));
+      #else
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(dhx_desc_,
+                                            dtype_,
+                                            3,
+                                            dimA,
+                                            strideA));
+      #endif
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(dcx_desc_,
+                                            dtype_,
+                                            3,
+                                            dimA,
+                                            strideA));
+      #if USE_CUDNN_LSTM_PROJ
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(dhy_desc_,
+                                            dtype_,
+                                            3,
+                                            dimB,
+                                            strideB));
+      #else
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(dhy_desc_,
+                                            dtype_,
+                                            3,
+                                            dimA,
+                                            strideA));
+      #endif
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(dcy_desc_,
+                                            dtype_,
+                                            3,
+                                            dimA,
+                                            strideA));
+
+      // Create Dropout descriptors
+      if (param_.p > 0) {
+        CUDNN_CALL(cudnnDropoutGetStatesSize(s->dnn_handle_, &dropout_byte_));
+        dropout_size_ = dropout_byte_ / sizeof(DType);
+        dropout_states_ = Storage::Get()->Alloc(dropout_byte_, Context::GPU(s->dev_id));
+      } else {
+        dropout_states_ = {};
+        dropout_byte_ = 0;
+      }
+      CUDNN_CALL(cudnnSetDropoutDescriptor(dropout_desc_, s->dnn_handle_,
+                                           param_.p,  // discard probability
+                                           dropout_states_.dptr, dropout_byte_,
+                                           seed_));
+      // RNN descriptors
+      #if CUDNN_MAJOR >= 6
+      cudnnRNNAlgo_t rnn_algo = CUDNN_RNN_ALGO_STANDARD;
+      CUDNN_CALL(cudnnSetRNNDescriptor_v6(s->dnn_handle_,
+                                          rnn_desc_,
+                                          param_.state_size,
+                                          param_.num_layers,
+                                          dropout_desc_,
+                                          input_mode_,
+                                          direction_,
+                                          mode_,
+                                          rnn_algo,
+                                          dtype_));
+      #else
+      CUDNN_CALL(cudnnSetRNNDescriptor(rnn_desc_,
+                                       param_.state_size,
+                                       param_.num_layers,
+                                       dropout_desc_,
+                                       input_mode_,
+                                       direction_,
+                                       mode_,
+                                       dtype_));
+      #endif
+      #if CUDNN_MAJOR >= 7
+        cudnnMathType_t math_type = CUDNN_DEFAULT_MATH;
+        if (cudnn_tensor_core_ && rnn_algo == CUDNN_RNN_ALGO_STANDARD) {
+          math_type = CUDNN_TENSOR_OP_MATH;
+        }
+      #if CUDNN_VERSION >= 7200
+            if (GetEnvAllowTensorCore() && GetEnvAllowTensorCoreConversion() &&
+                (DataType<DType>::kFlag != kFloat16))
+              math_type = CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION;
+      #endif
+        CUDNN_CALL(cudnnSetRNNMatrixMathType(rnn_desc_, math_type));
+      #endif
+      #if USE_CUDNN_LSTM_PROJ
+      if (param_.projection_size.has_value()) {
+        CUDNN_CALL(cudnnSetRNNProjectionLayers(s->dnn_handle_,
+                                               rnn_desc_,
+                                               param_.projection_size.value(),
+                                               0));
+      }
+      #endif
+      // Get temp space sizes
+      CUDNN_CALL(cudnnGetRNNWorkspaceSize(s->dnn_handle_,
+                                          rnn_desc_,
+                                          param_.seq_length_,
+                                          x_desc_vec_.data(),
+                                          &workspace_byte_));
+      CUDNN_CALL(cudnnGetRNNTrainingReserveSize(s->dnn_handle_,
+                                                rnn_desc_,
+                                                param_.seq_length_,
+                                                x_desc_vec_.data(),
+                                                &reserve_space_byte_));
+      workspace_size_ = workspace_byte_ / sizeof(DType);
+      // Allocate the reserve space
+      reserve_space_ = Storage::Get()->Alloc(reserve_space_byte_, Context::GPU(s->dev_id));
+
+      // Check that number of params are correct
+      size_t cudnn_param_size;
+      CUDNN_CALL(cudnnGetRNNParamsSize(s->dnn_handle_,
+                                       rnn_desc_,
+                                       x_desc_vec_[0],
+                                       &cudnn_param_size,
+                                       dtype_));
+      CHECK_EQ(w.shape_[0] * sizeof(DType), cudnn_param_size);
+
+      // Set param descriptors
+      int dim_w[3] = {1, 1, 1};
+      dim_w[0] = w.shape_[0];
+      CUDNN_CALL(cudnnSetFilterNdDescriptor(w_desc_,
+                                            dtype_,
+                                            format_,
+                                            3,
+                                            dim_w));
+      CUDNN_CALL(cudnnSetFilterNdDescriptor(dw_desc_,
+                                            dtype_,
+                                            format_,
+                                            3,
+                                            dim_w));
+
+      // Query weight layout
+      // cudnnFilterDescriptor_t m_desc;
+      // CHECK_EQ(cudnnCreateFilterDescriptor(&m_desc), CUDNN_STATUS_SUCCESS);
+      // DType *p;
+      // int n = 2;
+      // int64_t last = 0;
+      // if (param_.mode == rnn_enum::kLstm) n = 8;
+      // else if (param_.mode == rnn_enum::kGru) n = 6;
+
+      // for (int i = 0; i < param_.num_layers*(param_.bidirectional?2:1); ++i) {
+      //   for (int j = 0; j < n; ++j) {
+      //     CHECK_EQ(cudnnGetRNNLinLayerMatrixParams(s->dnn_handle_, rnn_desc_,
+      //       i, x_desc_vec_[0], w_desc_, 0, j, m_desc, (void**)&p), CUDNN_STATUS_SUCCESS);
+      //     LOG(INFO) << ((int64_t)(p - NULL))/sizeof(DType) - last;
+      //     last = ((int64_t)(p - NULL))/sizeof(DType);
+      //     cudnnDataType_t t;
+      //     cudnnTensorFormat_t f;
+      //     int ndim = 5;
+      //     int dims[5] = {0, 0, 0, 0, 0};
+      //     CHECK_EQ(cudnnGetFilterNdDescriptor(m_desc, ndim, &t, &f, &ndim, &dims[0]),
+      //       CUDNN_STATUS_SUCCESS);
+      //     LOG(INFO) << "w: " <<  i << " " << j << " " << ((int64_t)(p - NULL))/sizeof(DType);
+      //     for (int i = 0; i < ndim; ++i) LOG(INFO) << dims[i];
+      //   }
+      // }
+
+      // for (int i = 0; i < param_.num_layers*(param_.bidirectional?2:1); ++i) {
+      //   for (int j = 0; j < n; ++j) {
+      //     CHECK_EQ(cudnnGetRNNLinLayerBiasParams(s->dnn_handle_, rnn_desc_, i, x_desc_vec_[0],
+      //       w_desc_, 0, j, m_desc, (void**)&p), CUDNN_STATUS_SUCCESS);
+      //     LOG(INFO) << ((int64_t)(p - NULL))/sizeof(DType) - last;
+      //     last = ((int64_t)(p - NULL))/sizeof(DType);
+      //     LOG(INFO) << "b: " << i << " " << j << " " << ((int64_t)(p - NULL))/sizeof(DType);
+      //   }
+      // }
+    }
+  }
+
+  cudnnDataType_t dtype_;
+  bool init_cudnn_;
+  cudnnRNNDescriptor_t rnn_desc_;
+  cudnnRNNMode_t mode_;
+  cudnnDirectionMode_t direction_;
+  cudnnRNNInputMode_t input_mode_;
+  cudnnDropoutDescriptor_t dropout_desc_;
+  Storage::Handle dropout_states_, reserve_space_;
+  uint64_t seed_ = 17 + rand() % 4096;  // NOLINT(runtime/threadsafe_fn)
+  size_t workspace_byte_, reserve_space_byte_, dropout_byte_;
+  int workspace_size_, dropout_size_;
+  std::vector<cudnnTensorDescriptor_t> x_desc_vec_, y_desc_vec_, dx_desc_vec_, dy_desc_vec_;
+  #if USE_CUDNN_LSTM_PROJ
+  cudnnRNNDataDescriptor_t x_data_desc_, y_data_desc_, dx_data_desc_, dy_data_desc_;
+  #endif
+  cudnnTensorDescriptor_t hx_desc_, cx_desc_;
+  cudnnTensorDescriptor_t hy_desc_, cy_desc_;
+  cudnnTensorDescriptor_t dhx_desc_, dcx_desc_;
+  cudnnTensorDescriptor_t dhy_desc_, dcy_desc_;
+
+  cudnnFilterDescriptor_t w_desc_, dw_desc_;
+  // Allow TensorCore algo policy
+  bool cudnn_tensor_core_;
+
+  #if CUDNN_MAJOR >= 5
+  cudnnTensorFormat_t format_;
+  #endif
+};
+#else
 template<typename DType>
 class RNNOp {
  public:
@@ -629,6 +1453,80 @@ class RNNOp {
   size_t reserve_space_size_;
   Storage::Handle reserve_space_;
 };  // class RNNOp
+#endif
+
+static OpStatePtr CreateRNNState(const nnvm::NodeAttrs &attrs,
+                                 const Context ctx,
+                                 const mxnet::ShapeVector &in_shapes,
+                                 const std::vector<int> &in_types) {
+  const RNNParam& param = nnvm::get<RNNParam>(attrs.parsed);
+  OpStatePtr state = OpStatePtr();
+  MSHADOW_REAL_TYPE_SWITCH(in_types[rnn_enum::kData], DType, {
+    state = OpStatePtr::Create<RNNOp<DType>>(param);
+  return state;
+  });
+
+  return OpStatePtr();
+}
+
+template<typename xpu>
+void RNNStatefulCompute(const OpStatePtr& state,
+                        const OpContext& ctx,
+                        const std::vector<TBlob>& inputs,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<TBlob>& outputs) {
+  int dtype = inputs[rnn_enum::kData].type_flag_;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    RNNOp<DType>& op = state.get_state<RNNOp<DType>>();
+    op.Forward(ctx, inputs, req, outputs);
+  });
+}
+
+/*
+index description
+0: x
+1: w
+2: hx
+3: y
+4: dy
+5: hy
+6: dhy
+7: cx
+8: cy
+9: dcy
+*/
+template<typename xpu>
+void RNNStatefulGradCompute(const OpStatePtr& state,
+                            const OpContext& ctx,
+                            const std::vector<TBlob>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<TBlob>& outputs) {
+  std::vector<TBlob> in_data(inputs.begin(), inputs.begin() + 3);
+  std::vector<TBlob> out_data{inputs[3]};
+  std::vector<TBlob> out_grad{inputs[4]};
+  const std::vector<TBlob> &in_grad = outputs;
+
+  int dtype = inputs[rnn_enum::kData].type_flag_;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    RNNOp<DType>& op = state.get_state<RNNOp<DType>>();
+    const RNNParam& param = op.param_;
+    int index = 5;
+    if (param.state_outputs) {
+      out_data.push_back(inputs[index++]);
+      out_grad.push_back(inputs[index++]);
+    }
+
+    if (param.mode == rnn_enum::kLstm) {
+      in_data.push_back(inputs[index++]);
+      if (param.state_outputs) {
+        out_data.push_back(inputs[index++]);
+        out_grad.push_back(inputs[index]);
+      }
+    }
+
+    op.Backward(ctx, out_grad, in_data, out_data, req, in_grad);
+  });
+}
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index 5da870643cd0..aa5c3f305975 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -24,9 +24,6 @@
  * \author Sebastian Bodenstein
 */
 #include "./rnn-inl.h"
-#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
-#include "./cudnn_rnn-inl.h"
-#endif  // MXNET_USE_CUDNN && CUDNN_MAJOR
 
 namespace mxnet {
 namespace op {
@@ -141,27 +138,6 @@ static bool RNNType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
-inline static bool RNNStorageType(const nnvm::NodeAttrs& attrs,
-                                  const int dev_mask,
-                                  DispatchMode* dispatch_mode,
-                                  std::vector<int> *in_attrs,
-                                  std::vector<int> *out_attrs) {
-  DispatchMode wanted_mode = DispatchMode::kFCompute;
-
-  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                             dispatch_mode, wanted_mode);
-}
-
-inline static bool BackwardRNNStorageType(const nnvm::NodeAttrs& attrs,
-                                          const int dev_mask,
-                                          DispatchMode* dispatch_mode,
-                                          std::vector<int> *in_attrs,
-                                          std::vector<int> *out_attrs) {
-  DispatchMode wanted_mode = DispatchMode::kFCompute;
-  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                             dispatch_mode, wanted_mode);
-}
-
 struct RNNGrad {
   const char *op_name;
   std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr &n,
@@ -186,84 +162,6 @@ struct RNNGrad {
   }
 };
 
-static OpStatePtr CreateRNNState(const nnvm::NodeAttrs &attrs,
-                                 const Context ctx,
-                                 const mxnet::ShapeVector &in_shapes,
-                                 const std::vector<int> &in_types) {
-  const RNNParam& param = nnvm::get<RNNParam>(attrs.parsed);
-  OpStatePtr state = OpStatePtr();
-  #if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
-    MSHADOW_REAL_TYPE_SWITCH(in_types[rnn_enum::kData], DType, {
-      state = OpStatePtr::Create<CuDNNRNNOp<DType>>(param);
-    return state;
-    });
-  #else
-    MSHADOW_REAL_TYPE_SWITCH(in_types[rnn_enum::kData], DType, {
-      state = OpStatePtr::Create<RNNOp<DType>>(param);
-    return state;
-    });
-  #endif
-  return OpStatePtr();  // should never reach here
-}
-
-template<typename xpu>
-void RNNStatefulCompute(const OpStatePtr& state,
-                        const OpContext& ctx,
-                        const std::vector<TBlob>& inputs,
-                        const std::vector<OpReqType>& req,
-                        const std::vector<TBlob>& outputs) {
-  int dtype = inputs[rnn_enum::kData].type_flag_;
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    RNNOp<DType>& op = state.get_state<RNNOp<DType>>();
-    op.Forward(ctx, inputs, req, outputs);
-  });
-}
-/*
-index description
-0: x
-1: w
-2: hx
-3: y
-4: dy
-5: hy
-6: dhy
-7: cx
-8: cy
-9: dcy
-*/
-template<typename xpu>
-void RNNStatefulGradCompute(const OpStatePtr& state,
-                            const OpContext& ctx,
-                            const std::vector<TBlob>& inputs,
-                            const std::vector<OpReqType>& req,
-                            const std::vector<TBlob>& outputs) {
-  std::vector<TBlob> in_data(inputs.begin(), inputs.begin() + 3);
-  std::vector<TBlob> out_data{inputs[3]};
-  std::vector<TBlob> out_grad{inputs[4]};
-  const std::vector<TBlob> &in_grad = outputs;
-
-  int dtype = inputs[rnn_enum::kData].type_flag_;
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    RNNOp<DType>& op = state.get_state<RNNOp<DType>>();
-    const RNNParam& param = op.param_;
-    int index = 5;
-    if (param.state_outputs) {
-      out_data.push_back(inputs[index++]);
-      out_grad.push_back(inputs[index++]);
-    }
-
-    if (param.mode == rnn_enum::kLstm) {
-      in_data.push_back(inputs[index++]);
-      if (param.state_outputs) {
-        out_data.push_back(inputs[index++]);
-        out_grad.push_back(inputs[index]);
-      }
-    }
-
-    op.Backward(ctx, out_grad, in_data, out_data, req, in_grad);
-  });
-}
-
 NNVM_REGISTER_OP(RNN)
 .describe(R"code(Applies recurrent layers to input data. Currently, vanilla RNN, LSTM and GRU are
 implemented, with both multi-layer and bidirectional support.
@@ -342,7 +240,6 @@ The definition of GRU here is slightly different from paper but compatible with
 })
 .set_attr<mxnet::FInferShape>("FInferShape", RNNShape)
 .set_attr<nnvm::FInferType>("FInferType", RNNType)
-.set_attr<FInferStorageType>("FInferStorageType", RNNStorageType)
 .set_attr<FCreateOpState>("FCreateOpState", CreateRNNState)
 .set_attr<FStatefulCompute>("FStatefulCompute<cpu>", RNNStatefulCompute<cpu>)
 .set_attr<nnvm::FGradient>("FGradient", RNNGrad{"_backward_RNN"})
@@ -365,7 +262,6 @@ NNVM_REGISTER_OP(_backward_RNN)
 .set_attr_parser(ParamParser<RNNParam>)
 .set_attr<bool>("TIsLayerOpBackward", true)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<FInferStorageType>("FInferStorageType", BackwardRNNStorageType)
 .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
 })
diff --git a/src/operator/rnn.cu b/src/operator/rnn.cu
index 91f38f91f67c..77bb95522711 100644
--- a/src/operator/rnn.cu
+++ b/src/operator/rnn.cu
@@ -26,65 +26,10 @@
 
 #include "./rnn-inl.h"
 #include <algorithm>
-#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
-#include "./cudnn_rnn-inl.h"
-#endif  // MXNET_USE_CUDNN && CUDNN_MAJOR
 
 namespace mxnet {
 namespace op {
 
-template<typename xpu>
-void RNNStatefulCompute(const OpStatePtr& state,
-                        const OpContext& ctx,
-                        const std::vector<TBlob>& inputs,
-                        const std::vector<OpReqType>& req,
-                        const std::vector<TBlob>& outputs) {
-  int dtype = inputs[rnn_enum::kData].type_flag_;
-  #if  MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
-    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-      CuDNNRNNOp<DType>& op = state.get_state<CuDNNRNNOp<DType>>();
-      op.Forward(ctx, inputs, req, outputs);
-    });
-  #else
-    LOG(FATAL) << "RNN on GPU is only available for cuDNN at the moment.";
-  #endif  // MXNET_USE_CUDNN && CUDNN_MAJOR
-}
-
-template<typename xpu>
-void RNNStatefulGradCompute(const OpStatePtr& state,
-                            const OpContext& ctx,
-                            const std::vector<TBlob>& inputs,
-                            const std::vector<OpReqType>& req,
-                            const std::vector<TBlob>& outputs) {
-  std::vector<TBlob> in_data(inputs.begin(), inputs.begin() + 3);
-  std::vector<TBlob> out_data{inputs[3]};
-  std::vector<TBlob> out_grad{inputs[4]};
-  const std::vector<TBlob> &in_grad = outputs;
-  int dtype = inputs[rnn_enum::kData].type_flag_;
-  #if  MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
-    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-      CuDNNRNNOp<DType>& op = state.get_state<CuDNNRNNOp<DType>>();
-      const RNNParam& param = op.param_;
-      int index = 5;
-      if (param.state_outputs) {
-        out_data.push_back(inputs[index++]);
-        out_grad.push_back(inputs[index++]);
-      }
-
-      if (param.mode == rnn_enum::kLstm) {
-        in_data.push_back(inputs[index++]);
-          if (param.state_outputs) {
-          out_data.push_back(inputs[index++]);
-          out_grad.push_back(inputs[index]);
-        }
-      }
-      op.Backward(ctx, out_grad, in_data, out_data, req, in_grad);
-    });
-  #else
-    LOG(FATAL) << "RNN on GPU is only available for cuDNN at the moment.";
-  #endif  // MXNET_USE_CUDNN && CUDNN_MAJOR
-}
-
 NNVM_REGISTER_OP(RNN)
 .set_attr<FStatefulCompute>("FStatefulCompute<gpu>", RNNStatefulCompute<gpu>);
 

From 31ec7d0a64a7972545e1820928db34d7698083c8 Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Wed, 20 Mar 2019 23:19:15 +0800
Subject: [PATCH 05/25] fix some bugs

---
 src/operator/rnn-inl.h | 113 ++++++++++++++++++++++-------------------
 1 file changed, 62 insertions(+), 51 deletions(-)

diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index e0b5de7b18f0..e5386781498f 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -380,7 +380,7 @@ void RNNBackward(DType* ws,
 }
 
 #if MXNET_USE_CUDNN_RNN
-template<typename DType>
+template<typename xpu, typename DType>
 class RNNOp {
  public:
   RNNParam param_;
@@ -505,6 +505,7 @@ class RNNOp {
   void Forward(const OpContext &ctx, const std::vector<TBlob> &in_data,
                const std::vector<OpReqType> &req,
                const std::vector<TBlob> &out_data) {
+#if defined(__CUDACC__)
     using namespace mshadow;
     size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
     //  kOut
@@ -516,23 +517,23 @@ class RNNOp {
 
     CHECK_EQ(in_data.size(), num_inputs);
     CHECK_EQ(out_data.size(), num_outputs);
-    Stream<gpu> *s = ctx.get_stream<gpu>();
+    Stream<xpu> *s = ctx.get_stream<xpu>();
     // get input + output tensors
-    Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
-    Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
-    Tensor<gpu, 3, DType> hx = in_data[rnn_enum::kState].get<gpu, 3, DType>(s);
-    Tensor<gpu, 3, DType> y = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
+    Tensor<xpu, 3, DType> x = in_data[rnn_enum::kData].get<xpu, 3, DType>(s);
+    Tensor<xpu, 1, DType> w = in_data[rnn_enum::kParams].get<xpu, 1, DType>(s);
+    Tensor<xpu, 3, DType> hx = in_data[rnn_enum::kState].get<xpu, 3, DType>(s);
+    Tensor<xpu, 3, DType> y = out_data[rnn_enum::kOut].get<xpu, 3, DType>(s);
 
     void * hy_ptr = NULL;
     if (param_.state_outputs)
-      hy_ptr = out_data[rnn_enum::kStateOut].get<gpu, 3, DType>(s).dptr_;
+      hy_ptr = out_data[rnn_enum::kStateOut].get<xpu, 3, DType>(s).dptr_;
 
     DType * cx_ptr = NULL;
     DType * cy_ptr = NULL;
     if (param_.mode == rnn_enum::kLstm)
-      cx_ptr = (in_data[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
+      cx_ptr = (in_data[rnn_enum::kStateCell].get<xpu, 3, DType>(s)).dptr_;
     if (param_.mode == rnn_enum::kLstm && param_.state_outputs)
-      cy_ptr = (out_data[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
+      cy_ptr = (out_data[rnn_enum::kStateCellOut].get<xpu, 3, DType>(s)).dptr_;
 
     CHECK_EQ(x.CheckContiguous(), true);
     CHECK_EQ(w.CheckContiguous(), true);
@@ -544,8 +545,8 @@ class RNNOp {
     }
     // Get temp space
     int temp_size = workspace_size_;
-    Tensor<gpu, 1, DType> temp_space =
-      ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
+    Tensor<xpu, 1, DType> temp_space =
+      ctx.requested[rnn_enum::kTempSpace].get_space_typed<xpu, 1, DType>(
                               mshadow::Shape1(temp_size), s);
     #if USE_CUDNN_LSTM_PROJ
     std::vector<int> seqLengthArray(param_.batch_size_, param_.seq_length_);
@@ -702,6 +703,7 @@ class RNNOp {
                                           workspace_byte_));
       #endif
     }
+#endif
   }
 
   void Backward(const OpContext &ctx,
@@ -710,6 +712,7 @@ class RNNOp {
                 const std::vector<TBlob> &out_data,
                 const std::vector<OpReqType> &req,
                 const std::vector<TBlob> &in_grad) {
+#if defined(__CUDACC__)
     using namespace mshadow;
     size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
     //  kOut
@@ -726,23 +729,23 @@ class RNNOp {
     CHECK_EQ(req.size(), num_inputs);
     CHECK_NE(req[rnn_enum::kData], kAddTo) << "AddTo is not supported for data";
     CHECK_NE(req[rnn_enum::kState], kAddTo) << "AddTo is not supported for state";
-    Stream<gpu> *s = ctx.get_stream<gpu>();
+    Stream<xpu> *s = ctx.get_stream<xpu>();
     // get input + output tensors
-    Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
-    Tensor<gpu, 3, DType> dx = in_grad[rnn_enum::kData].get<gpu, 3, DType>(s);
-    Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
-    Tensor<gpu, 1, DType> dw = in_grad[rnn_enum::kParams].get<gpu, 1, DType>(s);
-    Tensor<gpu, 3, DType> hx = in_data[rnn_enum::kState].get<gpu, 3, DType>(s);
-    Tensor<gpu, 3, DType> dhx = in_grad[rnn_enum::kState].get<gpu, 3, DType>(s);
-    Tensor<gpu, 3, DType> y = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
-    Tensor<gpu, 3, DType> dy = out_grad[rnn_enum::kOut].get<gpu, 3, DType>(s);
+    Tensor<xpu, 3, DType> x = in_data[rnn_enum::kData].get<xpu, 3, DType>(s);
+    Tensor<xpu, 3, DType> dx = in_grad[rnn_enum::kData].get<xpu, 3, DType>(s);
+    Tensor<xpu, 1, DType> w = in_data[rnn_enum::kParams].get<xpu, 1, DType>(s);
+    Tensor<xpu, 1, DType> dw = in_grad[rnn_enum::kParams].get<xpu, 1, DType>(s);
+    Tensor<xpu, 3, DType> hx = in_data[rnn_enum::kState].get<xpu, 3, DType>(s);
+    Tensor<xpu, 3, DType> dhx = in_grad[rnn_enum::kState].get<xpu, 3, DType>(s);
+    Tensor<xpu, 3, DType> y = out_data[rnn_enum::kOut].get<xpu, 3, DType>(s);
+    Tensor<xpu, 3, DType> dy = out_grad[rnn_enum::kOut].get<xpu, 3, DType>(s);
     if (req[rnn_enum::kParams] != kAddTo) {
       dw = mshadow::expr::ScalarExp<DType>(0.0f);
     }
     // only need kStateOut grad output_states is true
     void * dhy_ptr = NULL;
     if (param_.state_outputs)
-      dhy_ptr = out_grad[rnn_enum::kStateOut].get<gpu, 3, DType>(s).dptr_;
+      dhy_ptr = out_grad[rnn_enum::kStateOut].get<xpu, 3, DType>(s).dptr_;
 
     // Deal with lstm
     void * dcx_ptr = NULL;
@@ -751,11 +754,11 @@ class RNNOp {
 
     if (param_.mode == rnn_enum::kLstm) {
       CHECK_NE(req[rnn_enum::kStateCell], kAddTo) << "AddTo is not supported for state cell";
-      cx_ptr = (in_data[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
-      dcx_ptr = (in_grad[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
+      cx_ptr = (in_data[rnn_enum::kStateCell].get<xpu, 3, DType>(s)).dptr_;
+      dcx_ptr = (in_grad[rnn_enum::kStateCell].get<xpu, 3, DType>(s)).dptr_;
     }
     if ((param_.mode == rnn_enum::kLstm) && param_.state_outputs)
-        dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
+        dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get<xpu, 3, DType>(s)).dptr_;
 
     CHECK_EQ(x.CheckContiguous(), true);
     CHECK_EQ(w.CheckContiguous(), true);
@@ -771,8 +774,8 @@ class RNNOp {
 
     // Get temp space
     int temp_size = workspace_size_;
-    Tensor<gpu, 1, DType> temp_space =
-      ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
+    Tensor<xpu, 1, DType> temp_space =
+      ctx.requested[rnn_enum::kTempSpace].get_space_typed<xpu, 1, DType>(
                               mshadow::Shape1(temp_size), s);
     #if USE_CUDNN_LSTM_PROJ
     CUDNN_CALL(cudnnRNNBackwardDataEx(s->dnn_handle_,
@@ -863,13 +866,16 @@ class RNNOp {
                                        reserve_space_.dptr,
                                        reserve_space_byte_));
     #endif
+#endif
   }
 
  private:
-  inline void Init(mshadow::Stream<gpu> *s,
+  inline void Init(mshadow::Stream<xpu> *s,
                    const std::vector<TBlob> &in_data,
                    const std::vector<TBlob> &out_data) {
+#if defined(__CUDACC__)
     using namespace mshadow;
+
     #if CUDNN_MAJOR >= 5
     format_ = CUDNN_TENSOR_NCHW;
     #endif
@@ -887,8 +893,8 @@ class RNNOp {
     if (!init_cudnn_) {
       init_cudnn_ = true;
       // get input + output tensors
-      Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
-      Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
+      Tensor<xpu, 3, DType> x = in_data[rnn_enum::kData].get<xpu, 3, DType>(s);
+      Tensor<xpu, 1, DType> w = in_data[rnn_enum::kParams].get<xpu, 1, DType>(s);
       param_.seq_length_ = x.shape_[0];
       param_.batch_size_ = x.shape_[1];
       param_.input_size_ = x.shape_[2];
@@ -1168,6 +1174,7 @@ class RNNOp {
       //   }
       // }
     }
+#endif
   }
 
   cudnnDataType_t dtype_;
@@ -1199,7 +1206,7 @@ class RNNOp {
   #endif
 };
 #else
-template<typename DType>
+template<typename xpu, typename DType>
 class RNNOp {
  public:
   explicit RNNOp(RNNParam p)
@@ -1238,12 +1245,12 @@ class RNNOp {
     }
     CHECK_EQ(in_data.size(), num_inputs);
     CHECK_EQ(out_data.size(), num_outputs);
-    Stream<cpu> *s = ctx.get_stream<cpu>();
+    Stream<xpu> *s = ctx.get_stream<xpu>();
     // get input + output tensor
-    Tensor<cpu, 3, DType> x = in_data[rnn_enum::kData].get<cpu, 3, DType>(s);
-    Tensor<cpu, 1, DType> w = in_data[rnn_enum::kParams].get<cpu, 1, DType>(s);
-    Tensor<cpu, 3, DType> hx = in_data[rnn_enum::kState].get<cpu, 3, DType>(s);
-    Tensor<cpu, 3, DType> y = out_data[rnn_enum::kOut].get<cpu, 3, DType>(s);
+    Tensor<xpu, 3, DType> x = in_data[rnn_enum::kData].get<xpu, 3, DType>(s);
+    Tensor<xpu, 1, DType> w = in_data[rnn_enum::kParams].get<xpu, 1, DType>(s);
+    Tensor<xpu, 3, DType> hx = in_data[rnn_enum::kState].get<xpu, 3, DType>(s);
+    Tensor<xpu, 3, DType> y = out_data[rnn_enum::kOut].get<xpu, 3, DType>(s);
     CHECK(x.CheckContiguous());
     CHECK(w.CheckContiguous());
     CHECK(hx.CheckContiguous());
@@ -1272,8 +1279,8 @@ class RNNOp {
     // allocate temp space
     const size_t workspace_size = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
                                                       param_.state_size, direction, param_.mode);
-    Tensor<cpu, 1, DType> workspace = ctx.requested[rnn_enum::kTempSpace]
-        .get_space_typed<cpu, 1, DType>(Shape1(workspace_size), s);
+    Tensor<xpu, 1, DType> workspace = ctx.requested[rnn_enum::kTempSpace]
+        .get_space_typed<xpu, 1, DType>(Shape1(workspace_size), s);
     if (ctx.is_train) {
       const size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
                                                    param_.seq_length_, param_.batch_size_,
@@ -1356,16 +1363,16 @@ class RNNOp {
     CHECK_EQ(req.size(), num_inputs);
     CHECK_NE(req[rnn_enum::kData], kAddTo) << "AddTo is not supported for data";
     CHECK_NE(req[rnn_enum::kState], kAddTo) << "AddTo is not supported for state";
-    mshadow::Stream<cpu> *s = ctx.get_stream<cpu>();
+    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
     // get input + output tensors
-    Tensor<cpu, 3, DType> x = in_data[rnn_enum::kData].get<cpu, 3, DType>(s);
-    Tensor<cpu, 1, DType> w = in_data[rnn_enum::kParams].get<cpu, 1, DType>(s);
-    Tensor<cpu, 3, DType> hx = in_data[rnn_enum::kState].get<cpu, 3, DType>(s);
-    Tensor<cpu, 3, DType> y = out_data[rnn_enum::kOut].get<cpu, 3, DType>(s);
-    Tensor<cpu, 3, DType> dx = in_grad[rnn_enum::kData].get<cpu, 3, DType>(s);
-    Tensor<cpu, 1, DType> dw = in_grad[rnn_enum::kParams].get<cpu, 1, DType>(s);
-    Tensor<cpu, 3, DType> dhx = in_grad[rnn_enum::kState].get<cpu, 3, DType>(s);
-    Tensor<cpu, 3, DType> dy = out_grad[rnn_enum::kOut].get<cpu, 3, DType>(s);
+    Tensor<xpu, 3, DType> x = in_data[rnn_enum::kData].get<xpu, 3, DType>(s);
+    Tensor<xpu, 1, DType> w = in_data[rnn_enum::kParams].get<xpu, 1, DType>(s);
+    Tensor<xpu, 3, DType> hx = in_data[rnn_enum::kState].get<xpu, 3, DType>(s);
+    Tensor<xpu, 3, DType> y = out_data[rnn_enum::kOut].get<xpu, 3, DType>(s);
+    Tensor<xpu, 3, DType> dx = in_grad[rnn_enum::kData].get<xpu, 3, DType>(s);
+    Tensor<xpu, 1, DType> dw = in_grad[rnn_enum::kParams].get<xpu, 1, DType>(s);
+    Tensor<xpu, 3, DType> dhx = in_grad[rnn_enum::kState].get<xpu, 3, DType>(s);
+    Tensor<xpu, 3, DType> dy = out_grad[rnn_enum::kOut].get<xpu, 3, DType>(s);
     CHECK(x.CheckContiguous());
     CHECK(w.CheckContiguous());
     CHECK(hx.CheckContiguous());
@@ -1404,8 +1411,8 @@ class RNNOp {
     // allocate temp space
     const size_t workspace_size = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
                                                       param_.state_size, direction, param_.mode);
-    Tensor<cpu, 1, DType> workspace = ctx.requested[rnn_enum::kTempSpace]
-        .get_space_typed<cpu, 1, DType>(Shape1(workspace_size), s);
+    Tensor<xpu, 1, DType> workspace = ctx.requested[rnn_enum::kTempSpace]
+        .get_space_typed<xpu, 1, DType>(Shape1(workspace_size), s);
 
     size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
                                            param_.seq_length_, param_.batch_size_,
@@ -1462,7 +1469,11 @@ static OpStatePtr CreateRNNState(const nnvm::NodeAttrs &attrs,
   const RNNParam& param = nnvm::get<RNNParam>(attrs.parsed);
   OpStatePtr state = OpStatePtr();
   MSHADOW_REAL_TYPE_SWITCH(in_types[rnn_enum::kData], DType, {
-    state = OpStatePtr::Create<RNNOp<DType>>(param);
+    if (ctx.dev_type == kGPU) {
+      state = OpStatePtr::Create<RNNOp<gpu, DType>>(param);
+    } else {
+      state = OpStatePtr::Create<RNNOp<cpu, DType>>(param);
+    }
   return state;
   });
 
@@ -1477,7 +1488,7 @@ void RNNStatefulCompute(const OpStatePtr& state,
                         const std::vector<TBlob>& outputs) {
   int dtype = inputs[rnn_enum::kData].type_flag_;
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    RNNOp<DType>& op = state.get_state<RNNOp<DType>>();
+    RNNOp<xpu, DType>& op = state.get_state<RNNOp<xpu, DType>>();
     op.Forward(ctx, inputs, req, outputs);
   });
 }
@@ -1508,7 +1519,7 @@ void RNNStatefulGradCompute(const OpStatePtr& state,
 
   int dtype = inputs[rnn_enum::kData].type_flag_;
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    RNNOp<DType>& op = state.get_state<RNNOp<DType>>();
+    RNNOp<xpu, DType>& op = state.get_state<RNNOp<xpu, DType>>();
     const RNNParam& param = op.param_;
     int index = 5;
     if (param.state_outputs) {

From 27d11e7a3611ba2aa528d04d30bf90dae649a15c Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Thu, 21 Mar 2019 21:25:22 +0800
Subject: [PATCH 06/25] fix bug about gpu case like
 tests/python/gpu/test_gluon_gpu.test_rnn_layers_fp32 etc

---
 src/operator/rnn-inl.h | 499 +++++++++++++++++------------------------
 1 file changed, 206 insertions(+), 293 deletions(-)

diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index e5386781498f..30fe746ce528 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -379,13 +379,15 @@ void RNNBackward(DType* ws,
   }
 }
 
-#if MXNET_USE_CUDNN_RNN
 template<typename xpu, typename DType>
 class RNNOp {
  public:
   RNNParam param_;
-  explicit RNNOp(RNNParam param) {
+  Context ctx_;
+  explicit RNNOp(RNNParam param, Context ctx) {
     this->param_ = param;
+    this->ctx_ = ctx;
+    #if MXNET_USE_CUDNN_RNN
     init_cudnn_ = false;
     dtype_ = mshadow::DataType<DType>::kCudnnFlag;
     // TensorCore algos only allowed on fp16-I/O convolutions if permitted by the global policy.
@@ -463,9 +465,23 @@ class RNNOp {
     CUDNN_CALL(cudnnCreateRNNDataDescriptor(&dx_data_desc_));
     CUDNN_CALL(cudnnCreateRNNDataDescriptor(&dy_data_desc_));
     #endif
+    #endif
+    if (ctx_.dev_type == kCPU) {
+      this->init_space_ = false;
+      this->reserve_space_size_ = 0;
+      if (param_.projection_size.has_value()) {
+        LOG(FATAL) <<
+            "hidden layer projection is only supported for GPU with CuDNN later than 7.1.1";
+      }
+      if (param_.lstm_state_clip_min.has_value()
+          || param_.lstm_state_clip_max.has_value()) {
+        LOG(FATAL) << "LSTM state clipping is only supported for GPU with CuDNN later than 7.2.1";
+      }
+    }
   }
 
   ~RNNOp() {
+    #if MXNET_USE_CUDNN_RNN
     CUDNN_CALL(cudnnDestroyTensorDescriptor(hx_desc_));
     CUDNN_CALL(cudnnDestroyTensorDescriptor(cx_desc_));
     CUDNN_CALL(cudnnDestroyTensorDescriptor(hy_desc_));
@@ -500,12 +516,22 @@ class RNNOp {
     CUDNN_CALL(cudnnDestroyRNNDataDescriptor(dx_data_desc_));
     CUDNN_CALL(cudnnDestroyRNNDataDescriptor(dy_data_desc_));
     #endif
+    #endif
+    if (ctx_.dev_type == kCPU) {
+      if (init_space_) {
+        Storage::Get()->Free(reserve_cpu_space_);
+        init_space_ = false;
+      }
+    }
   }
 
   void Forward(const OpContext &ctx, const std::vector<TBlob> &in_data,
                const std::vector<OpReqType> &req,
                const std::vector<TBlob> &out_data) {
-#if defined(__CUDACC__)
+    using namespace mshadow::expr;
+    CHECK(param_.p >= 0.0f && param_.p < 1.0f)
+        << "unsupported dropout value, should be 0 <= dropout < 1";
+
     using namespace mshadow;
     size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
     //  kOut
@@ -524,10 +550,17 @@ class RNNOp {
     Tensor<xpu, 3, DType> hx = in_data[rnn_enum::kState].get<xpu, 3, DType>(s);
     Tensor<xpu, 3, DType> y = out_data[rnn_enum::kOut].get<xpu, 3, DType>(s);
 
-    void * hy_ptr = NULL;
-    if (param_.state_outputs)
-      hy_ptr = out_data[rnn_enum::kStateOut].get<xpu, 3, DType>(s).dptr_;
+    param_.seq_length_ = x.shape_[0];
+    param_.batch_size_ = x.shape_[1];
+    param_.input_size_ = x.shape_[2];
+    const int direction = param_.bidirectional ? 2 : 1;
+    const int bsize = GetRnnBiasSize(param_.num_layers, param_.state_size, direction, param_.mode);
+    DType* b_ptr = w.dptr_ + w.shape_[0] - bsize;
 
+    DType* hy_ptr = NULL;
+    if (param_.state_outputs) {
+      hy_ptr = out_data[rnn_enum::kStateOut].dptr<DType>();
+    }
     DType * cx_ptr = NULL;
     DType * cy_ptr = NULL;
     if (param_.mode == rnn_enum::kLstm)
@@ -540,6 +573,12 @@ class RNNOp {
     CHECK_EQ(hx.CheckContiguous(), true);
     CHECK_EQ(y.CheckContiguous(), true);
 
+    // allocate temp space
+    const size_t workspace_size_cpu = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
+                                                      param_.state_size, direction, param_.mode);
+
+    DType* workspace_cpu = NULL;
+    #if MXNET_USE_CUDNN_RNN && (USE_CUDNN_LSTM_PROJ || defined(__CUDACC__))
     if (!init_cudnn_) {
       Init(s, in_data, out_data);
     }
@@ -547,7 +586,10 @@ class RNNOp {
     int temp_size = workspace_size_;
     Tensor<xpu, 1, DType> temp_space =
       ctx.requested[rnn_enum::kTempSpace].get_space_typed<xpu, 1, DType>(
-                              mshadow::Shape1(temp_size), s);
+                              mshadow::Shape1(temp_size + workspace_size_cpu), s);
+
+    workspace_cpu = temp_space.dptr_ + temp_size;
+
     #if USE_CUDNN_LSTM_PROJ
     std::vector<int> seqLengthArray(param_.batch_size_, param_.seq_length_);
     CUDNN_CALL(cudnnSetRNNDataDescriptor(x_data_desc_,
@@ -703,7 +745,69 @@ class RNNOp {
                                           workspace_byte_));
       #endif
     }
-#endif
+    #endif
+
+    if (ctx_.dev_type == kCPU) {
+      if (!workspace_cpu) {
+        Tensor<xpu, 1, DType> workspace = ctx.requested[rnn_enum::kTempSpace]
+          .get_space_typed<xpu, 1, DType>(Shape1(workspace_size_cpu), s);
+        workspace_cpu = workspace.dptr_;
+      }
+      if (ctx.is_train) {
+        const size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
+                                                     param_.seq_length_, param_.batch_size_,
+                                                     param_.state_size, param_.mode);
+        if (init_space_ && reserve_space_size_ < r_size) {
+          Storage::Get()->Free(reserve_cpu_space_);
+          init_space_ = false;
+        }
+        if (!init_space_) {
+          reserve_cpu_space_ = Storage::Get()->Alloc(r_size * sizeof(DType), Context::CPU());
+          reserve_space_size_ = r_size;
+          init_space_ = true;
+        }
+
+        DType* reserve_space_ptr = static_cast<DType*>(reserve_cpu_space_.dptr);
+
+        RNNForwardTraining<DType>(workspace_cpu,
+                                  reserve_space_ptr,
+                                  param_.state_outputs,
+                                  param_.num_layers,
+                                  direction,
+                                  param_.seq_length_,
+                                  param_.batch_size_,
+                                  param_.input_size_,
+                                  param_.state_size,
+                                  x.dptr_,
+                                  hx.dptr_,
+                                  cx_ptr,
+                                  w.dptr_,
+                                  b_ptr,
+                                  y.dptr_,
+                                  hy_ptr,
+                                  cy_ptr,
+                                  param_.p,
+                                  param_.mode);
+      } else {
+        RNNForwardInference<DType>(workspace_cpu,
+                                   param_.state_outputs,
+                                   param_.num_layers,
+                                   direction,
+                                   param_.seq_length_,
+                                   param_.batch_size_,
+                                   param_.input_size_,
+                                   param_.state_size,
+                                   x.dptr_,
+                                   hx.dptr_,
+                                   cx_ptr,
+                                   w.dptr_,
+                                   b_ptr,
+                                   y.dptr_,
+                                   hy_ptr,
+                                   cy_ptr,
+                                   param_.mode);
+      }
+    }
   }
 
   void Backward(const OpContext &ctx,
@@ -712,8 +816,11 @@ class RNNOp {
                 const std::vector<TBlob> &out_data,
                 const std::vector<OpReqType> &req,
                 const std::vector<TBlob> &in_grad) {
-#if defined(__CUDACC__)
     using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK(param_.p >= 0.0f && param_.p < 1.0f)
+        << "unsupported dropout value, should be 0 <= dropout < 1";
+
     size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
     //  kOut
     size_t num_outputs = 1;
@@ -739,18 +846,36 @@ class RNNOp {
     Tensor<xpu, 3, DType> dhx = in_grad[rnn_enum::kState].get<xpu, 3, DType>(s);
     Tensor<xpu, 3, DType> y = out_data[rnn_enum::kOut].get<xpu, 3, DType>(s);
     Tensor<xpu, 3, DType> dy = out_grad[rnn_enum::kOut].get<xpu, 3, DType>(s);
+
+    CHECK_EQ(x.CheckContiguous(), true);
+    CHECK_EQ(w.CheckContiguous(), true);
+    CHECK_EQ(dw.CheckContiguous(), true);
+    CHECK_EQ(hx.CheckContiguous(), true);
+    CHECK_EQ(dhx.CheckContiguous(), true);
+    CHECK_EQ(y.CheckContiguous(), true);
+    CHECK_EQ(dy.CheckContiguous(), true);
+
     if (req[rnn_enum::kParams] != kAddTo) {
       dw = mshadow::expr::ScalarExp<DType>(0.0f);
     }
-    // only need kStateOut grad output_states is true
-    void * dhy_ptr = NULL;
-    if (param_.state_outputs)
-      dhy_ptr = out_grad[rnn_enum::kStateOut].get<xpu, 3, DType>(s).dptr_;
 
-    // Deal with lstm
-    void * dcx_ptr = NULL;
-    void * dcy_ptr = NULL;
-    void * cx_ptr = NULL;
+    param_.seq_length_ = x.shape_[0];
+    param_.batch_size_ = x.shape_[1];
+    param_.input_size_ = x.shape_[2];
+
+    const int direction = param_.bidirectional ? 2 : 1;
+    const int bsize = GetRnnBiasSize(param_.num_layers, param_.state_size, direction, param_.mode);
+
+    DType* db_ptr = dw.dptr_ + w.shape_[0] - bsize;
+
+    DType * dhy_ptr = NULL;
+    if (param_.state_outputs) {
+      dhy_ptr = out_grad[rnn_enum::kStateOut].dptr<DType>();
+    }
+
+    DType* dcx_ptr = NULL;
+    DType* dcy_ptr = NULL;
+    DType* cx_ptr = NULL;
 
     if (param_.mode == rnn_enum::kLstm) {
       CHECK_NE(req[rnn_enum::kStateCell], kAddTo) << "AddTo is not supported for state cell";
@@ -760,14 +885,12 @@ class RNNOp {
     if ((param_.mode == rnn_enum::kLstm) && param_.state_outputs)
         dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get<xpu, 3, DType>(s)).dptr_;
 
-    CHECK_EQ(x.CheckContiguous(), true);
-    CHECK_EQ(w.CheckContiguous(), true);
-    CHECK_EQ(dw.CheckContiguous(), true);
-    CHECK_EQ(hx.CheckContiguous(), true);
-    CHECK_EQ(dhx.CheckContiguous(), true);
-    CHECK_EQ(y.CheckContiguous(), true);
-    CHECK_EQ(dy.CheckContiguous(), true);
-
+    // allocate temp space
+    const size_t workspace_size_cpu =
+        GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
+                            param_.state_size, direction, param_.mode);
+    DType* workspace_cpu = NULL;
+    #if MXNET_USE_CUDNN_RNN && (USE_CUDNN_LSTM_PROJ || defined(__CUDACC__))
     if (!init_cudnn_) {
       Init(s, in_data, out_data);
     }
@@ -776,7 +899,8 @@ class RNNOp {
     int temp_size = workspace_size_;
     Tensor<xpu, 1, DType> temp_space =
       ctx.requested[rnn_enum::kTempSpace].get_space_typed<xpu, 1, DType>(
-                              mshadow::Shape1(temp_size), s);
+                              mshadow::Shape1(temp_size + workspace_size_cpu), s);
+    workspace_cpu = temp_space.dptr_ + temp_size;
     #if USE_CUDNN_LSTM_PROJ
     CUDNN_CALL(cudnnRNNBackwardDataEx(s->dnn_handle_,
                                       rnn_desc_,
@@ -866,14 +990,60 @@ class RNNOp {
                                        reserve_space_.dptr,
                                        reserve_space_byte_));
     #endif
-#endif
+    #endif
+
+    if (ctx_.dev_type == kCPU) {
+      if (!workspace_cpu) {
+        Tensor<xpu, 1, DType> workspace = ctx.requested[rnn_enum::kTempSpace]
+          .get_space_typed<xpu, 1, DType>(Shape1(workspace_size_cpu), s);
+        workspace_cpu = workspace.dptr_;
+      }
+      size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
+                                             param_.seq_length_, param_.batch_size_,
+                                             param_.state_size, param_.mode);
+
+      if (!init_space_ || reserve_space_size_ != r_size) {
+        LOG(FATAL) << "Check forward init error";
+      }
+
+      DType* reserve_space_ptr = static_cast<DType*>(reserve_cpu_space_.dptr);
+      RNNBackward<DType>(workspace_cpu,
+                         reserve_space_ptr,
+                         param_.num_layers,
+                         direction,
+                         param_.seq_length_,
+                         param_.batch_size_,
+                         param_.input_size_,
+                         param_.state_size,
+                         x.dptr_,
+                         hx.dptr_,
+                         cx_ptr,
+                         w.dptr_,
+                         y.dptr_,
+                         dy.dptr_,
+                         dhy_ptr,
+                         dcy_ptr,
+                         dx.dptr_,
+                         dhx.dptr_,
+                         dcx_ptr,
+                         dw.dptr_,
+                         db_ptr,
+                         req[rnn_enum::kData],
+                         req[rnn_enum::kParams],
+                         req[rnn_enum::kState],
+                         // State cell should be present for LSTMs, but is absent for other RNNs.
+                         param_.mode == rnn_enum::kLstm ? req[rnn_enum::kStateCell] : kNullOp,
+                         param_.p,
+                         param_.mode);
+    }
   }
 
+
  private:
   inline void Init(mshadow::Stream<xpu> *s,
                    const std::vector<TBlob> &in_data,
                    const std::vector<TBlob> &out_data) {
-#if defined(__CUDACC__)
+    #if MXNET_USE_CUDNN_RNN && (USE_CUDNN_LSTM_PROJ || defined(__CUDACC__))
     using namespace mshadow;
 
     #if CUDNN_MAJOR >= 5
@@ -970,7 +1140,6 @@ class RNNOp {
       strideB[1] = dimB[2];
       strideB[2] = 1;
       #endif
-
       #if USE_CUDNN_LSTM_PROJ
       CUDNN_CALL(cudnnSetTensorNdDescriptor(hx_desc_,
                                             dtype_,
@@ -1114,7 +1283,6 @@ class RNNOp {
       workspace_size_ = workspace_byte_ / sizeof(DType);
       // Allocate the reserve space
       reserve_space_ = Storage::Get()->Alloc(reserve_space_byte_, Context::GPU(s->dev_id));
-
       // Check that number of params are correct
       size_t cudnn_param_size;
       CUDNN_CALL(cudnnGetRNNParamsSize(s->dnn_handle_,
@@ -1123,7 +1291,6 @@ class RNNOp {
                                        &cudnn_param_size,
                                        dtype_));
       CHECK_EQ(w.shape_[0] * sizeof(DType), cudnn_param_size);
-
       // Set param descriptors
       int dim_w[3] = {1, 1, 1};
       dim_w[0] = w.shape_[0];
@@ -1174,9 +1341,9 @@ class RNNOp {
       //   }
       // }
     }
-#endif
+  #endif
   }
-
+  #if MXNET_USE_CUDNN_RNN
   cudnnDataType_t dtype_;
   bool init_cudnn_;
   cudnnRNNDescriptor_t rnn_desc_;
@@ -1204,263 +1371,11 @@ class RNNOp {
   #if CUDNN_MAJOR >= 5
   cudnnTensorFormat_t format_;
   #endif
-};
-#else
-template<typename xpu, typename DType>
-class RNNOp {
- public:
-  explicit RNNOp(RNNParam p)
-    :param_(p), init_space_(false), reserve_space_size_(0) {
-    if (param_.projection_size.has_value()) {
-      LOG(FATAL) << "hidden layer projection is only supported for GPU with CuDNN later than 7.1.1";
-    }
-    if (param_.lstm_state_clip_min.has_value()
-        || param_.lstm_state_clip_max.has_value()) {
-      LOG(FATAL) << "LSTM state clipping is only supported for GPU with CuDNN later than 7.2.1";
-    }
-  }
-
-  ~RNNOp() {
-    if (init_space_) {
-      Storage::Get()->Free(reserve_space_);
-      init_space_ = false;
-    }
-  }
-
-  void Forward(const OpContext &ctx,
-               const std::vector<TBlob> &in_data,
-               const std::vector<OpReqType> &req,
-               const std::vector<TBlob> &out_data) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK(param_.p >= 0.0f && param_.p < 1.0f)
-        << "unsupported dropout value, should be 0 <= dropout < 1";
-
-    size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
-    //  kOut
-    size_t num_outputs = 1;
-    if (param_.state_outputs) {
-      // kOut, kStateOut, kStateCellOut
-      num_outputs = (param_.mode == rnn_enum::kLstm) ? 3 : 2;
-    }
-    CHECK_EQ(in_data.size(), num_inputs);
-    CHECK_EQ(out_data.size(), num_outputs);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    // get input + output tensor
-    Tensor<xpu, 3, DType> x = in_data[rnn_enum::kData].get<xpu, 3, DType>(s);
-    Tensor<xpu, 1, DType> w = in_data[rnn_enum::kParams].get<xpu, 1, DType>(s);
-    Tensor<xpu, 3, DType> hx = in_data[rnn_enum::kState].get<xpu, 3, DType>(s);
-    Tensor<xpu, 3, DType> y = out_data[rnn_enum::kOut].get<xpu, 3, DType>(s);
-    CHECK(x.CheckContiguous());
-    CHECK(w.CheckContiguous());
-    CHECK(hx.CheckContiguous());
-    CHECK(y.CheckContiguous());
-    param_.seq_length_ = x.shape_[0];
-    param_.batch_size_ = x.shape_[1];
-    param_.input_size_ = x.shape_[2];
-    const int direction = param_.bidirectional ? 2 : 1;
-    const int bsize = GetRnnBiasSize(param_.num_layers, param_.state_size, direction, param_.mode);
-    DType* b_ptr = w.dptr_ + w.shape_[0] - bsize;
-
-    DType* hy_ptr = NULL;
-    if (param_.state_outputs) {
-      hy_ptr = out_data[rnn_enum::kStateOut].dptr<DType>();
-    }
-    DType* cx_ptr = NULL;
-    DType* cy_ptr = NULL;
-
-    if (param_.mode == rnn_enum::kLstm) {
-      cx_ptr = in_data[rnn_enum::kStateCell].dptr<DType>();
-      if (param_.state_outputs) {
-        cy_ptr = out_data[rnn_enum::kStateCellOut].dptr<DType>();
-      }
-    }
-
-    // allocate temp space
-    const size_t workspace_size = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
-                                                      param_.state_size, direction, param_.mode);
-    Tensor<xpu, 1, DType> workspace = ctx.requested[rnn_enum::kTempSpace]
-        .get_space_typed<xpu, 1, DType>(Shape1(workspace_size), s);
-    if (ctx.is_train) {
-      const size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
-                                                   param_.seq_length_, param_.batch_size_,
-                                                   param_.state_size, param_.mode);
-      if (init_space_ && reserve_space_size_ < r_size) {
-        Storage::Get()->Free(reserve_space_);
-        init_space_ = false;
-      }
-      if (!init_space_) {
-        reserve_space_ = Storage::Get()->Alloc(r_size * sizeof(DType), Context::CPU());
-        reserve_space_size_ = r_size;
-        init_space_ = true;
-      }
-
-      DType* reserve_space_ptr = static_cast<DType*>(reserve_space_.dptr);
-
-      RNNForwardTraining<DType>(workspace.dptr_,
-                                reserve_space_ptr,
-                                param_.state_outputs,
-                                param_.num_layers,
-                                direction,
-                                param_.seq_length_,
-                                param_.batch_size_,
-                                param_.input_size_,
-                                param_.state_size,
-                                x.dptr_,
-                                hx.dptr_,
-                                cx_ptr,
-                                w.dptr_,
-                                b_ptr,
-                                y.dptr_,
-                                hy_ptr,
-                                cy_ptr,
-                                param_.p,
-                                param_.mode);
-    } else {
-      RNNForwardInference<DType>(workspace.dptr_,
-                                 param_.state_outputs,
-                                 param_.num_layers,
-                                 direction,
-                                 param_.seq_length_,
-                                 param_.batch_size_,
-                                 param_.input_size_,
-                                 param_.state_size,
-                                 x.dptr_,
-                                 hx.dptr_,
-                                 cx_ptr,
-                                 w.dptr_,
-                                 b_ptr,
-                                 y.dptr_,
-                                 hy_ptr,
-                                 cy_ptr,
-                                 param_.mode);
-    }
-  }
-
-  void Backward(const OpContext &ctx,
-                const std::vector<TBlob> &out_grad,
-                const std::vector<TBlob> &in_data,
-                const std::vector<TBlob> &out_data,
-                const std::vector<OpReqType> &req,
-                const std::vector<TBlob> &in_grad) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK(param_.p >= 0.0f && param_.p < 1.0f)
-        << "unsupported dropout value, should be 0 <= dropout < 1";
-
-    size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
-    //  kOut
-    size_t num_outputs = 1;
-    if (param_.state_outputs) {
-      // kOut, kStateOut, kStateCellOut
-      num_outputs = (param_.mode == rnn_enum::kLstm) ? 3 : 2;
-    }
-
-    CHECK_EQ(in_data.size(), num_inputs);
-    CHECK_EQ(out_data.size(), num_outputs);
-    CHECK_EQ(in_grad.size(), num_inputs);
-    CHECK_EQ(out_grad.size(), num_outputs);
-    CHECK_EQ(req.size(), num_inputs);
-    CHECK_NE(req[rnn_enum::kData], kAddTo) << "AddTo is not supported for data";
-    CHECK_NE(req[rnn_enum::kState], kAddTo) << "AddTo is not supported for state";
-    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-    // get input + output tensors
-    Tensor<xpu, 3, DType> x = in_data[rnn_enum::kData].get<xpu, 3, DType>(s);
-    Tensor<xpu, 1, DType> w = in_data[rnn_enum::kParams].get<xpu, 1, DType>(s);
-    Tensor<xpu, 3, DType> hx = in_data[rnn_enum::kState].get<xpu, 3, DType>(s);
-    Tensor<xpu, 3, DType> y = out_data[rnn_enum::kOut].get<xpu, 3, DType>(s);
-    Tensor<xpu, 3, DType> dx = in_grad[rnn_enum::kData].get<xpu, 3, DType>(s);
-    Tensor<xpu, 1, DType> dw = in_grad[rnn_enum::kParams].get<xpu, 1, DType>(s);
-    Tensor<xpu, 3, DType> dhx = in_grad[rnn_enum::kState].get<xpu, 3, DType>(s);
-    Tensor<xpu, 3, DType> dy = out_grad[rnn_enum::kOut].get<xpu, 3, DType>(s);
-    CHECK(x.CheckContiguous());
-    CHECK(w.CheckContiguous());
-    CHECK(hx.CheckContiguous());
-    CHECK(y.CheckContiguous());
-    CHECK(dx.CheckContiguous());
-    CHECK(dw.CheckContiguous());
-    CHECK(dhx.CheckContiguous());
-    CHECK(dy.CheckContiguous());
-    param_.seq_length_ = x.shape_[0];
-    param_.batch_size_ = x.shape_[1];
-    param_.input_size_ = x.shape_[2];
-
-    const int direction = param_.bidirectional ? 2 : 1;
-    const int bsize = GetRnnBiasSize(param_.num_layers, param_.state_size, direction, param_.mode);
-
-    DType* db_ptr = dw.dptr_ + w.shape_[0] - bsize;
-
-    DType * dhy_ptr = NULL;
-    if (param_.state_outputs) {
-      dhy_ptr = out_grad[rnn_enum::kStateOut].dptr<DType>();
-    }
-
-    DType * cx_ptr = NULL;
-    DType * dcx_ptr = NULL;
-    DType * dcy_ptr = NULL;
-
-    if (param_.mode == rnn_enum::kLstm) {
-      CHECK_NE(req[rnn_enum::kStateCell], kAddTo) << "AddTo is not supported for state cell";
-      cx_ptr = in_data[rnn_enum::kStateCell].dptr<DType>();
-      dcx_ptr = in_grad[rnn_enum::kStateCell].dptr<DType>();
-      if (param_.state_outputs) {
-        dcy_ptr = out_grad[rnn_enum::kStateCellOut].dptr<DType>();
-      }
-    }
-
-    // allocate temp space
-    const size_t workspace_size = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
-                                                      param_.state_size, direction, param_.mode);
-    Tensor<xpu, 1, DType> workspace = ctx.requested[rnn_enum::kTempSpace]
-        .get_space_typed<xpu, 1, DType>(Shape1(workspace_size), s);
-
-    size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
-                                           param_.seq_length_, param_.batch_size_,
-                                           param_.state_size, param_.mode);
-
-    if (!init_space_ || reserve_space_size_ != r_size) {
-      LOG(FATAL) << "Check forward init error";
-    }
-
-    DType* reserve_space_ptr = static_cast<DType*>(reserve_space_.dptr);
-    RNNBackward<DType>(workspace.dptr_,
-                       reserve_space_ptr,
-                       param_.num_layers,
-                       direction,
-                       param_.seq_length_,
-                       param_.batch_size_,
-                       param_.input_size_,
-                       param_.state_size,
-                       x.dptr_,
-                       hx.dptr_,
-                       cx_ptr,
-                       w.dptr_,
-                       y.dptr_,
-                       dy.dptr_,
-                       dhy_ptr,
-                       dcy_ptr,
-                       dx.dptr_,
-                       dhx.dptr_,
-                       dcx_ptr,
-                       dw.dptr_,
-                       db_ptr,
-                       req[rnn_enum::kData],
-                       req[rnn_enum::kParams],
-                       req[rnn_enum::kState],
-                       // State cell should be present for LSTMs, but is absent for other RNNs.
-                       param_.mode == rnn_enum::kLstm ? req[rnn_enum::kStateCell] : kNullOp,
-                       param_.p,
-                       param_.mode);
-  }
-
-  RNNParam param_;
-
- private:
+  #endif
   bool init_space_;
   size_t reserve_space_size_;
-  Storage::Handle reserve_space_;
-};  // class RNNOp
-#endif
+  Storage::Handle reserve_cpu_space_;
+};  //  class RNNOp
 
 static OpStatePtr CreateRNNState(const nnvm::NodeAttrs &attrs,
                                  const Context ctx,
@@ -1470,14 +1385,12 @@ static OpStatePtr CreateRNNState(const nnvm::NodeAttrs &attrs,
   OpStatePtr state = OpStatePtr();
   MSHADOW_REAL_TYPE_SWITCH(in_types[rnn_enum::kData], DType, {
     if (ctx.dev_type == kGPU) {
-      state = OpStatePtr::Create<RNNOp<gpu, DType>>(param);
+      state = OpStatePtr::Create<RNNOp<gpu, DType>>(param, ctx);
     } else {
-      state = OpStatePtr::Create<RNNOp<cpu, DType>>(param);
+      state = OpStatePtr::Create<RNNOp<cpu, DType>>(param, ctx);
     }
-  return state;
   });
-
-  return OpStatePtr();
+  return state;
 }
 
 template<typename xpu>

From a523b77e3232256604f5e6316e25731d8e4a1fd9 Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Fri, 22 Mar 2019 10:10:19 +0800
Subject: [PATCH 07/25] fix gpu compile issue of unix-gpu and windows-gpu

---
 src/operator/rnn-inl.h | 63 +++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 32 deletions(-)

diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 30fe746ce528..feffd8c26ca2 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -468,7 +468,7 @@ class RNNOp {
     #endif
     if (ctx_.dev_type == kCPU) {
       this->init_space_ = false;
-      this->reserve_space_size_ = 0;
+      this->reserve_cpu_space_size_ = 0;
       if (param_.projection_size.has_value()) {
         LOG(FATAL) <<
             "hidden layer projection is only supported for GPU with CuDNN later than 7.1.1";
@@ -528,11 +528,10 @@ class RNNOp {
   void Forward(const OpContext &ctx, const std::vector<TBlob> &in_data,
                const std::vector<OpReqType> &req,
                const std::vector<TBlob> &out_data) {
+    using namespace mshadow;
     using namespace mshadow::expr;
     CHECK(param_.p >= 0.0f && param_.p < 1.0f)
         << "unsupported dropout value, should be 0 <= dropout < 1";
-
-    using namespace mshadow;
     size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
     //  kOut
     size_t num_outputs = 1;
@@ -574,11 +573,11 @@ class RNNOp {
     CHECK_EQ(y.CheckContiguous(), true);
 
     // allocate temp space
-    const size_t workspace_size_cpu = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
+    const size_t work_cpu_space_size = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
                                                       param_.state_size, direction, param_.mode);
+    DType* work_cpu_space = NULL;
 
-    DType* workspace_cpu = NULL;
-    #if MXNET_USE_CUDNN_RNN && (USE_CUDNN_LSTM_PROJ || defined(__CUDACC__))
+    #if MXNET_USE_CUDNN_RNN && defined(__CUDACC__)
     if (!init_cudnn_) {
       Init(s, in_data, out_data);
     }
@@ -586,9 +585,9 @@ class RNNOp {
     int temp_size = workspace_size_;
     Tensor<xpu, 1, DType> temp_space =
       ctx.requested[rnn_enum::kTempSpace].get_space_typed<xpu, 1, DType>(
-                              mshadow::Shape1(temp_size + workspace_size_cpu), s);
+                              mshadow::Shape1(temp_size + work_cpu_space_size), s);
 
-    workspace_cpu = temp_space.dptr_ + temp_size;
+    work_cpu_space = temp_space.dptr_ + temp_size;
 
     #if USE_CUDNN_LSTM_PROJ
     std::vector<int> seqLengthArray(param_.batch_size_, param_.seq_length_);
@@ -748,28 +747,28 @@ class RNNOp {
     #endif
 
     if (ctx_.dev_type == kCPU) {
-      if (!workspace_cpu) {
+      if (!work_cpu_space) {
         Tensor<xpu, 1, DType> workspace = ctx.requested[rnn_enum::kTempSpace]
-          .get_space_typed<xpu, 1, DType>(Shape1(workspace_size_cpu), s);
-        workspace_cpu = workspace.dptr_;
+          .get_space_typed<xpu, 1, DType>(Shape1(work_cpu_space_size), s);
+        work_cpu_space = workspace.dptr_;
       }
       if (ctx.is_train) {
         const size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
                                                      param_.seq_length_, param_.batch_size_,
                                                      param_.state_size, param_.mode);
-        if (init_space_ && reserve_space_size_ < r_size) {
+        if (init_space_ && reserve_cpu_space_size_ < r_size) {
           Storage::Get()->Free(reserve_cpu_space_);
           init_space_ = false;
         }
         if (!init_space_) {
           reserve_cpu_space_ = Storage::Get()->Alloc(r_size * sizeof(DType), Context::CPU());
-          reserve_space_size_ = r_size;
+          reserve_cpu_space_size_ = r_size;
           init_space_ = true;
         }
 
         DType* reserve_space_ptr = static_cast<DType*>(reserve_cpu_space_.dptr);
 
-        RNNForwardTraining<DType>(workspace_cpu,
+        RNNForwardTraining<DType>(work_cpu_space,
                                   reserve_space_ptr,
                                   param_.state_outputs,
                                   param_.num_layers,
@@ -789,7 +788,7 @@ class RNNOp {
                                   param_.p,
                                   param_.mode);
       } else {
-        RNNForwardInference<DType>(workspace_cpu,
+        RNNForwardInference<DType>(work_cpu_space,
                                    param_.state_outputs,
                                    param_.num_layers,
                                    direction,
@@ -886,11 +885,11 @@ class RNNOp {
         dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get<xpu, 3, DType>(s)).dptr_;
 
     // allocate temp space
-    const size_t workspace_size_cpu =
+    const size_t work_cpu_space_size =
         GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
                             param_.state_size, direction, param_.mode);
-    DType* workspace_cpu = NULL;
-    #if MXNET_USE_CUDNN_RNN && (USE_CUDNN_LSTM_PROJ || defined(__CUDACC__))
+    DType* work_cpu_space = NULL;
+    #if MXNET_USE_CUDNN_RNN && defined(__CUDACC__)
     if (!init_cudnn_) {
       Init(s, in_data, out_data);
     }
@@ -899,8 +898,8 @@ class RNNOp {
     int temp_size = workspace_size_;
     Tensor<xpu, 1, DType> temp_space =
       ctx.requested[rnn_enum::kTempSpace].get_space_typed<xpu, 1, DType>(
-                              mshadow::Shape1(temp_size + workspace_size_cpu), s);
-    workspace_cpu = temp_space.dptr_ + temp_size;
+                              mshadow::Shape1(temp_size + work_cpu_space_size), s);
+    work_cpu_space = temp_space.dptr_ + temp_size;
     #if USE_CUDNN_LSTM_PROJ
     CUDNN_CALL(cudnnRNNBackwardDataEx(s->dnn_handle_,
                                       rnn_desc_,
@@ -993,21 +992,21 @@ class RNNOp {
     #endif
 
     if (ctx_.dev_type == kCPU) {
-      if (!workspace_cpu) {
+      if (!work_cpu_space) {
         Tensor<xpu, 1, DType> workspace = ctx.requested[rnn_enum::kTempSpace]
-          .get_space_typed<xpu, 1, DType>(Shape1(workspace_size_cpu), s);
-        workspace_cpu = workspace.dptr_;
+          .get_space_typed<xpu, 1, DType>(Shape1(work_cpu_space_size), s);
+        work_cpu_space = workspace.dptr_;
       }
       size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
                                              param_.seq_length_, param_.batch_size_,
                                              param_.state_size, param_.mode);
 
-      if (!init_space_ || reserve_space_size_ != r_size) {
+      if (!init_space_ || reserve_cpu_space_size_ != r_size) {
         LOG(FATAL) << "Check forward init error";
       }
 
       DType* reserve_space_ptr = static_cast<DType*>(reserve_cpu_space_.dptr);
-      RNNBackward<DType>(workspace_cpu,
+      RNNBackward<DType>(work_cpu_space,
                          reserve_space_ptr,
                          param_.num_layers,
                          direction,
@@ -1043,13 +1042,7 @@ class RNNOp {
   inline void Init(mshadow::Stream<xpu> *s,
                    const std::vector<TBlob> &in_data,
                    const std::vector<TBlob> &out_data) {
-    #if MXNET_USE_CUDNN_RNN && (USE_CUDNN_LSTM_PROJ || defined(__CUDACC__))
     using namespace mshadow;
-
-    #if CUDNN_MAJOR >= 5
-    format_ = CUDNN_TENSOR_NCHW;
-    #endif
-
     size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
     //  kOut
     size_t num_outputs = 1;
@@ -1060,6 +1053,12 @@ class RNNOp {
 
     CHECK_EQ(in_data.size(), num_inputs);
     CHECK_EQ(out_data.size(), num_outputs);
+
+    #if MXNET_USE_CUDNN_RNN && defined(__CUDACC__)
+    #if CUDNN_MAJOR >= 5
+    format_ = CUDNN_TENSOR_NCHW;
+    #endif
+
     if (!init_cudnn_) {
       init_cudnn_ = true;
       // get input + output tensors
@@ -1373,7 +1372,7 @@ class RNNOp {
   #endif
   #endif
   bool init_space_;
-  size_t reserve_space_size_;
+  size_t reserve_cpu_space_size_;
   Storage::Handle reserve_cpu_space_;
 };  //  class RNNOp
 

From 148d110c7ac385c67197ee944711d4167c38c0f4 Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Fri, 22 Mar 2019 13:02:06 +0800
Subject: [PATCH 08/25] print log for test

---
 src/operator/rnn-inl.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index feffd8c26ca2..409e511f0434 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -576,6 +576,26 @@ class RNNOp {
     const size_t work_cpu_space_size = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
                                                       param_.state_size, direction, param_.mode);
     DType* work_cpu_space = NULL;
+    #if MXNET_USE_CUDNN_RNN
+    LOG(INFO) << "MXNET_USE_CUDNN_RNN:true";
+    #else
+    LOG(INFO) << "MXNET_USE_CUDNN_RNN:false";
+    #endif
+    #if defined(__CUDACC__)
+    LOG(INFO) << "defined(__CUDACC__):true";
+    #else
+    LOG(INFO) << "defined(__CUDACC__):false";
+    #endif
+    #if MXNET_USE_CUDNN == 1
+    LOG(INFO) << "MXNET_USE_CUDNN == 1:true";
+    #else
+    LOG(INFO) << "MXNET_USE_CUDNN == 1:false";
+    #endif
+    #if CUDNN_VERSION >= 7200
+    LOG(INFO) << "CUDNN_VERSION >= 7200:true";
+    #else
+    LOG(INFO) << "CUDNN_VERSION >= 7200:false";
+    #endif
 
     #if MXNET_USE_CUDNN_RNN && defined(__CUDACC__)
     if (!init_cudnn_) {

From 7b97de4e4592138126207be4ee093bbb097e8bec Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Fri, 22 Mar 2019 14:24:25 +0800
Subject: [PATCH 09/25] fix GPU NO CUDNN for unix-gpu case

---
 src/operator/rnn-inl.h | 163 +++++++++++++++++++++++++++++++++++------
 1 file changed, 142 insertions(+), 21 deletions(-)

diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 409e511f0434..60a8af3f4d20 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -466,6 +466,24 @@ class RNNOp {
     CUDNN_CALL(cudnnCreateRNNDataDescriptor(&dy_data_desc_));
     #endif
     #endif
+
+    #if !MXNET_USE_CUDNN_RNN || !defined(__CUDACC__)
+    //  GPU NO CUDNN
+    if (ctx_.dev_type == kGPU) {
+      this->init_space_ = false;
+      this->reserve_cpu_space_size_ = 0;
+      if (param_.projection_size.has_value()) {
+        LOG(FATAL) <<
+            "hidden layer projection is only supported for GPU with CuDNN later than 7.1.1";
+      }
+      if (param_.lstm_state_clip_min.has_value()
+          || param_.lstm_state_clip_max.has_value()) {
+        LOG(FATAL) << "LSTM state clipping is only supported for GPU with CuDNN later than 7.2.1";
+      }
+    }
+    #endif
+
+    //  if dev_type is CPU, run CPU code
     if (ctx_.dev_type == kCPU) {
       this->init_space_ = false;
       this->reserve_cpu_space_size_ = 0;
@@ -517,6 +535,18 @@ class RNNOp {
     CUDNN_CALL(cudnnDestroyRNNDataDescriptor(dy_data_desc_));
     #endif
     #endif
+
+    #if !MXNET_USE_CUDNN_RNN || !defined(__CUDACC__)
+    //  GPU NO CUDNN
+    if (ctx_.dev_type == kGPU) {
+      if (init_space_) {
+        Storage::Get()->Free(reserve_cpu_space_);
+        init_space_ = false;
+      }
+    }
+    #endif
+
+    //  if dev_type is CPU, run CPU code
     if (ctx_.dev_type == kCPU) {
       if (init_space_) {
         Storage::Get()->Free(reserve_cpu_space_);
@@ -576,27 +606,6 @@ class RNNOp {
     const size_t work_cpu_space_size = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
                                                       param_.state_size, direction, param_.mode);
     DType* work_cpu_space = NULL;
-    #if MXNET_USE_CUDNN_RNN
-    LOG(INFO) << "MXNET_USE_CUDNN_RNN:true";
-    #else
-    LOG(INFO) << "MXNET_USE_CUDNN_RNN:false";
-    #endif
-    #if defined(__CUDACC__)
-    LOG(INFO) << "defined(__CUDACC__):true";
-    #else
-    LOG(INFO) << "defined(__CUDACC__):false";
-    #endif
-    #if MXNET_USE_CUDNN == 1
-    LOG(INFO) << "MXNET_USE_CUDNN == 1:true";
-    #else
-    LOG(INFO) << "MXNET_USE_CUDNN == 1:false";
-    #endif
-    #if CUDNN_VERSION >= 7200
-    LOG(INFO) << "CUDNN_VERSION >= 7200:true";
-    #else
-    LOG(INFO) << "CUDNN_VERSION >= 7200:false";
-    #endif
-
     #if MXNET_USE_CUDNN_RNN && defined(__CUDACC__)
     if (!init_cudnn_) {
       Init(s, in_data, out_data);
@@ -764,8 +773,72 @@ class RNNOp {
                                           workspace_byte_));
       #endif
     }
+    #else
+    //  GPU NO CUDNN
+    if (ctx_.dev_type == kGPU) {
+      if (!work_cpu_space) {
+        Tensor<xpu, 1, DType> workspace = ctx.requested[rnn_enum::kTempSpace]
+          .get_space_typed<xpu, 1, DType>(Shape1(work_cpu_space_size), s);
+        work_cpu_space = workspace.dptr_;
+      }
+      if (ctx.is_train) {
+        const size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
+                                                     param_.seq_length_, param_.batch_size_,
+                                                     param_.state_size, param_.mode);
+        if (init_space_ && reserve_cpu_space_size_ < r_size) {
+          Storage::Get()->Free(reserve_cpu_space_);
+          init_space_ = false;
+        }
+        if (!init_space_) {
+          reserve_cpu_space_ = Storage::Get()->Alloc(r_size * sizeof(DType), Context::CPU());
+          reserve_cpu_space_size_ = r_size;
+          init_space_ = true;
+        }
+
+        DType* reserve_space_ptr = static_cast<DType*>(reserve_cpu_space_.dptr);
+
+        RNNForwardTraining<DType>(work_cpu_space,
+                                  reserve_space_ptr,
+                                  param_.state_outputs,
+                                  param_.num_layers,
+                                  direction,
+                                  param_.seq_length_,
+                                  param_.batch_size_,
+                                  param_.input_size_,
+                                  param_.state_size,
+                                  x.dptr_,
+                                  hx.dptr_,
+                                  cx_ptr,
+                                  w.dptr_,
+                                  b_ptr,
+                                  y.dptr_,
+                                  hy_ptr,
+                                  cy_ptr,
+                                  param_.p,
+                                  param_.mode);
+      } else {
+        RNNForwardInference<DType>(work_cpu_space,
+                                   param_.state_outputs,
+                                   param_.num_layers,
+                                   direction,
+                                   param_.seq_length_,
+                                   param_.batch_size_,
+                                   param_.input_size_,
+                                   param_.state_size,
+                                   x.dptr_,
+                                   hx.dptr_,
+                                   cx_ptr,
+                                   w.dptr_,
+                                   b_ptr,
+                                   y.dptr_,
+                                   hy_ptr,
+                                   cy_ptr,
+                                   param_.mode);
+      }
+    }
     #endif
 
+    //  if dev_type is CPU, run CPU code
     if (ctx_.dev_type == kCPU) {
       if (!work_cpu_space) {
         Tensor<xpu, 1, DType> workspace = ctx.requested[rnn_enum::kTempSpace]
@@ -1009,8 +1082,56 @@ class RNNOp {
                                        reserve_space_.dptr,
                                        reserve_space_byte_));
     #endif
+
+    #else
+    //  GPU NO CUDNN
+    if (ctx_.dev_type == kGPU) {
+      if (!work_cpu_space) {
+        Tensor<xpu, 1, DType> workspace = ctx.requested[rnn_enum::kTempSpace]
+          .get_space_typed<xpu, 1, DType>(Shape1(work_cpu_space_size), s);
+        work_cpu_space = workspace.dptr_;
+      }
+      size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
+                                             param_.seq_length_, param_.batch_size_,
+                                             param_.state_size, param_.mode);
+
+      if (!init_space_ || reserve_cpu_space_size_ != r_size) {
+        LOG(FATAL) << "Check forward init error";
+      }
+
+      DType* reserve_space_ptr = static_cast<DType*>(reserve_cpu_space_.dptr);
+      RNNBackward<DType>(work_cpu_space,
+                         reserve_space_ptr,
+                         param_.num_layers,
+                         direction,
+                         param_.seq_length_,
+                         param_.batch_size_,
+                         param_.input_size_,
+                         param_.state_size,
+                         x.dptr_,
+                         hx.dptr_,
+                         cx_ptr,
+                         w.dptr_,
+                         y.dptr_,
+                         dy.dptr_,
+                         dhy_ptr,
+                         dcy_ptr,
+                         dx.dptr_,
+                         dhx.dptr_,
+                         dcx_ptr,
+                         dw.dptr_,
+                         db_ptr,
+                         req[rnn_enum::kData],
+                         req[rnn_enum::kParams],
+                         req[rnn_enum::kState],
+                         // State cell should be present for LSTMs, but is absent for other RNNs.
+                         param_.mode == rnn_enum::kLstm ? req[rnn_enum::kStateCell] : kNullOp,
+                         param_.p,
+                         param_.mode);
+    }
     #endif
 
+    //  if dev_type is CPU, run CPU code
     if (ctx_.dev_type == kCPU) {
       if (!work_cpu_space) {
         Tensor<xpu, 1, DType> workspace = ctx.requested[rnn_enum::kTempSpace]

From 58c7f86ddb9ae99a58980e3b1ea4c85938d4e8d5 Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Fri, 22 Mar 2019 15:47:00 +0800
Subject: [PATCH 10/25] print log

---
 src/operator/rnn-inl.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 60a8af3f4d20..9b4f0b15028c 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -467,9 +467,15 @@ class RNNOp {
     #endif
     #endif
 
+    #if MXNET_USE_CUDNN_RNN
+    LOG(INFO) << "MXNET_USE_CUDNN_RNN:true";
+    #else
+    LOG(INFO) << "MXNET_USE_CUDNN_RNN:false";
+    #endif
     #if !MXNET_USE_CUDNN_RNN || !defined(__CUDACC__)
     //  GPU NO CUDNN
     if (ctx_.dev_type == kGPU) {
+      LOG(INFO) << "ctx_.dev_type == kGPU:true";
       this->init_space_ = false;
       this->reserve_cpu_space_size_ = 0;
       if (param_.projection_size.has_value()) {
@@ -777,6 +783,7 @@ class RNNOp {
     //  GPU NO CUDNN
     if (ctx_.dev_type == kGPU) {
       if (!work_cpu_space) {
+        LOG(INFO) << "work_cpu_space is null, allocate";
         Tensor<xpu, 1, DType> workspace = ctx.requested[rnn_enum::kTempSpace]
           .get_space_typed<xpu, 1, DType>(Shape1(work_cpu_space_size), s);
         work_cpu_space = workspace.dptr_;

From 2edb811887e17401b72027a576e69f1b84517408 Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Fri, 22 Mar 2019 22:57:10 +0800
Subject: [PATCH 11/25] remove print log and make gpu case has same error
 message as master when USE_CUDA=1 USE_CUDNN=0

---
 src/operator/rnn-inl.h | 147 +----------------------------------------
 1 file changed, 1 insertion(+), 146 deletions(-)

diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 9b4f0b15028c..56003d3fca08 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -465,31 +465,10 @@ class RNNOp {
     CUDNN_CALL(cudnnCreateRNNDataDescriptor(&dx_data_desc_));
     CUDNN_CALL(cudnnCreateRNNDataDescriptor(&dy_data_desc_));
     #endif
-    #endif
-
-    #if MXNET_USE_CUDNN_RNN
-    LOG(INFO) << "MXNET_USE_CUDNN_RNN:true";
     #else
-    LOG(INFO) << "MXNET_USE_CUDNN_RNN:false";
-    #endif
-    #if !MXNET_USE_CUDNN_RNN || !defined(__CUDACC__)
-    //  GPU NO CUDNN
-    if (ctx_.dev_type == kGPU) {
-      LOG(INFO) << "ctx_.dev_type == kGPU:true";
-      this->init_space_ = false;
-      this->reserve_cpu_space_size_ = 0;
-      if (param_.projection_size.has_value()) {
-        LOG(FATAL) <<
-            "hidden layer projection is only supported for GPU with CuDNN later than 7.1.1";
-      }
-      if (param_.lstm_state_clip_min.has_value()
-          || param_.lstm_state_clip_max.has_value()) {
-        LOG(FATAL) << "LSTM state clipping is only supported for GPU with CuDNN later than 7.2.1";
-      }
-    }
+    LOG(FATAL) << "RNN on GPU is only available for cuDNN at the moment.";
     #endif
 
-    //  if dev_type is CPU, run CPU code
     if (ctx_.dev_type == kCPU) {
       this->init_space_ = false;
       this->reserve_cpu_space_size_ = 0;
@@ -542,17 +521,6 @@ class RNNOp {
     #endif
     #endif
 
-    #if !MXNET_USE_CUDNN_RNN || !defined(__CUDACC__)
-    //  GPU NO CUDNN
-    if (ctx_.dev_type == kGPU) {
-      if (init_space_) {
-        Storage::Get()->Free(reserve_cpu_space_);
-        init_space_ = false;
-      }
-    }
-    #endif
-
-    //  if dev_type is CPU, run CPU code
     if (ctx_.dev_type == kCPU) {
       if (init_space_) {
         Storage::Get()->Free(reserve_cpu_space_);
@@ -779,73 +747,8 @@ class RNNOp {
                                           workspace_byte_));
       #endif
     }
-    #else
-    //  GPU NO CUDNN
-    if (ctx_.dev_type == kGPU) {
-      if (!work_cpu_space) {
-        LOG(INFO) << "work_cpu_space is null, allocate";
-        Tensor<xpu, 1, DType> workspace = ctx.requested[rnn_enum::kTempSpace]
-          .get_space_typed<xpu, 1, DType>(Shape1(work_cpu_space_size), s);
-        work_cpu_space = workspace.dptr_;
-      }
-      if (ctx.is_train) {
-        const size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
-                                                     param_.seq_length_, param_.batch_size_,
-                                                     param_.state_size, param_.mode);
-        if (init_space_ && reserve_cpu_space_size_ < r_size) {
-          Storage::Get()->Free(reserve_cpu_space_);
-          init_space_ = false;
-        }
-        if (!init_space_) {
-          reserve_cpu_space_ = Storage::Get()->Alloc(r_size * sizeof(DType), Context::CPU());
-          reserve_cpu_space_size_ = r_size;
-          init_space_ = true;
-        }
-
-        DType* reserve_space_ptr = static_cast<DType*>(reserve_cpu_space_.dptr);
-
-        RNNForwardTraining<DType>(work_cpu_space,
-                                  reserve_space_ptr,
-                                  param_.state_outputs,
-                                  param_.num_layers,
-                                  direction,
-                                  param_.seq_length_,
-                                  param_.batch_size_,
-                                  param_.input_size_,
-                                  param_.state_size,
-                                  x.dptr_,
-                                  hx.dptr_,
-                                  cx_ptr,
-                                  w.dptr_,
-                                  b_ptr,
-                                  y.dptr_,
-                                  hy_ptr,
-                                  cy_ptr,
-                                  param_.p,
-                                  param_.mode);
-      } else {
-        RNNForwardInference<DType>(work_cpu_space,
-                                   param_.state_outputs,
-                                   param_.num_layers,
-                                   direction,
-                                   param_.seq_length_,
-                                   param_.batch_size_,
-                                   param_.input_size_,
-                                   param_.state_size,
-                                   x.dptr_,
-                                   hx.dptr_,
-                                   cx_ptr,
-                                   w.dptr_,
-                                   b_ptr,
-                                   y.dptr_,
-                                   hy_ptr,
-                                   cy_ptr,
-                                   param_.mode);
-      }
-    }
     #endif
 
-    //  if dev_type is CPU, run CPU code
     if (ctx_.dev_type == kCPU) {
       if (!work_cpu_space) {
         Tensor<xpu, 1, DType> workspace = ctx.requested[rnn_enum::kTempSpace]
@@ -1089,56 +992,8 @@ class RNNOp {
                                        reserve_space_.dptr,
                                        reserve_space_byte_));
     #endif
-
-    #else
-    //  GPU NO CUDNN
-    if (ctx_.dev_type == kGPU) {
-      if (!work_cpu_space) {
-        Tensor<xpu, 1, DType> workspace = ctx.requested[rnn_enum::kTempSpace]
-          .get_space_typed<xpu, 1, DType>(Shape1(work_cpu_space_size), s);
-        work_cpu_space = workspace.dptr_;
-      }
-      size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
-                                             param_.seq_length_, param_.batch_size_,
-                                             param_.state_size, param_.mode);
-
-      if (!init_space_ || reserve_cpu_space_size_ != r_size) {
-        LOG(FATAL) << "Check forward init error";
-      }
-
-      DType* reserve_space_ptr = static_cast<DType*>(reserve_cpu_space_.dptr);
-      RNNBackward<DType>(work_cpu_space,
-                         reserve_space_ptr,
-                         param_.num_layers,
-                         direction,
-                         param_.seq_length_,
-                         param_.batch_size_,
-                         param_.input_size_,
-                         param_.state_size,
-                         x.dptr_,
-                         hx.dptr_,
-                         cx_ptr,
-                         w.dptr_,
-                         y.dptr_,
-                         dy.dptr_,
-                         dhy_ptr,
-                         dcy_ptr,
-                         dx.dptr_,
-                         dhx.dptr_,
-                         dcx_ptr,
-                         dw.dptr_,
-                         db_ptr,
-                         req[rnn_enum::kData],
-                         req[rnn_enum::kParams],
-                         req[rnn_enum::kState],
-                         // State cell should be present for LSTMs, but is absent for other RNNs.
-                         param_.mode == rnn_enum::kLstm ? req[rnn_enum::kStateCell] : kNullOp,
-                         param_.p,
-                         param_.mode);
-    }
     #endif
 
-    //  if dev_type is CPU, run CPU code
     if (ctx_.dev_type == kCPU) {
       if (!work_cpu_space) {
         Tensor<xpu, 1, DType> workspace = ctx.requested[rnn_enum::kTempSpace]

From b94c45215d2a366c867a3d205fa522c42c446ee5 Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Fri, 22 Mar 2019 23:55:13 +0800
Subject: [PATCH 12/25] fix typo bug

---
 src/operator/rnn-inl.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 56003d3fca08..db90fe8b74de 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -466,7 +466,9 @@ class RNNOp {
     CUDNN_CALL(cudnnCreateRNNDataDescriptor(&dy_data_desc_));
     #endif
     #else
-    LOG(FATAL) << "RNN on GPU is only available for cuDNN at the moment.";
+    if (ctx_.dev_type == kGPU) {
+      LOG(FATAL) << "RNN on GPU is only available for cuDNN at the moment.";
+    }
     #endif
 
     if (ctx_.dev_type == kCPU) {

From bb6e9f1d2f7ff341b2935a794122b4e6812e13b9 Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Mon, 25 Mar 2019 08:57:26 +0800
Subject: [PATCH 13/25] retrigger the ci


From adc92d354826dfdb9d1030461de5ccd5be72edec Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Mon, 25 Mar 2019 14:08:51 +0800
Subject: [PATCH 14/25] retrigger the ci


From 67b3909f4044760fdeff3bd4af896c1d2da904a8 Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Tue, 26 Mar 2019 13:45:19 +0800
Subject: [PATCH 15/25] retrigger the ci


From 3987b9c5a0abfc568e37683f5202956bf6eb56d3 Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Tue, 26 Mar 2019 22:21:15 +0800
Subject: [PATCH 16/25] fix comments

---
 src/operator/rnn-inl.h | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index db90fe8b74de..30055ff16e9a 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -511,9 +511,6 @@ class RNNOp {
       init_cudnn_ = false;
 
       Storage::Get()->Free(reserve_space_);
-      if (param_.p > 0) {
-        Storage::Get()->Free(dropout_states_);
-      }
     }
     #if USE_CUDNN_LSTM_PROJ
     CUDNN_CALL(cudnnDestroyRNNDataDescriptor(x_data_desc_));
@@ -584,7 +581,7 @@ class RNNOp {
     DType* work_cpu_space = NULL;
     #if MXNET_USE_CUDNN_RNN && defined(__CUDACC__)
     if (!init_cudnn_) {
-      Init(s, in_data, out_data);
+      Init(ctx, s, in_data, out_data);
     }
     // Get temp space
     int temp_size = workspace_size_;
@@ -896,7 +893,7 @@ class RNNOp {
     DType* work_cpu_space = NULL;
     #if MXNET_USE_CUDNN_RNN && defined(__CUDACC__)
     if (!init_cudnn_) {
-      Init(s, in_data, out_data);
+      Init(ctx, s, in_data, out_data);
     }
 
     // Get temp space
@@ -1044,7 +1041,8 @@ class RNNOp {
 
 
  private:
-  inline void Init(mshadow::Stream<xpu> *s,
+  inline void Init(const OpContext &ctx,
+                   mshadow::Stream<xpu> *s,
                    const std::vector<TBlob> &in_data,
                    const std::vector<TBlob> &out_data) {
     using namespace mshadow;
@@ -1218,18 +1216,21 @@ class RNNOp {
                                             strideA));
 
       // Create Dropout descriptors
+      DType* dropout_states_ = NULL;
       if (param_.p > 0) {
         CUDNN_CALL(cudnnDropoutGetStatesSize(s->dnn_handle_, &dropout_byte_));
         dropout_size_ = dropout_byte_ / sizeof(DType);
-        dropout_states_ = Storage::Get()->Alloc(dropout_byte_, Context::GPU(s->dev_id));
+        dropout_states_ = ctx.requested[rnn_enum::kTempSpace].get_space_typed<xpu, 1, DType>(
+            mshadow::Shape1(dropout_size_), s).dptr_;
       } else {
-        dropout_states_ = {};
         dropout_byte_ = 0;
       }
+
       CUDNN_CALL(cudnnSetDropoutDescriptor(dropout_desc_, s->dnn_handle_,
                                            param_.p,  // discard probability
-                                           dropout_states_.dptr, dropout_byte_,
+                                           dropout_states_, dropout_byte_,
                                            seed_));
+
       // RNN descriptors
       #if CUDNN_MAJOR >= 6
       cudnnRNNAlgo_t rnn_algo = CUDNN_RNN_ALGO_STANDARD;
@@ -1355,7 +1356,7 @@ class RNNOp {
   cudnnDirectionMode_t direction_;
   cudnnRNNInputMode_t input_mode_;
   cudnnDropoutDescriptor_t dropout_desc_;
-  Storage::Handle dropout_states_, reserve_space_;
+  Storage::Handle reserve_space_;
   uint64_t seed_ = 17 + rand() % 4096;  // NOLINT(runtime/threadsafe_fn)
   size_t workspace_byte_, reserve_space_byte_, dropout_byte_;
   int workspace_size_, dropout_size_;

From d6b3fe5ccea013056cc5a3f2e4a3ff78f0974d74 Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Wed, 27 Mar 2019 08:50:39 +0800
Subject: [PATCH 17/25] retrigger the ci


From 9b7c3fcca3909169827e08e54cfab4e4f3a1ea55 Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Mon, 8 Apr 2019 10:10:58 +0800
Subject: [PATCH 18/25] fix comments

---
 src/operator/rnn-inl.h | 44 ++++++++++++++++++------------------------
 1 file changed, 19 insertions(+), 25 deletions(-)

diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 30055ff16e9a..c1fd005ac90d 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -563,8 +563,8 @@ class RNNOp {
     if (param_.state_outputs) {
       hy_ptr = out_data[rnn_enum::kStateOut].dptr<DType>();
     }
-    DType * cx_ptr = NULL;
-    DType * cy_ptr = NULL;
+    DType* cx_ptr = NULL;
+    DType* cy_ptr = NULL;
     if (param_.mode == rnn_enum::kLstm)
       cx_ptr = (in_data[rnn_enum::kStateCell].get<xpu, 3, DType>(s)).dptr_;
     if (param_.mode == rnn_enum::kLstm && param_.state_outputs)
@@ -575,10 +575,6 @@ class RNNOp {
     CHECK_EQ(hx.CheckContiguous(), true);
     CHECK_EQ(y.CheckContiguous(), true);
 
-    // allocate temp space
-    const size_t work_cpu_space_size = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
-                                                      param_.state_size, direction, param_.mode);
-    DType* work_cpu_space = NULL;
     #if MXNET_USE_CUDNN_RNN && defined(__CUDACC__)
     if (!init_cudnn_) {
       Init(ctx, s, in_data, out_data);
@@ -587,9 +583,7 @@ class RNNOp {
     int temp_size = workspace_size_;
     Tensor<xpu, 1, DType> temp_space =
       ctx.requested[rnn_enum::kTempSpace].get_space_typed<xpu, 1, DType>(
-                              mshadow::Shape1(temp_size + work_cpu_space_size), s);
-
-    work_cpu_space = temp_space.dptr_ + temp_size;
+                              mshadow::Shape1(temp_size), s);
 
     #if USE_CUDNN_LSTM_PROJ
     std::vector<int> seqLengthArray(param_.batch_size_, param_.seq_length_);
@@ -749,11 +743,13 @@ class RNNOp {
     #endif
 
     if (ctx_.dev_type == kCPU) {
-      if (!work_cpu_space) {
-        Tensor<xpu, 1, DType> workspace = ctx.requested[rnn_enum::kTempSpace]
-          .get_space_typed<xpu, 1, DType>(Shape1(work_cpu_space_size), s);
-        work_cpu_space = workspace.dptr_;
-      }
+      // allocate temp space
+      const size_t work_cpu_space_size =
+          GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
+                              param_.state_size, direction, param_.mode);
+      Tensor<xpu, 1, DType> workspace = ctx.requested[rnn_enum::kTempSpace]
+        .get_space_typed<xpu, 1, DType>(Shape1(work_cpu_space_size), s);
+      DType* work_cpu_space = workspace.dptr_;
       if (ctx.is_train) {
         const size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
                                                      param_.seq_length_, param_.batch_size_,
@@ -855,6 +851,7 @@ class RNNOp {
     CHECK_EQ(dhx.CheckContiguous(), true);
     CHECK_EQ(y.CheckContiguous(), true);
     CHECK_EQ(dy.CheckContiguous(), true);
+    CHECK_EQ(dx.CheckContiguous(), true);
 
     if (req[rnn_enum::kParams] != kAddTo) {
       dw = mshadow::expr::ScalarExp<DType>(0.0f);
@@ -886,11 +883,6 @@ class RNNOp {
     if ((param_.mode == rnn_enum::kLstm) && param_.state_outputs)
         dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get<xpu, 3, DType>(s)).dptr_;
 
-    // allocate temp space
-    const size_t work_cpu_space_size =
-        GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
-                            param_.state_size, direction, param_.mode);
-    DType* work_cpu_space = NULL;
     #if MXNET_USE_CUDNN_RNN && defined(__CUDACC__)
     if (!init_cudnn_) {
       Init(ctx, s, in_data, out_data);
@@ -900,8 +892,7 @@ class RNNOp {
     int temp_size = workspace_size_;
     Tensor<xpu, 1, DType> temp_space =
       ctx.requested[rnn_enum::kTempSpace].get_space_typed<xpu, 1, DType>(
-                              mshadow::Shape1(temp_size + work_cpu_space_size), s);
-    work_cpu_space = temp_space.dptr_ + temp_size;
+                              mshadow::Shape1(temp_size), s);
     #if USE_CUDNN_LSTM_PROJ
     CUDNN_CALL(cudnnRNNBackwardDataEx(s->dnn_handle_,
                                       rnn_desc_,
@@ -994,11 +985,14 @@ class RNNOp {
     #endif
 
     if (ctx_.dev_type == kCPU) {
-      if (!work_cpu_space) {
-        Tensor<xpu, 1, DType> workspace = ctx.requested[rnn_enum::kTempSpace]
+      // allocate temp space
+      const size_t work_cpu_space_size =
+          GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
+                              param_.state_size, direction, param_.mode);
+      DType* work_cpu_space = NULL;
+      Tensor<xpu, 1, DType> workspace = ctx.requested[rnn_enum::kTempSpace]
           .get_space_typed<xpu, 1, DType>(Shape1(work_cpu_space_size), s);
-        work_cpu_space = workspace.dptr_;
-      }
+      work_cpu_space = workspace.dptr_;
       size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
                                              param_.seq_length_, param_.batch_size_,
                                              param_.state_size, param_.mode);

From 27a6840cd159288dfdb211a6b5fce5314b2a4e69 Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Mon, 8 Apr 2019 12:09:07 +0800
Subject: [PATCH 19/25] retrigger the ci


From 0aec50e9127ebe44d0ba5f774e26a8f0ccbeb2c8 Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Mon, 8 Apr 2019 15:40:03 +0800
Subject: [PATCH 20/25] fix comments

---
 src/operator/rnn-inl.h | 10 ++++------
 src/operator/rnn.cc    | 17 +++++++++++++++--
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index c1fd005ac90d..f3d7a4a4a4ff 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -51,7 +51,7 @@ namespace rnn_enum {
   enum RNNOpInputs {kData, kParams, kState, kStateCell};
   enum RNNOpOutputs {kOut, kStateOut, kStateCellOut};
   enum RNNModeType {kRnnRelu, kRnnTanh, kLstm, kGru};
-  enum RNNOpResource {kTempSpace};
+  enum RNNOpResource {kTempSpace, kCuDNNDropoutDescSpace};
 }
 
 inline int GetRnnParamSize(int num_layer,
@@ -1212,10 +1212,8 @@ class RNNOp {
       // Create Dropout descriptors
       DType* dropout_states_ = NULL;
       if (param_.p > 0) {
-        CUDNN_CALL(cudnnDropoutGetStatesSize(s->dnn_handle_, &dropout_byte_));
-        dropout_size_ = dropout_byte_ / sizeof(DType);
-        dropout_states_ = ctx.requested[rnn_enum::kTempSpace].get_space_typed<xpu, 1, DType>(
-            mshadow::Shape1(dropout_size_), s).dptr_;
+         ctx.requested[rnn_enum::kCuDNNDropoutDescSpace].get_cudnn_dropout_desc
+            (&dropout_desc_, s, 1.0f - param_.p, seed_);
       } else {
         dropout_byte_ = 0;
       }
@@ -1353,7 +1351,7 @@ class RNNOp {
   Storage::Handle reserve_space_;
   uint64_t seed_ = 17 + rand() % 4096;  // NOLINT(runtime/threadsafe_fn)
   size_t workspace_byte_, reserve_space_byte_, dropout_byte_;
-  int workspace_size_, dropout_size_;
+  int workspace_size_;
   std::vector<cudnnTensorDescriptor_t> x_desc_vec_, y_desc_vec_, dx_desc_vec_, dy_desc_vec_;
   #if USE_CUDNN_LSTM_PROJ
   cudnnRNNDataDescriptor_t x_data_desc_, y_data_desc_, dx_data_desc_, dy_data_desc_;
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index aa5c3f305975..70ee3cd216e0 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -243,8 +243,21 @@ The definition of GRU here is slightly different from paper but compatible with
 .set_attr<FCreateOpState>("FCreateOpState", CreateRNNState)
 .set_attr<FStatefulCompute>("FStatefulCompute<cpu>", RNNStatefulCompute<cpu>)
 .set_attr<nnvm::FGradient>("FGradient", RNNGrad{"_backward_RNN"})
-.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
-  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+.set_attr<FResourceRequestEx>("FResourceRequestEx",
+  [](const NodeAttrs& attrs, const int dev_mask, const DispatchMode dispatch_mode) {
+    std::vector<ResourceRequest> request;
+    request.emplace_back(ResourceRequest::kTempSpace);
+    const RNNParam& param = nnvm::get<RNNParam>(attrs.parsed);
+    if (param.p == 0) return request;
+    if (dev_mask == kGPU) {
+#if MXNET_USE_CUDNN_RNN
+      if (1.0f - param.p > 0) {
+        request.emplace_back(ResourceRequest::kCuDNNDropoutDesc);
+        return request;
+      }
+#endif
+    }
+    return request;
 })
 .add_argument("data", "NDArray-or-Symbol", "Input data to RNN")
 .add_argument("parameters", "NDArray-or-Symbol",

From 69f7f7852944ae181c450e7bd30e30aa6f21bf00 Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Mon, 8 Apr 2019 20:06:07 +0800
Subject: [PATCH 21/25] retrigger the ci


From c950d1f4b7e864fc744e43bdb8b8d859737568f3 Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Tue, 9 Apr 2019 06:41:14 +0800
Subject: [PATCH 22/25] retrigger the ci


From 5c5a9a446dd8f3110df8e5bf5587cda425eda3b4 Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Wed, 10 Apr 2019 10:54:12 +0800
Subject: [PATCH 23/25] retrigger the ci


From d99d82aa863877958502e391f95e80cfe0043bcc Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Wed, 10 Apr 2019 20:01:48 +0800
Subject: [PATCH 24/25] retrigger the ci


From 69295caad333a6e79d84a48410a0893c10786ab8 Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Thu, 11 Apr 2019 12:26:26 +0800
Subject: [PATCH 25/25] fix comments

---
 src/operator/rnn-inl.h | 69 +++++++++++++++++++++++-------------------
 src/operator/rnn.cc    |  4 ---
 2 files changed, 38 insertions(+), 35 deletions(-)

diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index f3d7a4a4a4ff..37f21ce6d126 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -51,7 +51,7 @@ namespace rnn_enum {
   enum RNNOpInputs {kData, kParams, kState, kStateCell};
   enum RNNOpOutputs {kOut, kStateOut, kStateCellOut};
   enum RNNModeType {kRnnRelu, kRnnTanh, kLstm, kGru};
-  enum RNNOpResource {kTempSpace, kCuDNNDropoutDescSpace};
+  enum RNNOpResource {kCuDNNDropoutDescSpace};
 }
 
 inline int GetRnnParamSize(int num_layer,
@@ -473,7 +473,10 @@ class RNNOp {
 
     if (ctx_.dev_type == kCPU) {
       this->init_space_ = false;
+      this->temp_init_space_ = false;
       this->reserve_cpu_space_size_ = 0;
+      this->temp_cpu_space_size_ = 0;
+
       if (param_.projection_size.has_value()) {
         LOG(FATAL) <<
             "hidden layer projection is only supported for GPU with CuDNN later than 7.1.1";
@@ -509,7 +512,7 @@ class RNNOp {
         CUDNN_CALL(cudnnDestroyTensorDescriptor(dy_desc_vec_[i]));
       }
       init_cudnn_ = false;
-
+      Storage::Get()->Free(temp_space_);
       Storage::Get()->Free(reserve_space_);
     }
     #if USE_CUDNN_LSTM_PROJ
@@ -525,6 +528,10 @@ class RNNOp {
         Storage::Get()->Free(reserve_cpu_space_);
         init_space_ = false;
       }
+      if (temp_init_space_) {
+        Storage::Get()->Free(temp_cpu_space_);
+        temp_init_space_ = false;
+      }
     }
   }
 
@@ -579,11 +586,6 @@ class RNNOp {
     if (!init_cudnn_) {
       Init(ctx, s, in_data, out_data);
     }
-    // Get temp space
-    int temp_size = workspace_size_;
-    Tensor<xpu, 1, DType> temp_space =
-      ctx.requested[rnn_enum::kTempSpace].get_space_typed<xpu, 1, DType>(
-                              mshadow::Shape1(temp_size), s);
 
     #if USE_CUDNN_LSTM_PROJ
     std::vector<int> seqLengthArray(param_.batch_size_, param_.seq_length_);
@@ -663,7 +665,7 @@ class RNNOp {
                                            nullptr,
                                            nullptr,
                                            nullptr,
-                                           temp_space.dptr_,
+                                           temp_space_.dptr,
                                            workspace_byte_,
                                            reserve_space_.dptr,
                                            reserve_space_byte_));
@@ -685,7 +687,7 @@ class RNNOp {
                                          hy_ptr,
                                          cy_desc_,
                                          cy_ptr,
-                                         temp_space.dptr_,
+                                         temp_space_.dptr,
                                          workspace_byte_,
                                          reserve_space_.dptr,
                                          reserve_space_byte_));
@@ -716,7 +718,7 @@ class RNNOp {
                                             nullptr,
                                             nullptr,
                                             nullptr,
-                                            temp_space.dptr_,
+                                            temp_space_.dptr,
                                             workspace_byte_));
       #else
       CUDNN_CALL(cudnnRNNForwardInference(s->dnn_handle_,
@@ -736,7 +738,7 @@ class RNNOp {
                                           hy_ptr,
                                           cy_desc_,
                                           cy_ptr,
-                                          temp_space.dptr_,
+                                          temp_space_.dptr,
                                           workspace_byte_));
       #endif
     }
@@ -747,9 +749,17 @@ class RNNOp {
       const size_t work_cpu_space_size =
           GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
                               param_.state_size, direction, param_.mode);
-      Tensor<xpu, 1, DType> workspace = ctx.requested[rnn_enum::kTempSpace]
-        .get_space_typed<xpu, 1, DType>(Shape1(work_cpu_space_size), s);
-      DType* work_cpu_space = workspace.dptr_;
+      if (temp_init_space_ && temp_cpu_space_size_ < work_cpu_space_size) {
+          Storage::Get()->Free(temp_cpu_space_);
+          temp_init_space_ = false;
+      }
+      if (!temp_init_space_) {
+        temp_cpu_space_ = Storage::Get()->Alloc
+            (work_cpu_space_size * sizeof(DType), Context::CPU());
+        temp_cpu_space_size_ = work_cpu_space_size;
+        temp_init_space_ = true;
+      }
+      DType* work_cpu_space = static_cast<DType*>(temp_cpu_space_.dptr);
       if (ctx.is_train) {
         const size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
                                                      param_.seq_length_, param_.batch_size_,
@@ -888,11 +898,6 @@ class RNNOp {
       Init(ctx, s, in_data, out_data);
     }
 
-    // Get temp space
-    int temp_size = workspace_size_;
-    Tensor<xpu, 1, DType> temp_space =
-      ctx.requested[rnn_enum::kTempSpace].get_space_typed<xpu, 1, DType>(
-                              mshadow::Shape1(temp_size), s);
     #if USE_CUDNN_LSTM_PROJ
     CUDNN_CALL(cudnnRNNBackwardDataEx(s->dnn_handle_,
                                       rnn_desc_,
@@ -920,7 +925,7 @@ class RNNOp {
                                       dcx_ptr,
                                       nullptr,
                                       nullptr,
-                                      temp_space.dptr_,
+                                      temp_space_.dptr,
                                       workspace_byte_,
                                       reserve_space_.dptr,
                                       reserve_space_byte_));
@@ -932,7 +937,7 @@ class RNNOp {
                                          hx.dptr_,
                                          y_data_desc_,
                                          y.dptr_,
-                                         temp_space.dptr_,
+                                         temp_space_.dptr,
                                          workspace_byte_,
                                          dw_desc_,
                                          dw.dptr_,
@@ -962,7 +967,7 @@ class RNNOp {
                                     dhx.dptr_,
                                     dcx_desc_,
                                     dcx_ptr,
-                                    temp_space.dptr_,
+                                    temp_space_.dptr,
                                     workspace_byte_,
                                     reserve_space_.dptr,
                                     reserve_space_byte_));
@@ -975,7 +980,7 @@ class RNNOp {
                                        hx.dptr_,
                                        y_desc_vec_.data(),
                                        y.dptr_,
-                                       temp_space.dptr_,
+                                       temp_space_.dptr,
                                        workspace_byte_,
                                        dw_desc_,
                                        dw.dptr_,
@@ -989,10 +994,10 @@ class RNNOp {
       const size_t work_cpu_space_size =
           GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
                               param_.state_size, direction, param_.mode);
-      DType* work_cpu_space = NULL;
-      Tensor<xpu, 1, DType> workspace = ctx.requested[rnn_enum::kTempSpace]
-          .get_space_typed<xpu, 1, DType>(Shape1(work_cpu_space_size), s);
-      work_cpu_space = workspace.dptr_;
+      if (!temp_init_space_ || temp_cpu_space_size_ != work_cpu_space_size) {
+        LOG(FATAL) << "Check temp init error";
+      }
+      DType* work_cpu_space = static_cast<DType*>(temp_cpu_space_.dptr);
       size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
                                              param_.seq_length_, param_.batch_size_,
                                              param_.state_size, param_.mode);
@@ -1280,6 +1285,8 @@ class RNNOp {
       workspace_size_ = workspace_byte_ / sizeof(DType);
       // Allocate the reserve space
       reserve_space_ = Storage::Get()->Alloc(reserve_space_byte_, Context::GPU(s->dev_id));
+      // Allocate the temp space
+      temp_space_ = Storage::Get()->Alloc(workspace_byte_, Context::GPU(s->dev_id));
       // Check that number of params are correct
       size_t cudnn_param_size;
       CUDNN_CALL(cudnnGetRNNParamsSize(s->dnn_handle_,
@@ -1348,7 +1355,7 @@ class RNNOp {
   cudnnDirectionMode_t direction_;
   cudnnRNNInputMode_t input_mode_;
   cudnnDropoutDescriptor_t dropout_desc_;
-  Storage::Handle reserve_space_;
+  Storage::Handle reserve_space_, temp_space_;
   uint64_t seed_ = 17 + rand() % 4096;  // NOLINT(runtime/threadsafe_fn)
   size_t workspace_byte_, reserve_space_byte_, dropout_byte_;
   int workspace_size_;
@@ -1369,9 +1376,9 @@ class RNNOp {
   cudnnTensorFormat_t format_;
   #endif
   #endif
-  bool init_space_;
-  size_t reserve_cpu_space_size_;
-  Storage::Handle reserve_cpu_space_;
+  bool init_space_, temp_init_space_;
+  size_t reserve_cpu_space_size_, temp_cpu_space_size_;
+  Storage::Handle reserve_cpu_space_, temp_cpu_space_;
 };  //  class RNNOp
 
 static OpStatePtr CreateRNNState(const nnvm::NodeAttrs &attrs,
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index 70ee3cd216e0..74c563afceb1 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -246,7 +246,6 @@ The definition of GRU here is slightly different from paper but compatible with
 .set_attr<FResourceRequestEx>("FResourceRequestEx",
   [](const NodeAttrs& attrs, const int dev_mask, const DispatchMode dispatch_mode) {
     std::vector<ResourceRequest> request;
-    request.emplace_back(ResourceRequest::kTempSpace);
     const RNNParam& param = nnvm::get<RNNParam>(attrs.parsed);
     if (param.p == 0) return request;
     if (dev_mask == kGPU) {
@@ -275,9 +274,6 @@ NNVM_REGISTER_OP(_backward_RNN)
 .set_attr_parser(ParamParser<RNNParam>)
 .set_attr<bool>("TIsLayerOpBackward", true)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
-  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-})
 .set_attr<FStatefulCompute>("FStatefulCompute<cpu>", RNNStatefulGradCompute<cpu>);
 }  // namespace op
 }  // namespace mxnet