diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 557c1117739a..d41b5b4f030b 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -63,7 +63,7 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
   bool bidirectional, state_outputs;
   int mode;
   float p;
-  int seq_length_, batch_size_, input_size_;
+  index_t seq_length_, batch_size_, input_size_;
 
   bool use_sequence_length;
   dmlc::optional<int> projection_size;
@@ -122,8 +122,8 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
   }
 };
 
-inline int GetRnnParamSize(int num_layer,
-                           int input_size,
+inline index_t GetRnnParamSize(int num_layer,
+                           index_t input_size,
                            int state_size,
                            int direction,
                            int mode,
@@ -140,14 +140,14 @@ inline int GetRnnParamSize(int num_layer,
       size *= 3;
       break;
   }
-  int size1 = (input_size + state_size + 2) * size;  // first layer size
-  int size2 = (state_size * direction + state_size + 2) * size;  // other layers size
+  index_t size1 = (input_size + state_size + 2) * size;  // first layer size
+  index_t size2 = (state_size * direction + state_size + 2) * size;  // other layers size
   if (projection_size.has_value()) {
-    int proj_size = projection_size.value();
+    index_t proj_size = projection_size.value();
     size1 = (input_size + proj_size + 2) * size;
     size2 = (proj_size * direction + proj_size + 2) * size;
   }
-  int param_size = size1 + (num_layer - 1) * size2;
+  index_t param_size = size1 + (num_layer - 1) * size2;
   if (projection_size.has_value()) {
     param_size += projection_size.value() * state_size * num_layer * direction;
   }
@@ -182,8 +182,8 @@ inline int GetRnnBiasSize(int num_layer,
  *  - output -> h[t](, c[t] additionally with Lstm) time by time(sz: NxH(x2))
  *  - intermediate y[1...T] as next layer's inputs(sz: TxNxHxD)
  */
-inline size_t GetRNNWorkspaceSize(int seq_length,
-                                  int batch_size,
+inline size_t GetRNNWorkspaceSize(index_t seq_length,
+                                  index_t batch_size,
                                   int hidden_size,
                                   int projection_size,
                                   int direction,
@@ -214,8 +214,8 @@ inline size_t GetRNNWorkspaceSize(int seq_length,
 
 inline size_t GetRNNReserveSpaceSize(int num_layer,
                                      int direction,
-                                     int seq_length,
-                                     int batch_size,
+                                     index_t seq_length,
+                                     index_t batch_size,
                                      int hidden_size,
                                      int mode) {
   size_t size = 0;
@@ -279,9 +279,9 @@ void RNNForwardTraining(DType* ws,
                         bool state_outputs,
                         const int num_layers,
                         const int direction,
-                        const int seq_length,
-                        const int batch_size,
-                        const int input_size,
+                        const index_t seq_length,
+                        const index_t batch_size,
+                        const index_t input_size,
                         const int state_size,
                         DType* x_ptr,
                         DType* hx_ptr,
@@ -321,9 +321,9 @@ void RNNForwardInference(DType* ws,
                          bool state_outputs,
                          const int num_layers,
                          const int direction,
-                         const int seq_length,
-                         const int batch_size,
-                         const int input_size,
+                         const index_t seq_length,
+                         const index_t batch_size,
+                         const index_t input_size,
                          const int state_size,
                          const int projection_size,
                          DType* x_ptr,
@@ -363,9 +363,9 @@ void RNNBackward(DType* ws,
                  DType* rs,
                  const int num_layers,
                  const int direction,
-                 const int seq_length,
-                 const int batch_size,
-                 const int input_size,
+                 const index_t seq_length,
+                 const index_t batch_size,
+                 const index_t input_size,
                  const int state_size,
                  DType* x_ptr,
                  DType* hx_ptr,
diff --git a/src/operator/rnn_impl.h b/src/operator/rnn_impl.h
index 9f9cf875e0ac..5acf4eb7b3bd 100644
--- a/src/operator/rnn_impl.h
+++ b/src/operator/rnn_impl.h
@@ -59,9 +59,9 @@ void LstmForwardTrainingSingleLayer(DType* ws,
                                     DType* rs,
                                     bool state_outputs,
                                     bool bid,
-                                    const int T,
-                                    const int N,
-                                    const int I,
+                                    const index_t T,
+                                    const index_t N,
+                                    const index_t I,
                                     const int H,
                                     const Tensor<cpu, 2, DType> &x,
                                     const Tensor<cpu, 2, DType> &hx,
@@ -88,17 +88,17 @@ void LstmForwardTrainingSingleLayer(DType* ws,
   const int offset = bid ? H : 0;
   const DType alpha = 1.0;
   const DType beta = 0.0;
-  const int cell_size = N * H;
+  const index_t cell_size = N * H;
   linalg_gemm(x, wx, yx_flat, alpha, beta, false, true);
 
   const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
-  for (int i = 0; i < T; ++i) {
-    int t = bid ? T - 1 - i : i;
+  for (index_t i = 0; i < T; ++i) {
+    index_t t = bid ? T - 1 - i : i;
     linalg_gemm(i ? h : hx, wh, yh_flat, alpha, beta, false, true);
     #pragma omp parallel for num_threads(omp_threads)
-    for (int jk = 0; jk < cell_size; ++jk) {
-      int j = jk / H;
-      int k = jk % H;
+    for (index_t jk = 0; jk < cell_size; ++jk) {
+      index_t j = jk / H;
+      index_t k = jk % H;
       DType it = sigmoid<DType>(yx[t][j][0][k] + yh[j][0][k] + bx[0][k] + bh[0][k]);
       DType ft = sigmoid<DType>(yx[t][j][1][k] + yh[j][1][k] + bx[1][k] + bh[1][k]);
       DType gt =           tanh(yx[t][j][2][k] + yh[j][2][k] + bx[2][k] + bh[2][k]);
@@ -127,9 +127,9 @@ void LstmForwardTraining(DType* ws,
                          bool state_outputs,
                          const int L,
                          const int D,
-                         const int T,
-                         const int N,
-                         const int I,
+                         const index_t T,
+                         const index_t N,
+                         const index_t I,
                          const int H,
                          DType* x_ptr,
                          DType* hx_ptr,
@@ -145,16 +145,16 @@ void LstmForwardTraining(DType* ws,
   const int total_layers = D * L;
   Tensor<cpu, 3, DType> hx(hx_ptr, Shape3(total_layers, N, H));
   Tensor<cpu, 3, DType> cx(cx_ptr, Shape3(total_layers, N, H));
-  const int b_size = 2 * H * 4;
-  const int r_size = D * T * N * H * 6;
-  const int y_offset = T * N * H * 5;
-  const int cell_size = N * H;
+  const index_t b_size = 2 * H * 4;
+  const index_t r_size = D * T * N * H * 6;
+  const index_t y_offset = T * N * H * 5;
+  const index_t cell_size = N * H;
   unsigned int seed_ = 17 + rand() % 4096;  // NOLINT(runtime/threadsafe_fn)
   int idx = 0;  // state & cell state's idx;
   const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
   for (int i = 0; i < L; ++i) {
-    const int input_size = i ? H * D : I;
-    const int w_size = (input_size + H) * H * 4;
+    const index_t input_size = i ? H * D : I;
+    const index_t w_size = (input_size + H) * H * 4;
     Tensor<cpu, 2, DType> x(x_ptr, Shape2(T * N, input_size));
     Tensor<cpu, 3, DType> y(rs2 + y_offset, Shape3(T, N, H * D));
     LstmForwardTrainingSingleLayer<DType>(ws, rs2, state_outputs, false, T, N, input_size, H, x,
@@ -175,7 +175,7 @@ void LstmForwardTraining(DType* ws,
       b_ptr += b_size;
       if (dropout > 0.0f) {
         #pragma omp parallel for num_threads(omp_threads)
-        for (int j = 0; j < T * N * H * D; j++) {
+        for (index_t j = 0; j < T * N * H * D; j++) {
           int rand_data = rand_r(&seed_);
           if (static_cast<float>(rand_data % 1000) < static_cast<float>(1000 * dropout)) {
             dropout_random[i * T * N * H * D + j] = 0;
@@ -196,7 +196,7 @@ void LstmForwardTraining(DType* ws,
     }
   }
   #pragma omp parallel for num_threads(omp_threads)
-  for (int i = 0; i < T * N * H * D; ++i) {
+  for (index_t i = 0; i < T * N * H * D; ++i) {
     y_ptr[i] = (rs2 + y_offset)[i];
   }
 }
@@ -205,9 +205,9 @@ template<typename DType>
 void LstmForwardInferenceSingleLayer(DType* ws,
                                      bool state_outputs,
                                      bool bid,
-                                     const int T,
-                                     const int N,
-                                     const int I,
+                                     const index_t T,
+                                     const index_t N,
+                                     const index_t I,
                                      const int H,
                                      const int P,
                                      const Tensor<cpu, 2, DType> &x,
@@ -237,19 +237,19 @@ void LstmForwardInferenceSingleLayer(DType* ws,
   const int proj_offset = bid ? P : 0;
   const DType alpha = 1.0;
   const DType beta = 0.0;
-  const int cell_size = N * H;
+  const index_t cell_size = N * H;
   linalg_gemm(x, wx, yx_flat, alpha, beta, false, true);
 
   const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
-  for (int i = 0; i < T; ++i) {
-    int t = bid ? T - 1 - i : i;
+  for (index_t i = 0; i < T; ++i) {
+    index_t t = bid ? T - 1 - i : i;
     if (P > 0) {
       linalg_gemm(i ? r : hx, wh, yh_flat, alpha, beta, false, true);
     } else {
       linalg_gemm(i ? h : hx, wh, yh_flat, alpha, beta, false, true);
     }
     #pragma omp parallel for num_threads(omp_threads)
-    for (int jk = 0; jk < cell_size; ++jk) {
+    for (index_t jk = 0; jk < cell_size; ++jk) {
       int j = jk / H;
       int k = jk % H;
       DType it = sigmoid<DType>(yx[t][j][0][k] + yh[j][0][k] + bx[0][k] + bh[0][k]);
@@ -287,9 +287,9 @@ void LstmForwardInference(DType* ws,
                           bool state_outputs,
                           const int L,
                           const int D,
-                          const int T,
-                          const int N,
-                          const int I,
+                          const index_t T,
+                          const index_t N,
+                          const index_t I,
                           const int H,
                           const int P,
                           DType* x_ptr,
@@ -303,16 +303,16 @@ void LstmForwardInference(DType* ws,
   const int total_layers = D * L;
   Tensor<cpu, 3, DType> hx(hx_ptr, Shape3(total_layers, N, P ? P : H));
   Tensor<cpu, 3, DType> cx(cx_ptr, Shape3(total_layers, N, H));
-  const int b_size = 2 * H * 4;
-  const int cell_size = N * H;
-  const int projection_size = (P ? P : H) * N;
+  const index_t b_size = 2 * H * 4;
+  const index_t cell_size = N * H;
+  const index_t projection_size = (P ? P : H) * N;
   DType* y_tmp_ptr = ws + (T + 1) * cell_size * 4 + cell_size * 2;
   DType* y_cur_ptr = y_ptr;
   int idx = 0;  // state & cell state's idx;
   bool flag = L % 2 ? false : true;
   for (int i = 0; i < L; ++i) {
-    const int input_size = i ? (P ? P : H) * D : I;
-    int w_size = (input_size + (P ? P : H)) * H * 4;
+    const index_t input_size = i ? (P ? P : H) * D : I;
+    index_t w_size = (input_size + (P ? P : H)) * H * 4;
     if (P > 0) {
       w_size += P * H;
     }
@@ -356,9 +356,9 @@ void LstmBackwardSingleLayer(DType* ws,
                              DType* rs,
                              DType* tmp_buf,
                              bool bid,
-                             const int T,
-                             const int N,
-                             const int I,
+                             const index_t T,
+                             const index_t N,
+                             const index_t I,
                              const int H,
                              const Tensor<cpu, 2, DType> &x,
                              const Tensor<cpu, 2, DType> &hx,
@@ -408,41 +408,41 @@ void LstmBackwardSingleLayer(DType* ws,
   const DType beta0 = 0.0;
   const DType beta1 = 1.0;
   const DType beta2 = 2.0;
-  const int cell_size = N * H;
+  const index_t cell_size = N * H;
   if (dhy_ptr != nullptr) {
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < cell_size; ++i) {
+    for (index_t i = 0; i < cell_size; ++i) {
       dh.dptr_[i] = dhy_ptr[i];
     }
   } else {
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < cell_size; ++i) {
+    for (index_t i = 0; i < cell_size; ++i) {
       dh.dptr_[i] = 0;
     }
   }
   if (dcy_ptr != nullptr) {
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < cell_size; ++i) {
+    for (index_t i = 0; i < cell_size; ++i) {
       dc.dptr_[i] = dcy_ptr[i];
     }
   } else {
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < cell_size; ++i) {
+    for (index_t i = 0; i < cell_size; ++i) {
       dc.dptr_[i] = 0;
     }
   }
 
-  for (int i = T - 1; i >= 0; --i) {
-    int t = bid ? T - 1 - i : i;
-    int tnext = bid ? t + 1 : t - 1;
+  for (index_t i = T - 1; i >= 0; --i) {
+    index_t t = bid ? T - 1 - i : i;
+    index_t tnext = bid ? t + 1 : t - 1;
     const Tensor<cpu, 2, DType>& dhnext = i ? dh : dhx;
     const Tensor<cpu, 2, DType>& dcnext = i ? dc : dcx;
     const Tensor<cpu, 2, DType>& hnext = i ? htmp : hx;
     const Tensor<cpu, 2, DType>& cnext = i ? c[i - 1] : cx;
     #pragma omp parallel for num_threads(omp_threads)
-    for (int jk = 0; jk < cell_size; ++jk) {
-      int j = jk / H;
-      int k = jk % H;
+    for (index_t jk = 0; jk < cell_size; ++jk) {
+      index_t j = jk / H;
+      index_t k = jk % H;
       DType tc = tanh(c[i][j][k]);
       DType it = ifgo[i][j][k][0];
       DType ft = ifgo[i][j][k][1];
@@ -485,13 +485,13 @@ void LstmBackwardSingleLayer(DType* ws,
   if (req_params != kNullOp && req_params != kAddTo) {
     linalg_gemm(dyx, x, dwx, alpha, beta0, true, false);
   }
-  const int row = T * N;
-  const int col = H * 4;
+  const index_t row = T * N;
+  const index_t col = H * 4;
   if (req_params != kNullOp) {
     if (req_params != kAddTo) {
-      for (int i = 0; i < row; ++i) {
+      for (index_t i = 0; i < row; ++i) {
         #pragma omp parallel for num_threads(omp_threads)
-        for (int j = 0; j < col; ++j) {
+        for (index_t j = 0; j < col; ++j) {
           dbx[j] += dyx[i][j];
           dbh[j] = dbx[j];
         }
@@ -500,20 +500,20 @@ void LstmBackwardSingleLayer(DType* ws,
       const Tensor<cpu, 2, DType> tmp_dbx(tmp_buf, Shape2(col, T));
       const Tensor<cpu, 2, DType> tmp_dbh(tmp_buf + col * T, Shape2(col, T));
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < col * T; ++i) {
+      for (index_t i = 0; i < col * T; ++i) {
         tmp_dbx.dptr_[i] = 0;
         tmp_dbh.dptr_[i] = 0;
       }
-      for (int t = T - 1; t >= 0; --t) {
+      for (index_t t = T - 1; t >= 0; --t) {
         #pragma omp parallel for num_threads(omp_threads)
-        for (int j = 0; j < col; ++j) {
-          for (int i = 0; i < N; ++i) {
+        for (index_t j = 0; j < col; ++j) {
+          for (index_t i = 0; i < N; ++i) {
             tmp_dbx[j][t] += dyx[t * N + i][j];
             tmp_dbh[j][t] = tmp_dbx[j][t];
           }
         }
         #pragma omp parallel for num_threads(omp_threads)
-        for (int j = 0; j < col; ++j) {
+        for (index_t j = 0; j < col; ++j) {
           dbx[j] += tmp_dbx[j][t] + dbx[j];
           dbh[j] += tmp_dbh[j][t] + dbh[j];
         }
@@ -527,9 +527,9 @@ void LstmBackward(DType* ws,
                   DType* rs,
                   const int L,
                   const int D,
-                  const int T,
-                  const int N,
-                  const int I,
+                  const index_t T,
+                  const index_t N,
+                  const index_t I,
                   const int H,
                   DType* x_ptr,
                   DType* hx_ptr,
@@ -558,16 +558,16 @@ void LstmBackward(DType* ws,
   Tensor<cpu, 3, DType> cx(cx_ptr, Shape3(total_layers, N, H));
   Tensor<cpu, 3, DType> dhx(dhx_ptr, Shape3(total_layers, N, H));
   Tensor<cpu, 3, DType> dcx(dcx_ptr, Shape3(total_layers, N, H));
-  const int b_size = 2 * H * 4;
-  const int r_size = D * T * N * H * 6;
-  const int y_offset = T * N * H * 5;
-  const int w_size1 = (I + H) * H * 4;      // first layer
-  const int w_size2 = (D * H + H) * H * 4;  // other layers
-  const int cell_size = N * H;
+  const index_t b_size = 2 * H * 4;
+  const index_t r_size = D * T * N * H * 6;
+  const index_t y_offset = T * N * H * 5;
+  const index_t w_size1 = (I + H) * H * 4;      // first layer
+  const index_t w_size2 = (D * H + H) * H * 4;  // other layers
+  const index_t cell_size = N * H;
   DType* dy_tmp_ptr = ws2 + T * cell_size * 4 + cell_size * 3;
   for (int i = L - 1; i >= 0; --i) {
-    const int input_size = i ? H * D : I;
-    const int w_size = i ? w_size2 : w_size1;
+    const index_t input_size = i ? H * D : I;
+    const index_t w_size = i ? w_size2 : w_size1;
     int idx = i * D;
     DType* w_cur_ptr = i ? w_ptr + (w_size1 + (i - 1) * w_size2) * D : w_ptr;
     DType* dw_cur_ptr = i ? dw_ptr + (w_size1 + (i - 1) * w_size2) * D : dw_ptr;
@@ -599,7 +599,7 @@ void LstmBackward(DType* ws,
       dropout_random = dropout_random - T * N * D * H;
       const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
       #pragma omp parallel for num_threads(omp_threads)
-      for (int j = 0; j < T * N * D * H; j++) {
+      for (index_t j = 0; j < T * N * D * H; j++) {
         if (dropout_random[j] == 0) {
           dx.dptr_[j] = 0;
         } else {
@@ -616,9 +616,9 @@ void GruForwardInferenceSingleLayer(DType* ws,
                                     DType* tmp_buf,
                                     bool state_outputs,
                                     const int D,
-                                    const int T,
-                                    const int N,
-                                    const int I,
+                                    const index_t T,
+                                    const index_t N,
+                                    const index_t I,
                                     const int H,
                                     const Tensor<cpu, 2, DType> &x,
                                     const Tensor<cpu, 2, DType> &hx,
@@ -655,13 +655,13 @@ void GruForwardInferenceSingleLayer(DType* ws,
   const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
   if (D == 1) {
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; i++)
+    for (index_t i = 0; i < N; i++)
       for (int j = 0; j < H; j++) {
         y_ptr[i * H + j] = hx[i][j];
       }
   } else {
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; i++)
+    for (index_t i = 0; i < N; i++)
       for (int j = 0; j < H; j++) {
         y_ptr[i * D * H + j] = hx[i][j];
         back_ht_1[i * D * H + j] = hx[N + i][j];
@@ -679,7 +679,7 @@ void GruForwardInferenceSingleLayer(DType* ws,
     linalg_gemm(x, back_wx, dback_gemmC1, alpha, beta, false, true);
   }
 
-  for (int t = 0; t < T; t++) {
+  for (index_t t = 0; t < T; t++) {
     //  perform the first direction, X * wx and H * wh for each step
     //  ht-1 * wh, ht-1:[N, H] wh:[3 * H, H]
     Tensor<cpu, 2, DType> dht_1(ht_1, Shape2(N, D * H));
@@ -693,11 +693,11 @@ void GruForwardInferenceSingleLayer(DType* ws,
     }
     gemmC1_t = gemmC1 + t * N * 3 * H;
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; ++i) {
+    for (index_t i = 0; i < N; ++i) {
       for (int j = 0; j < H; ++j) {
-        int rtb = i * 3 * H;
-        int ztb = i * 3 * H + H;
-        int ntb = i * 3 * H + 2 * H;
+        index_t rtb = i * 3 * H;
+        index_t ztb = i * 3 * H + H;
+        index_t ntb = i * 3 * H + 2 * H;
         rt[i * H + j] = sigmoid(gemmC1_t[rtb + j] + gemmC2[rtb + j]
             + bx[0][j] + bh[0][j]);
         zt[i * H + j] = sigmoid(gemmC1_t[ztb + j] + gemmC2[ztb + j]
@@ -720,11 +720,11 @@ void GruForwardInferenceSingleLayer(DType* ws,
       linalg_gemm(dback_ht_1_tmp[1], back_wh, dgemmC2, alpha, beta, true, true);
 
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; ++i) {
+      for (index_t i = 0; i < N; ++i) {
         for (int j = 0; j < H; ++j) {
-          int rtb = i * 3 * H;
-          int ztb = i * 3 * H + H;
-          int ntb = i * 3 * H + 2 * H;
+          index_t rtb = i * 3 * H;
+          index_t ztb = i * 3 * H + H;
+          index_t ntb = i * 3 * H + 2 * H;
           rt[i * H + j] = sigmoid(gemmC1_t[rtb + j] +
               gemmC2[rtb + j] + back_bx[0][j] + back_bh[0][j]);
           zt[i * H + j] = sigmoid(gemmC1_t[ztb + j] +
@@ -744,7 +744,7 @@ void GruForwardInferenceSingleLayer(DType* ws,
     if (D == 1) {
       DType* y_start = y_ptr + (T - 1) * N * H;
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; i++)
+      for (index_t i = 0; i < N; i++)
         for (int j = 0; j < H; j++) {
           hy_ptr[i * H + j] = y_start[i * H + j];
         }
@@ -752,7 +752,7 @@ void GruForwardInferenceSingleLayer(DType* ws,
       DType* y_start = y_ptr + (T - 1) * N * H * D;
       DType* y_back_start = y_ptr + H;
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; i++)
+      for (index_t i = 0; i < N; i++)
         for (int j = 0; j < H; j++) {
           hy_ptr[i * H + j] = y_start[i * D * H + j];
           hy_ptr[N * H + i * H + j] = y_back_start[i * D * H + j];
@@ -766,9 +766,9 @@ void GruForwardInference(DType* ws,
                          bool state_outputs,
                          const int L,
                          const int D,
-                         const int T,
-                         const int N,
-                         int I,
+                         const index_t T,
+                         const index_t N,
+                         index_t I,
                          const int H,
                          DType* x_ptr,
                          DType* hx_ptr,
@@ -819,9 +819,9 @@ void GruForwardTrainingSingleLayer(DType* ws,
                                    DType* tmp_buf,
                                    bool state_outputs,
                                    const int D,
-                                   const int T,
-                                   const int N,
-                                   const int I,
+                                   const index_t T,
+                                   const index_t N,
+                                   const index_t I,
                                    const int H,
                                    const Tensor<cpu, 2, DType> &x,
                                    const Tensor<cpu, 2, DType> &hx,
@@ -867,13 +867,13 @@ void GruForwardTrainingSingleLayer(DType* ws,
   const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
   if (D == 1) {
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; i++)
+    for (index_t i = 0; i < N; i++)
       for (int j = 0; j < H; j++) {
         y_ptr[i * H + j] = hx[i][j];
       }
   } else {
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; i++)
+    for (index_t i = 0; i < N; i++)
       for (int j = 0; j < H; j++) {
         y_ptr[i * D * H + j] = hx[i][j];
         back_ht_1[i * D * H + j] = hx[N + i][j];
@@ -892,7 +892,7 @@ void GruForwardTrainingSingleLayer(DType* ws,
     linalg_gemm(x, back_wx, dback_gemmC1, alpha, beta, false, true);
   }
 
-  for (int t = 0; t < T; t++) {
+  for (index_t t = 0; t < T; t++) {
     //  perform the first direction, X * wx and H * wh for each step
     //  ht-1 * wh, ht-1:[N, H] wh:[3 * H, H]
     Tensor<cpu, 2, DType> dht_1(ht_1, Shape2(N, D * H));
@@ -910,11 +910,11 @@ void GruForwardTrainingSingleLayer(DType* ws,
     gemmC1_t = gemmC1 + t * N * 3 * H;
     DType* Mnht = Mnh + t * N * H;
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; ++i) {
+    for (index_t i = 0; i < N; ++i) {
       for (int j = 0; j < H; ++j) {
-        int rtb = i * 3 * H;
-        int ztb = i * 3 * H + H;
-        int ntb = i * 3 * H + 2 * H;
+        index_t rtb = i * 3 * H;
+        index_t ztb = i * 3 * H + H;
+        index_t ntb = i * 3 * H + 2 * H;
         Mnht[i * H + j] = gemmC2[ntb + j] + bh[2][j];
         rt[i * H + j] = sigmoid(gemmC1_t[rtb + j] + gemmC2[rtb + j]
             + bx[0][j] + bh[0][j]);
@@ -942,11 +942,11 @@ void GruForwardTrainingSingleLayer(DType* ws,
 
       DType* back_Mnht = back_Mnh + (T - 1 - t) * N * H;
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; ++i) {
+      for (index_t i = 0; i < N; ++i) {
         for (int j = 0; j < H; ++j) {
-          int rtb = i * 3 * H;
-          int ztb = i * 3 * H + H;
-          int ntb = i * 3 * H + 2 * H;
+          index_t rtb = i * 3 * H;
+          index_t ztb = i * 3 * H + H;
+          index_t ntb = i * 3 * H + 2 * H;
           back_Mnht[i * H + j] = gemmC2[ntb + j] + back_bh[2][j];
           rt[i * H + j] = sigmoid(gemmC1_t[rtb + j] +
               gemmC2[rtb + j] + back_bx[0][j] + back_bh[0][j]);
@@ -968,7 +968,7 @@ void GruForwardTrainingSingleLayer(DType* ws,
     if (D == 1) {
       DType* y_start = y_ptr + (T - 1) * N * H;
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; i++)
+      for (index_t i = 0; i < N; i++)
         for (int j = 0; j < H; j++) {
           hy_ptr[i * H + j] = y_start[i * H + j];
         }
@@ -976,7 +976,7 @@ void GruForwardTrainingSingleLayer(DType* ws,
       DType* y_start = y_ptr + (T - 1) * N * H * D;
       DType* y_back_start = y_ptr + H;
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; i++)
+      for (index_t i = 0; i < N; i++)
         for (int j = 0; j < H; j++) {
           hy_ptr[i * H + j] = y_start[i * D * H + j];
           hy_ptr[N * H + i * H + j] = y_back_start[i * D * H + j];
@@ -991,9 +991,9 @@ void GruForwardTraining(DType* ws,
                         bool state_outputs,
                         const int L,
                         const int D,
-                        const int T,
-                        const int N,
-                        int I,
+                        const index_t T,
+                        const index_t N,
+                        index_t I,
                         const int H,
                         DType* x_ptr,
                         DType* hx_ptr,
@@ -1030,7 +1030,7 @@ void GruForwardTraining(DType* ws,
     if (dropout > 0.0f && l > 0) {
       const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < T * N * I; i++) {
+      for (index_t i = 0; i < T * N * I; i++) {
         int rand_data = rand_r(&seed_);
         if (static_cast<float>(rand_data % 1000) < static_cast<float>(1000 * dropout)) {
           dropout_random[(l - 1) * T * N * I + i] = 0;
@@ -1062,7 +1062,7 @@ void GruForwardTraining(DType* ws,
   }
   const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
   #pragma omp parallel for num_threads(omp_threads)
-  for (int i = 0; i < T * N * H * D; ++i) {
+  for (index_t i = 0; i < T * N * H * D; ++i) {
     y_ptr[i] = y_l[i];
   }
 }
@@ -1071,9 +1071,9 @@ template <typename DType>
 void GruBackwardSingleLayer(DType* ws,
                             DType* tmp_buf,
                             const int D,
-                            const int T,
-                            const int N,
-                            const int I,
+                            const index_t T,
+                            const index_t N,
+                            const index_t I,
                             const int H,
                             const Tensor<cpu, 2, DType> &x,
                             const Tensor<cpu, 2, DType> &hx,
@@ -1139,7 +1139,7 @@ void GruBackwardSingleLayer(DType* ws,
     }
   }
   #pragma omp parallel for num_threads(omp_threads)
-  for (int i = 0; i < N * H; ++i) {
+  for (index_t i = 0; i < N * H; ++i) {
     if (dhy_ptr) {
       dht1[i] = dhy_ptr[i];
     } else {
@@ -1148,7 +1148,7 @@ void GruBackwardSingleLayer(DType* ws,
   }
 
   #pragma omp parallel for num_threads(omp_threads)
-  for (int i = 0; i < N; ++i) {
+  for (index_t i = 0; i < N; ++i) {
     for (int j = 0; j < H; ++j) {
       hx_[i * D * H + j] = hx[i][j];
     }
@@ -1156,7 +1156,7 @@ void GruBackwardSingleLayer(DType* ws,
 
   if (D == 2) {
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N * H; ++i) {
+    for (index_t i = 0; i < N * H; ++i) {
       if (dhy_ptr) {
         back_dht1[i] = dhy_ptr[N * H + i];
       } else {
@@ -1164,13 +1164,13 @@ void GruBackwardSingleLayer(DType* ws,
       }
     }
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; ++i) {
+    for (index_t i = 0; i < N; ++i) {
       for (int j = 0; j < H; ++j) {
         hx_[i * D * H + H + j] = hx[N + i][j];
       }
     }
   }
-  for (int t = T - 1; t >= 0; --t) {
+  for (index_t t = T - 1; t >= 0; --t) {
     if (t) {
       ht1 = y_ptr + (t - 1) * N * D * H;
     } else {
@@ -1180,7 +1180,7 @@ void GruBackwardSingleLayer(DType* ws,
     dyt = dy_ptr + t * N * D * H;
 
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; ++i) {
+    for (index_t i = 0; i < N; ++i) {
       for (int j = 0; j < H; ++j) {
         dht1[i * H + j] += dyt[i * D * H + j];
       }
@@ -1193,7 +1193,7 @@ void GruBackwardSingleLayer(DType* ws,
     dat = da + t * N * 3 * H;
     dart = dar + t * N * 3 * H;
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; ++i) {
+    for (index_t i = 0; i < N; ++i) {
       for (int j = 0; j < H; ++j) {
         int nid = i * 3 * H + 2 * H + j;
         int zid = i * 3 * H + H + j;
@@ -1239,7 +1239,7 @@ void GruBackwardSingleLayer(DType* ws,
     if (req_params != kAddTo) {
       #pragma omp parallel for num_threads(omp_threads)
       for (int i = 0; i < 3 * H; ++i) {
-        for (int j = 0; j < N * T; ++j) {
+        for (index_t j = 0; j < N * T; ++j) {
           dbx[i] += da[j * 3 * H + i];
           dbh[i] += dar[j * 3 * H + i];
         }
@@ -1248,15 +1248,15 @@ void GruBackwardSingleLayer(DType* ws,
       const Tensor<cpu, 2, DType> tmp_dbx(tmp_buf + T * N * D * H, Shape2(H * 3, T));
       const Tensor<cpu, 2, DType> tmp_dbh(tmp_buf + T * N * D * H + 3 * H * T, Shape2(H * 3, T));
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < H * T * 3; ++i) {
+      for (index_t i = 0; i < H * T * 3; ++i) {
         tmp_dbx.dptr_[i] = 0;
         tmp_dbh.dptr_[i] = 0;
       }
 
-      for (int t = T - 1; t >= 0; --t) {
+      for (index_t t = T - 1; t >= 0; --t) {
         #pragma omp parallel for num_threads(omp_threads)
         for (int i = 0; i < 3 * H; ++i) {
-          for (int j = 0; j < N; ++j) {
+          for (index_t j = 0; j < N; ++j) {
             tmp_dbx[i][t] += da[t * N * 3 * H + j * 3 * H + i];
             tmp_dbh[i][t] += dar[t * N * 3 * H + j * 3 * H + i];
           }
@@ -1286,7 +1286,7 @@ void GruBackwardSingleLayer(DType* ws,
   }
 
   if (D == 2) {
-    for (int t = 0; t < T; ++t) {
+    for (index_t t = 0; t < T; ++t) {
       if (t == T-1) {
         back_ht1 = hx_;
       } else {
@@ -1296,7 +1296,7 @@ void GruBackwardSingleLayer(DType* ws,
       //  add dy[T, N, D, H] to dhy[D, N, H]
       dyt = dy_ptr + t * N * D * H;
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; ++i) {
+      for (index_t i = 0; i < N; ++i) {
         for (int j = 0; j < H; ++j) {
           back_dht1[i * H + j] += dyt[i * D * H + H + j];
         }
@@ -1310,12 +1310,12 @@ void GruBackwardSingleLayer(DType* ws,
       dart = dar + t * N * 3 * H;
 
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; ++i) {
+      for (index_t i = 0; i < N; ++i) {
         for (int j = 0; j < H; ++j) {
-          int nid = i * 3 * H + 2 * H + j;
-          int zid = i * 3 * H + H + j;
-          int rid = i * 3 * H + j;
-          int id = i * H + j;
+          index_t nid = i * 3 * H + 2 * H + j;
+          index_t zid = i * 3 * H + H + j;
+          index_t rid = i * 3 * H + j;
+          index_t id = i * H + j;
           dat[nid] = back_dht1[id] * (1 - zt[id]) * (1 - nt[id] * nt[id]);
           dart[zid] = dat[zid] = back_dht1[id] * (back_ht1[i * D * H + H + j] -
               nt[id]) * zt[id] * (1 - zt[id]);
@@ -1357,7 +1357,7 @@ void GruBackwardSingleLayer(DType* ws,
       if (req_params != kAddTo) {
         #pragma omp parallel for num_threads(omp_threads)
         for (int i = 0; i < 3 * H; ++i) {
-          for (int j = 0; j < N * T; ++j) {
+          for (index_t j = 0; j < N * T; ++j) {
             back_dbx[i] += da[j * 3 * H + i];
             back_dbh[i] += dar[j * 3 * H + i];
           }
@@ -1366,14 +1366,14 @@ void GruBackwardSingleLayer(DType* ws,
         const Tensor<cpu, 2, DType> tmp_dbx(tmp_buf + T * N * D * H, Shape2(H * 3, T));
         const Tensor<cpu, 2, DType> tmp_dbh(tmp_buf + T * N * D * H + 3 * H * T, Shape2(H * 3, T));
         #pragma omp parallel for num_threads(omp_threads)
-        for (int i = 0; i < H * T * 3; ++i) {
+        for (index_t i = 0; i < H * T * 3; ++i) {
           tmp_dbx.dptr_[i] = 0;
           tmp_dbh.dptr_[i] = 0;
         }
-        for (int t = T - 1; t >= 0; --t) {
+        for (index_t t = T - 1; t >= 0; --t) {
           #pragma omp parallel for num_threads(omp_threads)
           for (int i = 0; i < 3 * H; ++i) {
-            for (int j = 0; j < N; ++j) {
+            for (index_t j = 0; j < N; ++j) {
               tmp_dbx[i][t] += da[t * N * 3 * H + j * 3 * H + i];
               tmp_dbh[i][t] += dar[t * N * 3 * H + j * 3 * H + i];
             }
@@ -1404,7 +1404,7 @@ void GruBackwardSingleLayer(DType* ws,
   }
   if (req_state != kNullOp) {
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N * H * D; ++i) {
+    for (index_t i = 0; i < N * H * D; ++i) {
       dhx[i] = dht1[i];
     }
   }
@@ -1415,9 +1415,9 @@ void GruBackward(DType* ws,
                  DType* rs,
                  const int L,
                  const int D,
-                 const int T,
-                 const int N,
-                 int I,
+                 const index_t T,
+                 const index_t N,
+                 index_t I,
                  const int H,
                  DType* x_ptr,
                  DType* hx_ptr,
@@ -1469,7 +1469,7 @@ void GruBackward(DType* ws,
   DType* dhx_l = dhx_ptr + (L - 1) * D * N * H;
   DType* dy_l = dy_ptr;
   Tensor<cpu, 3, DType> hx(hx_ptr, Shape3(L, D * N, H));
-  int inputsize = I;
+  index_t inputsize = I;
   DType* y_tmp = y_l - T * N * H * D;
   const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
   for (int l = L - 1; l >= 0; --l) {
@@ -1488,7 +1488,7 @@ void GruBackward(DType* ws,
     if (dropout > 0.0f && l > 0 && req_data != kNullOp) {
       dropout_random = dropout_random - T * N * D * H;
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < T * N * I; i++) {
+      for (index_t i = 0; i < T * N * I; i++) {
         if (dropout_random[i] == 0) {
           dx_l[i] = 0;
         } else {
@@ -1498,7 +1498,7 @@ void GruBackward(DType* ws,
     }
     if (l > 0) {
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < T * N * H * D; ++i) {
+      for (index_t i = 0; i < T * N * H * D; ++i) {
         dy_l[i] = dx_l[i];
       }
       gateR_l = gateR_l - T * D * N * H;
@@ -1532,9 +1532,9 @@ void VanillaRNNForwardInferenceSingleLayer(DType* ws,
                                            DType* tmp_buf,
                                            bool state_outputs,
                                            const int D,
-                                           const int T,
-                                           const int N,
-                                           const int I,
+                                           const index_t T,
+                                           const index_t N,
+                                           const index_t I,
                                            const int H,
                                            const Tensor<cpu, 2, DType> &x,
                                            const Tensor<cpu, 2, DType> &hx,
@@ -1569,13 +1569,13 @@ void VanillaRNNForwardInferenceSingleLayer(DType* ws,
   const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
   if (D == 1) {
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; i++)
+    for (index_t i = 0; i < N; i++)
       for (int j = 0; j < H; j++) {
         y_ptr[i * H + j] = hx[i][j];
       }
   } else {
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; i++)
+    for (index_t i = 0; i < N; i++)
       for (int j = 0; j < H; j++) {
         y_ptr[i * D * H + j] = hx[i][j];
         back_ht_1[i * D * H + j] = hx[N + i][j];
@@ -1593,7 +1593,7 @@ void VanillaRNNForwardInferenceSingleLayer(DType* ws,
     linalg_gemm(x, back_wx, dback_gemmC1, alpha, beta, false, true);
   }
 
-  for (int t = 0; t < T; t++) {
+  for (index_t t = 0; t < T; t++) {
     //  perform the first direction, X * wx and H * wh for each step
     //  ht-1 * wh, ht-1:[N, H] wh:[H, H]
     Tensor<cpu, 2, DType> dht_1(ht_1, Shape2(N, D * H));
@@ -1607,9 +1607,9 @@ void VanillaRNNForwardInferenceSingleLayer(DType* ws,
     }
     gemmC1_t = gemmC1 + t * N * H;
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; ++i) {
+    for (index_t i = 0; i < N; ++i) {
       for (int j = 0; j < H; ++j) {
-        int tb = i * H;
+        index_t tb = i * H;
         if (mode == 1) {
           ht[i * D * H + j] = tanh(gemmC1_t[tb + j] + bx[0][j] +
               gemmC2[tb + j] + bh[0][j]);
@@ -1631,9 +1631,9 @@ void VanillaRNNForwardInferenceSingleLayer(DType* ws,
       linalg_gemm(dback_ht_1_tmp[1], back_wh, dgemmC2, alpha, beta, true, true);
 
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; ++i) {
+      for (index_t i = 0; i < N; ++i) {
         for (int j = 0; j < H; ++j) {
-          int tb = i * H;
+          index_t tb = i * H;
           if (mode == 1) {
             back_ht[i * D * H + j] = tanh(gemmC1_t[tb + j] + back_bx[0][j]
                 + gemmC2[tb + j] + back_bh[0][j]);
@@ -1652,7 +1652,7 @@ void VanillaRNNForwardInferenceSingleLayer(DType* ws,
     if (D == 1) {
       DType* y_start = y_ptr + (T - 1) * N * H;
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; i++)
+      for (index_t i = 0; i < N; i++)
         for (int j = 0; j < H; j++) {
           hy_ptr[i * H + j] = y_start[i * H + j];
         }
@@ -1660,7 +1660,7 @@ void VanillaRNNForwardInferenceSingleLayer(DType* ws,
       DType* y_start = y_ptr + (T - 1) * N * H * D;
       DType* y_back_start = y_ptr + H;
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; i++)
+      for (index_t i = 0; i < N; i++)
         for (int j = 0; j < H; j++) {
           hy_ptr[i * H + j] = y_start[i * D * H + j];
           hy_ptr[N * H + i * H + j] = y_back_start[i * D * H + j];
@@ -1674,9 +1674,9 @@ void VanillaRNNForwardInference(DType* ws,
                                 bool state_outputs,
                                 const int L,
                                 const int D,
-                                const int T,
-                                const int N,
-                                int I,
+                                const index_t T,
+                                const index_t N,
+                                index_t I,
                                 const int H,
                                 DType* x_ptr,
                                 DType* hx_ptr,
@@ -1729,9 +1729,9 @@ void VanillaRNNForwardTrainingSingleLayer(DType* ws,
                                        DType* tmp_buf,
                                        bool state_outputs,
                                        const int D,
-                                       const int T,
-                                       const int N,
-                                       const int I,
+                                       const index_t T,
+                                       const index_t N,
+                                       const index_t I,
                                        const int H,
                                        const Tensor<cpu, 2, DType> &x,
                                        const Tensor<cpu, 2, DType> &hx,
@@ -1770,13 +1770,13 @@ void VanillaRNNForwardTrainingSingleLayer(DType* ws,
   const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
   if (D == 1) {
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; i++)
+    for (index_t i = 0; i < N; i++)
       for (int j = 0; j < H; j++) {
         y_ptr[i * H + j] = hx[i][j];
       }
   } else {
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; i++)
+    for (index_t i = 0; i < N; i++)
       for (int j = 0; j < H; j++) {
         y_ptr[i * D * H + j] = hx[i][j];
         back_ht_1[i * D * H + j] = hx[N + i][j];
@@ -1795,7 +1795,7 @@ void VanillaRNNForwardTrainingSingleLayer(DType* ws,
     linalg_gemm(x, back_wx, dback_gemmC1, alpha, beta, false, true);
   }
 
-  for (int t = 0; t < T; t++) {
+  for (index_t t = 0; t < T; t++) {
     //  perform the first direction, X * wx and H * wh for each step
     //  ht-1 * wh, ht-1:[N, H] wh:[H, H]
     Tensor<cpu, 2, DType> dht_1(ht_1, Shape2(N, D * H));
@@ -1810,9 +1810,9 @@ void VanillaRNNForwardTrainingSingleLayer(DType* ws,
     nt = gateN + t * N * H;
     gemmC1_t = gemmC1 + t * N * H;
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; ++i) {
+    for (index_t i = 0; i < N; ++i) {
       for (int j = 0; j < H; ++j) {
-        int tb = i * H;
+        index_t tb = i * H;
         if (mode == 1) {
           nt[tb + j] = ht[i * D * H + j] = tanh(gemmC1_t[tb + j] + bx[0][j] +
               gemmC2[tb + j] + bh[0][j]);
@@ -1834,9 +1834,9 @@ void VanillaRNNForwardTrainingSingleLayer(DType* ws,
       dback_ht_1_tmp = reshape(dback_ht_1.T(), Shape3(D, H, N));
       linalg_gemm(dback_ht_1_tmp[1], back_wh, dgemmC2, alpha, beta, true, true);
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; ++i) {
+      for (index_t i = 0; i < N; ++i) {
         for (int j = 0; j < H; ++j) {
-          int tb = i * H;
+          index_t tb = i * H;
           if (mode == 1) {
             nt[tb + j] = back_ht[i * D * H + j] = tanh(gemmC1_t[tb + j] + back_bx[0][j]
                 + gemmC2[tb + j] + back_bh[0][j]);
@@ -1856,7 +1856,7 @@ void VanillaRNNForwardTrainingSingleLayer(DType* ws,
     if (D == 1) {
       DType* y_start = y_ptr + (T - 1) * N * H;
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; i++)
+      for (index_t i = 0; i < N; i++)
         for (int j = 0; j < H; j++) {
           hy_ptr[i * H + j] = y_start[i * H + j];
         }
@@ -1864,7 +1864,7 @@ void VanillaRNNForwardTrainingSingleLayer(DType* ws,
       DType* y_start = y_ptr + (T - 1) * N * H * D;
       DType* y_back_start = y_ptr + H;
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; i++)
+      for (index_t i = 0; i < N; i++)
         for (int j = 0; j < H; j++) {
           hy_ptr[i * H + j] = y_start[i * D * H + j];
           hy_ptr[N * H + i * H + j] = y_back_start[i * D * H + j];
@@ -1879,9 +1879,9 @@ void VanillaRNNForwardTraining(DType* ws,
                                bool state_outputs,
                                const int L,
                                const int D,
-                               const int T,
-                               const int N,
-                               int I,
+                               const index_t T,
+                               const index_t N,
+                               index_t I,
                                const int H,
                                DType* x_ptr,
                                DType* hx_ptr,
@@ -1916,7 +1916,7 @@ void VanillaRNNForwardTraining(DType* ws,
     }
     if (dropout > 0.0f && l > 0) {
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < T * N * I; i++) {
+      for (index_t i = 0; i < T * N * I; i++) {
         int rand_data = rand_r(&seed_);
         if (static_cast<float>(rand_data % 1000) < static_cast<float>(1000 * dropout)) {
           dropout_random[(l - 1) * T * N * I + i] = 0;
@@ -1944,7 +1944,7 @@ void VanillaRNNForwardTraining(DType* ws,
     wh_l = wx_l + I * H;
   }
   #pragma omp parallel for num_threads(omp_threads)
-  for (int i = 0; i < T * N * H * D; ++i) {
+  for (index_t i = 0; i < T * N * H * D; ++i) {
     y_ptr[i] = y_l[i];
   }
 }
@@ -1953,9 +1953,9 @@ template <typename DType>
 void VanillaRNNBackwardSingleLayer(DType* ws,
                                    DType* tmp_buf,
                                    const int D,
-                                   const int T,
-                                   const int N,
-                                   const int I,
+                                   const index_t T,
+                                   const index_t N,
+                                   const index_t I,
                                    const int H,
                                    const Tensor<cpu, 2, DType> &x,
                                    const Tensor<cpu, 2, DType> &hx,
@@ -2013,7 +2013,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws,
   }
 
   #pragma omp parallel for num_threads(omp_threads)
-  for (int i = 0; i < N * H; ++i) {
+  for (index_t i = 0; i < N * H; ++i) {
     if (dhy_ptr) {
       dht1[i] = dhy_ptr[i];
     } else {
@@ -2022,7 +2022,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws,
   }
 
   #pragma omp parallel for num_threads(omp_threads)
-  for (int i = 0; i < N; ++i) {
+  for (index_t i = 0; i < N; ++i) {
     for (int j = 0; j < H; ++j) {
       hx_[i * D * H + j] = hx[i][j];
     }
@@ -2030,7 +2030,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws,
 
   if (D == 2) {
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N * H; ++i) {
+    for (index_t i = 0; i < N * H; ++i) {
       if (dhy_ptr) {
         back_dht1[i] = dhy_ptr[N * H + i];
       } else {
@@ -2038,13 +2038,13 @@ void VanillaRNNBackwardSingleLayer(DType* ws,
       }
     }
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; ++i) {
+    for (index_t i = 0; i < N; ++i) {
       for (int j = 0; j < H; ++j) {
         hx_[i * D * H + H + j] = hx[N + i][j];
       }
     }
   }
-  for (int t = T - 1; t >= 0; --t) {
+  for (index_t t = T - 1; t >= 0; --t) {
     if (t) {
       ht1 = y_ptr + (t - 1) * N * D * H;
     } else {
@@ -2054,7 +2054,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws,
     dyt = dy_ptr + t * N * D * H;
 
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; ++i) {
+    for (index_t i = 0; i < N; ++i) {
       for (int j = 0; j < H; ++j) {
         dht1[i * H + j] += dyt[i * D * H + j];
       }
@@ -2063,9 +2063,9 @@ void VanillaRNNBackwardSingleLayer(DType* ws,
     nt = gateN + t * N * H;
     dart = dar + t * N * H;
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; ++i) {
+    for (index_t i = 0; i < N; ++i) {
       for (int j = 0; j < H; ++j) {
-        int id = i * H + j;
+        index_t id = i * H + j;
         if (mode == 1) {
           dart[id] = dht1[id] * (1 - nt[id] * nt[id]);
         } else {
@@ -2104,7 +2104,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws,
     if (req_params != kAddTo) {
       #pragma omp parallel for num_threads(omp_threads)
       for (int i = 0; i < H; ++i) {
-        for (int j = 0; j < N * T; ++j) {
+        for (index_t j = 0; j < N * T; ++j) {
           dbx[i] += dar[j * H + i];
           dbh[i] = dbx[i];
         }
@@ -2113,15 +2113,15 @@ void VanillaRNNBackwardSingleLayer(DType* ws,
       const Tensor<cpu, 2, DType> tmp_dbx(tmp_buf + T * N * D * H, Shape2(H, T));
       const Tensor<cpu, 2, DType> tmp_dbh(tmp_buf + T * N * D * H + H * T, Shape2(H, T));
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < H * T; ++i) {
+      for (index_t i = 0; i < H * T; ++i) {
         tmp_dbx.dptr_[i] = 0;
         tmp_dbh.dptr_[i] = 0;
       }
 
-      for (int t = T - 1; t >= 0; --t) {
+      for (index_t t = T - 1; t >= 0; --t) {
         #pragma omp parallel for num_threads(omp_threads)
         for (int i = 0; i < H; ++i) {
-          for (int j = 0; j < N; ++j) {
+          for (index_t j = 0; j < N; ++j) {
             tmp_dbx[i][t] += dar[t * N * H + j * H + i];
             tmp_dbh[i][t] = tmp_dbx[i][t];
           }
@@ -2151,7 +2151,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws,
   }
 
   if (D == 2) {
-    for (int t = 0; t < T; ++t) {
+    for (index_t t = 0; t < T; ++t) {
       if (t == T-1) {
         back_ht1 = hx_;
       } else {
@@ -2161,7 +2161,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws,
       //  add dy[T, N, D, H] to dhy[D, N, H]
       dyt = dy_ptr + t * N * D * H;
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; ++i) {
+      for (index_t i = 0; i < N; ++i) {
         for (int j = 0; j < H; ++j) {
           back_dht1[i * H + j] += dyt[i * D * H + H + j];
         }
@@ -2171,9 +2171,9 @@ void VanillaRNNBackwardSingleLayer(DType* ws,
       dart = dar + t * N * H;
 
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; ++i) {
+      for (index_t i = 0; i < N; ++i) {
         for (int j = 0; j < H; ++j) {
-          int id = i * H + j;
+          index_t id = i * H + j;
           if (mode == 1) {
             dart[id] = back_dht1[id] * (1 - nt[id] * nt[id]);
           } else {
@@ -2213,7 +2213,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws,
       if (req_params != kAddTo) {
         #pragma omp parallel for num_threads(omp_threads)
         for (int i = 0; i < H; ++i) {
-          for (int j = 0; j < N * T; ++j) {
+          for (index_t j = 0; j < N * T; ++j) {
             back_dbx[i] += dar[j * H + i];
             back_dbh[i] = back_dbx[i];
           }
@@ -2222,15 +2222,15 @@ void VanillaRNNBackwardSingleLayer(DType* ws,
         const Tensor<cpu, 2, DType> tmp_dbx(tmp_buf + T * N * D * H, Shape2(H, T));
         const Tensor<cpu, 2, DType> tmp_dbh(tmp_buf + T * N * D * H + H * T, Shape2(H, T));
         #pragma omp parallel for num_threads(omp_threads)
-        for (int i = 0; i < H * T; ++i) {
+        for (index_t i = 0; i < H * T; ++i) {
           tmp_dbx.dptr_[i] = 0;
           tmp_dbh.dptr_[i] = 0;
         }
 
-        for (int t = T - 1; t >= 0; --t) {
+        for (index_t t = T - 1; t >= 0; --t) {
           #pragma omp parallel for num_threads(omp_threads)
           for (int i = 0; i < H; ++i) {
-            for (int j = 0; j < N; ++j) {
+            for (index_t j = 0; j < N; ++j) {
               tmp_dbx[i][t] += dar[t * N * H + j * H + i];
               tmp_dbh[i][t] = tmp_dbx[i][t];
             }
@@ -2261,7 +2261,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws,
   }
   if (req_state != kNullOp) {
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N * H * D; ++i) {
+    for (index_t i = 0; i < N * H * D; ++i) {
       dhx[i] = dht1[i];
     }
   }
@@ -2272,9 +2272,9 @@ void VanillaRNNBackward(DType* ws,
                         DType* rs,
                         const int L,
                         const int D,
-                        const int T,
-                        const int N,
-                        int I,
+                        const index_t T,
+                        const index_t N,
+                        index_t I,
                         const int H,
                         DType* x_ptr,
                         DType* hx_ptr,
@@ -2324,7 +2324,7 @@ void VanillaRNNBackward(DType* ws,
   DType* dhx_l = dhx_ptr + (L - 1) * D * N * H;
   DType* dy_l = dy_ptr;
   Tensor<cpu, 3, DType> hx(hx_ptr, Shape3(L, D * N, H));
-  int inputsize = I;
+  index_t inputsize = I;
   DType* y_tmp = y_l - T * N * H * D;
   const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
   for (int l = L - 1; l >= 0; --l) {
@@ -2343,7 +2343,7 @@ void VanillaRNNBackward(DType* ws,
     if (dropout > 0.0f && l > 0 && req_data != kNullOp) {
       dropout_random = dropout_random - T * N * D * H;
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < T * N * I; i++) {
+      for (index_t i = 0; i < T * N * I; i++) {
         if (dropout_random[i] == 0) {
           dx_l[i] = 0;
         } else {
@@ -2353,7 +2353,7 @@ void VanillaRNNBackward(DType* ws,
     }
     if (l > 0) {
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < T * N * H * D; ++i) {
+      for (index_t i = 0; i < T * N * H * D; ++i) {
         dy_l[i] = dx_l[i];
       }
       gateN_l = gateN_l -  T * D * N * H;
diff --git a/tests/nightly/test_large_array.py b/tests/nightly/test_large_array.py
index cc1977acdd91..5fb0ff81da6b 100644
--- a/tests/nightly/test_large_array.py
+++ b/tests/nightly/test_large_array.py
@@ -39,6 +39,7 @@
 SMALL_Y = 50
 LARGE_SIZE = LARGE_X * SMALL_Y
 LARGE_TENSOR_SHAPE = 2**32
+RNN_LARGE_TENSOR = 2**28
 
 
 def test_nn():
@@ -479,7 +480,6 @@ def check_embedding():
 
         assert out.shape[0] == LARGE_TENSOR_SHAPE
         assert out.shape[1] == 1
-        assert out.shape[2] == 1
         
     def check_spatial_transformer():
         data = nd.random_normal(shape=(2, 2**29, 1, 6))
@@ -530,6 +530,39 @@ def check_multi_lars():
 
         # Trigger lazy evaluation of the output NDArray and ensure that it has been filled
         assert type(out[0, 0].asscalar()).__name__ == 'float32'
+        
+    def check_rnn():
+        data = nd.random_normal(shape=(RNN_LARGE_TENSOR, 4, 4))
+        parameters_relu_tanh = nd.random_normal(shape=(7,))
+        parameters_lstm = nd.random_normal(shape=(28,))
+        parameters_gru = nd.random_normal(shape=(21,))
+        state = nd.random_normal(shape=(1, 4, 1))
+        state_cell = nd.random_normal(shape=(1, 4, 1))
+        mode_relu = 'rnn_relu'
+        mode_tanh = 'rnn_tanh'
+        mode_lstm = 'lstm'
+        mode_gru = 'gru'
+        state_size = 1
+        num_layers = 1
+
+        out_relu = nd.RNN(data=data, parameters=parameters_relu_tanh, state=state, mode=mode_relu,
+                          state_size=state_size, num_layers=num_layers)
+        
+        out_tanh = nd.RNN(data=data, parameters=parameters_relu_tanh, state=state, mode=mode_tanh,
+                          state_size=state_size, num_layers=num_layers)
+        
+        out_lstm = nd.RNN(data=data, parameters=parameters_lstm, state=state, mode=mode_lstm,
+                          state_cell=state_cell, state_size=state_size, num_layers=num_layers)
+
+        out_gru = nd.RNN(data=data, parameters=parameters_gru, state=state, mode=mode_gru,
+                         state_size=state_size, num_layers=num_layers)
+
+        for out in [out_relu, out_tanh, out_lstm, out_gru]:
+            assert out.shape[0] == RNN_LARGE_TENSOR
+            assert out.shape[1] == 4
+            assert out.shape[2] == 1
+
+            assert type(out[0, 0, 0].asscalar()).__name__ == 'float32'
 
     check_gluon_embedding()
     check_fully_connected()
@@ -556,6 +589,7 @@ def check_multi_lars():
     check_ravel()
     check_cumsum()
     check_multi_lars()
+    check_rnn()
 
 
 def test_tensor():