diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index 557c1117739a..d41b5b4f030b 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -63,7 +63,7 @@ struct RNNParam : public dmlc::Parameter { bool bidirectional, state_outputs; int mode; float p; - int seq_length_, batch_size_, input_size_; + index_t seq_length_, batch_size_, input_size_; bool use_sequence_length; dmlc::optional projection_size; @@ -122,8 +122,8 @@ struct RNNParam : public dmlc::Parameter { } }; -inline int GetRnnParamSize(int num_layer, - int input_size, +inline index_t GetRnnParamSize(int num_layer, + index_t input_size, int state_size, int direction, int mode, @@ -140,14 +140,14 @@ inline int GetRnnParamSize(int num_layer, size *= 3; break; } - int size1 = (input_size + state_size + 2) * size; // first layer size - int size2 = (state_size * direction + state_size + 2) * size; // other layers size + index_t size1 = (input_size + state_size + 2) * size; // first layer size + index_t size2 = (state_size * direction + state_size + 2) * size; // other layers size if (projection_size.has_value()) { - int proj_size = projection_size.value(); + index_t proj_size = projection_size.value(); size1 = (input_size + proj_size + 2) * size; size2 = (proj_size * direction + proj_size + 2) * size; } - int param_size = size1 + (num_layer - 1) * size2; + index_t param_size = size1 + (num_layer - 1) * size2; if (projection_size.has_value()) { param_size += projection_size.value() * state_size * num_layer * direction; } @@ -182,8 +182,8 @@ inline int GetRnnBiasSize(int num_layer, * - output -> h[t](, c[t] additionally with Lstm) time by time(sz: NxH(x2)) * - intermediate y[1...T] as next layer's inputs(sz: TxNxHxD) */ -inline size_t GetRNNWorkspaceSize(int seq_length, - int batch_size, +inline size_t GetRNNWorkspaceSize(index_t seq_length, + index_t batch_size, int hidden_size, int projection_size, int direction, @@ -214,8 +214,8 @@ inline size_t GetRNNWorkspaceSize(int seq_length, inline size_t GetRNNReserveSpaceSize(int num_layer, int direction, - int seq_length, - int batch_size, + index_t seq_length, + index_t batch_size, int hidden_size, int mode) { size_t size = 0; @@ -279,9 +279,9 @@ void RNNForwardTraining(DType* ws, bool state_outputs, const int num_layers, const int direction, - const int seq_length, - const int batch_size, - const int input_size, + const index_t seq_length, + const index_t batch_size, + const index_t input_size, const int state_size, DType* x_ptr, DType* hx_ptr, @@ -321,9 +321,9 @@ void RNNForwardInference(DType* ws, bool state_outputs, const int num_layers, const int direction, - const int seq_length, - const int batch_size, - const int input_size, + const index_t seq_length, + const index_t batch_size, + const index_t input_size, const int state_size, const int projection_size, DType* x_ptr, @@ -363,9 +363,9 @@ void RNNBackward(DType* ws, DType* rs, const int num_layers, const int direction, - const int seq_length, - const int batch_size, - const int input_size, + const index_t seq_length, + const index_t batch_size, + const index_t input_size, const int state_size, DType* x_ptr, DType* hx_ptr, diff --git a/src/operator/rnn_impl.h b/src/operator/rnn_impl.h index 9f9cf875e0ac..5acf4eb7b3bd 100644 --- a/src/operator/rnn_impl.h +++ b/src/operator/rnn_impl.h @@ -59,9 +59,9 @@ void LstmForwardTrainingSingleLayer(DType* ws, DType* rs, bool state_outputs, bool bid, - const int T, - const int N, - const int I, + const index_t T, + const index_t N, + const index_t I, const int H, const Tensor &x, const Tensor &hx, @@ -88,17 +88,17 @@ void LstmForwardTrainingSingleLayer(DType* ws, const int offset = bid ? H : 0; const DType alpha = 1.0; const DType beta = 0.0; - const int cell_size = N * H; + const index_t cell_size = N * H; linalg_gemm(x, wx, yx_flat, alpha, beta, false, true); const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); - for (int i = 0; i < T; ++i) { - int t = bid ? T - 1 - i : i; + for (index_t i = 0; i < T; ++i) { + index_t t = bid ? T - 1 - i : i; linalg_gemm(i ? h : hx, wh, yh_flat, alpha, beta, false, true); #pragma omp parallel for num_threads(omp_threads) - for (int jk = 0; jk < cell_size; ++jk) { - int j = jk / H; - int k = jk % H; + for (index_t jk = 0; jk < cell_size; ++jk) { + index_t j = jk / H; + index_t k = jk % H; DType it = sigmoid(yx[t][j][0][k] + yh[j][0][k] + bx[0][k] + bh[0][k]); DType ft = sigmoid(yx[t][j][1][k] + yh[j][1][k] + bx[1][k] + bh[1][k]); DType gt = tanh(yx[t][j][2][k] + yh[j][2][k] + bx[2][k] + bh[2][k]); @@ -127,9 +127,9 @@ void LstmForwardTraining(DType* ws, bool state_outputs, const int L, const int D, - const int T, - const int N, - const int I, + const index_t T, + const index_t N, + const index_t I, const int H, DType* x_ptr, DType* hx_ptr, @@ -145,16 +145,16 @@ void LstmForwardTraining(DType* ws, const int total_layers = D * L; Tensor hx(hx_ptr, Shape3(total_layers, N, H)); Tensor cx(cx_ptr, Shape3(total_layers, N, H)); - const int b_size = 2 * H * 4; - const int r_size = D * T * N * H * 6; - const int y_offset = T * N * H * 5; - const int cell_size = N * H; + const index_t b_size = 2 * H * 4; + const index_t r_size = D * T * N * H * 6; + const index_t y_offset = T * N * H * 5; + const index_t cell_size = N * H; unsigned int seed_ = 17 + rand() % 4096; // NOLINT(runtime/threadsafe_fn) int idx = 0; // state & cell state's idx; const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); for (int i = 0; i < L; ++i) { - const int input_size = i ? H * D : I; - const int w_size = (input_size + H) * H * 4; + const index_t input_size = i ? H * D : I; + const index_t w_size = (input_size + H) * H * 4; Tensor x(x_ptr, Shape2(T * N, input_size)); Tensor y(rs2 + y_offset, Shape3(T, N, H * D)); LstmForwardTrainingSingleLayer(ws, rs2, state_outputs, false, T, N, input_size, H, x, @@ -175,7 +175,7 @@ void LstmForwardTraining(DType* ws, b_ptr += b_size; if (dropout > 0.0f) { #pragma omp parallel for num_threads(omp_threads) - for (int j = 0; j < T * N * H * D; j++) { + for (index_t j = 0; j < T * N * H * D; j++) { int rand_data = rand_r(&seed_); if (static_cast(rand_data % 1000) < static_cast(1000 * dropout)) { dropout_random[i * T * N * H * D + j] = 0; @@ -196,7 +196,7 @@ void LstmForwardTraining(DType* ws, } } #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < T * N * H * D; ++i) { + for (index_t i = 0; i < T * N * H * D; ++i) { y_ptr[i] = (rs2 + y_offset)[i]; } } @@ -205,9 +205,9 @@ template void LstmForwardInferenceSingleLayer(DType* ws, bool state_outputs, bool bid, - const int T, - const int N, - const int I, + const index_t T, + const index_t N, + const index_t I, const int H, const int P, const Tensor &x, @@ -237,19 +237,19 @@ void LstmForwardInferenceSingleLayer(DType* ws, const int proj_offset = bid ? P : 0; const DType alpha = 1.0; const DType beta = 0.0; - const int cell_size = N * H; + const index_t cell_size = N * H; linalg_gemm(x, wx, yx_flat, alpha, beta, false, true); const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); - for (int i = 0; i < T; ++i) { - int t = bid ? T - 1 - i : i; + for (index_t i = 0; i < T; ++i) { + index_t t = bid ? T - 1 - i : i; if (P > 0) { linalg_gemm(i ? r : hx, wh, yh_flat, alpha, beta, false, true); } else { linalg_gemm(i ? h : hx, wh, yh_flat, alpha, beta, false, true); } #pragma omp parallel for num_threads(omp_threads) - for (int jk = 0; jk < cell_size; ++jk) { + for (index_t jk = 0; jk < cell_size; ++jk) { int j = jk / H; int k = jk % H; DType it = sigmoid(yx[t][j][0][k] + yh[j][0][k] + bx[0][k] + bh[0][k]); @@ -287,9 +287,9 @@ void LstmForwardInference(DType* ws, bool state_outputs, const int L, const int D, - const int T, - const int N, - const int I, + const index_t T, + const index_t N, + const index_t I, const int H, const int P, DType* x_ptr, @@ -303,16 +303,16 @@ void LstmForwardInference(DType* ws, const int total_layers = D * L; Tensor hx(hx_ptr, Shape3(total_layers, N, P ? P : H)); Tensor cx(cx_ptr, Shape3(total_layers, N, H)); - const int b_size = 2 * H * 4; - const int cell_size = N * H; - const int projection_size = (P ? P : H) * N; + const index_t b_size = 2 * H * 4; + const index_t cell_size = N * H; + const index_t projection_size = (P ? P : H) * N; DType* y_tmp_ptr = ws + (T + 1) * cell_size * 4 + cell_size * 2; DType* y_cur_ptr = y_ptr; int idx = 0; // state & cell state's idx; bool flag = L % 2 ? false : true; for (int i = 0; i < L; ++i) { - const int input_size = i ? (P ? P : H) * D : I; - int w_size = (input_size + (P ? P : H)) * H * 4; + const index_t input_size = i ? (P ? P : H) * D : I; + index_t w_size = (input_size + (P ? P : H)) * H * 4; if (P > 0) { w_size += P * H; } @@ -356,9 +356,9 @@ void LstmBackwardSingleLayer(DType* ws, DType* rs, DType* tmp_buf, bool bid, - const int T, - const int N, - const int I, + const index_t T, + const index_t N, + const index_t I, const int H, const Tensor &x, const Tensor &hx, @@ -408,41 +408,41 @@ void LstmBackwardSingleLayer(DType* ws, const DType beta0 = 0.0; const DType beta1 = 1.0; const DType beta2 = 2.0; - const int cell_size = N * H; + const index_t cell_size = N * H; if (dhy_ptr != nullptr) { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < cell_size; ++i) { + for (index_t i = 0; i < cell_size; ++i) { dh.dptr_[i] = dhy_ptr[i]; } } else { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < cell_size; ++i) { + for (index_t i = 0; i < cell_size; ++i) { dh.dptr_[i] = 0; } } if (dcy_ptr != nullptr) { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < cell_size; ++i) { + for (index_t i = 0; i < cell_size; ++i) { dc.dptr_[i] = dcy_ptr[i]; } } else { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < cell_size; ++i) { + for (index_t i = 0; i < cell_size; ++i) { dc.dptr_[i] = 0; } } - for (int i = T - 1; i >= 0; --i) { - int t = bid ? T - 1 - i : i; - int tnext = bid ? t + 1 : t - 1; + for (index_t i = T - 1; i >= 0; --i) { + index_t t = bid ? T - 1 - i : i; + index_t tnext = bid ? t + 1 : t - 1; const Tensor& dhnext = i ? dh : dhx; const Tensor& dcnext = i ? dc : dcx; const Tensor& hnext = i ? htmp : hx; const Tensor& cnext = i ? c[i - 1] : cx; #pragma omp parallel for num_threads(omp_threads) - for (int jk = 0; jk < cell_size; ++jk) { - int j = jk / H; - int k = jk % H; + for (index_t jk = 0; jk < cell_size; ++jk) { + index_t j = jk / H; + index_t k = jk % H; DType tc = tanh(c[i][j][k]); DType it = ifgo[i][j][k][0]; DType ft = ifgo[i][j][k][1]; @@ -485,13 +485,13 @@ void LstmBackwardSingleLayer(DType* ws, if (req_params != kNullOp && req_params != kAddTo) { linalg_gemm(dyx, x, dwx, alpha, beta0, true, false); } - const int row = T * N; - const int col = H * 4; + const index_t row = T * N; + const index_t col = H * 4; if (req_params != kNullOp) { if (req_params != kAddTo) { - for (int i = 0; i < row; ++i) { + for (index_t i = 0; i < row; ++i) { #pragma omp parallel for num_threads(omp_threads) - for (int j = 0; j < col; ++j) { + for (index_t j = 0; j < col; ++j) { dbx[j] += dyx[i][j]; dbh[j] = dbx[j]; } @@ -500,20 +500,20 @@ void LstmBackwardSingleLayer(DType* ws, const Tensor tmp_dbx(tmp_buf, Shape2(col, T)); const Tensor tmp_dbh(tmp_buf + col * T, Shape2(col, T)); #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < col * T; ++i) { + for (index_t i = 0; i < col * T; ++i) { tmp_dbx.dptr_[i] = 0; tmp_dbh.dptr_[i] = 0; } - for (int t = T - 1; t >= 0; --t) { + for (index_t t = T - 1; t >= 0; --t) { #pragma omp parallel for num_threads(omp_threads) - for (int j = 0; j < col; ++j) { - for (int i = 0; i < N; ++i) { + for (index_t j = 0; j < col; ++j) { + for (index_t i = 0; i < N; ++i) { tmp_dbx[j][t] += dyx[t * N + i][j]; tmp_dbh[j][t] = tmp_dbx[j][t]; } } #pragma omp parallel for num_threads(omp_threads) - for (int j = 0; j < col; ++j) { + for (index_t j = 0; j < col; ++j) { dbx[j] += tmp_dbx[j][t] + dbx[j]; dbh[j] += tmp_dbh[j][t] + dbh[j]; } @@ -527,9 +527,9 @@ void LstmBackward(DType* ws, DType* rs, const int L, const int D, - const int T, - const int N, - const int I, + const index_t T, + const index_t N, + const index_t I, const int H, DType* x_ptr, DType* hx_ptr, @@ -558,16 +558,16 @@ void LstmBackward(DType* ws, Tensor cx(cx_ptr, Shape3(total_layers, N, H)); Tensor dhx(dhx_ptr, Shape3(total_layers, N, H)); Tensor dcx(dcx_ptr, Shape3(total_layers, N, H)); - const int b_size = 2 * H * 4; - const int r_size = D * T * N * H * 6; - const int y_offset = T * N * H * 5; - const int w_size1 = (I + H) * H * 4; // first layer - const int w_size2 = (D * H + H) * H * 4; // other layers - const int cell_size = N * H; + const index_t b_size = 2 * H * 4; + const index_t r_size = D * T * N * H * 6; + const index_t y_offset = T * N * H * 5; + const index_t w_size1 = (I + H) * H * 4; // first layer + const index_t w_size2 = (D * H + H) * H * 4; // other layers + const index_t cell_size = N * H; DType* dy_tmp_ptr = ws2 + T * cell_size * 4 + cell_size * 3; for (int i = L - 1; i >= 0; --i) { - const int input_size = i ? H * D : I; - const int w_size = i ? w_size2 : w_size1; + const index_t input_size = i ? H * D : I; + const index_t w_size = i ? w_size2 : w_size1; int idx = i * D; DType* w_cur_ptr = i ? w_ptr + (w_size1 + (i - 1) * w_size2) * D : w_ptr; DType* dw_cur_ptr = i ? dw_ptr + (w_size1 + (i - 1) * w_size2) * D : dw_ptr; @@ -599,7 +599,7 @@ void LstmBackward(DType* ws, dropout_random = dropout_random - T * N * D * H; const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); #pragma omp parallel for num_threads(omp_threads) - for (int j = 0; j < T * N * D * H; j++) { + for (index_t j = 0; j < T * N * D * H; j++) { if (dropout_random[j] == 0) { dx.dptr_[j] = 0; } else { @@ -616,9 +616,9 @@ void GruForwardInferenceSingleLayer(DType* ws, DType* tmp_buf, bool state_outputs, const int D, - const int T, - const int N, - const int I, + const index_t T, + const index_t N, + const index_t I, const int H, const Tensor &x, const Tensor &hx, @@ -655,13 +655,13 @@ void GruForwardInferenceSingleLayer(DType* ws, const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); if (D == 1) { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; i++) + for (index_t i = 0; i < N; i++) for (int j = 0; j < H; j++) { y_ptr[i * H + j] = hx[i][j]; } } else { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; i++) + for (index_t i = 0; i < N; i++) for (int j = 0; j < H; j++) { y_ptr[i * D * H + j] = hx[i][j]; back_ht_1[i * D * H + j] = hx[N + i][j]; @@ -679,7 +679,7 @@ void GruForwardInferenceSingleLayer(DType* ws, linalg_gemm(x, back_wx, dback_gemmC1, alpha, beta, false, true); } - for (int t = 0; t < T; t++) { + for (index_t t = 0; t < T; t++) { // perform the first direction, X * wx and H * wh for each step // ht-1 * wh, ht-1:[N, H] wh:[3 * H, H] Tensor dht_1(ht_1, Shape2(N, D * H)); @@ -693,11 +693,11 @@ void GruForwardInferenceSingleLayer(DType* ws, } gemmC1_t = gemmC1 + t * N * 3 * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { - int rtb = i * 3 * H; - int ztb = i * 3 * H + H; - int ntb = i * 3 * H + 2 * H; + index_t rtb = i * 3 * H; + index_t ztb = i * 3 * H + H; + index_t ntb = i * 3 * H + 2 * H; rt[i * H + j] = sigmoid(gemmC1_t[rtb + j] + gemmC2[rtb + j] + bx[0][j] + bh[0][j]); zt[i * H + j] = sigmoid(gemmC1_t[ztb + j] + gemmC2[ztb + j] @@ -720,11 +720,11 @@ void GruForwardInferenceSingleLayer(DType* ws, linalg_gemm(dback_ht_1_tmp[1], back_wh, dgemmC2, alpha, beta, true, true); #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { - int rtb = i * 3 * H; - int ztb = i * 3 * H + H; - int ntb = i * 3 * H + 2 * H; + index_t rtb = i * 3 * H; + index_t ztb = i * 3 * H + H; + index_t ntb = i * 3 * H + 2 * H; rt[i * H + j] = sigmoid(gemmC1_t[rtb + j] + gemmC2[rtb + j] + back_bx[0][j] + back_bh[0][j]); zt[i * H + j] = sigmoid(gemmC1_t[ztb + j] + @@ -744,7 +744,7 @@ void GruForwardInferenceSingleLayer(DType* ws, if (D == 1) { DType* y_start = y_ptr + (T - 1) * N * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; i++) + for (index_t i = 0; i < N; i++) for (int j = 0; j < H; j++) { hy_ptr[i * H + j] = y_start[i * H + j]; } @@ -752,7 +752,7 @@ void GruForwardInferenceSingleLayer(DType* ws, DType* y_start = y_ptr + (T - 1) * N * H * D; DType* y_back_start = y_ptr + H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; i++) + for (index_t i = 0; i < N; i++) for (int j = 0; j < H; j++) { hy_ptr[i * H + j] = y_start[i * D * H + j]; hy_ptr[N * H + i * H + j] = y_back_start[i * D * H + j]; @@ -766,9 +766,9 @@ void GruForwardInference(DType* ws, bool state_outputs, const int L, const int D, - const int T, - const int N, - int I, + const index_t T, + const index_t N, + index_t I, const int H, DType* x_ptr, DType* hx_ptr, @@ -819,9 +819,9 @@ void GruForwardTrainingSingleLayer(DType* ws, DType* tmp_buf, bool state_outputs, const int D, - const int T, - const int N, - const int I, + const index_t T, + const index_t N, + const index_t I, const int H, const Tensor &x, const Tensor &hx, @@ -867,13 +867,13 @@ void GruForwardTrainingSingleLayer(DType* ws, const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); if (D == 1) { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; i++) + for (index_t i = 0; i < N; i++) for (int j = 0; j < H; j++) { y_ptr[i * H + j] = hx[i][j]; } } else { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; i++) + for (index_t i = 0; i < N; i++) for (int j = 0; j < H; j++) { y_ptr[i * D * H + j] = hx[i][j]; back_ht_1[i * D * H + j] = hx[N + i][j]; @@ -892,7 +892,7 @@ void GruForwardTrainingSingleLayer(DType* ws, linalg_gemm(x, back_wx, dback_gemmC1, alpha, beta, false, true); } - for (int t = 0; t < T; t++) { + for (index_t t = 0; t < T; t++) { // perform the first direction, X * wx and H * wh for each step // ht-1 * wh, ht-1:[N, H] wh:[3 * H, H] Tensor dht_1(ht_1, Shape2(N, D * H)); @@ -910,11 +910,11 @@ void GruForwardTrainingSingleLayer(DType* ws, gemmC1_t = gemmC1 + t * N * 3 * H; DType* Mnht = Mnh + t * N * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { - int rtb = i * 3 * H; - int ztb = i * 3 * H + H; - int ntb = i * 3 * H + 2 * H; + index_t rtb = i * 3 * H; + index_t ztb = i * 3 * H + H; + index_t ntb = i * 3 * H + 2 * H; Mnht[i * H + j] = gemmC2[ntb + j] + bh[2][j]; rt[i * H + j] = sigmoid(gemmC1_t[rtb + j] + gemmC2[rtb + j] + bx[0][j] + bh[0][j]); @@ -942,11 +942,11 @@ void GruForwardTrainingSingleLayer(DType* ws, DType* back_Mnht = back_Mnh + (T - 1 - t) * N * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { - int rtb = i * 3 * H; - int ztb = i * 3 * H + H; - int ntb = i * 3 * H + 2 * H; + index_t rtb = i * 3 * H; + index_t ztb = i * 3 * H + H; + index_t ntb = i * 3 * H + 2 * H; back_Mnht[i * H + j] = gemmC2[ntb + j] + back_bh[2][j]; rt[i * H + j] = sigmoid(gemmC1_t[rtb + j] + gemmC2[rtb + j] + back_bx[0][j] + back_bh[0][j]); @@ -968,7 +968,7 @@ void GruForwardTrainingSingleLayer(DType* ws, if (D == 1) { DType* y_start = y_ptr + (T - 1) * N * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; i++) + for (index_t i = 0; i < N; i++) for (int j = 0; j < H; j++) { hy_ptr[i * H + j] = y_start[i * H + j]; } @@ -976,7 +976,7 @@ void GruForwardTrainingSingleLayer(DType* ws, DType* y_start = y_ptr + (T - 1) * N * H * D; DType* y_back_start = y_ptr + H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; i++) + for (index_t i = 0; i < N; i++) for (int j = 0; j < H; j++) { hy_ptr[i * H + j] = y_start[i * D * H + j]; hy_ptr[N * H + i * H + j] = y_back_start[i * D * H + j]; @@ -991,9 +991,9 @@ void GruForwardTraining(DType* ws, bool state_outputs, const int L, const int D, - const int T, - const int N, - int I, + const index_t T, + const index_t N, + index_t I, const int H, DType* x_ptr, DType* hx_ptr, @@ -1030,7 +1030,7 @@ void GruForwardTraining(DType* ws, if (dropout > 0.0f && l > 0) { const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < T * N * I; i++) { + for (index_t i = 0; i < T * N * I; i++) { int rand_data = rand_r(&seed_); if (static_cast(rand_data % 1000) < static_cast(1000 * dropout)) { dropout_random[(l - 1) * T * N * I + i] = 0; @@ -1062,7 +1062,7 @@ void GruForwardTraining(DType* ws, } const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < T * N * H * D; ++i) { + for (index_t i = 0; i < T * N * H * D; ++i) { y_ptr[i] = y_l[i]; } } @@ -1071,9 +1071,9 @@ template void GruBackwardSingleLayer(DType* ws, DType* tmp_buf, const int D, - const int T, - const int N, - const int I, + const index_t T, + const index_t N, + const index_t I, const int H, const Tensor &x, const Tensor &hx, @@ -1139,7 +1139,7 @@ void GruBackwardSingleLayer(DType* ws, } } #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N * H; ++i) { + for (index_t i = 0; i < N * H; ++i) { if (dhy_ptr) { dht1[i] = dhy_ptr[i]; } else { @@ -1148,7 +1148,7 @@ void GruBackwardSingleLayer(DType* ws, } #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { hx_[i * D * H + j] = hx[i][j]; } @@ -1156,7 +1156,7 @@ void GruBackwardSingleLayer(DType* ws, if (D == 2) { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N * H; ++i) { + for (index_t i = 0; i < N * H; ++i) { if (dhy_ptr) { back_dht1[i] = dhy_ptr[N * H + i]; } else { @@ -1164,13 +1164,13 @@ void GruBackwardSingleLayer(DType* ws, } } #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { hx_[i * D * H + H + j] = hx[N + i][j]; } } } - for (int t = T - 1; t >= 0; --t) { + for (index_t t = T - 1; t >= 0; --t) { if (t) { ht1 = y_ptr + (t - 1) * N * D * H; } else { @@ -1180,7 +1180,7 @@ void GruBackwardSingleLayer(DType* ws, dyt = dy_ptr + t * N * D * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { dht1[i * H + j] += dyt[i * D * H + j]; } @@ -1193,7 +1193,7 @@ void GruBackwardSingleLayer(DType* ws, dat = da + t * N * 3 * H; dart = dar + t * N * 3 * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { int nid = i * 3 * H + 2 * H + j; int zid = i * 3 * H + H + j; @@ -1239,7 +1239,7 @@ void GruBackwardSingleLayer(DType* ws, if (req_params != kAddTo) { #pragma omp parallel for num_threads(omp_threads) for (int i = 0; i < 3 * H; ++i) { - for (int j = 0; j < N * T; ++j) { + for (index_t j = 0; j < N * T; ++j) { dbx[i] += da[j * 3 * H + i]; dbh[i] += dar[j * 3 * H + i]; } @@ -1248,15 +1248,15 @@ void GruBackwardSingleLayer(DType* ws, const Tensor tmp_dbx(tmp_buf + T * N * D * H, Shape2(H * 3, T)); const Tensor tmp_dbh(tmp_buf + T * N * D * H + 3 * H * T, Shape2(H * 3, T)); #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < H * T * 3; ++i) { + for (index_t i = 0; i < H * T * 3; ++i) { tmp_dbx.dptr_[i] = 0; tmp_dbh.dptr_[i] = 0; } - for (int t = T - 1; t >= 0; --t) { + for (index_t t = T - 1; t >= 0; --t) { #pragma omp parallel for num_threads(omp_threads) for (int i = 0; i < 3 * H; ++i) { - for (int j = 0; j < N; ++j) { + for (index_t j = 0; j < N; ++j) { tmp_dbx[i][t] += da[t * N * 3 * H + j * 3 * H + i]; tmp_dbh[i][t] += dar[t * N * 3 * H + j * 3 * H + i]; } @@ -1286,7 +1286,7 @@ void GruBackwardSingleLayer(DType* ws, } if (D == 2) { - for (int t = 0; t < T; ++t) { + for (index_t t = 0; t < T; ++t) { if (t == T-1) { back_ht1 = hx_; } else { @@ -1296,7 +1296,7 @@ void GruBackwardSingleLayer(DType* ws, // add dy[T, N, D, H] to dhy[D, N, H] dyt = dy_ptr + t * N * D * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { back_dht1[i * H + j] += dyt[i * D * H + H + j]; } @@ -1310,12 +1310,12 @@ void GruBackwardSingleLayer(DType* ws, dart = dar + t * N * 3 * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { - int nid = i * 3 * H + 2 * H + j; - int zid = i * 3 * H + H + j; - int rid = i * 3 * H + j; - int id = i * H + j; + index_t nid = i * 3 * H + 2 * H + j; + index_t zid = i * 3 * H + H + j; + index_t rid = i * 3 * H + j; + index_t id = i * H + j; dat[nid] = back_dht1[id] * (1 - zt[id]) * (1 - nt[id] * nt[id]); dart[zid] = dat[zid] = back_dht1[id] * (back_ht1[i * D * H + H + j] - nt[id]) * zt[id] * (1 - zt[id]); @@ -1357,7 +1357,7 @@ void GruBackwardSingleLayer(DType* ws, if (req_params != kAddTo) { #pragma omp parallel for num_threads(omp_threads) for (int i = 0; i < 3 * H; ++i) { - for (int j = 0; j < N * T; ++j) { + for (index_t j = 0; j < N * T; ++j) { back_dbx[i] += da[j * 3 * H + i]; back_dbh[i] += dar[j * 3 * H + i]; } @@ -1366,14 +1366,14 @@ void GruBackwardSingleLayer(DType* ws, const Tensor tmp_dbx(tmp_buf + T * N * D * H, Shape2(H * 3, T)); const Tensor tmp_dbh(tmp_buf + T * N * D * H + 3 * H * T, Shape2(H * 3, T)); #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < H * T * 3; ++i) { + for (index_t i = 0; i < H * T * 3; ++i) { tmp_dbx.dptr_[i] = 0; tmp_dbh.dptr_[i] = 0; } - for (int t = T - 1; t >= 0; --t) { + for (index_t t = T - 1; t >= 0; --t) { #pragma omp parallel for num_threads(omp_threads) for (int i = 0; i < 3 * H; ++i) { - for (int j = 0; j < N; ++j) { + for (index_t j = 0; j < N; ++j) { tmp_dbx[i][t] += da[t * N * 3 * H + j * 3 * H + i]; tmp_dbh[i][t] += dar[t * N * 3 * H + j * 3 * H + i]; } @@ -1404,7 +1404,7 @@ void GruBackwardSingleLayer(DType* ws, } if (req_state != kNullOp) { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N * H * D; ++i) { + for (index_t i = 0; i < N * H * D; ++i) { dhx[i] = dht1[i]; } } @@ -1415,9 +1415,9 @@ void GruBackward(DType* ws, DType* rs, const int L, const int D, - const int T, - const int N, - int I, + const index_t T, + const index_t N, + index_t I, const int H, DType* x_ptr, DType* hx_ptr, @@ -1469,7 +1469,7 @@ void GruBackward(DType* ws, DType* dhx_l = dhx_ptr + (L - 1) * D * N * H; DType* dy_l = dy_ptr; Tensor hx(hx_ptr, Shape3(L, D * N, H)); - int inputsize = I; + index_t inputsize = I; DType* y_tmp = y_l - T * N * H * D; const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); for (int l = L - 1; l >= 0; --l) { @@ -1488,7 +1488,7 @@ void GruBackward(DType* ws, if (dropout > 0.0f && l > 0 && req_data != kNullOp) { dropout_random = dropout_random - T * N * D * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < T * N * I; i++) { + for (index_t i = 0; i < T * N * I; i++) { if (dropout_random[i] == 0) { dx_l[i] = 0; } else { @@ -1498,7 +1498,7 @@ void GruBackward(DType* ws, } if (l > 0) { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < T * N * H * D; ++i) { + for (index_t i = 0; i < T * N * H * D; ++i) { dy_l[i] = dx_l[i]; } gateR_l = gateR_l - T * D * N * H; @@ -1532,9 +1532,9 @@ void VanillaRNNForwardInferenceSingleLayer(DType* ws, DType* tmp_buf, bool state_outputs, const int D, - const int T, - const int N, - const int I, + const index_t T, + const index_t N, + const index_t I, const int H, const Tensor &x, const Tensor &hx, @@ -1569,13 +1569,13 @@ void VanillaRNNForwardInferenceSingleLayer(DType* ws, const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); if (D == 1) { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; i++) + for (index_t i = 0; i < N; i++) for (int j = 0; j < H; j++) { y_ptr[i * H + j] = hx[i][j]; } } else { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; i++) + for (index_t i = 0; i < N; i++) for (int j = 0; j < H; j++) { y_ptr[i * D * H + j] = hx[i][j]; back_ht_1[i * D * H + j] = hx[N + i][j]; @@ -1593,7 +1593,7 @@ void VanillaRNNForwardInferenceSingleLayer(DType* ws, linalg_gemm(x, back_wx, dback_gemmC1, alpha, beta, false, true); } - for (int t = 0; t < T; t++) { + for (index_t t = 0; t < T; t++) { // perform the first direction, X * wx and H * wh for each step // ht-1 * wh, ht-1:[N, H] wh:[H, H] Tensor dht_1(ht_1, Shape2(N, D * H)); @@ -1607,9 +1607,9 @@ void VanillaRNNForwardInferenceSingleLayer(DType* ws, } gemmC1_t = gemmC1 + t * N * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { - int tb = i * H; + index_t tb = i * H; if (mode == 1) { ht[i * D * H + j] = tanh(gemmC1_t[tb + j] + bx[0][j] + gemmC2[tb + j] + bh[0][j]); @@ -1631,9 +1631,9 @@ void VanillaRNNForwardInferenceSingleLayer(DType* ws, linalg_gemm(dback_ht_1_tmp[1], back_wh, dgemmC2, alpha, beta, true, true); #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { - int tb = i * H; + index_t tb = i * H; if (mode == 1) { back_ht[i * D * H + j] = tanh(gemmC1_t[tb + j] + back_bx[0][j] + gemmC2[tb + j] + back_bh[0][j]); @@ -1652,7 +1652,7 @@ void VanillaRNNForwardInferenceSingleLayer(DType* ws, if (D == 1) { DType* y_start = y_ptr + (T - 1) * N * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; i++) + for (index_t i = 0; i < N; i++) for (int j = 0; j < H; j++) { hy_ptr[i * H + j] = y_start[i * H + j]; } @@ -1660,7 +1660,7 @@ void VanillaRNNForwardInferenceSingleLayer(DType* ws, DType* y_start = y_ptr + (T - 1) * N * H * D; DType* y_back_start = y_ptr + H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; i++) + for (index_t i = 0; i < N; i++) for (int j = 0; j < H; j++) { hy_ptr[i * H + j] = y_start[i * D * H + j]; hy_ptr[N * H + i * H + j] = y_back_start[i * D * H + j]; @@ -1674,9 +1674,9 @@ void VanillaRNNForwardInference(DType* ws, bool state_outputs, const int L, const int D, - const int T, - const int N, - int I, + const index_t T, + const index_t N, + index_t I, const int H, DType* x_ptr, DType* hx_ptr, @@ -1729,9 +1729,9 @@ void VanillaRNNForwardTrainingSingleLayer(DType* ws, DType* tmp_buf, bool state_outputs, const int D, - const int T, - const int N, - const int I, + const index_t T, + const index_t N, + const index_t I, const int H, const Tensor &x, const Tensor &hx, @@ -1770,13 +1770,13 @@ void VanillaRNNForwardTrainingSingleLayer(DType* ws, const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); if (D == 1) { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; i++) + for (index_t i = 0; i < N; i++) for (int j = 0; j < H; j++) { y_ptr[i * H + j] = hx[i][j]; } } else { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; i++) + for (index_t i = 0; i < N; i++) for (int j = 0; j < H; j++) { y_ptr[i * D * H + j] = hx[i][j]; back_ht_1[i * D * H + j] = hx[N + i][j]; @@ -1795,7 +1795,7 @@ void VanillaRNNForwardTrainingSingleLayer(DType* ws, linalg_gemm(x, back_wx, dback_gemmC1, alpha, beta, false, true); } - for (int t = 0; t < T; t++) { + for (index_t t = 0; t < T; t++) { // perform the first direction, X * wx and H * wh for each step // ht-1 * wh, ht-1:[N, H] wh:[H, H] Tensor dht_1(ht_1, Shape2(N, D * H)); @@ -1810,9 +1810,9 @@ void VanillaRNNForwardTrainingSingleLayer(DType* ws, nt = gateN + t * N * H; gemmC1_t = gemmC1 + t * N * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { - int tb = i * H; + index_t tb = i * H; if (mode == 1) { nt[tb + j] = ht[i * D * H + j] = tanh(gemmC1_t[tb + j] + bx[0][j] + gemmC2[tb + j] + bh[0][j]); @@ -1834,9 +1834,9 @@ void VanillaRNNForwardTrainingSingleLayer(DType* ws, dback_ht_1_tmp = reshape(dback_ht_1.T(), Shape3(D, H, N)); linalg_gemm(dback_ht_1_tmp[1], back_wh, dgemmC2, alpha, beta, true, true); #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { - int tb = i * H; + index_t tb = i * H; if (mode == 1) { nt[tb + j] = back_ht[i * D * H + j] = tanh(gemmC1_t[tb + j] + back_bx[0][j] + gemmC2[tb + j] + back_bh[0][j]); @@ -1856,7 +1856,7 @@ void VanillaRNNForwardTrainingSingleLayer(DType* ws, if (D == 1) { DType* y_start = y_ptr + (T - 1) * N * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; i++) + for (index_t i = 0; i < N; i++) for (int j = 0; j < H; j++) { hy_ptr[i * H + j] = y_start[i * H + j]; } @@ -1864,7 +1864,7 @@ void VanillaRNNForwardTrainingSingleLayer(DType* ws, DType* y_start = y_ptr + (T - 1) * N * H * D; DType* y_back_start = y_ptr + H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; i++) + for (index_t i = 0; i < N; i++) for (int j = 0; j < H; j++) { hy_ptr[i * H + j] = y_start[i * D * H + j]; hy_ptr[N * H + i * H + j] = y_back_start[i * D * H + j]; @@ -1879,9 +1879,9 @@ void VanillaRNNForwardTraining(DType* ws, bool state_outputs, const int L, const int D, - const int T, - const int N, - int I, + const index_t T, + const index_t N, + index_t I, const int H, DType* x_ptr, DType* hx_ptr, @@ -1916,7 +1916,7 @@ void VanillaRNNForwardTraining(DType* ws, } if (dropout > 0.0f && l > 0) { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < T * N * I; i++) { + for (index_t i = 0; i < T * N * I; i++) { int rand_data = rand_r(&seed_); if (static_cast(rand_data % 1000) < static_cast(1000 * dropout)) { dropout_random[(l - 1) * T * N * I + i] = 0; @@ -1944,7 +1944,7 @@ void VanillaRNNForwardTraining(DType* ws, wh_l = wx_l + I * H; } #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < T * N * H * D; ++i) { + for (index_t i = 0; i < T * N * H * D; ++i) { y_ptr[i] = y_l[i]; } } @@ -1953,9 +1953,9 @@ template void VanillaRNNBackwardSingleLayer(DType* ws, DType* tmp_buf, const int D, - const int T, - const int N, - const int I, + const index_t T, + const index_t N, + const index_t I, const int H, const Tensor &x, const Tensor &hx, @@ -2013,7 +2013,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws, } #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N * H; ++i) { + for (index_t i = 0; i < N * H; ++i) { if (dhy_ptr) { dht1[i] = dhy_ptr[i]; } else { @@ -2022,7 +2022,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws, } #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { hx_[i * D * H + j] = hx[i][j]; } @@ -2030,7 +2030,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws, if (D == 2) { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N * H; ++i) { + for (index_t i = 0; i < N * H; ++i) { if (dhy_ptr) { back_dht1[i] = dhy_ptr[N * H + i]; } else { @@ -2038,13 +2038,13 @@ void VanillaRNNBackwardSingleLayer(DType* ws, } } #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { hx_[i * D * H + H + j] = hx[N + i][j]; } } } - for (int t = T - 1; t >= 0; --t) { + for (index_t t = T - 1; t >= 0; --t) { if (t) { ht1 = y_ptr + (t - 1) * N * D * H; } else { @@ -2054,7 +2054,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws, dyt = dy_ptr + t * N * D * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { dht1[i * H + j] += dyt[i * D * H + j]; } @@ -2063,9 +2063,9 @@ void VanillaRNNBackwardSingleLayer(DType* ws, nt = gateN + t * N * H; dart = dar + t * N * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { - int id = i * H + j; + index_t id = i * H + j; if (mode == 1) { dart[id] = dht1[id] * (1 - nt[id] * nt[id]); } else { @@ -2104,7 +2104,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws, if (req_params != kAddTo) { #pragma omp parallel for num_threads(omp_threads) for (int i = 0; i < H; ++i) { - for (int j = 0; j < N * T; ++j) { + for (index_t j = 0; j < N * T; ++j) { dbx[i] += dar[j * H + i]; dbh[i] = dbx[i]; } @@ -2113,15 +2113,15 @@ void VanillaRNNBackwardSingleLayer(DType* ws, const Tensor tmp_dbx(tmp_buf + T * N * D * H, Shape2(H, T)); const Tensor tmp_dbh(tmp_buf + T * N * D * H + H * T, Shape2(H, T)); #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < H * T; ++i) { + for (index_t i = 0; i < H * T; ++i) { tmp_dbx.dptr_[i] = 0; tmp_dbh.dptr_[i] = 0; } - for (int t = T - 1; t >= 0; --t) { + for (index_t t = T - 1; t >= 0; --t) { #pragma omp parallel for num_threads(omp_threads) for (int i = 0; i < H; ++i) { - for (int j = 0; j < N; ++j) { + for (index_t j = 0; j < N; ++j) { tmp_dbx[i][t] += dar[t * N * H + j * H + i]; tmp_dbh[i][t] = tmp_dbx[i][t]; } @@ -2151,7 +2151,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws, } if (D == 2) { - for (int t = 0; t < T; ++t) { + for (index_t t = 0; t < T; ++t) { if (t == T-1) { back_ht1 = hx_; } else { @@ -2161,7 +2161,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws, // add dy[T, N, D, H] to dhy[D, N, H] dyt = dy_ptr + t * N * D * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { back_dht1[i * H + j] += dyt[i * D * H + H + j]; } @@ -2171,9 +2171,9 @@ void VanillaRNNBackwardSingleLayer(DType* ws, dart = dar + t * N * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { - int id = i * H + j; + index_t id = i * H + j; if (mode == 1) { dart[id] = back_dht1[id] * (1 - nt[id] * nt[id]); } else { @@ -2213,7 +2213,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws, if (req_params != kAddTo) { #pragma omp parallel for num_threads(omp_threads) for (int i = 0; i < H; ++i) { - for (int j = 0; j < N * T; ++j) { + for (index_t j = 0; j < N * T; ++j) { back_dbx[i] += dar[j * H + i]; back_dbh[i] = back_dbx[i]; } @@ -2222,15 +2222,15 @@ void VanillaRNNBackwardSingleLayer(DType* ws, const Tensor tmp_dbx(tmp_buf + T * N * D * H, Shape2(H, T)); const Tensor tmp_dbh(tmp_buf + T * N * D * H + H * T, Shape2(H, T)); #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < H * T; ++i) { + for (index_t i = 0; i < H * T; ++i) { tmp_dbx.dptr_[i] = 0; tmp_dbh.dptr_[i] = 0; } - for (int t = T - 1; t >= 0; --t) { + for (index_t t = T - 1; t >= 0; --t) { #pragma omp parallel for num_threads(omp_threads) for (int i = 0; i < H; ++i) { - for (int j = 0; j < N; ++j) { + for (index_t j = 0; j < N; ++j) { tmp_dbx[i][t] += dar[t * N * H + j * H + i]; tmp_dbh[i][t] = tmp_dbx[i][t]; } @@ -2261,7 +2261,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws, } if (req_state != kNullOp) { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N * H * D; ++i) { + for (index_t i = 0; i < N * H * D; ++i) { dhx[i] = dht1[i]; } } @@ -2272,9 +2272,9 @@ void VanillaRNNBackward(DType* ws, DType* rs, const int L, const int D, - const int T, - const int N, - int I, + const index_t T, + const index_t N, + index_t I, const int H, DType* x_ptr, DType* hx_ptr, @@ -2324,7 +2324,7 @@ void VanillaRNNBackward(DType* ws, DType* dhx_l = dhx_ptr + (L - 1) * D * N * H; DType* dy_l = dy_ptr; Tensor hx(hx_ptr, Shape3(L, D * N, H)); - int inputsize = I; + index_t inputsize = I; DType* y_tmp = y_l - T * N * H * D; const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); for (int l = L - 1; l >= 0; --l) { @@ -2343,7 +2343,7 @@ void VanillaRNNBackward(DType* ws, if (dropout > 0.0f && l > 0 && req_data != kNullOp) { dropout_random = dropout_random - T * N * D * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < T * N * I; i++) { + for (index_t i = 0; i < T * N * I; i++) { if (dropout_random[i] == 0) { dx_l[i] = 0; } else { @@ -2353,7 +2353,7 @@ void VanillaRNNBackward(DType* ws, } if (l > 0) { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < T * N * H * D; ++i) { + for (index_t i = 0; i < T * N * H * D; ++i) { dy_l[i] = dx_l[i]; } gateN_l = gateN_l - T * D * N * H; diff --git a/tests/nightly/test_large_array.py b/tests/nightly/test_large_array.py index cc1977acdd91..5fb0ff81da6b 100644 --- a/tests/nightly/test_large_array.py +++ b/tests/nightly/test_large_array.py @@ -39,6 +39,7 @@ SMALL_Y = 50 LARGE_SIZE = LARGE_X * SMALL_Y LARGE_TENSOR_SHAPE = 2**32 +RNN_LARGE_TENSOR = 2**28 def test_nn(): @@ -479,7 +480,6 @@ def check_embedding(): assert out.shape[0] == LARGE_TENSOR_SHAPE assert out.shape[1] == 1 - assert out.shape[2] == 1 def check_spatial_transformer(): data = nd.random_normal(shape=(2, 2**29, 1, 6)) @@ -530,6 +530,39 @@ def check_multi_lars(): # Trigger lazy evaluation of the output NDArray and ensure that it has been filled assert type(out[0, 0].asscalar()).__name__ == 'float32' + + def check_rnn(): + data = nd.random_normal(shape=(RNN_LARGE_TENSOR, 4, 4)) + parameters_relu_tanh = nd.random_normal(shape=(7,)) + parameters_lstm = nd.random_normal(shape=(28,)) + parameters_gru = nd.random_normal(shape=(21,)) + state = nd.random_normal(shape=(1, 4, 1)) + state_cell = nd.random_normal(shape=(1, 4, 1)) + mode_relu = 'rnn_relu' + mode_tanh = 'rnn_tanh' + mode_lstm = 'lstm' + mode_gru = 'gru' + state_size = 1 + num_layers = 1 + + out_relu = nd.RNN(data=data, parameters=parameters_relu_tanh, state=state, mode=mode_relu, + state_size=state_size, num_layers=num_layers) + + out_tanh = nd.RNN(data=data, parameters=parameters_relu_tanh, state=state, mode=mode_tanh, + state_size=state_size, num_layers=num_layers) + + out_lstm = nd.RNN(data=data, parameters=parameters_lstm, state=state, mode=mode_lstm, + state_cell=state_cell, state_size=state_size, num_layers=num_layers) + + out_gru = nd.RNN(data=data, parameters=parameters_gru, state=state, mode=mode_gru, + state_size=state_size, num_layers=num_layers) + + for out in [out_relu, out_tanh, out_lstm, out_gru]: + assert out.shape[0] == RNN_LARGE_TENSOR + assert out.shape[1] == 4 + assert out.shape[2] == 1 + + assert type(out[0, 0, 0].asscalar()).__name__ == 'float32' check_gluon_embedding() check_fully_connected() @@ -556,6 +589,7 @@ def check_multi_lars(): check_ravel() check_cumsum() check_multi_lars() + check_rnn() def test_tensor():