From 55cae0ff04f2e5d94616d49d7e85008b5cd0d293 Mon Sep 17 00:00:00 2001
From: Qian Kun <qkun@b02.clsp.jhu.edu>
Date: Thu, 12 Apr 2018 19:21:51 -0400
Subject: [PATCH 01/15] maxpooling component over blocks

---
 src/cudamatrix/cu-kernels.cu       | 66 +++++++++++++++++++++++
 src/cudamatrix/cu-kernels.h        | 14 +++++
 src/cudamatrix/cu-matrix.cc        | 86 ++++++++++++++++++++++++++++++
 src/cudamatrix/cu-matrix.h         |  3 ++
 src/nnet3/nnet-simple-component.cc | 71 ++++++++++++++++++++++++
 src/nnet3/nnet-simple-component.h  | 43 +++++++++++++++
 6 files changed, 283 insertions(+)
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 934a860a055..ae2781f1b07 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -773,6 +773,46 @@ static void _add_mat_blocks_trans(Real alpha, const Real* src,
     }
 }
 
+template<typename Real>
+__global__
+static void _max_mat_blocks(Real alpha, const Real* src,
+                            int32_cuda num_row_blocks,
+                            int32_cuda num_col_blocks, Real* dst, MatrixDim d,
+                            int src_stride) {
+  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
+  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
+  int32_cuda index = i + j * d.stride;
+  int32_cuda index_src = i + j * src_stride;
+  if (i < d.cols && j < d.rows)
+    for (int32_cuda p = 0; p < num_row_blocks; p++) {
+      for (int32_cuda q = 0; q < num_col_blocks; q++) {
+        dst[index] = fmax(
+            src[index_src + p * src_stride * d.rows + q * d.cols],
+            dst[index]);
+      }
+    }
+}
+
+template<typename Real>
+__global__
+static void _max_mat_blocks_trans(Real alpha, const Real* src,
+                                  int32_cuda num_row_blocks,
+                                  int32_cuda num_col_blocks, Real* dst,
+                                  MatrixDim d, int src_stride) {
+  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
+  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
+  int32_cuda index = i + j * d.stride;
+  int32_cuda index_src = j + i * src_stride;
+  if (i < d.cols && j < d.rows)
+    for (int32_cuda p = 0; p < num_row_blocks; p++) {
+      for (int32_cuda q = 0; q < num_col_blocks; q++) {
+        dst[index] = fmax(
+            src[index_src + p * src_stride * d.cols + q * d.rows],
+            dst[index]);
+      }
+    }
+}
+
 template<typename Real>
 __global__
 static void _set_mat_mat_div_mat(const Real* A, const Real* B, const Real* C,
@@ -3952,6 +3992,19 @@ void cudaF_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float* src,
   }
 }
 
+void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float* src,
+                          int32_cuda num_row_blocks, int32_cuda num_col_blocks,
+                          float* dst, MatrixDim d, int src_stride,
+                          int A_trans) {
+  if (A_trans) {
+    _max_mat_blocks_trans<<<Gr,Bl>>>(alpha, src, num_row_blocks, num_col_blocks,
+        dst, d, src_stride);
+  } else {
+    _max_mat_blocks<<<Gr,Bl>>>(alpha, src, num_row_blocks, num_col_blocks, dst,
+        d, src_stride);
+  }
+}
+
 void cudaF_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float* src,
                             MatrixDim src_dim, float *dst, MatrixDim dst_dim) {
   _add_mat_repeated<<<Gr,Bl>>>(alpha, src, src_dim, dst, dst_dim);
@@ -4656,6 +4709,19 @@ void cudaD_add_mat_blocks(dim3 Gr, dim3 Bl, double alpha, const double* src,
   }
 }
 
+void cudaD_max_mat_blocks(dim3 Gr, dim3 Bl, double alpha, const double* src,
+                          int32_cuda num_row_blocks, int32_cuda num_col_blocks,
+                          double* dst, MatrixDim d, int src_stride,
+                          int A_trans) {
+  if (A_trans) {
+    _max_mat_blocks_trans<<<Gr,Bl>>>(alpha, src, num_row_blocks, num_col_blocks,
+        dst, d, src_stride);
+  } else {
+    _max_mat_blocks<<<Gr,Bl>>>(alpha, src, num_row_blocks, num_col_blocks, dst,
+        d, src_stride);
+  }
+}
+
 void cudaD_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double* src,
                             MatrixDim src_dim, double *dst, MatrixDim dst_dim) {
   _add_mat_repeated<<<Gr,Bl>>>(alpha, src, src_dim, dst, dst_dim);
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index 8f719a8c4a1..234c4fe48de 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -158,6 +158,20 @@ inline void cuda_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float *src,
   cudaF_add_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst,
                        d, src_stride, A_trans);
 }
+inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, double alpha,
+                                const double *src, int32_cuda num_row_blocks,
+                                int32_cuda num_col_blocks, double *dst,
+                                MatrixDim d, int src_stride, int A_trans) {
+  cudaD_max_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst,
+                       d, src_stride, A_trans);
+}
+inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float *src,
+                                int32_cuda num_row_blocks,
+                                int32_cuda num_col_blocks, float *dst,
+                                MatrixDim d, int src_stride, int A_trans) {
+  cudaF_max_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst,
+                       d, src_stride, A_trans);
+}
 inline void cuda_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha,
                                   const double *src, MatrixDim src_dim,
                                   double *dst, MatrixDim dst_dim) {
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index beccd9dc4a5..a7000d93697 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -1187,6 +1187,92 @@ void CuMatrixBase<Real>::AddMatBlocks(Real alpha, const CuMatrixBase<Real> &A,
   }
 }
 
+
+template<typename Real>
+void CuMatrixBase<Real>::MaxMatBlocks(Real alpha, const CuMatrixBase<Real> &A,
+                                      MatrixTransposeType transA) {
+  if (num_rows_ == 0 || num_cols_ == 0) return;
+
+  if (A.NumRows() >= (transA == kNoTrans ? num_rows_ : num_cols_) &&
+      A.NumCols() >= (transA == kNoTrans ? num_cols_ : num_rows_)) {
+    // This is the "summing", not broadcasting, version of AddMatBlocks.
+    // It supports both regular and transposed operation.
+    int32 num_row_blocks, num_col_blocks;
+    if (transA == kNoTrans) {
+      KALDI_ASSERT(A.NumRows() % num_rows_ == 0 && A.NumCols() % num_cols_ == 0);
+      num_row_blocks = A.Mat().NumRows() / num_rows_;
+      num_col_blocks = A.Mat().NumCols() / num_cols_;
+    } else {
+      KALDI_ASSERT(A.NumRows() % num_cols_ == 0 && A.NumCols() % num_rows_ == 0);
+      num_row_blocks = A.Mat().NumRows() / num_cols_;
+      num_col_blocks = A.Mat().NumCols() / num_rows_;
+    }
+#if HAVE_CUDA == 1
+    if (CuDevice::Instantiate().Enabled()) {
+      CuTimer tim;
+      dim3 dimGrid, dimBlock;
+      GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                            &dimGrid, &dimBlock);
+      cuda_max_mat_blocks(dimGrid, dimBlock, alpha, A.data_, num_row_blocks,
+                          num_col_blocks, data_, Dim(), A.Stride(),
+                          (transA == kTrans ? 1 : 0));
+      CU_SAFE_CALL(cudaGetLastError());
+
+      CuDevice::Instantiate().AccuProfile(__func__, tim);
+    } //else
+#endif
+    // {
+    //   int32 nr, nc;
+    //   if (transA == kNoTrans) {
+    //     nr = num_rows_;
+    //     nc = num_cols_;
+    //   } else {
+    //     nr = num_cols_;
+    //     nc = num_rows_;
+    //   }
+    //   for (int32 i = 0; i < num_row_blocks; i++) {
+    //     for (int32 j = 0; j < num_col_blocks; j++) {
+    //       Mat().AddMat(alpha, SubMatrix<Real>(A.Mat(), i * nr, nr, j * nc, nc),
+    //                    transA);
+    //     }
+    //   }
+    // }
+//   } else {
+//     // This is the "broadcasting" version of MaxMatBlocks, where
+//     // *this is larger than src.
+//     if (transA != kNoTrans)
+//       KALDI_ERR << "Transposed operation not supported currently.";
+//     if (!(num_rows_ % A.NumRows() == 0 && num_cols_ % A.NumCols() == 0))
+//       KALDI_ERR << "Invalid sizes of arguments";
+// #if HAVE_CUDA == 1
+//     if (CuDevice::Instantiate().Enabled()) {
+//       CuTimer tim;
+//       dim3 dimGrid, dimBlock;
+//       GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+//                                             &dimGrid, &dimBlock);
+//       cuda_add_mat_repeated(dimGrid, dimBlock, alpha,
+//                             A.data_, A.Dim(), data_, Dim());
+//       CU_SAFE_CALL(cudaGetLastError());
+//       CuDevice::Instantiate().AccuProfile(__func__, tim);
+//     } else
+// #endif
+//     {
+//       const MatrixBase<Real> &src_mat = A.Mat(),
+//           &this_mat = this->Mat();
+//       for (int32 row_offset = 0; row_offset < NumRows();
+//            row_offset += src_mat.NumRows()) {
+//         for (int32 col_offset = 0; col_offset < NumCols();
+//              col_offset += src_mat.NumCols()) {
+//           SubMatrix<Real> this_part(this_mat,
+//                                     row_offset, src_mat.NumRows(),
+//                                     col_offset, src_mat.NumCols());
+//           this_part.AddMat(alpha, src_mat);
+//         }
+//       }
+//     }
+  }
+}
+
 /// dst = a * b / c (by element; when c = 0, dst = a)
 /// dst can be an alias of a, b or c safely and get expected result.
 template<typename Real>
diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index 03e69b639d3..334daf65645 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -509,6 +509,9 @@ class CuMatrixBase {
   void AddMatBlocks(Real alpha, const CuMatrixBase<Real> &A,
                     MatrixTransposeType trans = kNoTrans);
 
+  void MaxMatBlocks(Real alpha, const CuMatrixBase<Real> &A,
+                    MatrixTransposeType trans = kNoTrans);
+
   /// (for each column c of *this), c = alpha * col + beta * c
   void AddVecToCols(Real alpha, const CuVectorBase<Real> &col, Real beta = 1.0);
   /// (for each row r of *this), r = alpha * row + beta * r
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index 4eb078c0fcb..02b7d661459 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -5860,6 +5860,77 @@ void SumBlockComponent::Backprop(
   }
 }
 
+MaxPoolingOverBlock::MaxPoolingOverBlock(const MaxPoolingOverBlock &other):
+    input_dim_(other.input_dim_), output_dim_(other.output_dim_),
+    scale_(other.scale_) { }
+
+void MaxPoolingOverBlock::InitFromConfig(ConfigLine *cfl) {
+  scale_ = 1.0;
+  bool ok = cfl->GetValue("input-dim", &input_dim_) &&
+      cfl->GetValue("output-dim", &output_dim_);
+  if (!ok)
+    KALDI_ERR << "input-dim and output-dim must both be provided.";
+  if (input_dim_ <= 0 || input_dim_ % output_dim_ != 0)
+    KALDI_ERR << "Invalid values input-dim=" << input_dim_
+              << " output-dim=" << output_dim_;
+  cfl->GetValue("scale", &scale_);
+  if (cfl->HasUnusedValues())
+    KALDI_ERR << "Could not process these elements in initializer: "
+              << cfl->UnusedValues();
+}
+
+void MaxPoolingOverBlock::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary, "<MaxPoolingOverBlock>", "<InputDim>");
+  ReadBasicType(is, binary, &input_dim_);
+  ExpectToken(is, binary, "<OutputDim>");
+  ReadBasicType(is, binary, &output_dim_);
+  ExpectToken(is, binary, "<Scale>");
+  ReadBasicType(is, binary, &scale_);
+  ExpectToken(is, binary, "</MaxPoolingOverBlock>");
+}
+
+void MaxPoolingOverBlock::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<MaxPoolingOverBlock>");
+  WriteToken(os, binary, "<InputDim>");
+  WriteBasicType(os, binary, input_dim_);
+  WriteToken(os, binary, "<OutputDim>");
+  WriteBasicType(os, binary, output_dim_);
+  WriteToken(os, binary, "<Scale>");
+  WriteBasicType(os, binary, scale_);
+  WriteToken(os, binary, "</MaxPoolingOverBlock>");
+}
+
+std::string MaxPoolingOverBlock::Info() const {
+  std::ostringstream stream;
+  stream << Type() << ", input-dim=" << input_dim_
+         << ", output-dim=" << output_dim_
+         << ", scale=" << scale_;
+  return stream.str();
+}
+
+void* MaxPoolingOverBlock::Propagate(const ComponentPrecomputedIndexes *indexes,
+                                   const CuMatrixBase<BaseFloat> &in,
+                                   CuMatrixBase<BaseFloat> *out) const {
+  KALDI_ASSERT(out->NumRows() == in.NumRows() &&
+               out->NumCols() == output_dim_ &&
+               in.NumCols() == input_dim_);
+  out->MaxMatBlocks(scale_, in, kNoTrans);
+  return NULL;
+}
+
+void MaxPoolingOverBlock::Backprop(
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &, //in_value
+    const CuMatrixBase<BaseFloat> &, // out_value,
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    void *memo,
+    Component *to_update,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+  if (in_deriv) {
+    in_deriv->MaxMatBlocks(scale_, out_deriv, kNoTrans);
+  }
+}
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index 3929c253aab..0f83631b7e6 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -1220,6 +1220,49 @@ class SumBlockComponent: public Component {
   SumBlockComponent &operator = (const SumBlockComponent &other); // Disallow.
 };
 
+/**  MaxPoolingOverBlock gets maximum value over blocks of its input: for instance, if
+     you create one with the config "input-dim=400 output-dim=100",
+     its output will be the maximum value over the 4 100-dimensional blocks of
+     the input.
+
+     Accepted values on its config-file line are:
+        input-dim  The input dimension.  Required.
+        output-dim  The block dimension.  Required.  Must divide input-dim.
+        scale      A scaling factor on the output.  Defaults to 1.0.
+ */
+class MaxPoolingOverBlock: public Component {
+ public:
+  explicit MaxPoolingOverBlock(const MaxPoolingOverBlock &other);
+  MaxPoolingOverBlock() { }
+  virtual std::string Type() const { return "MaxPoolingOverBlock"; }
+  virtual int32 Properties() const {
+    return kSimpleComponent|kPropagateAdds|kBackpropAdds;
+  }
+  virtual void (ConfigLine *cfl);
+  virtual int32 InputDim() const { return input_dim_; }
+  virtual int32 OutputDim() const { return output_dim_; }
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+  virtual std::string Info() const;
+  virtual Component* Copy() const { return new MaxPoolingOverBlock(*this); }
+  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
+                          const CuMatrixBase<BaseFloat> &in,
+                          CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &, //in_value
+                        const CuMatrixBase<BaseFloat> &, // out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        void *memo,
+                        Component *to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+ private:
+  int32 input_dim_;
+  int32 output_dim_;
+  BaseFloat scale_;
+  MaxPoolingOverBlock &operator = (const MaxPoolingOverBlock &other); // Disallow.
+};
+
 
 /*
  ClipGradientComponent just duplicates its input, but clips gradients

From ecb5a86ec0cd03af7103252ed2fefcb156aaad33 Mon Sep 17 00:00:00 2001
From: Qian Kun <qkun@b02.clsp.jhu.edu>
Date: Thu, 12 Apr 2018 19:45:12 -0400
Subject: [PATCH 02/15] maxpooling component over blocks: repeated func

---
 src/cudamatrix/cu-kernels.cu | 24 +++++++++++++++++++++++
 src/cudamatrix/cu-kernels.h  | 18 +++++++++++++----
 src/cudamatrix/cu-matrix.cc  | 38 ++++++++++++++++++------------------
 3 files changed, 57 insertions(+), 23 deletions(-)

diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index ae2781f1b07..fe1bcf60fc9 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -813,6 +813,21 @@ static void _max_mat_blocks_trans(Real alpha, const Real* src,
     }
 }
 
+template<typename Real>
+__global__
+static void _max_mat_repeated(Real alpha, const Real* src,
+                              MatrixDim src_dim, Real* dst,
+                              MatrixDim dst_dim) {
+  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
+  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
+  int32_cuda src_i = i % src_dim.cols,
+      src_j = j % src_dim.rows,
+      dst_index = i + j * dst_dim.stride,
+      src_index = src_i + src_j * src_dim.stride;
+  if (i < dst_dim.cols && j < dst_dim.rows)
+    dst[dst_index] += alpha * src[src_index];
+}
+
 template<typename Real>
 __global__
 static void _set_mat_mat_div_mat(const Real* A, const Real* B, const Real* C,
@@ -4010,6 +4025,10 @@ void cudaF_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float* src,
   _add_mat_repeated<<<Gr,Bl>>>(alpha, src, src_dim, dst, dst_dim);
 }
 
+void cudaF_max_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float* src,
+                            MatrixDim src_dim, float *dst, MatrixDim dst_dim) {
+  _max_mat_repeated<<<Gr,Bl>>>(alpha, src, src_dim, dst, dst_dim);
+}
 
 void cudaF_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A, const float *B,
                                const float *C, float *dst, MatrixDim d,
@@ -4727,6 +4746,11 @@ void cudaD_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double* src,
   _add_mat_repeated<<<Gr,Bl>>>(alpha, src, src_dim, dst, dst_dim);
 }
 
+void cudaD_max_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double* src,
+                            MatrixDim src_dim, double *dst, MatrixDim dst_dim) {
+  _max_mat_repeated<<<Gr,Bl>>>(alpha, src, src_dim, dst, dst_dim);
+}
+
 void cudaD_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A,
                                const double *B, const double *C, double *dst,
                                MatrixDim d, int stride_a, int stride_b,
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index 234c4fe48de..d106c973aa1 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -158,6 +158,16 @@ inline void cuda_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float *src,
   cudaF_add_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst,
                        d, src_stride, A_trans);
 }
+inline void cuda_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha,
+                                  const double *src, MatrixDim src_dim,
+                                  double *dst, MatrixDim dst_dim) {
+  cudaD_add_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim);
+}
+inline void cuda_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha,
+                                  const float *src, MatrixDim src_dim,
+                                  float *dst, MatrixDim dst_dim) {
+  cudaF_add_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim);
+}
 inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, double alpha,
                                 const double *src, int32_cuda num_row_blocks,
                                 int32_cuda num_col_blocks, double *dst,
@@ -172,15 +182,15 @@ inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float *src,
   cudaF_max_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst,
                        d, src_stride, A_trans);
 }
-inline void cuda_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha,
+inline void cuda_max_mat_repeated(dim3 Gr, dim3 Bl, double alpha,
                                   const double *src, MatrixDim src_dim,
                                   double *dst, MatrixDim dst_dim) {
-  cudaD_add_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim);
+  cudaD_max_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim);
 }
-inline void cuda_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha,
+inline void cuda_max_mat_repeated(dim3 Gr, dim3 Bl, float alpha,
                                   const float *src, MatrixDim src_dim,
                                   float *dst, MatrixDim dst_dim) {
-  cudaF_add_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim);
+  cudaF_max_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim);
 }
 inline void cuda_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat,
                                   MatrixDim mat_dim, const double *mat2,
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index a7000d93697..bf1f019d3ac 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -1237,25 +1237,25 @@ void CuMatrixBase<Real>::MaxMatBlocks(Real alpha, const CuMatrixBase<Real> &A,
     //     }
     //   }
     // }
-//   } else {
-//     // This is the "broadcasting" version of MaxMatBlocks, where
-//     // *this is larger than src.
-//     if (transA != kNoTrans)
-//       KALDI_ERR << "Transposed operation not supported currently.";
-//     if (!(num_rows_ % A.NumRows() == 0 && num_cols_ % A.NumCols() == 0))
-//       KALDI_ERR << "Invalid sizes of arguments";
-// #if HAVE_CUDA == 1
-//     if (CuDevice::Instantiate().Enabled()) {
-//       CuTimer tim;
-//       dim3 dimGrid, dimBlock;
-//       GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
-//                                             &dimGrid, &dimBlock);
-//       cuda_add_mat_repeated(dimGrid, dimBlock, alpha,
-//                             A.data_, A.Dim(), data_, Dim());
-//       CU_SAFE_CALL(cudaGetLastError());
-//       CuDevice::Instantiate().AccuProfile(__func__, tim);
-//     } else
-// #endif
+  } else {
+    // This is the "broadcasting" version of MaxMatBlocks, where
+    // *this is larger than src.
+    if (transA != kNoTrans)
+      KALDI_ERR << "Transposed operation not supported currently.";
+    if (!(num_rows_ % A.NumRows() == 0 && num_cols_ % A.NumCols() == 0))
+      KALDI_ERR << "Invalid sizes of arguments";
+#if HAVE_CUDA == 1
+    if (CuDevice::Instantiate().Enabled()) {
+      CuTimer tim;
+      dim3 dimGrid, dimBlock;
+      GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                            &dimGrid, &dimBlock);
+      cuda_max_mat_repeated(dimGrid, dimBlock, alpha,
+                            A.data_, A.Dim(), data_, Dim());
+      CU_SAFE_CALL(cudaGetLastError());
+      CuDevice::Instantiate().AccuProfile(__func__, tim);
+    } //else
+#endif
 //     {
 //       const MatrixBase<Real> &src_mat = A.Mat(),
 //           &this_mat = this->Mat();

From b626325c01280cbc2a073c9162bafe327a548396 Mon Sep 17 00:00:00 2001
From: Qian Kun <qkun@b02.clsp.jhu.edu>
Date: Mon, 23 Apr 2018 10:45:58 -0400
Subject: [PATCH 03/15] remove scale parameter and add comments

---
 src/cudamatrix/cu-kernels.cu       | 28 ++++++++++++++--------------
 src/cudamatrix/cu-kernels.h        |  8 ++++----
 src/cudamatrix/cu-matrix.cc        |  6 +++---
 src/cudamatrix/cu-matrix.h         | 18 +++++++++++++++++-
 src/nnet3/nnet-simple-component.cc | 16 ++++------------
 src/nnet3/nnet-simple-component.h  |  2 --
 6 files changed, 42 insertions(+), 36 deletions(-)

diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index fe1bcf60fc9..63ac13cd6ef 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -775,7 +775,7 @@ static void _add_mat_blocks_trans(Real alpha, const Real* src,
 
 template<typename Real>
 __global__
-static void _max_mat_blocks(Real alpha, const Real* src,
+static void _max_mat_blocks(const Real* src,
                             int32_cuda num_row_blocks,
                             int32_cuda num_col_blocks, Real* dst, MatrixDim d,
                             int src_stride) {
@@ -795,7 +795,7 @@ static void _max_mat_blocks(Real alpha, const Real* src,
 
 template<typename Real>
 __global__
-static void _max_mat_blocks_trans(Real alpha, const Real* src,
+static void _max_mat_blocks_trans(const Real* src,
                                   int32_cuda num_row_blocks,
                                   int32_cuda num_col_blocks, Real* dst,
                                   MatrixDim d, int src_stride) {
@@ -815,7 +815,7 @@ static void _max_mat_blocks_trans(Real alpha, const Real* src,
 
 template<typename Real>
 __global__
-static void _max_mat_repeated(Real alpha, const Real* src,
+static void _max_mat_repeated(const Real* src,
                               MatrixDim src_dim, Real* dst,
                               MatrixDim dst_dim) {
   int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -825,7 +825,7 @@ static void _max_mat_repeated(Real alpha, const Real* src,
       dst_index = i + j * dst_dim.stride,
       src_index = src_i + src_j * src_dim.stride;
   if (i < dst_dim.cols && j < dst_dim.rows)
-    dst[dst_index] += alpha * src[src_index];
+    dst[dst_index] += src[src_index];
 }
 
 template<typename Real>
@@ -4007,15 +4007,15 @@ void cudaF_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float* src,
   }
 }
 
-void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float* src,
+void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, const float* src,
                           int32_cuda num_row_blocks, int32_cuda num_col_blocks,
                           float* dst, MatrixDim d, int src_stride,
                           int A_trans) {
   if (A_trans) {
-    _max_mat_blocks_trans<<<Gr,Bl>>>(alpha, src, num_row_blocks, num_col_blocks,
+    _max_mat_blocks_trans<<<Gr,Bl>>>(src, num_row_blocks, num_col_blocks,
         dst, d, src_stride);
   } else {
-    _max_mat_blocks<<<Gr,Bl>>>(alpha, src, num_row_blocks, num_col_blocks, dst,
+    _max_mat_blocks<<<Gr,Bl>>>(src, num_row_blocks, num_col_blocks, dst,
         d, src_stride);
   }
 }
@@ -4025,9 +4025,9 @@ void cudaF_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float* src,
   _add_mat_repeated<<<Gr,Bl>>>(alpha, src, src_dim, dst, dst_dim);
 }
 
-void cudaF_max_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float* src,
+void cudaF_max_mat_repeated(dim3 Gr, dim3 Bl, const float* src,
                             MatrixDim src_dim, float *dst, MatrixDim dst_dim) {
-  _max_mat_repeated<<<Gr,Bl>>>(alpha, src, src_dim, dst, dst_dim);
+  _max_mat_repeated<<<Gr,Bl>>>(src, src_dim, dst, dst_dim);
 }
 
 void cudaF_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A, const float *B,
@@ -4728,15 +4728,15 @@ void cudaD_add_mat_blocks(dim3 Gr, dim3 Bl, double alpha, const double* src,
   }
 }
 
-void cudaD_max_mat_blocks(dim3 Gr, dim3 Bl, double alpha, const double* src,
+void cudaD_max_mat_blocks(dim3 Gr, dim3 Bl, const double* src,
                           int32_cuda num_row_blocks, int32_cuda num_col_blocks,
                           double* dst, MatrixDim d, int src_stride,
                           int A_trans) {
   if (A_trans) {
-    _max_mat_blocks_trans<<<Gr,Bl>>>(alpha, src, num_row_blocks, num_col_blocks,
+    _max_mat_blocks_trans<<<Gr,Bl>>>(src, num_row_blocks, num_col_blocks,
         dst, d, src_stride);
   } else {
-    _max_mat_blocks<<<Gr,Bl>>>(alpha, src, num_row_blocks, num_col_blocks, dst,
+    _max_mat_blocks<<<Gr,Bl>>>(src, num_row_blocks, num_col_blocks, dst,
         d, src_stride);
   }
 }
@@ -4746,9 +4746,9 @@ void cudaD_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double* src,
   _add_mat_repeated<<<Gr,Bl>>>(alpha, src, src_dim, dst, dst_dim);
 }
 
-void cudaD_max_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double* src,
+void cudaD_max_mat_repeated(dim3 Gr, dim3 Bl, const double* src,
                             MatrixDim src_dim, double *dst, MatrixDim dst_dim) {
-  _max_mat_repeated<<<Gr,Bl>>>(alpha, src, src_dim, dst, dst_dim);
+  _max_mat_repeated<<<Gr,Bl>>>(src, src_dim, dst, dst_dim);
 }
 
 void cudaD_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A,
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index d106c973aa1..27a8d8a34fe 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -168,26 +168,26 @@ inline void cuda_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha,
                                   float *dst, MatrixDim dst_dim) {
   cudaF_add_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim);
 }
-inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, double alpha,
+inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl,
                                 const double *src, int32_cuda num_row_blocks,
                                 int32_cuda num_col_blocks, double *dst,
                                 MatrixDim d, int src_stride, int A_trans) {
   cudaD_max_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst,
                        d, src_stride, A_trans);
 }
-inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float *src,
+inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, const float *src,
                                 int32_cuda num_row_blocks,
                                 int32_cuda num_col_blocks, float *dst,
                                 MatrixDim d, int src_stride, int A_trans) {
   cudaF_max_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst,
                        d, src_stride, A_trans);
 }
-inline void cuda_max_mat_repeated(dim3 Gr, dim3 Bl, double alpha,
+inline void cuda_max_mat_repeated(dim3 Gr, dim3 Bl,
                                   const double *src, MatrixDim src_dim,
                                   double *dst, MatrixDim dst_dim) {
   cudaD_max_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim);
 }
-inline void cuda_max_mat_repeated(dim3 Gr, dim3 Bl, float alpha,
+inline void cuda_max_mat_repeated(dim3 Gr, dim3 Bl,
                                   const float *src, MatrixDim src_dim,
                                   float *dst, MatrixDim dst_dim) {
   cudaF_max_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim);
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index bf1f019d3ac..c74e3caf3eb 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -1189,7 +1189,7 @@ void CuMatrixBase<Real>::AddMatBlocks(Real alpha, const CuMatrixBase<Real> &A,
 
 
 template<typename Real>
-void CuMatrixBase<Real>::MaxMatBlocks(Real alpha, const CuMatrixBase<Real> &A,
+void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
                                       MatrixTransposeType transA) {
   if (num_rows_ == 0 || num_cols_ == 0) return;
 
@@ -1213,7 +1213,7 @@ void CuMatrixBase<Real>::MaxMatBlocks(Real alpha, const CuMatrixBase<Real> &A,
       dim3 dimGrid, dimBlock;
       GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                             &dimGrid, &dimBlock);
-      cuda_max_mat_blocks(dimGrid, dimBlock, alpha, A.data_, num_row_blocks,
+      cuda_max_mat_blocks(dimGrid, dimBlock, A.data_, num_row_blocks,
                           num_col_blocks, data_, Dim(), A.Stride(),
                           (transA == kTrans ? 1 : 0));
       CU_SAFE_CALL(cudaGetLastError());
@@ -1250,7 +1250,7 @@ void CuMatrixBase<Real>::MaxMatBlocks(Real alpha, const CuMatrixBase<Real> &A,
       dim3 dimGrid, dimBlock;
       GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                             &dimGrid, &dimBlock);
-      cuda_max_mat_repeated(dimGrid, dimBlock, alpha,
+      cuda_max_mat_repeated(dimGrid, dimBlock,
                             A.data_, A.Dim(), data_, Dim());
       CU_SAFE_CALL(cudaGetLastError());
       CuDevice::Instantiate().AccuProfile(__func__, tim);
diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index 334daf65645..6d3b35ed19c 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -509,7 +509,23 @@ class CuMatrixBase {
   void AddMatBlocks(Real alpha, const CuMatrixBase<Real> &A,
                     MatrixTransposeType trans = kNoTrans);
 
-  void MaxMatBlocks(Real alpha, const CuMatrixBase<Real> &A,
+  /// This function does *this = max(*this, src). similar with AddMatBlocks,
+  /// it supports cases where *this and src have different dimension.  
+  /// There are two allowed cases:
+  ///
+  ///  (1) *this is larger than src; this case is probably for backpropagation.
+  ///       In this case, we do a broadcasting operation.  *this must
+  ///       have NumRows() == a * src.NumRows() and NumCols() == b *
+  ///       src.NumCols() for integer a >= 1, b >= 1.  *this will be treated as
+  ///       being made up of of blocks with the same size as src, and to each
+  ///       block we'll add src.  This case does not support trans == kTrans.
+  ///
+  ///  (2) *this is smaller than src; we select the maximum.  src.NumRows() must 
+  ///      == a * this->NumRows(), and src.NumCols() must == b * this->NumCols(), 
+  ///      for a >= 1, b >= 1.  In this case, src will be treated as being made 
+  ///      up of blocks with the same size as *this, and to *this we will select
+  ///      max value over all of those blocks.
+  void MaxMatBlocks(const CuMatrixBase<Real> &A,
                     MatrixTransposeType trans = kNoTrans);
 
   /// (for each column c of *this), c = alpha * col + beta * c
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index 02b7d661459..3bfd6c7d100 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -5861,11 +5861,9 @@ void SumBlockComponent::Backprop(
 }
 
 MaxPoolingOverBlock::MaxPoolingOverBlock(const MaxPoolingOverBlock &other):
-    input_dim_(other.input_dim_), output_dim_(other.output_dim_),
-    scale_(other.scale_) { }
+    input_dim_(other.input_dim_), output_dim_(other.output_dim_) { }
 
 void MaxPoolingOverBlock::InitFromConfig(ConfigLine *cfl) {
-  scale_ = 1.0;
   bool ok = cfl->GetValue("input-dim", &input_dim_) &&
       cfl->GetValue("output-dim", &output_dim_);
   if (!ok)
@@ -5873,7 +5871,6 @@ void MaxPoolingOverBlock::InitFromConfig(ConfigLine *cfl) {
   if (input_dim_ <= 0 || input_dim_ % output_dim_ != 0)
     KALDI_ERR << "Invalid values input-dim=" << input_dim_
               << " output-dim=" << output_dim_;
-  cfl->GetValue("scale", &scale_);
   if (cfl->HasUnusedValues())
     KALDI_ERR << "Could not process these elements in initializer: "
               << cfl->UnusedValues();
@@ -5884,8 +5881,6 @@ void MaxPoolingOverBlock::Read(std::istream &is, bool binary) {
   ReadBasicType(is, binary, &input_dim_);
   ExpectToken(is, binary, "<OutputDim>");
   ReadBasicType(is, binary, &output_dim_);
-  ExpectToken(is, binary, "<Scale>");
-  ReadBasicType(is, binary, &scale_);
   ExpectToken(is, binary, "</MaxPoolingOverBlock>");
 }
 
@@ -5895,16 +5890,13 @@ void MaxPoolingOverBlock::Write(std::ostream &os, bool binary) const {
   WriteBasicType(os, binary, input_dim_);
   WriteToken(os, binary, "<OutputDim>");
   WriteBasicType(os, binary, output_dim_);
-  WriteToken(os, binary, "<Scale>");
-  WriteBasicType(os, binary, scale_);
   WriteToken(os, binary, "</MaxPoolingOverBlock>");
 }
 
 std::string MaxPoolingOverBlock::Info() const {
   std::ostringstream stream;
   stream << Type() << ", input-dim=" << input_dim_
-         << ", output-dim=" << output_dim_
-         << ", scale=" << scale_;
+         << ", output-dim=" << output_dim_;
   return stream.str();
 }
 
@@ -5914,7 +5906,7 @@ void* MaxPoolingOverBlock::Propagate(const ComponentPrecomputedIndexes *indexes,
   KALDI_ASSERT(out->NumRows() == in.NumRows() &&
                out->NumCols() == output_dim_ &&
                in.NumCols() == input_dim_);
-  out->MaxMatBlocks(scale_, in, kNoTrans);
+  out->MaxMatBlocks(in, kNoTrans);
   return NULL;
 }
 
@@ -5928,7 +5920,7 @@ void MaxPoolingOverBlock::Backprop(
     Component *to_update,
     CuMatrixBase<BaseFloat> *in_deriv) const {
   if (in_deriv) {
-    in_deriv->MaxMatBlocks(scale_, out_deriv, kNoTrans);
+    in_deriv->MaxMatBlocks(out_deriv, kNoTrans);
   }
 }
 
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index 0f83631b7e6..3983d87816b 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -1228,7 +1228,6 @@ class SumBlockComponent: public Component {
      Accepted values on its config-file line are:
         input-dim  The input dimension.  Required.
         output-dim  The block dimension.  Required.  Must divide input-dim.
-        scale      A scaling factor on the output.  Defaults to 1.0.
  */
 class MaxPoolingOverBlock: public Component {
  public:
@@ -1259,7 +1258,6 @@ class MaxPoolingOverBlock: public Component {
  private:
   int32 input_dim_;
   int32 output_dim_;
-  BaseFloat scale_;
   MaxPoolingOverBlock &operator = (const MaxPoolingOverBlock &other); // Disallow.
 };
 

From f76586de2c747fa33679770b4849a9068e6d1b91 Mon Sep 17 00:00:00 2001
From: Qian Kun <qkun@b02.clsp.jhu.edu>
Date: Mon, 23 Apr 2018 17:07:22 -0400
Subject: [PATCH 04/15] basic back-propagation(fake cuda)(stride = pool_size)

---
 src/nnet3/nnet-simple-component.cc | 21 +++++++++++++++++----
 src/nnet3/nnet-simple-component.h  |  4 ++--
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index 3bfd6c7d100..a2f51e25a52 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -5913,14 +5913,27 @@ void* MaxPoolingOverBlock::Propagate(const ComponentPrecomputedIndexes *indexes,
 void MaxPoolingOverBlock::Backprop(
     const std::string &debug_info,
     const ComponentPrecomputedIndexes *indexes,
-    const CuMatrixBase<BaseFloat> &, //in_value
-    const CuMatrixBase<BaseFloat> &, // out_value,
+    const CuMatrixBase<BaseFloat> &in_value, //in_value
+    const CuMatrixBase<BaseFloat> &out_value, // out_value,
     const CuMatrixBase<BaseFloat> &out_deriv,
     void *memo,
     Component *to_update,
     CuMatrixBase<BaseFloat> *in_deriv) const {
-  if (in_deriv) {
-    in_deriv->MaxMatBlocks(out_deriv, kNoTrans);
+  // if (in_deriv) {
+  //   in_deriv->MaxMatBlocks(out_deriv, kNoTrans);
+  // }
+  if (!in_deriv)
+    return;
+
+  int32 num_pools = input_dim_;
+  int32 pool_size = output_dim_ / input_dim_;
+
+  for (int32 q = 0; q < pool_size; q++) {
+    // zero-out mask
+    CuMatrix<BaseFloat> mask;
+    out_value.EqualElementMask(in_value.ColRange(q * num_pools, num_pools), &mask);
+    mask.MulElements(out_deriv);
+    in_deriv.ColRange(q * num_pools, num_pools).CopyFromMat(mask);
   }
 }
 
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index 3983d87816b..6284a686333 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -1249,8 +1249,8 @@ class MaxPoolingOverBlock: public Component {
                           CuMatrixBase<BaseFloat> *out) const;
   virtual void Backprop(const std::string &debug_info,
                         const ComponentPrecomputedIndexes *indexes,
-                        const CuMatrixBase<BaseFloat> &, //in_value
-                        const CuMatrixBase<BaseFloat> &, // out_value,
+                        const CuMatrixBase<BaseFloat> &in_value, //in_value
+                        const CuMatrixBase<BaseFloat> &out_value, // out_value,
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         void *memo,
                         Component *to_update,

From f284026155d3abef914635c46f39dac07107c0f4 Mon Sep 17 00:00:00 2001
From: Qian Kun <qkun@b04.clsp.jhu.edu>
Date: Mon, 30 Apr 2018 21:39:29 -0400
Subject: [PATCH 05/15] structure of 2.5d maxpooling component (compatible with
 nnet-convolution-component)(without cuda functions)

---
 src/cudamatrix/cu-matrix.cc               | 118 ++++++++-------
 src/cudamatrix/cu-matrix.h                |  37 ++---
 src/nnet3/nnet-convolutional-component.cc | 163 +++++++++++++++++++++
 src/nnet3/nnet-convolutional-component.h  | 166 +++++++++++++++++++++-
 src/nnet3/nnet-simple-component.cc        |  77 ----------
 src/nnet3/nnet-simple-component.h         |  42 ------
 6 files changed, 415 insertions(+), 188 deletions(-)

diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index c74e3caf3eb..dcd37b76709 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -1190,86 +1190,100 @@ void CuMatrixBase<Real>::AddMatBlocks(Real alpha, const CuMatrixBase<Real> &A,
 
 template<typename Real>
 void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
-                                      MatrixTransposeType transA) {
+                                      vector index_max,
+                                      MatrixTransposeType transA,
+                                      const int32 input_t_dim_,
+                                      const int32 pool_t_size_,
+                                      const int32 pool_t_step_,
+                                      const int32 input_h_dim_,
+                                      const int32 pool_h_size_,
+                                      const int32 pool_h_step_,
+                                      const int32 input_f_dim_,
+                                      const int32 pool_f_size_,
+                                      const int32 pool_f_step_) {
   if (num_rows_ == 0 || num_cols_ == 0) return;
 
+  int32 num_pools_t = 1 + (input_t_dim_ - pool_t_size_) / pool_t_step_;
+  int32 num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_;
+  int32 num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_;
+
+  // Not sure whether this needed?
+  KALDI_ASSERT((input_t_dim_ - pool_t_size_) % pool_t_step_ == 0 && 
+               (input_h_dim_ - pool_h_size_) % pool_h_step_ == 0 &&
+               (input_f_dim_ - pool_f_size_) % pool_f_step_ == 0);
+
   if (A.NumRows() >= (transA == kNoTrans ? num_rows_ : num_cols_) &&
       A.NumCols() >= (transA == kNoTrans ? num_cols_ : num_rows_)) {
-    // This is the "summing", not broadcasting, version of AddMatBlocks.
+    // This is the "forward-propagation" version of MaxMatBlocks.
     // It supports both regular and transposed operation.
-    int32 num_row_blocks, num_col_blocks;
+
     if (transA == kNoTrans) {
-      KALDI_ASSERT(A.NumRows() % num_rows_ == 0 && A.NumCols() % num_cols_ == 0);
-      num_row_blocks = A.Mat().NumRows() / num_rows_;
-      num_col_blocks = A.Mat().NumCols() / num_cols_;
+      KALDI_ASSERT(A.NumRows() == input_t_dim_ &&
+                   A.NumCols() == input_h_dim_ * input_f_dim_ &&
+                   num_rows_ == num_pools_t  &&
+                   num_cols_ == num_pools_h * num_pools_f);
+
     } else {
-      KALDI_ASSERT(A.NumRows() % num_cols_ == 0 && A.NumCols() % num_rows_ == 0);
-      num_row_blocks = A.Mat().NumRows() / num_cols_;
-      num_col_blocks = A.Mat().NumCols() / num_rows_;
+      KALDI_ASSERT(A.NumCols() == input_t_dim_ &&
+                   A.NumRows() == input_h_dim_ * input_f_dim_ &&
+                   num_cols_ == num_pools_t  &&
+                   num_rows_ == num_pools_h * num_pools_f);
+
     }
 #if HAVE_CUDA == 1
     if (CuDevice::Instantiate().Enabled()) {
       CuTimer tim;
-      dim3 dimGrid, dimBlock;
-      GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
-                                            &dimGrid, &dimBlock);
-      cuda_max_mat_blocks(dimGrid, dimBlock, A.data_, num_row_blocks,
-                          num_col_blocks, data_, Dim(), A.Stride(),
-                          (transA == kTrans ? 1 : 0));
+      dim3 dimBlock(num_pools_t, num_pools_h, num_pools_f);
+      dim3 dimGrid(1024);
+      // dim3 dimGrid, dimBlock;
+      // GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+      //                                       &dimGrid, &dimBlock);
+      cuda_max_mat_blocks(dimGrid, dimBlock, A.data_, data_, Dim(), 
+                          input_t_dim_, pool_t_size_, pool_t_step_,
+                          input_h_dim_, pool_h_size_, pool_h_step_,
+                          input_f_dim_, pool_f_size_, pool_f_step_,
+                          index_max_, (transA == kTrans ? 1 : 0));
       CU_SAFE_CALL(cudaGetLastError());
 
       CuDevice::Instantiate().AccuProfile(__func__, tim);
-    } //else
+    } else
 #endif
     // {
-    //   int32 nr, nc;
-    //   if (transA == kNoTrans) {
-    //     nr = num_rows_;
-    //     nc = num_cols_;
-    //   } else {
-    //     nr = num_cols_;
-    //     nc = num_rows_;
-    //   }
-    //   for (int32 i = 0; i < num_row_blocks; i++) {
-    //     for (int32 j = 0; j < num_col_blocks; j++) {
-    //       Mat().AddMat(alpha, SubMatrix<Real>(A.Mat(), i * nr, nr, j * nc, nc),
-    //                    transA);
-    //     }
-    //   }
+    // TO DO
+    // maxpooling without cuda
     // }
   } else {
-    // This is the "broadcasting" version of MaxMatBlocks, where
+
+    // This is the "backward-propagation" version of MaxMatBlocks, where
     // *this is larger than src.
-    if (transA != kNoTrans)
-      KALDI_ERR << "Transposed operation not supported currently.";
-    if (!(num_rows_ % A.NumRows() == 0 && num_cols_ % A.NumCols() == 0))
-      KALDI_ERR << "Invalid sizes of arguments";
+    if (transA == kNoTrans){
+      KALDI_ASSERT(A.NumRows() == num_pools_t &&
+                   A.NumCols() == num_pools_h * num_pools_f &&
+                   num_rows_ == input_t_dim_  &&
+                   num_cols_ == input_h_dim_ * input_f_dim_);
+    } else {
+      KALDI_ASSERT(A.NumCols() == num_pools_t &&
+                   A.NumRows() == num_pools_h * num_pools_f &&
+                   num_cols_ == input_t_dim_  &&
+                   num_rows_ == input_h_dim_ * input_f_dim_);
+    }
+
 #if HAVE_CUDA == 1
     if (CuDevice::Instantiate().Enabled()) {
       CuTimer tim;
       dim3 dimGrid, dimBlock;
       GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                             &dimGrid, &dimBlock);
-      cuda_max_mat_repeated(dimGrid, dimBlock,
+      cuda_max_mat_blocks_back(dimGrid, dimBlock, index_max_,
                             A.data_, A.Dim(), data_, Dim());
       CU_SAFE_CALL(cudaGetLastError());
       CuDevice::Instantiate().AccuProfile(__func__, tim);
-    } //else
-#endif
-//     {
-//       const MatrixBase<Real> &src_mat = A.Mat(),
-//           &this_mat = this->Mat();
-//       for (int32 row_offset = 0; row_offset < NumRows();
-//            row_offset += src_mat.NumRows()) {
-//         for (int32 col_offset = 0; col_offset < NumCols();
-//              col_offset += src_mat.NumCols()) {
-//           SubMatrix<Real> this_part(this_mat,
-//                                     row_offset, src_mat.NumRows(),
-//                                     col_offset, src_mat.NumCols());
-//           this_part.AddMat(alpha, src_mat);
-//         }
-//       }
-//     }
+    } else
+#endif
+    // {
+    // TO DO
+    // maxpooling backward propagation without cuda
+    // }
   }
 }
 
diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index 6d3b35ed19c..b12819eac39 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -509,24 +509,29 @@ class CuMatrixBase {
   void AddMatBlocks(Real alpha, const CuMatrixBase<Real> &A,
                     MatrixTransposeType trans = kNoTrans);
 
-  /// This function does *this = max(*this, src). similar with AddMatBlocks,
-  /// it supports cases where *this and src have different dimension.  
-  /// There are two allowed cases:
+
+  /// This function is used for do the  maxpooling over blocks. The detailed 
+  /// description is written in the MaxPoolingOverBlock component in file
+  /// nnet-convolutional-component.h
   ///
-  ///  (1) *this is larger than src; this case is probably for backpropagation.
-  ///       In this case, we do a broadcasting operation.  *this must
-  ///       have NumRows() == a * src.NumRows() and NumCols() == b *
-  ///       src.NumCols() for integer a >= 1, b >= 1.  *this will be treated as
-  ///       being made up of of blocks with the same size as src, and to each
-  ///       block we'll add src.  This case does not support trans == kTrans.
+  /// To point out, this function has two version 'forward-propagate' and
+  /// 'backward-propagate':
   ///
-  ///  (2) *this is smaller than src; we select the maximum.  src.NumRows() must 
-  ///      == a * this->NumRows(), and src.NumCols() must == b * this->NumCols(), 
-  ///      for a >= 1, b >= 1.  In this case, src will be treated as being made 
-  ///      up of blocks with the same size as *this, and to *this we will select
-  ///      max value over all of those blocks.
-  void MaxMatBlocks(const CuMatrixBase<Real> &A,
-                    MatrixTransposeType trans = kNoTrans);
+  ///  (1)  When the size of input matrix &A is larger than *this, it is then a 
+  ///       'forward-propagate' version, and the function do the maxpooling 
+  ///       depending on the parameters. Meanwhile, it stores the index of 
+  ///       maximum value in each pool in vector 'index_max_' for backpropagation.
+  ///
+  ///  (2)  When the size of input matrix &A is smaller than *this, it is then a 
+  ///       'backward-propagate' version. According to the vector 'index_max_', the 
+  ///       function set all the values in &out_deriv whose index is not in 
+  ///       vector(not corresponding to maximum value in each pool of &in_value) 
+  ///       as zero, and keeps those correponding to maximum value as the *in_deriv.
+  void MaxMatBlocks(const CuMatrixBase<Real> &A, vector index_max_,
+                    MatrixTransposeType trans = kNoTrans,
+                    const int32 input_t_dim_, const int32 pool_t_size_, const int32 pool_t_step_,
+                    const int32 input_h_dim_, const int32 pool_h_size_, const int32 pool_h_step_,
+                    const int32 input_f_dim_, const int32 pool_f_size_, const int32 pool_f_step_);
 
   /// (for each column c of *this), c = alpha * col + beta * c
   void AddVecToCols(Real alpha, const CuVectorBase<Real> &col, Real beta = 1.0);
diff --git a/src/nnet3/nnet-convolutional-component.cc b/src/nnet3/nnet-convolutional-component.cc
index bea3b9d31d5..73de0ea5844 100644
--- a/src/nnet3/nnet-convolutional-component.cc
+++ b/src/nnet3/nnet-convolutional-component.cc
@@ -666,6 +666,169 @@ void TimeHeightConvolutionComponent::PrecomputedIndexes::Read(
   ExpectToken(is, binary, "</TimeHeightConvolutionComponentPrecomputedIndexes>");
 }
 
+MaxPoolingOverBlock::MaxPoolingOverBlock(
+    const MaxPoolingOverBlock &other):
+    input_t_dim_(other.input_t_dim_),
+    input_h_dim_(other.input_h_dim_),
+    input_f_dim_(other.input_f_dim_),
+    pool_t_size_(other.pool_t_size_),
+    pool_h_size_(other.pool_h_size_),
+    pool_f_size_(other.pool_f_size_),
+    pool_t_step_(other.pool_t_step_),
+    pool_h_step_(other.pool_h_step_),
+    pool_f_step_(other.pool_f_step_) { }
+
+// aquire input dim
+int32 MaxpoolingComponent::InputDim() const {
+  return input_t_dim_ * input_h_dim_ * input_f_dim_;
+}
+
+// aquire output dim
+int32 MaxpoolingComponent::OutputDim() const {
+  int32 num_pools_t = 1 + (input_t_dim_ - pool_t_size_) / pool_t_step_;
+  int32 num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_;
+  int32 num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_;
+  return num_pools_t * num_pools_h * num_pools_f;
+}
+
+// check the component parameters
+void MaxpoolingComponent::Check() const {
+  // sanity check of the max pooling parameters
+  KALDI_ASSERT(input_t_dim_ > 0);
+  KALDI_ASSERT(input_h_dim_ > 0);
+  KALDI_ASSERT(input_f_dim_ > 0);
+  KALDI_ASSERT(pool_t_size_ > 0);
+  KALDI_ASSERT(pool_h_size_ > 0);
+  KALDI_ASSERT(pool_f_size_ > 0);
+  KALDI_ASSERT(pool_t_step_ > 0);
+  KALDI_ASSERT(pool_h_step_ > 0);
+  KALDI_ASSERT(pool_f_step_ > 0);
+  KALDI_ASSERT(input_t_dim_ >= pool_t_size_);
+  KALDI_ASSERT(input_h_dim_ >= pool_h_size_);
+  KALDI_ASSERT(input_f_dim_ >= pool_f_size_);
+  KALDI_ASSERT(pool_t_size_ >= pool_t_step_);
+  KALDI_ASSERT(pool_h_size_ >= pool_h_step_);
+  KALDI_ASSERT(pool_f_size_ >= pool_f_step_);
+  KALDI_ASSERT((input_t_dim_ - pool_t_size_) % pool_t_step_  == 0);
+  KALDI_ASSERT((input_h_dim_ - pool_h_size_) % pool_h_step_  == 0);
+  KALDI_ASSERT((input_f_dim_ - pool_f_size_) % pool_f_step_  == 0);
+}
+
+// initialize the component using configuration file
+void MaxPoolingOverBlock::InitFromConfig(ConfigLine *cfl) {
+  bool ok = true;
+
+  ok = ok && cfl->GetValue("input-t-dim", &input_t_dim_);
+  ok = ok && cfl->GetValue("input-h-dim", &input_h_dim_);
+  ok = ok && cfl->GetValue("input-f-dim", &input_f_dim_);
+  ok = ok && cfl->GetValue("pool-t-size", &pool_t_size_);
+  ok = ok && cfl->GetValue("pool-h-size", &pool_h_size_);
+  ok = ok && cfl->GetValue("pool-f-size", &pool_f_size_);
+  ok = ok && cfl->GetValue("pool-t-step", &pool_t_step_);
+  ok = ok && cfl->GetValue("pool-h-step", &pool_h_step_);
+  ok = ok && cfl->GetValue("pool-f-step", &pool_f_step_);
+
+  if (cfl->HasUnusedValues())
+    KALDI_ERR << "Could not process these elements in initializer: "
+              << cfl->UnusedValues();
+  if (!ok)
+    KALDI_ERR << "Bad initializer " 
+              << cfl->WholeLine();
+
+  Check();
+}
+
+void MaxPoolingOverBlock::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary, "<MaxPoolingOverBlock>", "<InputTDim>");
+  ReadBasicType(is, binary, &input_t_dim_);
+  ExpectToken(is, binary, "<InputHDim>");
+  ReadBasicType(is, binary, &input_h_dim_);
+  ExpectToken(is, binary, "<InputFDim>");
+  ReadBasicType(is, binary, &input_f_dim_);
+  ExpectToken(is, binary, "<PoolTize>");
+  ReadBasicType(is, binary, &pool_t_size_);
+  ExpectToken(is, binary, "<PoolHSize>");
+  ReadBasicType(is, binary, &pool_h_size_);
+  ExpectToken(is, binary, "<PoolFSize>");
+  ReadBasicType(is, binary, &pool_f_size_);
+  ExpectToken(is, binary, "<PoolTStep>");
+  ReadBasicType(is, binary, &pool_t_step_);
+  ExpectToken(is, binary, "<PoolHStep>");
+  ReadBasicType(is, binary, &pool_h_step_);
+  ExpectToken(is, binary, "<PoolFStep>");
+  ReadBasicType(is, binary, &pool_f_step_);
+  ExpectToken(is, binary, "</MaxPoolingOverBlock>");
+  Check();
+}
+
+void MaxPoolingOverBlock::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<MaxPoolingOverBlock>");
+  WriteToken(os, binary, "<InputTDim>");
+  WriteBasicType(os, binary, input_t_dim_);
+  WriteToken(os, binary, "<InputHDim>");
+  WriteBasicType(os, binary, input_h_dim_);
+  WriteToken(os, binary, "<InputFDim>");
+  WriteBasicType(os, binary, input_f_dim_);
+  WriteToken(os, binary, "<PoolTSize>");
+  WriteBasicType(os, binary, pool_t_size_);
+  WriteToken(os, binary, "<PoolHSize>");
+  WriteBasicType(os, binary, pool_h_size_);
+  WriteToken(os, binary, "<PoolFSize>");
+  WriteBasicType(os, binary, pool_f_size_);
+  WriteToken(os, binary, "<PoolTStep>");
+  WriteBasicType(os, binary, pool_t_step_);
+  WriteToken(os, binary, "<PoolHStep>");
+  WriteBasicType(os, binary, pool_h_step_);
+  WriteToken(os, binary, "<PoolFStep>");
+  WriteBasicType(os, binary, pool_f_step_);
+  WriteToken(os, binary, "</MaxPoolingOverBlock>");
+}
+
+// display information about component
+std::string MaxPoolingOverBlock::Info() const {
+  std::ostringstream stream;
+  stream << Type()
+         << ", input-t-dim=" << input_t_dim_
+         << ", input-h-dim=" << input_h_dim_
+         << ", input-f-dim=" << input_f_dim_
+         << ", pool-t-size=" << pool_t_size_
+         << ", pool-h-size=" << pool_h_size_
+         << ", pool-f-size=" << pool_f_size_
+         << ", pool-t-step=" << pool_t_step_
+         << ", pool-h-step=" << pool_h_step_
+         << ", pool-f-step=" << pool_f_step_;
+  return stream.str();
+}
+
+void* MaxPoolingOverBlock::Propagate(const ComponentPrecomputedIndexes *indexes,
+                                     const CuMatrixBase<BaseFloat> &in,
+                                     CuMatrixBase<BaseFloat> *out) const {
+
+  out->MaxMatBlocks(in, index_max_, kNoTrans,
+                    input_t_dim_, pool_t_size_, pool_t_step_,
+                    input_h_dim_, pool_h_size_, pool_h_step_,
+                    input_f_dim_, pool_f_size_, pool_f_step_);
+  return NULL;
+}
+
+void MaxPoolingOverBlock::Backprop(
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &in_value, //in_value
+    const CuMatrixBase<BaseFloat> &out_value, // out_value,
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    void *memo,
+    Component *to_update,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+
+  if (in_deriv) {
+    in_derv->MaxMatBlocks(out_deriv, index_max_, kNoTrans,
+                          input_t_dim_, pool_t_size_, pool_t_step_,
+                          input_h_dim_, pool_h_size_, pool_h_step_,
+                          input_f_dim_, pool_f_size_, pool_f_step_);
+
+  }
+}
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-convolutional-component.h b/src/nnet3/nnet-convolutional-component.h
index 35cf0de11c9..4677c894e69 100644
--- a/src/nnet3/nnet-convolutional-component.h
+++ b/src/nnet3/nnet-convolutional-component.h
@@ -25,6 +25,7 @@
 #include "nnet3/natural-gradient-online.h"
 #include "nnet3/convolution.h"
 #include <iostream>
+#include <vector>
 
 namespace kaldi {
 namespace nnet3 {
@@ -370,8 +371,171 @@ class TimeHeightConvolutionComponent: public UpdatableComponent {
   OnlineNaturalGradient preconditioner_out_;
 };
 
+/**  MaxPoolingOverBlock gets maximum value over blocks of its input
+  this component should be compatible with TimeHeightConvolutionComponent
+
+  MaxPoolingOverBlock :
+  MaxPoolingOverBlock component was firstly used in ConvNet for selecting an
+  representative activation in an area. It inspired Maxout nonlinearity.
+  Each output element of this component is the maximum of a block of
+  input elements where the block has a 2.5D dimension (pool_t_size_,
+  pool_h_size_ * pool_f_size_).
+  Blocks could overlap if the shift value on any axis is smaller
+  than its corresponding pool size (e.g. pool_t_step_ < pool_t_size_).
+  If the shift values are euqal to their pool size, there is no
+  overlap; while if they all equal 1, the blocks overlap to
+  the greatest possible extent.
+ 
+  This component is designed to be used after a ConvolutionComponent
+  so that the input matrix is propagated from a 2d-convolutional layer.
+  This component implements 2.5d-maxpooling which performs
+  max pooling along the three axes.
+ 
+  Input : A 2.5D matrix with dimensions:
+         t: (e.g. time)
+         h: (e.g. height, mel-frequency)
+         f: (e.g. channels like number of filters in the ConvolutionComponent)
+ 
+         The reason why we call the matrix 2.5D is because we compress the 3D block 
+         into a 2D matrix by concatenating each 2D matrix at different channel like:
+ 
+                     h = 0                   h = 1
+        |------------------------|------------------------|----...
+         f=0  f=1  f=2  ...  f=n  f=0  f=1  f=2  ...  f=n 
+        |----|----|----|----|----|----|----|----|----|----|----...
+    t=0  **** **** **** **** ****|**** **** **** **** ****|****... -
+    t=1  **** **** **** **** ****|**** **** **** **** ****|****... | m
+    t=2  **** **** **** **** ****|**** **** **** **** ****|****... | a
+    t=3  **** **** **** **** ****|**** **** **** **** ****|****... | t
+    t=4  **** **** **** **** ****|**** **** **** **** ****|****... | r
+    t=5  **** **** **** **** ****|**** **** **** **** ****|****... | i
+    t=6  **** **** **** **** ****|**** **** **** **** ****|****... | x
+    t=7  **** **** **** **** ****|**** **** **** **** ****|****... -
+    
+          In this case, if we set pool_t_size = 2, pool_t_step = 1
+                                  pool_h_size = 2, pool_h_step = 1
+                                  pool_f_size = 2, pool_f_step = 1
+          Then, the pooling block is like:
+          
+           h = 0     h = 1             h = 0     h = 1                  h = 1     h = 2              
+        |---------|---------|       |---------|---------|            |---------|---------|
+         f=0  f=1  f=0  f=1          f=1  f=2  f=1  f=2               f=0  f=1  f=0  f=1   
+        |----|----|----|----|       |----|----|----|----|   ...      |----|----|----|----|  ......
+    t=0  **** **** **** ****    t=0  **** **** **** ****         t=0  **** **** **** ****  
+    t=1  **** **** **** ****    t=1  **** **** **** ****         t=1  **** **** **** ****  
+
+
+           h = 0     f = 1             h = 0    h = 1                   h = 1     h = 2              
+        |---------|---------|       |---------|---------|            |---------|---------|
+         f=0  f=1  f=0  f=1          f=1  f=2  f=1  f=2               f=0  f=1  f=0  f=1   
+        |----|----|----|----|       |----|----|----|----|   ...      |----|----|----|----|  ......
+    t=1  **** **** **** ****    t=1  **** **** **** ****         t=1  **** **** **** ****  
+    t=2  **** **** **** ****    t=2  **** **** **** ****         t=2  **** **** **** ****  
+
+                  .                            .                               .
+                  .                            .                               .
+                  .                            .                               .
+                  .                            .                               .
+                  .                            .                               .
+                  .                            .                               .
+
+      Since the stride of filter(pool_f_step) is usually smaller than the 
+      stride of height(poo_h_step), we arrange each row of output as:
+      (all filters for height 0)(all filters for height 1)...
+
+  Parameters:
+
+            input_t_dim_    size of the input along t-axis
+                            (e.g. number of time steps)
+            input_h_dim_    size of input along h-axis
+                            (e.g. number of mel-frequency bins)
+            input_f_dim_    size of input along f-axis
+                            (e.g. number of filters in the ConvolutionComponent)
+
+            pool_t_size_    size of the pooling window along t-axis
+            pool_h_size_    size of the pooling window along h-axis
+            pool_f_size_    size of the pooling window along f-axis
+
+            pool_t_step_    the number of steps taken along t-axis of input
+                            before computing the next pool (e.g. the stride
+                             size along t-axis)
+            pool_h_step_    the number of steps taken along h-axis of input
+                            before computing the next pool (e.g. the stride
+                             size along t-axis)
+            pool_f_step_    the number of steps taken along f-axis of input
+                            before computing the next pool (e.g. the stride
+                             size along t-axis)
+
+            index_max_      a vector that store the index of the maximum 
+                            value as (t,h,f), used in back-propagation.
+
+ 
+ 
+  Output : The output is also a 2.5D tensor with dimension (num_block_t by 
+           num_block_h * num_block_f) where:
+ 
+           num_block_t = 1 + (input_t_dim_ - pool_t_size_) / pool_t_step_; 
+           // the number of blocks in t dimension
+           num_block_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_; 
+           // the number of blocks in h dimension
+           num_block_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_; 
+           // the number of blocks in f dimension
+ 
+ 
+ */
+class MaxPoolingOverBlock: public Component {
+ public:
+  explicit MaxPoolingOverBlock(const MaxPoolingOverBlock &other);
+  MaxPoolingOverBlock(): input_t_dim_(0), input_h_dim_(0), input_f_dim_(0),
+                         pool_t_size_(0), pool_h_size_(0), pool_f_size_(0),
+                         pool_t_step_(0), pool_h_step_(0), pool_f_step_(0) { }
+  virtual std::string Type() const { return "MaxPoolingOverBlock"; }
+  virtual int32 Properties() const {
+    return kSimpleComponent|kBackpropNeedsInput|kBackpropNeedsOutput|kBackpropAdds;
+  }
+  virtual void InitFromConfig(ConfigLine *cfl);
+  virtual int32 InputDim() const;// { return input_dim_; }
+  virtual int32 OutputDim() const;// { return output_dim_; }
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+  virtual std::string Info() const;
+  virtual Component* Copy() const { return new MaxPoolingOverBlock(*this); }
+  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
+                          const CuMatrixBase<BaseFloat> &in,
+                          CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &in_value, //in_value
+                        const CuMatrixBase<BaseFloat> &out_value, // out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        void *memo,
+                        Component *to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+  virtual void Check() const;
 
-
+ private:
+  int32 input_t_dim_;   // size of the input along t-axis
+  // (e.g. number of time steps)
+  int32 input_h_dim_;   // size of input along h-axis
+  // (e.g. number of mel-frequency bins)
+  int32 input_f_dim_;   // size of input along f-axis
+  // (e.g. number of filters in the ConvolutionComponent)
+
+  int32 pool_t_size_;    // size of the pooling window along t-axis
+  int32 pool_h_size_;    // size of the pooling window along h-axis
+  int32 pool_f_size_;    // size of the pooling window along f-axis
+
+  int32 pool_t_step_;   // the number of steps taken along t-axis of input
+  //  before computing the next pool
+  int32 pool_h_step_;   // the number of steps taken along h-axis of input
+  // before computing the next pool
+  int32 pool_f_step_;   // the number of steps taken along f-axis of input
+  // before computing the next pool
+
+  vector index_max_;
+
+  MaxPoolingOverBlock &operator = (const MaxPoolingOverBlock &other); // Disallow.
+};
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index a2f51e25a52..081e61bfa28 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -5860,82 +5860,5 @@ void SumBlockComponent::Backprop(
   }
 }
 
-MaxPoolingOverBlock::MaxPoolingOverBlock(const MaxPoolingOverBlock &other):
-    input_dim_(other.input_dim_), output_dim_(other.output_dim_) { }
-
-void MaxPoolingOverBlock::InitFromConfig(ConfigLine *cfl) {
-  bool ok = cfl->GetValue("input-dim", &input_dim_) &&
-      cfl->GetValue("output-dim", &output_dim_);
-  if (!ok)
-    KALDI_ERR << "input-dim and output-dim must both be provided.";
-  if (input_dim_ <= 0 || input_dim_ % output_dim_ != 0)
-    KALDI_ERR << "Invalid values input-dim=" << input_dim_
-              << " output-dim=" << output_dim_;
-  if (cfl->HasUnusedValues())
-    KALDI_ERR << "Could not process these elements in initializer: "
-              << cfl->UnusedValues();
-}
-
-void MaxPoolingOverBlock::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<MaxPoolingOverBlock>", "<InputDim>");
-  ReadBasicType(is, binary, &input_dim_);
-  ExpectToken(is, binary, "<OutputDim>");
-  ReadBasicType(is, binary, &output_dim_);
-  ExpectToken(is, binary, "</MaxPoolingOverBlock>");
-}
-
-void MaxPoolingOverBlock::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<MaxPoolingOverBlock>");
-  WriteToken(os, binary, "<InputDim>");
-  WriteBasicType(os, binary, input_dim_);
-  WriteToken(os, binary, "<OutputDim>");
-  WriteBasicType(os, binary, output_dim_);
-  WriteToken(os, binary, "</MaxPoolingOverBlock>");
-}
-
-std::string MaxPoolingOverBlock::Info() const {
-  std::ostringstream stream;
-  stream << Type() << ", input-dim=" << input_dim_
-         << ", output-dim=" << output_dim_;
-  return stream.str();
-}
-
-void* MaxPoolingOverBlock::Propagate(const ComponentPrecomputedIndexes *indexes,
-                                   const CuMatrixBase<BaseFloat> &in,
-                                   CuMatrixBase<BaseFloat> *out) const {
-  KALDI_ASSERT(out->NumRows() == in.NumRows() &&
-               out->NumCols() == output_dim_ &&
-               in.NumCols() == input_dim_);
-  out->MaxMatBlocks(in, kNoTrans);
-  return NULL;
-}
-
-void MaxPoolingOverBlock::Backprop(
-    const std::string &debug_info,
-    const ComponentPrecomputedIndexes *indexes,
-    const CuMatrixBase<BaseFloat> &in_value, //in_value
-    const CuMatrixBase<BaseFloat> &out_value, // out_value,
-    const CuMatrixBase<BaseFloat> &out_deriv,
-    void *memo,
-    Component *to_update,
-    CuMatrixBase<BaseFloat> *in_deriv) const {
-  // if (in_deriv) {
-  //   in_deriv->MaxMatBlocks(out_deriv, kNoTrans);
-  // }
-  if (!in_deriv)
-    return;
-
-  int32 num_pools = input_dim_;
-  int32 pool_size = output_dim_ / input_dim_;
-
-  for (int32 q = 0; q < pool_size; q++) {
-    // zero-out mask
-    CuMatrix<BaseFloat> mask;
-    out_value.EqualElementMask(in_value.ColRange(q * num_pools, num_pools), &mask);
-    mask.MulElements(out_deriv);
-    in_deriv.ColRange(q * num_pools, num_pools).CopyFromMat(mask);
-  }
-}
-
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index 6284a686333..55e7ff703cf 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -1220,48 +1220,6 @@ class SumBlockComponent: public Component {
   SumBlockComponent &operator = (const SumBlockComponent &other); // Disallow.
 };
 
-/**  MaxPoolingOverBlock gets maximum value over blocks of its input: for instance, if
-     you create one with the config "input-dim=400 output-dim=100",
-     its output will be the maximum value over the 4 100-dimensional blocks of
-     the input.
-
-     Accepted values on its config-file line are:
-        input-dim  The input dimension.  Required.
-        output-dim  The block dimension.  Required.  Must divide input-dim.
- */
-class MaxPoolingOverBlock: public Component {
- public:
-  explicit MaxPoolingOverBlock(const MaxPoolingOverBlock &other);
-  MaxPoolingOverBlock() { }
-  virtual std::string Type() const { return "MaxPoolingOverBlock"; }
-  virtual int32 Properties() const {
-    return kSimpleComponent|kPropagateAdds|kBackpropAdds;
-  }
-  virtual void (ConfigLine *cfl);
-  virtual int32 InputDim() const { return input_dim_; }
-  virtual int32 OutputDim() const { return output_dim_; }
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
-  virtual std::string Info() const;
-  virtual Component* Copy() const { return new MaxPoolingOverBlock(*this); }
-  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
-                          const CuMatrixBase<BaseFloat> &in,
-                          CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const std::string &debug_info,
-                        const ComponentPrecomputedIndexes *indexes,
-                        const CuMatrixBase<BaseFloat> &in_value, //in_value
-                        const CuMatrixBase<BaseFloat> &out_value, // out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        void *memo,
-                        Component *to_update,
-                        CuMatrixBase<BaseFloat> *in_deriv) const;
- private:
-  int32 input_dim_;
-  int32 output_dim_;
-  MaxPoolingOverBlock &operator = (const MaxPoolingOverBlock &other); // Disallow.
-};
-
-
 /*
  ClipGradientComponent just duplicates its input, but clips gradients
  during backpropagation if they cross a predetermined threshold.

From 79f2c5f1f85aa2160caf48a873c31ed7f64dfdf0 Mon Sep 17 00:00:00 2001
From: Qian Kun <qkun@b02.clsp.jhu.edu>
Date: Thu, 3 May 2018 09:55:18 -0400
Subject: [PATCH 06/15] non-cuda case

---
 src/cudamatrix/cu-kernels-ansi.h          | 26 +++++++++++
 src/cudamatrix/cu-kernels.h               | 54 ++++++++++++++++-------
 src/cudamatrix/cu-matrix.cc               | 45 ++++++++++++++++---
 src/cudamatrix/cu-matrix.h                |  2 +-
 src/nnet3/nnet-convolutional-component.cc |  2 +-
 src/nnet3/nnet-convolutional-component.h  | 13 +++---
 6 files changed, 111 insertions(+), 31 deletions(-)

diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index 6b99a77e73b..5520cb1c1db 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -110,6 +110,32 @@ void cudaD_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double *src,
                             MatrixDim src_dim, double *dst, MatrixDim dst_dim);
 void cudaF_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float *src,
                             MatrixDim src_dim, float *dst, MatrixDim dst_dim);
+void cudaD_max_mat_blocks(dim3 Gr, dim3 Bl, const double *src, double *dst,
+                                int32_cuda input_t_dim_,
+                                int32_cuda pool_t_size_,
+                                int32_cuda pool_t_step_,
+                                int32_cuda input_h_dim_,
+                                int32_cuda pool_h_size_,
+                                int32_cuda pool_h_step_,
+                                int32_cuda input_f_dim_,
+                                int32_cuda pool_f_size_,
+                                int32_cuda pool_f_step_, 
+                                double index_max_, int A_trans);
+void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, const float *src, float *dst,
+                          int32_cuda input_t_dim_,
+                          int32_cuda pool_t_size_,
+                          int32_cuda pool_t_step_,
+                          int32_cuda input_h_dim_,
+                          int32_cuda pool_h_size_,
+                          int32_cuda pool_h_step_,
+                          int32_cuda input_f_dim_,
+                          int32_cuda pool_f_size_,
+                          int32_cuda pool_f_step_, 
+                          float index_max_, int A_trans);
+void cudaD_max_mat_blocks_back(dim3 Gr, dim3 Bl, double index_max_, const double *src, 
+                            MatrixDim src_dim, double *dst, MatrixDim dst_dim);
+void cudaF_max_mat_blocks_back(dim3 Gr, dim3 Bl, float index_max_, const float *src, 
+                            MatrixDim src_dim, float *dst, MatrixDim dst_dim);
 void cudaD_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat,
                             MatrixDim mat_dim, const double *mat2,
                             int mat2_row_stride, int mat2_col_stride,
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index 27a8d8a34fe..2804a624959 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -169,28 +169,50 @@ inline void cuda_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha,
   cudaF_add_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim);
 }
 inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl,
-                                const double *src, int32_cuda num_row_blocks,
-                                int32_cuda num_col_blocks, double *dst,
-                                MatrixDim d, int src_stride, int A_trans) {
-  cudaD_max_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst,
-                       d, src_stride, A_trans);
-}
-inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, const float *src,
-                                int32_cuda num_row_blocks,
-                                int32_cuda num_col_blocks, float *dst,
-                                MatrixDim d, int src_stride, int A_trans) {
-  cudaF_max_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst,
-                       d, src_stride, A_trans);
+                                const double *src, double *dst,
+                                int32_cuda input_t_dim_,
+                                int32_cuda pool_t_size_,
+                                int32_cuda pool_t_step_,
+                                int32_cuda input_h_dim_,
+                                int32_cuda pool_h_size_,
+                                int32_cuda pool_h_step_,
+                                int32_cuda input_f_dim_,
+                                int32_cuda pool_f_size_,
+                                int32_cuda pool_f_step_, 
+                                double index_max_, int A_trans) {
+  cudaD_max_mat_blocks(Gr, Bl, src, dst,
+                        input_t_dim_, pool_t_size_, pool_t_step_,
+                        input_h_dim_, pool_h_size_, pool_h_step_,
+                        input_f_dim_, pool_f_size_, pool_f_step_,
+                        index_max_,   A_trans);
 }
-inline void cuda_max_mat_repeated(dim3 Gr, dim3 Bl,
+inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl,
+                                const float *src, float *dst,
+                                int32_cuda input_t_dim_,
+                                int32_cuda pool_t_size_,
+                                int32_cuda pool_t_step_,
+                                int32_cuda input_h_dim_,
+                                int32_cuda pool_h_size_,
+                                int32_cuda pool_h_step_,
+                                int32_cuda input_f_dim_,
+                                int32_cuda pool_f_size_,
+                                int32_cuda pool_f_step_, 
+                                float index_max_, int A_trans) {
+  cudaF_max_mat_blocks(Gr, Bl, src, dst,
+                        input_t_dim_, pool_t_size_, pool_t_step_,
+                        input_h_dim_, pool_h_size_, pool_h_step_,
+                        input_f_dim_, pool_f_size_, pool_f_step_,
+                        index_max_,   A_trans);
+}
+inline void cuda_max_mat_blocks_back(dim3 Gr, dim3 Bl, double index_max_,
                                   const double *src, MatrixDim src_dim,
                                   double *dst, MatrixDim dst_dim) {
-  cudaD_max_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim);
+  cudaD_max_mat_blocks_back(Gr, Bl, index_max_, src, src_dim, dst, dst_dim);
 }
-inline void cuda_max_mat_repeated(dim3 Gr, dim3 Bl,
+inline void cuda_max_mat_blocks_back(dim3 Gr, dim3 Bl, float index_max_,
                                   const float *src, MatrixDim src_dim,
                                   float *dst, MatrixDim dst_dim) {
-  cudaF_max_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim);
+  cudaF_max_mat_blocks_back(Gr, Bl, index_max_, src, src_dim, dst, dst_dim);
 }
 inline void cuda_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat,
                                   MatrixDim mat_dim, const double *mat2,
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index dcd37b76709..ae88db54df9 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -1190,7 +1190,7 @@ void CuMatrixBase<Real>::AddMatBlocks(Real alpha, const CuMatrixBase<Real> &A,
 
 template<typename Real>
 void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
-                                      vector index_max,
+                                      CuVectorBase<Real> &index_max_,
                                       MatrixTransposeType transA,
                                       const int32 input_t_dim_,
                                       const int32 pool_t_size_,
@@ -1248,10 +1248,37 @@ void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
       CuDevice::Instantiate().AccuProfile(__func__, tim);
     } else
 #endif
-    // {
-    // TO DO
+    {
     // maxpooling without cuda
-    // }
+      int32 tmp = 0;
+      for (int32 t = 0; t < num_pools_t; t++) {
+        for (int32 h = 0; h < num_pools_t; h++) {
+          for (int32 f = 0; f < num_pools_f; f++) {
+            // initialize the maximum value as the first element in the pool
+            int32 max_x = 0; int32 max_y = 0; 
+            int32 max_value = A[t * pool_t_step_][h * pool_h_step_ * input_f_dim_ + f * pool_f_step_];
+            
+            // find the maximm value in the pool
+            for (int32 x = 0; x < pool_t_size_; x++) {
+              for (int32 y = 0; y < pool_h_size_; y++) { 
+                for (int32 z = 0; z < pool_f_size_; z++) {
+                  int32 cur_x = t * pool_t_step_ + x;
+                  int32 cur_y = (h * pool_h_step_ + y) * input_f_dim_ + f * pool_f_step_ + z;
+                  if (A[cur_x][cur_y] > max_value) {
+                    max_x = cur_x
+                    max_y = cur_y;
+                    max_value = A[cur_x][cur_y];
+                    index_max_[tmp] = cur_x
+                    index_max_[tmp+1] = cur_y
+                  }
+                }
+              }
+            }
+            tmp += 2
+          }
+        }
+      }
+    }
   } else {
 
     // This is the "backward-propagation" version of MaxMatBlocks, where
@@ -1280,10 +1307,14 @@ void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
       CuDevice::Instantiate().AccuProfile(__func__, tim);
     } else
 #endif
-    // {
-    // TO DO
+    {
     // maxpooling backward propagation without cuda
-    // }
+      *this.SetZero()
+      for (int32 x = 0; x < num_pools_t * num_pools_h * num_pools_f; x += 2) {
+        *this[index_max_[x]][index_max_[x+1]] = 1
+      }
+      *this.MulElements(A);
+    }
   }
 }
 
diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index b12819eac39..67e2c9ad6ca 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -527,7 +527,7 @@ class CuMatrixBase {
   ///       function set all the values in &out_deriv whose index is not in 
   ///       vector(not corresponding to maximum value in each pool of &in_value) 
   ///       as zero, and keeps those correponding to maximum value as the *in_deriv.
-  void MaxMatBlocks(const CuMatrixBase<Real> &A, vector index_max_,
+  void MaxMatBlocks(const CuMatrixBase<Real> &A, CuVectorBase<Real> &index_max_,
                     MatrixTransposeType trans = kNoTrans,
                     const int32 input_t_dim_, const int32 pool_t_size_, const int32 pool_t_step_,
                     const int32 input_h_dim_, const int32 pool_h_size_, const int32 pool_h_step_,
diff --git a/src/nnet3/nnet-convolutional-component.cc b/src/nnet3/nnet-convolutional-component.cc
index 73de0ea5844..16b6ff0da37 100644
--- a/src/nnet3/nnet-convolutional-component.cc
+++ b/src/nnet3/nnet-convolutional-component.cc
@@ -1,4 +1,4 @@
-// nnet3/nnet-convolutional-component.cc
+ // nnet3/nnet-convolutional-component.cc
 
 // Copyright      2017  Johns Hopkins University (author: Daniel Povey)
 
diff --git a/src/nnet3/nnet-convolutional-component.h b/src/nnet3/nnet-convolutional-component.h
index 4677c894e69..dd73216ff18 100644
--- a/src/nnet3/nnet-convolutional-component.h
+++ b/src/nnet3/nnet-convolutional-component.h
@@ -25,7 +25,6 @@
 #include "nnet3/natural-gradient-online.h"
 #include "nnet3/convolution.h"
 #include <iostream>
-#include <vector>
 
 namespace kaldi {
 namespace nnet3 {
@@ -467,18 +466,20 @@ class TimeHeightConvolutionComponent: public UpdatableComponent {
                              size along t-axis)
 
             index_max_      a vector that store the index of the maximum 
-                            value as (t,h,f), used in back-propagation.
+                            value as (r, c), used in back-propagation. The 
+                            size of this vector is 2 * num_pools_t * 
+                            num_pools_h * num_pools_f
 
  
  
   Output : The output is also a 2.5D tensor with dimension (num_block_t by 
            num_block_h * num_block_f) where:
  
-           num_block_t = 1 + (input_t_dim_ - pool_t_size_) / pool_t_step_; 
+           num_pools_t = 1 + (input_t_dim_ - pool_t_size_) / pool_t_step_; 
            // the number of blocks in t dimension
-           num_block_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_; 
+           num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_; 
            // the number of blocks in h dimension
-           num_block_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_; 
+           num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_; 
            // the number of blocks in f dimension
  
  
@@ -532,7 +533,7 @@ class MaxPoolingOverBlock: public Component {
   int32 pool_f_step_;   // the number of steps taken along f-axis of input
   // before computing the next pool
 
-  vector index_max_;
+  CuVectorBase<Real> index_max_; // the index of maximum value
 
   MaxPoolingOverBlock &operator = (const MaxPoolingOverBlock &other); // Disallow.
 };

From 5ca6aba110fce7203298209375baf54b5fc2d990 Mon Sep 17 00:00:00 2001
From: Qian Kun <qkun@b02.clsp.jhu.edu>
Date: Thu, 3 May 2018 14:35:53 -0400
Subject: [PATCH 07/15] few updates about parameters' name

---
 src/cudamatrix/cu-kernels-ansi.h          |  40 ++++----
 src/cudamatrix/cu-kernels.cu              | 114 ++++++++++++++--------
 src/cudamatrix/cu-kernels.h               |   4 +-
 src/nnet3/nnet-convolutional-component.cc |  10 +-
 src/nnet3/nnet-convolutional-component.h  |  12 +--
 5 files changed, 107 insertions(+), 73 deletions(-)

diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index 5520cb1c1db..a5fd632976f 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -111,27 +111,27 @@ void cudaD_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double *src,
 void cudaF_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float *src,
                             MatrixDim src_dim, float *dst, MatrixDim dst_dim);
 void cudaD_max_mat_blocks(dim3 Gr, dim3 Bl, const double *src, double *dst,
-                                int32_cuda input_t_dim_,
-                                int32_cuda pool_t_size_,
-                                int32_cuda pool_t_step_,
-                                int32_cuda input_h_dim_,
-                                int32_cuda pool_h_size_,
-                                int32_cuda pool_h_step_,
-                                int32_cuda input_f_dim_,
-                                int32_cuda pool_f_size_,
-                                int32_cuda pool_f_step_, 
-                                double index_max_, int A_trans);
+                          const int32_cuda input_t_dim_,
+                          const int32_cuda pool_t_size_,
+                          const int32_cuda pool_t_step_,
+                          const int32_cuda input_h_dim_,
+                          const int32_cuda pool_h_size_,
+                          const int32_cuda pool_h_step_,
+                          const int32_cuda input_f_dim_,
+                          const int32_cuda pool_f_size_,
+                          const int32_cuda pool_f_step_, 
+                          int index_max_, int A_trans);
 void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, const float *src, float *dst,
-                          int32_cuda input_t_dim_,
-                          int32_cuda pool_t_size_,
-                          int32_cuda pool_t_step_,
-                          int32_cuda input_h_dim_,
-                          int32_cuda pool_h_size_,
-                          int32_cuda pool_h_step_,
-                          int32_cuda input_f_dim_,
-                          int32_cuda pool_f_size_,
-                          int32_cuda pool_f_step_, 
-                          float index_max_, int A_trans);
+                          const int32_cuda input_t_dim_,
+                          const int32_cuda pool_t_size_,
+                          const int32_cuda pool_t_step_,
+                          const int32_cuda input_h_dim_,
+                          const int32_cuda pool_h_size_,
+                          const int32_cuda pool_h_step_,
+                          const int32_cuda input_f_dim_,
+                          const int32_cuda pool_f_size_,
+                          const int32_cuda pool_f_step_,  
+                          int index_max_, int A_trans);
 void cudaD_max_mat_blocks_back(dim3 Gr, dim3 Bl, double index_max_, const double *src, 
                             MatrixDim src_dim, double *dst, MatrixDim dst_dim);
 void cudaF_max_mat_blocks_back(dim3 Gr, dim3 Bl, float index_max_, const float *src, 
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 63ac13cd6ef..4699060520e 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -775,30 +775,37 @@ static void _add_mat_blocks_trans(Real alpha, const Real* src,
 
 template<typename Real>
 __global__
-static void _max_mat_blocks(const Real* src,
-                            int32_cuda num_row_blocks,
-                            int32_cuda num_col_blocks, Real* dst, MatrixDim d,
-                            int src_stride) {
+static void _max_mat_blocks(const Real* src, Real* dst,
+                            const int32_cuda input_t_dim_,
+                            const int32_cuda pool_t_size_,
+                            const int32_cuda pool_t_step_,
+                            const int32_cuda input_h_dim_,
+                            const int32_cuda pool_h_size_,
+                            const int32_cuda pool_h_step_,
+                            const int32_cuda input_f_dim_,
+                            const int32_cuda pool_f_size_,
+                            const int32_cuda pool_f_step_, 
+                            int index_max_) {
   int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
   int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
-  int32_cuda index = i + j * d.stride;
-  int32_cuda index_src = i + j * src_stride;
-  if (i < d.cols && j < d.rows)
-    for (int32_cuda p = 0; p < num_row_blocks; p++) {
-      for (int32_cuda q = 0; q < num_col_blocks; q++) {
-        dst[index] = fmax(
-            src[index_src + p * src_stride * d.rows + q * d.cols],
-            dst[index]);
+
       }
     }
 }
 
 template<typename Real>
 __global__
-static void _max_mat_blocks_trans(const Real* src,
-                                  int32_cuda num_row_blocks,
-                                  int32_cuda num_col_blocks, Real* dst,
-                                  MatrixDim d, int src_stride) {
+static void _max_mat_blocks_trans(const Real* src, Real* dst,
+                                  const int32_cuda input_t_dim_,
+                                  const int32_cuda pool_t_size_,
+                                  const int32_cuda pool_t_step_,
+                                  const int32_cuda input_h_dim_,
+                                  const int32_cuda pool_h_size_,
+                                  const int32_cuda pool_h_step_,
+                                  const int32_cuda input_f_dim_,
+                                  const int32_cuda pool_f_size_,
+                                  const int32_cuda pool_f_step_, 
+                                  int index_max_) {
   int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
   int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
   int32_cuda index = i + j * d.stride;
@@ -815,9 +822,9 @@ static void _max_mat_blocks_trans(const Real* src,
 
 template<typename Real>
 __global__
-static void _max_mat_repeated(const Real* src,
-                              MatrixDim src_dim, Real* dst,
-                              MatrixDim dst_dim) {
+static void _max_mat_blocks_back(const int index_max_, const Real* src,
+                                 MatrixDim src_dim, Real* dst,
+                                 MatrixDim dst_dim) {
   int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
   int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
   int32_cuda src_i = i % src_dim.cols,
@@ -4007,16 +4014,29 @@ void cudaF_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float* src,
   }
 }
 
-void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, const float* src,
-                          int32_cuda num_row_blocks, int32_cuda num_col_blocks,
-                          float* dst, MatrixDim d, int src_stride,
-                          int A_trans) {
+void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, const float *src, float *dst,
+                        int32_cuda input_t_dim_,
+                        int32_cuda pool_t_size_,
+                        int32_cuda pool_t_step_,
+                        int32_cuda input_h_dim_,
+                        int32_cuda pool_h_size_,
+                        int32_cuda pool_h_step_,
+                        int32_cuda input_f_dim_,
+                        int32_cuda pool_f_size_,
+                        int32_cuda pool_f_step_, 
+                        int index_max_, int A_trans) {
   if (A_trans) {
-    _max_mat_blocks_trans<<<Gr,Bl>>>(src, num_row_blocks, num_col_blocks,
-        dst, d, src_stride);
+    _max_mat_blocks_trans<<<Gr,Bl>>>(src, dst,
+                        input_t_dim_, pool_t_size_, pool_t_step_,
+                        input_h_dim_, pool_h_size_, pool_h_step_,
+                        input_f_dim_, pool_f_size_, pool_f_step_,
+                        index_max_);
   } else {
-    _max_mat_blocks<<<Gr,Bl>>>(src, num_row_blocks, num_col_blocks, dst,
-        d, src_stride);
+    _max_mat_blocks<<<Gr,Bl>>>(src, dst,
+                        input_t_dim_, pool_t_size_, pool_t_step_,
+                        input_h_dim_, pool_h_size_, pool_h_step_,
+                        input_f_dim_, pool_f_size_, pool_f_step_,
+                        index_max_);
   }
 }
 
@@ -4025,9 +4045,10 @@ void cudaF_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float* src,
   _add_mat_repeated<<<Gr,Bl>>>(alpha, src, src_dim, dst, dst_dim);
 }
 
-void cudaF_max_mat_repeated(dim3 Gr, dim3 Bl, const float* src,
-                            MatrixDim src_dim, float *dst, MatrixDim dst_dim) {
-  _max_mat_repeated<<<Gr,Bl>>>(src, src_dim, dst, dst_dim);
+void cudaF_max_mat_blocks_back(dim3 Gr, dim3 Bl, int index_max_,
+                                  const float *src, MatrixDim src_dim,
+                                  float *dst, MatrixDim dst_dim) {
+  _max_mat_blocks_back<<<Gr,Bl>>>(index_max_, src, src_dim, dst, dst_dim);
 }
 
 void cudaF_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A, const float *B,
@@ -4728,16 +4749,29 @@ void cudaD_add_mat_blocks(dim3 Gr, dim3 Bl, double alpha, const double* src,
   }
 }
 
-void cudaD_max_mat_blocks(dim3 Gr, dim3 Bl, const double* src,
-                          int32_cuda num_row_blocks, int32_cuda num_col_blocks,
-                          double* dst, MatrixDim d, int src_stride,
-                          int A_trans) {
+void cudaD_max_mat_blocks(dim3 Gr, dim3 Bl, const double *src, double *dst,
+                                int32_cuda input_t_dim_,
+                                int32_cuda pool_t_size_,
+                                int32_cuda pool_t_step_,
+                                int32_cuda input_h_dim_,
+                                int32_cuda pool_h_size_,
+                                int32_cuda pool_h_step_,
+                                int32_cuda input_f_dim_,
+                                int32_cuda pool_f_size_,
+                                int32_cuda pool_f_step_, 
+                                int index_max_, int A_trans) {
   if (A_trans) {
-    _max_mat_blocks_trans<<<Gr,Bl>>>(src, num_row_blocks, num_col_blocks,
-        dst, d, src_stride);
+    _max_mat_blocks_trans<<<Gr,Bl>>>(src, dst,
+                        input_t_dim_, pool_t_size_, pool_t_step_,
+                        input_h_dim_, pool_h_size_, pool_h_step_,
+                        input_f_dim_, pool_f_size_, pool_f_step_,
+                        index_max_);
   } else {
-    _max_mat_blocks<<<Gr,Bl>>>(src, num_row_blocks, num_col_blocks, dst,
-        d, src_stride);
+    _max_mat_blocks<<<Gr,Bl>>>(src, dst,
+                        input_t_dim_, pool_t_size_, pool_t_step_,
+                        input_h_dim_, pool_h_size_, pool_h_step_,
+                        input_f_dim_, pool_f_size_, pool_f_step_,
+                        index_max_);
   }
 }
 
@@ -4746,9 +4780,9 @@ void cudaD_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double* src,
   _add_mat_repeated<<<Gr,Bl>>>(alpha, src, src_dim, dst, dst_dim);
 }
 
-void cudaD_max_mat_repeated(dim3 Gr, dim3 Bl, const double* src,
+void cudaD_max_mat_blocks_back(dim3 Gr, dim3 Bl, const double *src,
                             MatrixDim src_dim, double *dst, MatrixDim dst_dim) {
-  _max_mat_repeated<<<Gr,Bl>>>(src, src_dim, dst, dst_dim);
+  _max_mat_blocks_back<<<Gr,Bl>>>(index_max_, src, src_dim, dst, dst_dim);
 }
 
 void cudaD_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A,
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index 2804a624959..8985051a8c2 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -179,7 +179,7 @@ inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl,
                                 int32_cuda input_f_dim_,
                                 int32_cuda pool_f_size_,
                                 int32_cuda pool_f_step_, 
-                                double index_max_, int A_trans) {
+                                int index_max_, int A_trans) {
   cudaD_max_mat_blocks(Gr, Bl, src, dst,
                         input_t_dim_, pool_t_size_, pool_t_step_,
                         input_h_dim_, pool_h_size_, pool_h_step_,
@@ -197,7 +197,7 @@ inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl,
                                 int32_cuda input_f_dim_,
                                 int32_cuda pool_f_size_,
                                 int32_cuda pool_f_step_, 
-                                float index_max_, int A_trans) {
+                                int index_max_, int A_trans) {
   cudaF_max_mat_blocks(Gr, Bl, src, dst,
                         input_t_dim_, pool_t_size_, pool_t_step_,
                         input_h_dim_, pool_h_size_, pool_h_step_,
diff --git a/src/nnet3/nnet-convolutional-component.cc b/src/nnet3/nnet-convolutional-component.cc
index 16b6ff0da37..9822b34a2a5 100644
--- a/src/nnet3/nnet-convolutional-component.cc
+++ b/src/nnet3/nnet-convolutional-component.cc
@@ -801,10 +801,10 @@ std::string MaxPoolingOverBlock::Info() const {
 }
 
 void* MaxPoolingOverBlock::Propagate(const ComponentPrecomputedIndexes *indexes,
-                                     const CuMatrixBase<BaseFloat> &in,
-                                     CuMatrixBase<BaseFloat> *out) const {
+                                     const CuMatrixBase<BaseFloat> &in_value,
+                                     CuMatrixBase<BaseFloat> *out_value) const {
 
-  out->MaxMatBlocks(in, index_max_, kNoTrans,
+  out_value->MaxMatBlocks(in_value, index_max_, kNoTrans,
                     input_t_dim_, pool_t_size_, pool_t_step_,
                     input_h_dim_, pool_h_size_, pool_h_step_,
                     input_f_dim_, pool_f_size_, pool_f_step_);
@@ -814,8 +814,8 @@ void* MaxPoolingOverBlock::Propagate(const ComponentPrecomputedIndexes *indexes,
 void MaxPoolingOverBlock::Backprop(
     const std::string &debug_info,
     const ComponentPrecomputedIndexes *indexes,
-    const CuMatrixBase<BaseFloat> &in_value, //in_value
-    const CuMatrixBase<BaseFloat> &out_value, // out_value,
+    const CuMatrixBase<BaseFloat> &,//in_value,
+    const CuMatrixBase<BaseFloat> &,//out_value,
     const CuMatrixBase<BaseFloat> &out_deriv,
     void *memo,
     Component *to_update,
diff --git a/src/nnet3/nnet-convolutional-component.h b/src/nnet3/nnet-convolutional-component.h
index dd73216ff18..5e454be9cea 100644
--- a/src/nnet3/nnet-convolutional-component.h
+++ b/src/nnet3/nnet-convolutional-component.h
@@ -495,19 +495,19 @@ class MaxPoolingOverBlock: public Component {
     return kSimpleComponent|kBackpropNeedsInput|kBackpropNeedsOutput|kBackpropAdds;
   }
   virtual void InitFromConfig(ConfigLine *cfl);
-  virtual int32 InputDim() const;// { return input_dim_; }
-  virtual int32 OutputDim() const;// { return output_dim_; }
+  virtual int32 InputDim() const;
+  virtual int32 OutputDim() const;
   virtual void Read(std::istream &is, bool binary);
   virtual void Write(std::ostream &os, bool binary) const;
   virtual std::string Info() const;
   virtual Component* Copy() const { return new MaxPoolingOverBlock(*this); }
   virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
-                          const CuMatrixBase<BaseFloat> &in,
-                          CuMatrixBase<BaseFloat> *out) const;
+                          const CuMatrixBase<BaseFloat> &in_value,
+                          CuMatrixBase<BaseFloat> *out_value) const;
   virtual void Backprop(const std::string &debug_info,
                         const ComponentPrecomputedIndexes *indexes,
-                        const CuMatrixBase<BaseFloat> &in_value, //in_value
-                        const CuMatrixBase<BaseFloat> &out_value, // out_value,
+                        const CuMatrixBase<BaseFloat> &,//in_value,
+                        const CuMatrixBase<BaseFloat> &,//out_value,
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         void *memo,
                         Component *to_update,

From 7d53314df6925797f8464fa73433e29a56d3fad7 Mon Sep 17 00:00:00 2001
From: Qian Kun <qkun@b02.clsp.jhu.edu>
Date: Thu, 3 May 2018 17:42:07 -0400
Subject: [PATCH 08/15] cuda code for back propagation

---
 src/cudamatrix/cu-kernels-ansi.h |  26 +++++--
 src/cudamatrix/cu-kernels.cu     | 118 +++++++++++++++++++++----------
 src/cudamatrix/cu-kernels.h      |  44 +++++++++---
 src/cudamatrix/cu-matrix.cc      |  26 ++++---
 4 files changed, 156 insertions(+), 58 deletions(-)

diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index a5fd632976f..a5fd9e9d5a9 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -132,10 +132,28 @@ void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, const float *src, float *dst,
                           const int32_cuda pool_f_size_,
                           const int32_cuda pool_f_step_,  
                           int index_max_, int A_trans);
-void cudaD_max_mat_blocks_back(dim3 Gr, dim3 Bl, double index_max_, const double *src, 
-                            MatrixDim src_dim, double *dst, MatrixDim dst_dim);
-void cudaF_max_mat_blocks_back(dim3 Gr, dim3 Bl, float index_max_, const float *src, 
-                            MatrixDim src_dim, float *dst, MatrixDim dst_dim);
+void cudaD_max_mat_blocks_back(dim3 Gr, dim3 Bl, const double *src, double *dst,
+                              const int32_cuda num_pools_t,
+                              const int32_cuda pool_t_size_,
+                              const int32_cuda pool_t_step_,
+                              const int32_cuda num_pools_h,
+                              const int32_cuda pool_h_size_,
+                              const int32_cuda pool_h_step_,
+                              const int32_cuda num_pools_f,
+                              const int32_cuda pool_f_size_,
+                              const int32_cuda pool_f_step_, 
+                              int index_max_);
+void cudaF_max_mat_blocks_back(dim3 Gr, dim3 Bl, const float *src, float *dst,
+                              const int32_cuda num_pools_t,
+                              const int32_cuda pool_t_size_,
+                              const int32_cuda pool_t_step_,
+                              const int32_cuda num_pools_h,
+                              const int32_cuda pool_h_size_,
+                              const int32_cuda pool_h_step_,
+                              const int32_cuda num_pools_f,
+                              const int32_cuda pool_f_size_,
+                              const int32_cuda pool_f_step_,  
+                              int index_max_);
 void cudaD_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat,
                             MatrixDim mat_dim, const double *mat2,
                             int mat2_row_stride, int mat2_col_stride,
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 4699060520e..a9198124b74 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -822,17 +822,38 @@ static void _max_mat_blocks_trans(const Real* src, Real* dst,
 
 template<typename Real>
 __global__
-static void _max_mat_blocks_back(const int index_max_, const Real* src,
-                                 MatrixDim src_dim, Real* dst,
-                                 MatrixDim dst_dim) {
+static void _max_mat_blocks_back(const Real* src, Real* dst,
+                                  const int32_cuda num_pools_t,
+                                  const int32_cuda pool_t_size_,
+                                  const int32_cuda pool_t_step_,
+                                  const int32_cuda num_pools_h,
+                                  const int32_cuda pool_h_size_,
+                                  const int32_cuda pool_h_step_,
+                                  const int32_cuda num_pools_f,
+                                  const int32_cuda pool_f_size_,
+                                  const int32_cuda pool_f_step_, 
+                                  int index_max_) {
   int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
   int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
-  int32_cuda src_i = i % src_dim.cols,
-      src_j = j % src_dim.rows,
-      dst_index = i + j * dst_dim.stride,
-      src_index = src_i + src_j * src_dim.stride;
-  if (i < dst_dim.cols && j < dst_dim.rows)
-    dst[dst_index] += src[src_index];
+  int32_cuda k = blockIdx.z * blcokDim.z + threadIdx.z;
+
+  for (int32_cuda x = 0; x < pool_t_size_ ; x ++) {
+    int32_cuda cur_x = i * pool_t_step_ + x;
+
+    for (int32_cuda y = 0; y < pool_h_size_ ; y++) {
+      for (int32_cuda z = 0; z < pool_f_size_; z++) {
+        int32_cuda cur_y = (j * pool_h_step_ + y) * input_f_dim_ + k * pool_f_step_ + z;
+        int32_cuda idx_in_idxmax = (i * num_pools_h + j) * num_pools_f + f
+
+        if (cur_x == index_max_[idx_in_idxmax] && 
+            cur_y == index_max_[idx_in_idxmax + 1] ||
+            dst[cur_x][cur_y] == 1)
+          dst[cur_x][cur_y] = 1;
+        else
+          dst[cur_x][cur_y] = 0;
+      }
+    }
+  }
 }
 
 template<typename Real>
@@ -4015,16 +4036,16 @@ void cudaF_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float* src,
 }
 
 void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, const float *src, float *dst,
-                        int32_cuda input_t_dim_,
-                        int32_cuda pool_t_size_,
-                        int32_cuda pool_t_step_,
-                        int32_cuda input_h_dim_,
-                        int32_cuda pool_h_size_,
-                        int32_cuda pool_h_step_,
-                        int32_cuda input_f_dim_,
-                        int32_cuda pool_f_size_,
-                        int32_cuda pool_f_step_, 
-                        int index_max_, int A_trans) {
+                          int32_cuda input_t_dim_,
+                          int32_cuda pool_t_size_,
+                          int32_cuda pool_t_step_,
+                          int32_cuda input_h_dim_,
+                          int32_cuda pool_h_size_,
+                          int32_cuda pool_h_step_,
+                          int32_cuda input_f_dim_,
+                          int32_cuda pool_f_size_,
+                          int32_cuda pool_f_step_, 
+                          int index_max_, int A_trans) {
   if (A_trans) {
     _max_mat_blocks_trans<<<Gr,Bl>>>(src, dst,
                         input_t_dim_, pool_t_size_, pool_t_step_,
@@ -4045,10 +4066,22 @@ void cudaF_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float* src,
   _add_mat_repeated<<<Gr,Bl>>>(alpha, src, src_dim, dst, dst_dim);
 }
 
-void cudaF_max_mat_blocks_back(dim3 Gr, dim3 Bl, int index_max_,
-                                  const float *src, MatrixDim src_dim,
-                                  float *dst, MatrixDim dst_dim) {
-  _max_mat_blocks_back<<<Gr,Bl>>>(index_max_, src, src_dim, dst, dst_dim);
+void cudaF_max_mat_blocks_back(dim3 Gr, dim3 Bl, const float *src, float *dst,
+                              int32_cuda input_t_dim_,
+                              int32_cuda pool_t_size_,
+                              int32_cuda pool_t_step_,
+                              int32_cuda input_h_dim_,
+                              int32_cuda pool_h_size_,
+                              int32_cuda pool_h_step_,
+                              int32_cuda input_f_dim_,
+                              int32_cuda pool_f_size_,
+                              int32_cuda pool_f_step_, 
+                              int index_max_) {
+  _max_mat_blocks_back<<<Gr,Bl>>>(src, dst,
+                                  input_t_dim_, pool_t_size_, pool_t_step_,
+                                  input_h_dim_, pool_h_size_, pool_h_step_,
+                                  input_f_dim_, pool_f_size_, pool_f_step_,
+                                  index_max_);
 }
 
 void cudaF_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A, const float *B,
@@ -4750,16 +4783,16 @@ void cudaD_add_mat_blocks(dim3 Gr, dim3 Bl, double alpha, const double* src,
 }
 
 void cudaD_max_mat_blocks(dim3 Gr, dim3 Bl, const double *src, double *dst,
-                                int32_cuda input_t_dim_,
-                                int32_cuda pool_t_size_,
-                                int32_cuda pool_t_step_,
-                                int32_cuda input_h_dim_,
-                                int32_cuda pool_h_size_,
-                                int32_cuda pool_h_step_,
-                                int32_cuda input_f_dim_,
-                                int32_cuda pool_f_size_,
-                                int32_cuda pool_f_step_, 
-                                int index_max_, int A_trans) {
+                          int32_cuda input_t_dim_,
+                          int32_cuda pool_t_size_,
+                          int32_cuda pool_t_step_,
+                          int32_cuda input_h_dim_,
+                          int32_cuda pool_h_size_,
+                          int32_cuda pool_h_step_,
+                          int32_cuda input_f_dim_,
+                          int32_cuda pool_f_size_,
+                          int32_cuda pool_f_step_, 
+                          int index_max_, int A_trans) {
   if (A_trans) {
     _max_mat_blocks_trans<<<Gr,Bl>>>(src, dst,
                         input_t_dim_, pool_t_size_, pool_t_step_,
@@ -4780,9 +4813,22 @@ void cudaD_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double* src,
   _add_mat_repeated<<<Gr,Bl>>>(alpha, src, src_dim, dst, dst_dim);
 }
 
-void cudaD_max_mat_blocks_back(dim3 Gr, dim3 Bl, const double *src,
-                            MatrixDim src_dim, double *dst, MatrixDim dst_dim) {
-  _max_mat_blocks_back<<<Gr,Bl>>>(index_max_, src, src_dim, dst, dst_dim);
+void cudaD_max_mat_blocks_back(dim3 Gr, dim3 Bl, const double *src, double *dst,
+                              int32_cuda input_t_dim_,
+                              int32_cuda pool_t_size_,
+                              int32_cuda pool_t_step_,
+                              int32_cuda input_h_dim_,
+                              int32_cuda pool_h_size_,
+                              int32_cuda pool_h_step_,
+                              int32_cuda input_f_dim_,
+                              int32_cuda pool_f_size_,
+                              int32_cuda pool_f_step_, 
+                              int index_max_) {
+  _max_mat_blocks_back<<<Gr,Bl>>>(src, dst,
+                        input_t_dim_, pool_t_size_, pool_t_step_,
+                        input_h_dim_, pool_h_size_, pool_h_step_,
+                        input_f_dim_, pool_f_size_, pool_f_step_,
+                        index_max_);
 }
 
 void cudaD_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A,
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index 8985051a8c2..a2faf2682d3 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -204,15 +204,41 @@ inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl,
                         input_f_dim_, pool_f_size_, pool_f_step_,
                         index_max_,   A_trans);
 }
-inline void cuda_max_mat_blocks_back(dim3 Gr, dim3 Bl, double index_max_,
-                                  const double *src, MatrixDim src_dim,
-                                  double *dst, MatrixDim dst_dim) {
-  cudaD_max_mat_blocks_back(Gr, Bl, index_max_, src, src_dim, dst, dst_dim);
-}
-inline void cuda_max_mat_blocks_back(dim3 Gr, dim3 Bl, float index_max_,
-                                  const float *src, MatrixDim src_dim,
-                                  float *dst, MatrixDim dst_dim) {
-  cudaF_max_mat_blocks_back(Gr, Bl, index_max_, src, src_dim, dst, dst_dim);
+inline void cuda_max_mat_blocks_back(dim3 Gr, dim3 Bl,
+                                     const double *src, double *dst,
+                                     int32_cuda num_pools_t,
+                                     int32_cuda pool_t_size_,
+                                     int32_cuda pool_t_step_,
+                                     int32_cuda num_pools_h,
+                                     int32_cuda pool_h_size_,
+                                     int32_cuda pool_h_step_,
+                                     int32_cuda num_pools_f,
+                                     int32_cuda pool_f_size_,
+                                     int32_cuda pool_f_step_, 
+                                     int index_max_) {
+  cudaD_max_mat_blocks_back(Gr, Bl, src, dst,
+                            num_pools_t, pool_t_size_, pool_t_step_,
+                            num_pools_h, pool_h_size_, pool_h_step_,
+                            num_pools_f, pool_f_size_, pool_f_step_,
+                            index_max_);
+}
+inline void cuda_max_mat_blocks_back(dim3 Gr, dim3 Bl,
+                                     const float *src, float *dst,
+                                     int32_cuda num_pools_t,
+                                     int32_cuda pool_t_size_,
+                                     int32_cuda pool_t_step_,
+                                     int32_cuda num_pools_h,
+                                     int32_cuda pool_h_size_,
+                                     int32_cuda pool_h_step_,
+                                     int32_cuda num_pools_f,
+                                     int32_cuda pool_f_size_,
+                                     int32_cuda pool_f_step_, 
+                                     int index_max_) {
+  cudaF_max_mat_blocks_back(Gr, Bl, src, dst,
+                            num_pools_t, pool_t_size_, pool_t_step_,
+                            num_pools_h, pool_h_size_, pool_h_step_,
+                            num_pools_f, pool_f_size_, pool_f_step_,
+                            index_max_);
 }
 inline void cuda_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat,
                                   MatrixDim mat_dim, const double *mat2,
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index ae88db54df9..49958a9fff0 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -1234,11 +1234,11 @@ void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
     if (CuDevice::Instantiate().Enabled()) {
       CuTimer tim;
       dim3 dimBlock(num_pools_t, num_pools_h, num_pools_f);
-      dim3 dimGrid(1024);
+      dim3 dimGrid(1);
       // dim3 dimGrid, dimBlock;
       // GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
       //                                       &dimGrid, &dimBlock);
-      cuda_max_mat_blocks(dimGrid, dimBlock, A.data_, data_, Dim(), 
+      cuda_max_mat_blocks(dimGrid, dimBlock, A.data_, data_,
                           input_t_dim_, pool_t_size_, pool_t_step_,
                           input_h_dim_, pool_h_size_, pool_h_step_,
                           input_f_dim_, pool_f_size_, pool_f_step_,
@@ -1260,9 +1260,10 @@ void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
             
             // find the maximm value in the pool
             for (int32 x = 0; x < pool_t_size_; x++) {
+              int32 cur_x = t * pool_t_step_ + x;
+
               for (int32 y = 0; y < pool_h_size_; y++) { 
-                for (int32 z = 0; z < pool_f_size_; z++) {
-                  int32 cur_x = t * pool_t_step_ + x;
+                for (int32 z = 0; z < pool_f_size_; z++) {              
                   int32 cur_y = (h * pool_h_step_ + y) * input_f_dim_ + f * pool_f_step_ + z;
                   if (A[cur_x][cur_y] > max_value) {
                     max_x = cur_x
@@ -1298,11 +1299,18 @@ void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
 #if HAVE_CUDA == 1
     if (CuDevice::Instantiate().Enabled()) {
       CuTimer tim;
-      dim3 dimGrid, dimBlock;
-      GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
-                                            &dimGrid, &dimBlock);
-      cuda_max_mat_blocks_back(dimGrid, dimBlock, index_max_,
-                            A.data_, A.Dim(), data_, Dim());
+      dim3 dimBlock(num_pools_t, num_pools_h, num_pools_f);
+      dim3 dimGrid(1);
+      // dim3 dimGrid, dimBlock;
+      // GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+      //                                       &dimGrid, &dimBlock);
+      cuda_max_mat_blocks_back(A.data_, data_, Dim(), 
+                          num_pools_t, pool_t_size_, pool_t_step_,
+                          num_pools_h, pool_h_size_, pool_h_step_,
+                          num_pools_f, pool_f_size_, pool_f_step_,
+                          index_max_,)
+      // cuda_max_mat_blocks_back(dimGrid, dimBlock, index_max_,
+      //                       A.data_, A.Dim(), data_, Dim());
       CU_SAFE_CALL(cudaGetLastError());
       CuDevice::Instantiate().AccuProfile(__func__, tim);
     } else

From a91c345fb9f9e3eed0205ba9e3913371f306befe Mon Sep 17 00:00:00 2001
From: Qian Kun <qkun@b02.clsp.jhu.edu>
Date: Fri, 4 May 2018 16:06:19 -0400
Subject: [PATCH 09/15] cuda programming

---
 src/cudamatrix/cu-kernels-ansi.h |  72 ++++---
 src/cudamatrix/cu-kernels.cu     | 317 ++++++++++++++++++-------------
 src/cudamatrix/cu-kernels.h      | 128 ++++++-------
 src/cudamatrix/cu-matrix.cc      |  20 +-
 4 files changed, 285 insertions(+), 252 deletions(-)

diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index a5fd9e9d5a9..386b43ae16d 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -110,50 +110,48 @@ void cudaD_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double *src,
                             MatrixDim src_dim, double *dst, MatrixDim dst_dim);
 void cudaF_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float *src,
                             MatrixDim src_dim, float *dst, MatrixDim dst_dim);
-void cudaD_max_mat_blocks(dim3 Gr, dim3 Bl, const double *src, double *dst,
-                          const int32_cuda input_t_dim_,
-                          const int32_cuda pool_t_size_,
-                          const int32_cuda pool_t_step_,
-                          const int32_cuda input_h_dim_,
+void cudaD_max_mat_blocks(dim3 Gr, dim3 Bl, 
+                          const double *src, double *dst, double *index_max_,
+                          const int32_cuda pool_t_size_,                            
                           const int32_cuda pool_h_size_,
-                          const int32_cuda pool_h_step_,
-                          const int32_cuda input_f_dim_,
                           const int32_cuda pool_f_size_,
-                          const int32_cuda pool_f_step_, 
-                          int index_max_, int A_trans);
-void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, const float *src, float *dst,
-                          const int32_cuda input_t_dim_,
-                          const int32_cuda pool_t_size_,
                           const int32_cuda pool_t_step_,
+                          const int32_cuda pool_h_step_,
+                          const int32_cuda pool_f_step_,
                           const int32_cuda input_h_dim_,
+                          const int32_cuda input_f_dim_,
+                          int A_tran);
+void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, 
+                          const float *src, float *dst, float *index_max_,
+                          const int32_cuda pool_t_size_,                            
                           const int32_cuda pool_h_size_,
+                          const int32_cuda pool_f_size_,
+                          const int32_cuda pool_t_step_,
                           const int32_cuda pool_h_step_,
+                          const int32_cuda pool_f_step_,
+                          const int32_cuda input_h_dim_,
                           const int32_cuda input_f_dim_,
-                          const int32_cuda pool_f_size_,
-                          const int32_cuda pool_f_step_,  
-                          int index_max_, int A_trans);
-void cudaD_max_mat_blocks_back(dim3 Gr, dim3 Bl, const double *src, double *dst,
-                              const int32_cuda num_pools_t,
-                              const int32_cuda pool_t_size_,
-                              const int32_cuda pool_t_step_,
-                              const int32_cuda num_pools_h,
-                              const int32_cuda pool_h_size_,
-                              const int32_cuda pool_h_step_,
-                              const int32_cuda num_pools_f,
-                              const int32_cuda pool_f_size_,
-                              const int32_cuda pool_f_step_, 
-                              int index_max_);
-void cudaF_max_mat_blocks_back(dim3 Gr, dim3 Bl, const float *src, float *dst,
-                              const int32_cuda num_pools_t,
-                              const int32_cuda pool_t_size_,
-                              const int32_cuda pool_t_step_,
-                              const int32_cuda num_pools_h,
-                              const int32_cuda pool_h_size_,
-                              const int32_cuda pool_h_step_,
-                              const int32_cuda num_pools_f,
-                              const int32_cuda pool_f_size_,
-                              const int32_cuda pool_f_step_,  
-                              int index_max_);
+                          int A_tran);
+void cudaD_max_mat_blocks_back(dim3 Gr, dim3 Bl, 
+                               const double *src, double *dst, double *index_max_,
+                               const int32_cuda pool_t_size_,                            
+                               const int32_cuda pool_h_size_,
+                               const int32_cuda pool_f_size_,
+                               const int32_cuda pool_t_step_,
+                               const int32_cuda pool_h_step_,
+                               const int32_cuda pool_f_step_,
+                               const int32_cuda input_h_dim_,
+                               const int32_cuda input_f_dim_);
+void cudaF_max_mat_blocks_back(dim3 Gr, dim3 Bl,
+                               const float *src, float *dst, float *index_max_,
+                               const int32_cuda pool_t_size_,                            
+                               const int32_cuda pool_h_size_,
+                               const int32_cuda pool_f_size_,
+                               const int32_cuda pool_t_step_,
+                               const int32_cuda pool_h_step_,
+                               const int32_cuda pool_f_step_,
+                               const int32_cuda input_h_dim_,
+                               const int32_cuda input_f_dim_);
 void cudaD_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat,
                             MatrixDim mat_dim, const double *mat2,
                             int mat2_row_stride, int mat2_col_stride,
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index a9198124b74..f043f64b64b 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -775,82 +775,135 @@ static void _add_mat_blocks_trans(Real alpha, const Real* src,
 
 template<typename Real>
 __global__
-static void _max_mat_blocks(const Real* src, Real* dst,
-                            const int32_cuda input_t_dim_,
-                            const int32_cuda pool_t_size_,
-                            const int32_cuda pool_t_step_,
-                            const int32_cuda input_h_dim_,
+static void _max_mat_blocks(const Real* src, Real* dst, Real* index_max_,
+                            const int32_cuda pool_t_size_,                            
                             const int32_cuda pool_h_size_,
-                            const int32_cuda pool_h_step_,
-                            const int32_cuda input_f_dim_,
                             const int32_cuda pool_f_size_,
-                            const int32_cuda pool_f_step_, 
-                            int index_max_) {
+                            const int32_cuda pool_t_step_,
+                            const int32_cuda pool_h_step_,
+                            const int32_cuda pool_f_step_,
+                            const int32_cuda input_h_dim_,
+                            const int32_cuda input_f_dim_) {
   int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
   int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
-
-      }
+  int32_cuda k = blockIdx.z * blcokDim.z + threadIdx.z;
+  int32_cuda num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_;
+  int32_cuda num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_;
+
+  // initialize the temporary maximum value and its index in each pool
+  int32_cuda max_row = i * pool_t_step_; 
+  int32_cuda max_col = j * pool_h_step_ * input_f_dim_ + k * pool_f_step_;
+  int32_cuda max_value = src[max_row][max_col];
+
+  // loop over all the elements in each pool to find the maximum one,
+  // and record its index.
+
+  for (int32_cuda t = 0; t < pool_t_size_; t++) {
+    // the index of row in *src
+    idx_row = i * pool_t_step_ + t;
+
+    for (int32_cuda h = 0; h < pool_h_size_; h++) {
+      for (int32_cuda f = 0; f < pool_f_size_; f++) {
+        // the index of column in *src
+        idx_col = (j * pool_h_step_ + h) * input_f_dim_ + k * pool_f_step_ + f;
+
+        if (src[idx_row][idx_col] > max_value) {
+          max_row = idx_row;
+          max_col = idx_col;
+          max_value = src[idx_row][idx_col]
+        }
+      } 
     }
+  }
+
+  dst[i][j * num_pools_f + k] = max_value;
+
+  // the index of indexes stored in vector 'index_max_'.
+  int32_cuda idx_in_idxmax = (i * num_pools_h + j) * num_pools_f + k;
+  index_max_[idx_in_idxmax] = max_row;
+  index_max_[idx_in_idxmax + 1] = max_col;
 }
 
+// this function is basicall the same as _max_mat_blocks, except it
+// deal with the transpose matrix of *src. So the column and row index
+// are exchanged.
 template<typename Real>
 __global__
-static void _max_mat_blocks_trans(const Real* src, Real* dst,
-                                  const int32_cuda input_t_dim_,
-                                  const int32_cuda pool_t_size_,
-                                  const int32_cuda pool_t_step_,
-                                  const int32_cuda input_h_dim_,
+static void _max_mat_blocks_trans(const Real* src, Real* dst, Real* index_max_,
+                                  const int32_cuda pool_t_size_,                            
                                   const int32_cuda pool_h_size_,
-                                  const int32_cuda pool_h_step_,
-                                  const int32_cuda input_f_dim_,
                                   const int32_cuda pool_f_size_,
-                                  const int32_cuda pool_f_step_, 
-                                  int index_max_) {
+                                  const int32_cuda pool_t_step_,
+                                  const int32_cuda pool_h_step_,
+                                  const int32_cuda pool_f_step_,
+                                  const int32_cuda input_h_dim_,
+                                  const int32_cuda input_f_dim_) {
   int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
   int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
-  int32_cuda index = i + j * d.stride;
-  int32_cuda index_src = j + i * src_stride;
-  if (i < d.cols && j < d.rows)
-    for (int32_cuda p = 0; p < num_row_blocks; p++) {
-      for (int32_cuda q = 0; q < num_col_blocks; q++) {
-        dst[index] = fmax(
-            src[index_src + p * src_stride * d.cols + q * d.rows],
-            dst[index]);
-      }
+  int32_cuda k = blockIdx.z * blcokDim.z + threadIdx.z;
+  int32_cuda num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_;
+  int32_cuda num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_;
+
+  int32_cuda max_row = i * pool_t_step_; 
+  int32_cuda max_col = j * pool_h_step_ * input_f_dim_ + k * pool_f_step_;
+  int32_cuda max_value = src[max_col][max_row];
+
+  for (int32_cuda t = 0; t < pool_t_size_; t++) {
+  	idx_row = i * pool_t_step_ + t;
+
+    for (int32_cuda h = 0; h < pool_h_size_; h++) {
+      for (int32_cuda f = 0; f < pool_f_size_; f++) {
+        idx_col = (j * pool_h_step_ + h) * input_f_dim_ + k * pool_f_step_ + f;
+
+        if (src[idx_col][idx_row] > max_value) {
+          max_row = idx_row;
+          max_col = idx_col;
+          max_value = src[idx_col][idx_row]
+        }
+      } 
     }
+  }
+
+  dst[j * num_pools_f + k][i] = max_value;
+
+  int32_cuda idx_in_idxmax = (i * num_pools_h + j) * num_pools_f + k;
+  index_max_[idx_in_idxmax] = max_row;
+  index_max_[idx_in_idxmax + 1] = max_col;
+
 }
 
 template<typename Real>
 __global__
-static void _max_mat_blocks_back(const Real* src, Real* dst,
-                                  const int32_cuda num_pools_t,
-                                  const int32_cuda pool_t_size_,
-                                  const int32_cuda pool_t_step_,
-                                  const int32_cuda num_pools_h,
-                                  const int32_cuda pool_h_size_,
-                                  const int32_cuda pool_h_step_,
-                                  const int32_cuda num_pools_f,
-                                  const int32_cuda pool_f_size_,
-                                  const int32_cuda pool_f_step_, 
-                                  int index_max_) {
+static void _max_mat_blocks_back(const Real* src, Real* dst, Real* index_max_,
+                                 const int32_cuda pool_t_size_,                            
+                                 const int32_cuda pool_h_size_,
+                                 const int32_cuda pool_f_size_,
+                                 const int32_cuda pool_t_step_,
+                                 const int32_cuda pool_h_step_,
+                                 const int32_cuda pool_f_step_,
+                                 const int32_cuda input_h_dim_,
+                                 const int32_cuda input_f_dim_) {
   int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
   int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
   int32_cuda k = blockIdx.z * blcokDim.z + threadIdx.z;
+  int32_cuda num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_;
+  int32_cuda num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_;
 
-  for (int32_cuda x = 0; x < pool_t_size_ ; x ++) {
-    int32_cuda cur_x = i * pool_t_step_ + x;
+  for (int32_cuda t = 0; t < pool_t_size_ ; t++) {
+    int32_cuda idx_row = i * pool_t_step_ + t;
 
-    for (int32_cuda y = 0; y < pool_h_size_ ; y++) {
-      for (int32_cuda z = 0; z < pool_f_size_; z++) {
-        int32_cuda cur_y = (j * pool_h_step_ + y) * input_f_dim_ + k * pool_f_step_ + z;
-        int32_cuda idx_in_idxmax = (i * num_pools_h + j) * num_pools_f + f
+    for (int32_cuda h = 0; h < pool_h_size_ ; h++) {
+      for (int32_cuda f = 0; f < pool_f_size_; f++) {
+        int32_cuda idx_col = (j * pool_h_step_ + h) * input_f_dim_ + k * pool_f_step_ + f;
+        int32_cuda idx_in_idxmax = (i * num_pools_h + j) * num_pools_f + k;
 
-        if (cur_x == index_max_[idx_in_idxmax] && 
-            cur_y == index_max_[idx_in_idxmax + 1] ||
-            dst[cur_x][cur_y] == 1)
-          dst[cur_x][cur_y] = 1;
-        else
-          dst[cur_x][cur_y] = 0;
+        if (idx_row == index_max_[idx_in_idxmax] && 
+            idx_col == index_max_[idx_in_idxmax + 1] ||
+            dst[idx_row][idx_col] != 0) {
+          dst[idx_row][idx_col] = src[i][j * num_pools_f + k];
+        } else {
+          dst[idx_row][idx_col] = 0;
+        }
       }
     }
   }
@@ -4035,53 +4088,49 @@ void cudaF_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float* src,
   }
 }
 
-void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, const float *src, float *dst,
-                          int32_cuda input_t_dim_,
-                          int32_cuda pool_t_size_,
-                          int32_cuda pool_t_step_,
-                          int32_cuda input_h_dim_,
-                          int32_cuda pool_h_size_,
-                          int32_cuda pool_h_step_,
-                          int32_cuda input_f_dim_,
-                          int32_cuda pool_f_size_,
-                          int32_cuda pool_f_step_, 
-                          int index_max_, int A_trans) {
-  if (A_trans) {
-    _max_mat_blocks_trans<<<Gr,Bl>>>(src, dst,
-                        input_t_dim_, pool_t_size_, pool_t_step_,
-                        input_h_dim_, pool_h_size_, pool_h_step_,
-                        input_f_dim_, pool_f_size_, pool_f_step_,
-                        index_max_);
-  } else {
-    _max_mat_blocks<<<Gr,Bl>>>(src, dst,
-                        input_t_dim_, pool_t_size_, pool_t_step_,
-                        input_h_dim_, pool_h_size_, pool_h_step_,
-                        input_f_dim_, pool_f_size_, pool_f_step_,
-                        index_max_);
-  }
-}
-
 void cudaF_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float* src,
                             MatrixDim src_dim, float *dst, MatrixDim dst_dim) {
   _add_mat_repeated<<<Gr,Bl>>>(alpha, src, src_dim, dst, dst_dim);
 }
 
-void cudaF_max_mat_blocks_back(dim3 Gr, dim3 Bl, const float *src, float *dst,
-                              int32_cuda input_t_dim_,
-                              int32_cuda pool_t_size_,
-                              int32_cuda pool_t_step_,
-                              int32_cuda input_h_dim_,
-                              int32_cuda pool_h_size_,
-                              int32_cuda pool_h_step_,
-                              int32_cuda input_f_dim_,
-                              int32_cuda pool_f_size_,
-                              int32_cuda pool_f_step_, 
-                              int index_max_) {
-  _max_mat_blocks_back<<<Gr,Bl>>>(src, dst,
-                                  input_t_dim_, pool_t_size_, pool_t_step_,
-                                  input_h_dim_, pool_h_size_, pool_h_step_,
-                                  input_f_dim_, pool_f_size_, pool_f_step_,
-                                  index_max_);
+void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, 
+                          const float *src, float *dst, float *index_max_,
+                          const int32_cuda pool_t_size_,                            
+                          const int32_cuda pool_h_size_,
+                          const int32_cuda pool_f_size_,
+                          const int32_cuda pool_t_step_,
+                          const int32_cuda pool_h_step_,
+                          const int32_cuda pool_f_step_,
+                          const int32_cuda input_h_dim_,
+                          const int32_cuda input_f_dim_,
+                          int A_trans) {
+  if (A_trans) {
+    _max_mat_blocks_trans<<<Gr,Bl>>>(src, dst, index_max_
+                        pool_t_size_, pool_h_size_, pool_f_size_,
+                        pool_t_step_, pool_h_step_, pool_f_step_,
+                                      input_h_dim_, input_f_dim_);
+  } else {
+    _max_mat_blocks<<<Gr,Bl>>>(src, dst, index_max_,
+                        pool_t_size_, pool_h_size_, pool_f_size_,
+                        pool_t_step_, pool_h_step_, pool_f_step_,
+                                      input_h_dim_, input_f_dim_);
+  }
+}
+
+void cudaF_max_mat_blocks_back(dim3 Gr, dim3 Bl, 
+                               const float *src, float *dst, float *index_max_,
+                               const int32_cuda pool_t_size_,                            
+                               const int32_cuda pool_h_size_,
+                               const int32_cuda pool_f_size_,
+                               const int32_cuda pool_t_step_,
+                               const int32_cuda pool_h_step_,
+                               const int32_cuda pool_f_step_,
+                               const int32_cuda input_h_dim_,
+                               const int32_cuda input_f_dim_) {
+  _max_mat_blocks_back<<<Gr,Bl>>>(src, dst, index_max_,
+                                  pool_t_size_, pool_h_size_, pool_f_size_,
+                                  pool_t_step_, pool_h_step_, pool_f_step_,
+                                                input_h_dim_, input_f_dim_);
 }
 
 void cudaF_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A, const float *B,
@@ -4782,53 +4831,49 @@ void cudaD_add_mat_blocks(dim3 Gr, dim3 Bl, double alpha, const double* src,
   }
 }
 
-void cudaD_max_mat_blocks(dim3 Gr, dim3 Bl, const double *src, double *dst,
-                          int32_cuda input_t_dim_,
-                          int32_cuda pool_t_size_,
-                          int32_cuda pool_t_step_,
-                          int32_cuda input_h_dim_,
-                          int32_cuda pool_h_size_,
-                          int32_cuda pool_h_step_,
-                          int32_cuda input_f_dim_,
-                          int32_cuda pool_f_size_,
-                          int32_cuda pool_f_step_, 
-                          int index_max_, int A_trans) {
-  if (A_trans) {
-    _max_mat_blocks_trans<<<Gr,Bl>>>(src, dst,
-                        input_t_dim_, pool_t_size_, pool_t_step_,
-                        input_h_dim_, pool_h_size_, pool_h_step_,
-                        input_f_dim_, pool_f_size_, pool_f_step_,
-                        index_max_);
-  } else {
-    _max_mat_blocks<<<Gr,Bl>>>(src, dst,
-                        input_t_dim_, pool_t_size_, pool_t_step_,
-                        input_h_dim_, pool_h_size_, pool_h_step_,
-                        input_f_dim_, pool_f_size_, pool_f_step_,
-                        index_max_);
-  }
-}
-
 void cudaD_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double* src,
                             MatrixDim src_dim, double *dst, MatrixDim dst_dim) {
   _add_mat_repeated<<<Gr,Bl>>>(alpha, src, src_dim, dst, dst_dim);
 }
 
-void cudaD_max_mat_blocks_back(dim3 Gr, dim3 Bl, const double *src, double *dst,
-                              int32_cuda input_t_dim_,
-                              int32_cuda pool_t_size_,
-                              int32_cuda pool_t_step_,
-                              int32_cuda input_h_dim_,
-                              int32_cuda pool_h_size_,
-                              int32_cuda pool_h_step_,
-                              int32_cuda input_f_dim_,
-                              int32_cuda pool_f_size_,
-                              int32_cuda pool_f_step_, 
-                              int index_max_) {
-  _max_mat_blocks_back<<<Gr,Bl>>>(src, dst,
-                        input_t_dim_, pool_t_size_, pool_t_step_,
-                        input_h_dim_, pool_h_size_, pool_h_step_,
-                        input_f_dim_, pool_f_size_, pool_f_step_,
-                        index_max_);
+void cudaD_max_mat_blocks(dim3 Gr, dim3 Bl, 
+                          const double *src, double *dst, double *index_max_,
+                          const int32_cuda pool_t_size_,                            
+                          const int32_cuda pool_h_size_,
+                          const int32_cuda pool_f_size_,
+                          const int32_cuda pool_t_step_,
+                          const int32_cuda pool_h_step_,
+                          const int32_cuda pool_f_step_,
+                          const int32_cuda input_h_dim_,
+                          const int32_cuda input_f_dim_,
+                          int A_trans) {
+  if (A_trans) {
+    _max_mat_blocks_trans<<<Gr,Bl>>>(src, dst, index_max_,
+                              pool_t_size_, pool_h_size_, pool_f_size_,
+                              pool_t_step_, pool_h_step_, pool_f_step_,
+                                            input_h_dim_, input_f_dim_);
+  } else {
+    _max_mat_blocks<<<Gr,Bl>>>(src, dst, index_max_,
+                              pool_t_size_, pool_h_size_, pool_f_size_,
+                              pool_t_step_, pool_h_step_, pool_f_step_,
+                                            input_h_dim_, input_f_dim_);
+  }
+}
+
+void cudaD_max_mat_blocks_back(dim3 Gr, dim3 Bl, 
+                               const double *src, double *dst, double *index_max_,
+                               const int32_cuda pool_t_size_,                            
+                               const int32_cuda pool_h_size_,
+                               const int32_cuda pool_f_size_,
+                               const int32_cuda pool_t_step_,
+                               const int32_cuda pool_h_step_,
+                               const int32_cuda pool_f_step_,
+                               const int32_cuda input_h_dim_,
+                               const int32_cuda input_f_dim_) {
+  _max_mat_blocks_back<<<Gr,Bl>>>(src, dst, index_max_,
+                                  pool_t_size_, pool_h_size_, pool_f_size_,
+                                  pool_t_step_, pool_h_step_, pool_f_step_,
+                                                input_h_dim_, input_f_dim_);
 }
 
 void cudaD_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A,
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index a2faf2682d3..0d8361cb38d 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -168,77 +168,67 @@ inline void cuda_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha,
                                   float *dst, MatrixDim dst_dim) {
   cudaF_add_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim);
 }
-inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl,
-                                const double *src, double *dst,
-                                int32_cuda input_t_dim_,
-                                int32_cuda pool_t_size_,
-                                int32_cuda pool_t_step_,
-                                int32_cuda input_h_dim_,
-                                int32_cuda pool_h_size_,
-                                int32_cuda pool_h_step_,
-                                int32_cuda input_f_dim_,
-                                int32_cuda pool_f_size_,
-                                int32_cuda pool_f_step_, 
-                                int index_max_, int A_trans) {
-  cudaD_max_mat_blocks(Gr, Bl, src, dst,
-                        input_t_dim_, pool_t_size_, pool_t_step_,
-                        input_h_dim_, pool_h_size_, pool_h_step_,
-                        input_f_dim_, pool_f_size_, pool_f_step_,
-                        index_max_,   A_trans);
-}
-inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl,
-                                const float *src, float *dst,
-                                int32_cuda input_t_dim_,
-                                int32_cuda pool_t_size_,
-                                int32_cuda pool_t_step_,
-                                int32_cuda input_h_dim_,
-                                int32_cuda pool_h_size_,
-                                int32_cuda pool_h_step_,
-                                int32_cuda input_f_dim_,
-                                int32_cuda pool_f_size_,
-                                int32_cuda pool_f_step_, 
-                                int index_max_, int A_trans) {
-  cudaF_max_mat_blocks(Gr, Bl, src, dst,
-                        input_t_dim_, pool_t_size_, pool_t_step_,
-                        input_h_dim_, pool_h_size_, pool_h_step_,
-                        input_f_dim_, pool_f_size_, pool_f_step_,
-                        index_max_,   A_trans);
+inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, 
+                          const double *src, double *dst, double *index_max_,
+                          const int32_cuda pool_t_size_,                            
+                          const int32_cuda pool_h_size_,
+                          const int32_cuda pool_f_size_,
+                          const int32_cuda pool_t_step_,
+                          const int32_cuda pool_h_step_,
+                          const int32_cuda pool_f_step_,
+                          const int32_cuda input_h_dim_,
+                          const int32_cuda input_f_dim_,
+                          int A_tran) {
+  cudaD_max_mat_blocks(Gr, Bl, src, dst, index_max_,
+                       pool_t_size_, pool_h_size_, pool_f_size_,
+                       pool_t_step_, pool_h_step_, pool_f_step_,
+                                     input_h_dim_, input_f_dim_, A_trans);
+}
+inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, 
+                          const float *src, float *dst, float *index_max_,
+                          const int32_cuda pool_t_size_,                            
+                          const int32_cuda pool_h_size_,
+                          const int32_cuda pool_f_size_,
+                          const int32_cuda pool_t_step_,
+                          const int32_cuda pool_h_step_,
+                          const int32_cuda pool_f_step_,
+                          const int32_cuda input_h_dim_,
+                          const int32_cuda input_f_dim_,
+                          int A_tran) {
+  cudaF_max_mat_blocks(Gr, Bl, src, dst, index_max_,
+                       pool_t_size_, pool_h_size_, pool_f_size_,
+                       pool_t_step_, pool_h_step_, pool_f_step_,
+                                     input_h_dim_, input_f_dim_, A_trans);
+}
+inline void cuda_max_mat_blocks_back(dim3 Gr, dim3 Bl, 
+                               const double *src, double *dst, double *index_max_,
+                               const int32_cuda pool_t_size_,                            
+                               const int32_cuda pool_h_size_,
+                               const int32_cuda pool_f_size_,
+                               const int32_cuda pool_t_step_,
+                               const int32_cuda pool_h_step_,
+                               const int32_cuda pool_f_step_,
+                               const int32_cuda input_h_dim_,
+                               const int32_cuda input_f_dim_) {
+  cudaD_max_mat_blocks_back(Gr, Bl, src, dst, index_max_,
+                            pool_t_size_, pool_h_size_, pool_f_size_,
+                            pool_t_step_, pool_h_step_, pool_f_step_,
+                                          input_h_dim_, input_f_dim_);
 }
 inline void cuda_max_mat_blocks_back(dim3 Gr, dim3 Bl,
-                                     const double *src, double *dst,
-                                     int32_cuda num_pools_t,
-                                     int32_cuda pool_t_size_,
-                                     int32_cuda pool_t_step_,
-                                     int32_cuda num_pools_h,
-                                     int32_cuda pool_h_size_,
-                                     int32_cuda pool_h_step_,
-                                     int32_cuda num_pools_f,
-                                     int32_cuda pool_f_size_,
-                                     int32_cuda pool_f_step_, 
-                                     int index_max_) {
-  cudaD_max_mat_blocks_back(Gr, Bl, src, dst,
-                            num_pools_t, pool_t_size_, pool_t_step_,
-                            num_pools_h, pool_h_size_, pool_h_step_,
-                            num_pools_f, pool_f_size_, pool_f_step_,
-                            index_max_);
-}
-inline void cuda_max_mat_blocks_back(dim3 Gr, dim3 Bl,
-                                     const float *src, float *dst,
-                                     int32_cuda num_pools_t,
-                                     int32_cuda pool_t_size_,
-                                     int32_cuda pool_t_step_,
-                                     int32_cuda num_pools_h,
-                                     int32_cuda pool_h_size_,
-                                     int32_cuda pool_h_step_,
-                                     int32_cuda num_pools_f,
-                                     int32_cuda pool_f_size_,
-                                     int32_cuda pool_f_step_, 
-                                     int index_max_) {
-  cudaF_max_mat_blocks_back(Gr, Bl, src, dst,
-                            num_pools_t, pool_t_size_, pool_t_step_,
-                            num_pools_h, pool_h_size_, pool_h_step_,
-                            num_pools_f, pool_f_size_, pool_f_step_,
-                            index_max_);
+                               const float *src, float *dst, float *index_max_,
+                               const int32_cuda pool_t_size_,                            
+                               const int32_cuda pool_h_size_,
+                               const int32_cuda pool_f_size_,
+                               const int32_cuda pool_t_step_,
+                               const int32_cuda pool_h_step_,
+                               const int32_cuda pool_f_step_,
+                               const int32_cuda input_h_dim_,
+                               const int32_cuda input_f_dim_) {
+  cudaF_max_mat_blocks_back(Gr, Bl, src, dst, index_max_,
+                            pool_t_size_, pool_h_size_, pool_f_size_,
+                            pool_t_step_, pool_h_step_, pool_f_step_,
+                                          input_h_dim_, input_f_dim_);
 }
 inline void cuda_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat,
                                   MatrixDim mat_dim, const double *mat2,
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index 49958a9fff0..9af200a8b5e 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -1238,11 +1238,11 @@ void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
       // dim3 dimGrid, dimBlock;
       // GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
       //                                       &dimGrid, &dimBlock);
-      cuda_max_mat_blocks(dimGrid, dimBlock, A.data_, data_,
-                          input_t_dim_, pool_t_size_, pool_t_step_,
-                          input_h_dim_, pool_h_size_, pool_h_step_,
-                          input_f_dim_, pool_f_size_, pool_f_step_,
-                          index_max_, (transA == kTrans ? 1 : 0));
+      cuda_max_mat_blocks(dimGrid, dimBlock, A.data_, data_, index_max_,
+                          pool_t_size_, pool_h_size_, pool_f_size_,
+                          pool_t_step_, pool_h_step_, pool_f_step_,
+                                        input_h_dim_, input_f_dim_,
+                          (transA == kTrans ? 1 : 0));
       CU_SAFE_CALL(cudaGetLastError());
 
       CuDevice::Instantiate().AccuProfile(__func__, tim);
@@ -1275,6 +1275,7 @@ void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
                 }
               }
             }
+            *this[t][h * num_pools_f + f] = max_value;
             tmp += 2
           }
         }
@@ -1304,11 +1305,10 @@ void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
       // dim3 dimGrid, dimBlock;
       // GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
       //                                       &dimGrid, &dimBlock);
-      cuda_max_mat_blocks_back(A.data_, data_, Dim(), 
-                          num_pools_t, pool_t_size_, pool_t_step_,
-                          num_pools_h, pool_h_size_, pool_h_step_,
-                          num_pools_f, pool_f_size_, pool_f_step_,
-                          index_max_,)
+      cuda_max_mat_blocks_back(A.data_, data_, index_max_
+                               pool_t_size_, pool_h_size_, pool_f_size_,
+                               pool_t_step_, pool_h_step_, pool_f_step_,
+                                             input_h_dim_, input_f_dim_)
       // cuda_max_mat_blocks_back(dimGrid, dimBlock, index_max_,
       //                       A.data_, A.Dim(), data_, Dim());
       CU_SAFE_CALL(cudaGetLastError());

From 0d02c03de9a49fa9d28f275b3d46aa5719ea6d37 Mon Sep 17 00:00:00 2001
From: Qian Kun <qkun@b02.clsp.jhu.edu>
Date: Mon, 7 May 2018 13:55:50 -0400
Subject: [PATCH 10/15] cpp grammar error

---
 src/cudamatrix/cu-kernels.cu              |  6 +++---
 src/cudamatrix/cu-kernels.h               |  4 ++--
 src/cudamatrix/cu-matrix.cc               | 22 +++++++++++-----------
 src/cudamatrix/cu-matrix.h                |  6 +++---
 src/nnet3/nnet-convolutional-component.cc | 16 +++++++++-------
 src/nnet3/nnet-convolutional-component.h  | 10 ++++++----
 6 files changed, 34 insertions(+), 30 deletions(-)

diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index f043f64b64b..739a1b48d35 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -775,7 +775,7 @@ static void _add_mat_blocks_trans(Real alpha, const Real* src,
 
 template<typename Real>
 __global__
-static void _max_mat_blocks(const Real* src, Real* dst, Real* index_max_,
+static void _max_mat_blocks(const Real *src, Real *dst, Real *index_max_,
                             const int32_cuda pool_t_size_,                            
                             const int32_cuda pool_h_size_,
                             const int32_cuda pool_f_size_,
@@ -829,7 +829,7 @@ static void _max_mat_blocks(const Real* src, Real* dst, Real* index_max_,
 // are exchanged.
 template<typename Real>
 __global__
-static void _max_mat_blocks_trans(const Real* src, Real* dst, Real* index_max_,
+static void _max_mat_blocks_trans(const Real *src, Real *dst, Real *index_max_,
                                   const int32_cuda pool_t_size_,                            
                                   const int32_cuda pool_h_size_,
                                   const int32_cuda pool_f_size_,
@@ -874,7 +874,7 @@ static void _max_mat_blocks_trans(const Real* src, Real* dst, Real* index_max_,
 
 template<typename Real>
 __global__
-static void _max_mat_blocks_back(const Real* src, Real* dst, Real* index_max_,
+static void _max_mat_blocks_back(const Real *src, Real *dst, Real *index_max_,
                                  const int32_cuda pool_t_size_,                            
                                  const int32_cuda pool_h_size_,
                                  const int32_cuda pool_f_size_,
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index 0d8361cb38d..e85360eaf0f 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -178,7 +178,7 @@ inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl,
                           const int32_cuda pool_f_step_,
                           const int32_cuda input_h_dim_,
                           const int32_cuda input_f_dim_,
-                          int A_tran) {
+                          int A_trans) {
   cudaD_max_mat_blocks(Gr, Bl, src, dst, index_max_,
                        pool_t_size_, pool_h_size_, pool_f_size_,
                        pool_t_step_, pool_h_step_, pool_f_step_,
@@ -194,7 +194,7 @@ inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl,
                           const int32_cuda pool_f_step_,
                           const int32_cuda input_h_dim_,
                           const int32_cuda input_f_dim_,
-                          int A_tran) {
+                          int A_trans) {
   cudaF_max_mat_blocks(Gr, Bl, src, dst, index_max_,
                        pool_t_size_, pool_h_size_, pool_f_size_,
                        pool_t_step_, pool_h_step_, pool_f_step_,
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index 9af200a8b5e..06e40603dcb 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -1191,7 +1191,6 @@ void CuMatrixBase<Real>::AddMatBlocks(Real alpha, const CuMatrixBase<Real> &A,
 template<typename Real>
 void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
                                       CuVectorBase<Real> &index_max_,
-                                      MatrixTransposeType transA,
                                       const int32 input_t_dim_,
                                       const int32 pool_t_size_,
                                       const int32 pool_t_step_,
@@ -1200,7 +1199,8 @@ void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
                                       const int32 pool_h_step_,
                                       const int32 input_f_dim_,
                                       const int32 pool_f_size_,
-                                      const int32 pool_f_step_) {
+                                      const int32 pool_f_step_,
+                                      MatrixTransposeType transA) {
   if (num_rows_ == 0 || num_cols_ == 0) return;
 
   int32 num_pools_t = 1 + (input_t_dim_ - pool_t_size_) / pool_t_step_;
@@ -1266,17 +1266,17 @@ void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
                 for (int32 z = 0; z < pool_f_size_; z++) {              
                   int32 cur_y = (h * pool_h_step_ + y) * input_f_dim_ + f * pool_f_step_ + z;
                   if (A[cur_x][cur_y] > max_value) {
-                    max_x = cur_x
+                    max_x = cur_x;
                     max_y = cur_y;
                     max_value = A[cur_x][cur_y];
-                    index_max_[tmp] = cur_x
-                    index_max_[tmp+1] = cur_y
+                    index_max_[tmp] = cur_x;
+                    index_max_[tmp+1] = cur_y;
                   }
                 }
               }
             }
             *this[t][h * num_pools_f + f] = max_value;
-            tmp += 2
+            tmp += 2;
           }
         }
       }
@@ -1305,10 +1305,10 @@ void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
       // dim3 dimGrid, dimBlock;
       // GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
       //                                       &dimGrid, &dimBlock);
-      cuda_max_mat_blocks_back(A.data_, data_, index_max_
+      cuda_max_mat_blocks_back(dimGrid, dimBlock, A.data_, data_, index_max_,
                                pool_t_size_, pool_h_size_, pool_f_size_,
                                pool_t_step_, pool_h_step_, pool_f_step_,
-                                             input_h_dim_, input_f_dim_)
+                                             input_h_dim_, input_f_dim_);
       // cuda_max_mat_blocks_back(dimGrid, dimBlock, index_max_,
       //                       A.data_, A.Dim(), data_, Dim());
       CU_SAFE_CALL(cudaGetLastError());
@@ -1317,11 +1317,11 @@ void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
 #endif
     {
     // maxpooling backward propagation without cuda
-      *this.SetZero()
+      this->SetZero();
       for (int32 x = 0; x < num_pools_t * num_pools_h * num_pools_f; x += 2) {
-        *this[index_max_[x]][index_max_[x+1]] = 1
+        *this[index_max_[x]][index_max_[x+1]] = 1;
       }
-      *this.MulElements(A);
+      this->MulElements(A);
     }
   }
 }
diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index 67e2c9ad6ca..57761457226 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -527,11 +527,11 @@ class CuMatrixBase {
   ///       function set all the values in &out_deriv whose index is not in 
   ///       vector(not corresponding to maximum value in each pool of &in_value) 
   ///       as zero, and keeps those correponding to maximum value as the *in_deriv.
-  void MaxMatBlocks(const CuMatrixBase<Real> &A, CuVectorBase<Real> &index_max_,
-                    MatrixTransposeType trans = kNoTrans,
+  void MaxMatBlocks(const CuMatrixBase<Real> &A, CuVectorBase<Real> &index_max_,                    
                     const int32 input_t_dim_, const int32 pool_t_size_, const int32 pool_t_step_,
                     const int32 input_h_dim_, const int32 pool_h_size_, const int32 pool_h_step_,
-                    const int32 input_f_dim_, const int32 pool_f_size_, const int32 pool_f_step_);
+                    const int32 input_f_dim_, const int32 pool_f_size_, const int32 pool_f_step_,
+                    MatrixTransposeType trans = kNoTrans);
 
   /// (for each column c of *this), c = alpha * col + beta * c
   void AddVecToCols(Real alpha, const CuVectorBase<Real> &col, Real beta = 1.0);
diff --git a/src/nnet3/nnet-convolutional-component.cc b/src/nnet3/nnet-convolutional-component.cc
index 9822b34a2a5..7bcca4bd3dc 100644
--- a/src/nnet3/nnet-convolutional-component.cc
+++ b/src/nnet3/nnet-convolutional-component.cc
@@ -679,12 +679,12 @@ MaxPoolingOverBlock::MaxPoolingOverBlock(
     pool_f_step_(other.pool_f_step_) { }
 
 // aquire input dim
-int32 MaxpoolingComponent::InputDim() const {
+int32 MaxPoolingOverBlock::InputDim() const {
   return input_t_dim_ * input_h_dim_ * input_f_dim_;
 }
 
 // aquire output dim
-int32 MaxpoolingComponent::OutputDim() const {
+int32 MaxPoolingOverBlock::OutputDim() const {
   int32 num_pools_t = 1 + (input_t_dim_ - pool_t_size_) / pool_t_step_;
   int32 num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_;
   int32 num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_;
@@ -692,7 +692,7 @@ int32 MaxpoolingComponent::OutputDim() const {
 }
 
 // check the component parameters
-void MaxpoolingComponent::Check() const {
+void MaxPoolingOverBlock::Check() const {
   // sanity check of the max pooling parameters
   KALDI_ASSERT(input_t_dim_ > 0);
   KALDI_ASSERT(input_h_dim_ > 0);
@@ -804,10 +804,11 @@ void* MaxPoolingOverBlock::Propagate(const ComponentPrecomputedIndexes *indexes,
                                      const CuMatrixBase<BaseFloat> &in_value,
                                      CuMatrixBase<BaseFloat> *out_value) const {
 
-  out_value->MaxMatBlocks(in_value, index_max_, kNoTrans,
+  out_value->MaxMatBlocks(in_value, index_max_,
                     input_t_dim_, pool_t_size_, pool_t_step_,
                     input_h_dim_, pool_h_size_, pool_h_step_,
-                    input_f_dim_, pool_f_size_, pool_f_step_);
+                    input_f_dim_, pool_f_size_, pool_f_step_,
+                    kNoTrans);
   return NULL;
 }
 
@@ -822,10 +823,11 @@ void MaxPoolingOverBlock::Backprop(
     CuMatrixBase<BaseFloat> *in_deriv) const {
 
   if (in_deriv) {
-    in_derv->MaxMatBlocks(out_deriv, index_max_, kNoTrans,
+    in_deriv->MaxMatBlocks(out_deriv, index_max_,
                           input_t_dim_, pool_t_size_, pool_t_step_,
                           input_h_dim_, pool_h_size_, pool_h_step_,
-                          input_f_dim_, pool_f_size_, pool_f_step_);
+                          input_f_dim_, pool_f_size_, pool_f_step_,
+                          kNoTrans);
 
   }
 }
diff --git a/src/nnet3/nnet-convolutional-component.h b/src/nnet3/nnet-convolutional-component.h
index 5e454be9cea..6544254d8b1 100644
--- a/src/nnet3/nnet-convolutional-component.h
+++ b/src/nnet3/nnet-convolutional-component.h
@@ -513,8 +513,9 @@ class MaxPoolingOverBlock: public Component {
                         Component *to_update,
                         CuMatrixBase<BaseFloat> *in_deriv) const;
   virtual void Check() const;
-
- private:
+  CuVectorBase<BaseFloat> index_max_; // the index of maximum value
+  
+ protected:
   int32 input_t_dim_;   // size of the input along t-axis
   // (e.g. number of time steps)
   int32 input_h_dim_;   // size of input along h-axis
@@ -533,9 +534,10 @@ class MaxPoolingOverBlock: public Component {
   int32 pool_f_step_;   // the number of steps taken along f-axis of input
   // before computing the next pool
 
-  CuVectorBase<Real> index_max_; // the index of maximum value
+  // CuVectorBase<BaseFloat> index_max_; // the index of maximum value
+  // std::vector<int32> index_max_;
 
-  MaxPoolingOverBlock &operator = (const MaxPoolingOverBlock &other); // Disallow.
+  // MaxPoolingOverBlock &operator = (const MaxPoolingOverBlock &other); // Disallow.
 };
 
 } // namespace nnet3

From 8d554310373361b4f8cebdc6ccb67300de8bcf6d Mon Sep 17 00:00:00 2001
From: Qian Kun <qkun@b02.clsp.jhu.edu>
Date: Mon, 7 May 2018 15:01:25 -0400
Subject: [PATCH 11/15] change [] to () as index of CuVectorBase & CuMatrixBase

---
 src/cudamatrix/cu-matrix.cc | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index 06e40603dcb..771cb4ab6c4 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -1251,12 +1251,14 @@ void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
     {
     // maxpooling without cuda
       int32 tmp = 0;
+      // Real *data = this->data_;
+      // std::vector<int32> idx_tmp(index_max_.dim_);
       for (int32 t = 0; t < num_pools_t; t++) {
         for (int32 h = 0; h < num_pools_t; h++) {
           for (int32 f = 0; f < num_pools_f; f++) {
             // initialize the maximum value as the first element in the pool
             int32 max_x = 0; int32 max_y = 0; 
-            int32 max_value = A[t * pool_t_step_][h * pool_h_step_ * input_f_dim_ + f * pool_f_step_];
+            int32 max_value = A(t * pool_t_step_, h * pool_h_step_ * input_f_dim_ + f * pool_f_step_);
             
             // find the maximm value in the pool
             for (int32 x = 0; x < pool_t_size_; x++) {
@@ -1265,21 +1267,25 @@ void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
               for (int32 y = 0; y < pool_h_size_; y++) { 
                 for (int32 z = 0; z < pool_f_size_; z++) {              
                   int32 cur_y = (h * pool_h_step_ + y) * input_f_dim_ + f * pool_f_step_ + z;
-                  if (A[cur_x][cur_y] > max_value) {
+                  if (A(cur_x, cur_y) > max_value) {
                     max_x = cur_x;
                     max_y = cur_y;
-                    max_value = A[cur_x][cur_y];
-                    index_max_[tmp] = cur_x;
-                    index_max_[tmp+1] = cur_y;
+                    max_value = A(cur_x, cur_y);
+                    index_max_(tmp) = cur_x;
+                    index_max_(tmp+1) = cur_y;
                   }
                 }
               }
             }
-            *this[t][h * num_pools_f + f] = max_value;
+            (*this)(t, h * num_pools_f + f) = max_value;
             tmp += 2;
           }
         }
       }
+      // CuArray<int32> cu_cols(idx_tmp);
+      // // index_max_->CopyCols(index_max_, cu_cols);
+      // // CuVectorBase<Real> ttmp(idx_tmp);
+      // index_max_.CopyFromVec(cu_cols);
     }
   } else {
 
@@ -1305,12 +1311,12 @@ void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
       // dim3 dimGrid, dimBlock;
       // GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
       //                                       &dimGrid, &dimBlock);
-      cuda_max_mat_blocks_back(dimGrid, dimBlock, A.data_, data_, index_max_,
-                               pool_t_size_, pool_h_size_, pool_f_size_,
-                               pool_t_step_, pool_h_step_, pool_f_step_,
-                                             input_h_dim_, input_f_dim_);
-      // cuda_max_mat_blocks_back(dimGrid, dimBlock, index_max_,
-      //                       A.data_, A.Dim(), data_, Dim());
+
+      // cuda_max_mat_blocks_back(dimGrid, dimBlock, A.data_, data_, index_max_,
+      //                          pool_t_size_, pool_h_size_, pool_f_size_,
+      //                          pool_t_step_, pool_h_step_, pool_f_step_,
+      //                                        input_h_dim_, input_f_dim_);
+
       CU_SAFE_CALL(cudaGetLastError());
       CuDevice::Instantiate().AccuProfile(__func__, tim);
     } else
@@ -1319,7 +1325,7 @@ void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
     // maxpooling backward propagation without cuda
       this->SetZero();
       for (int32 x = 0; x < num_pools_t * num_pools_h * num_pools_f; x += 2) {
-        *this[index_max_[x]][index_max_[x+1]] = 1;
+        (*this)(index_max_(x),index_max_(x+1)) = 1;
       }
       this->MulElements(A);
     }

From d73c86d4e7a8cc63cce91c84be27570ad355517d Mon Sep 17 00:00:00 2001
From: Qian Kun <qkun@b02.clsp.jhu.edu>
Date: Tue, 8 May 2018 12:14:30 -0400
Subject: [PATCH 12/15] add some description

---
 src/cudamatrix/cu-kernels.cu             |  18 ++--
 src/cudamatrix/cu-matrix.cc              |   6 +-
 src/cudamatrix/cu-matrix.h               |  53 +++++++++++
 src/nnet3/nnet-convolutional-component.h | 108 +++++++++++++----------
 4 files changed, 125 insertions(+), 60 deletions(-)

diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 739a1b48d35..194c36d97f1 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -786,7 +786,7 @@ static void _max_mat_blocks(const Real *src, Real *dst, Real *index_max_,
                             const int32_cuda input_f_dim_) {
   int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
   int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
-  int32_cuda k = blockIdx.z * blcokDim.z + threadIdx.z;
+  int32_cuda k = blockIdx.z * blockDim.z + threadIdx.z;
   int32_cuda num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_;
   int32_cuda num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_;
 
@@ -800,17 +800,17 @@ static void _max_mat_blocks(const Real *src, Real *dst, Real *index_max_,
 
   for (int32_cuda t = 0; t < pool_t_size_; t++) {
     // the index of row in *src
-    idx_row = i * pool_t_step_ + t;
+    int32_cuda idx_row = i * pool_t_step_ + t;
 
     for (int32_cuda h = 0; h < pool_h_size_; h++) {
       for (int32_cuda f = 0; f < pool_f_size_; f++) {
         // the index of column in *src
-        idx_col = (j * pool_h_step_ + h) * input_f_dim_ + k * pool_f_step_ + f;
+        int32_cuda idx_col = (j * pool_h_step_ + h) * input_f_dim_ + k * pool_f_step_ + f;
 
         if (src[idx_row][idx_col] > max_value) {
           max_row = idx_row;
           max_col = idx_col;
-          max_value = src[idx_row][idx_col]
+          max_value = src[idx_row][idx_col];
         }
       } 
     }
@@ -840,7 +840,7 @@ static void _max_mat_blocks_trans(const Real *src, Real *dst, Real *index_max_,
                                   const int32_cuda input_f_dim_) {
   int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
   int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
-  int32_cuda k = blockIdx.z * blcokDim.z + threadIdx.z;
+  int32_cuda k = blockIdx.z * blockDim.z + threadIdx.z;
   int32_cuda num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_;
   int32_cuda num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_;
 
@@ -849,16 +849,16 @@ static void _max_mat_blocks_trans(const Real *src, Real *dst, Real *index_max_,
   int32_cuda max_value = src[max_col][max_row];
 
   for (int32_cuda t = 0; t < pool_t_size_; t++) {
-  	idx_row = i * pool_t_step_ + t;
+  	int32_cuda idx_row = i * pool_t_step_ + t;
 
     for (int32_cuda h = 0; h < pool_h_size_; h++) {
       for (int32_cuda f = 0; f < pool_f_size_; f++) {
-        idx_col = (j * pool_h_step_ + h) * input_f_dim_ + k * pool_f_step_ + f;
+        int32_cuda idx_col = (j * pool_h_step_ + h) * input_f_dim_ + k * pool_f_step_ + f;
 
         if (src[idx_col][idx_row] > max_value) {
           max_row = idx_row;
           max_col = idx_col;
-          max_value = src[idx_col][idx_row]
+          max_value = src[idx_col][idx_row];
         }
       } 
     }
@@ -885,7 +885,7 @@ static void _max_mat_blocks_back(const Real *src, Real *dst, Real *index_max_,
                                  const int32_cuda input_f_dim_) {
   int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
   int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
-  int32_cuda k = blockIdx.z * blcokDim.z + threadIdx.z;
+  int32_cuda k = blockIdx.z * blockDim.z + threadIdx.z;
   int32_cuda num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_;
   int32_cuda num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_;
 
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index 771cb4ab6c4..97cfce8d7e5 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -1238,7 +1238,7 @@ void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
       // dim3 dimGrid, dimBlock;
       // GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
       //                                       &dimGrid, &dimBlock);
-      cuda_max_mat_blocks(dimGrid, dimBlock, A.data_, data_, index_max_,
+      cuda_max_mat_blocks(dimGrid, dimBlock, A.data_, data_,// index_max_,
                           pool_t_size_, pool_h_size_, pool_f_size_,
                           pool_t_step_, pool_h_step_, pool_f_step_,
                                         input_h_dim_, input_f_dim_,
@@ -1282,10 +1282,6 @@ void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
           }
         }
       }
-      // CuArray<int32> cu_cols(idx_tmp);
-      // // index_max_->CopyCols(index_max_, cu_cols);
-      // // CuVectorBase<Real> ttmp(idx_tmp);
-      // index_max_.CopyFromVec(cu_cols);
     }
   } else {
 
diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index 57761457226..7c8c5bfd8af 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -527,6 +527,59 @@ class CuMatrixBase {
   ///       function set all the values in &out_deriv whose index is not in 
   ///       vector(not corresponding to maximum value in each pool of &in_value) 
   ///       as zero, and keeps those correponding to maximum value as the *in_deriv.
+  /// Parameters:
+  ///    
+  ///    size of input matrix:
+  ///         input_t_dim_    size of the input along t-axis
+  ///                         (e.g. number of time steps)
+  ///         input_h_dim_    size of input along h-axis
+  ///                         (e.g. number of mel-frequency bins)
+  ///         input_f_dim_    size of input along f-axis
+  ///                         (e.g. number of filters in the ConvolutionComponent)
+  ///  
+  ///    block size:
+  ///         pool_t_size_    size of the pooling window along t-axis
+  ///         pool_h_size_    size of the pooling window along h-axis
+  ///         pool_f_size_    size of the pooling window along f-axis
+  ///                         (So, the dimension of block is:
+  ///                          pool_t_size_ by pool_h_size_ * pool_f_size_)
+  ///    
+  ///    stride size:
+  ///         pool_t_step_    the number of steps taken along t-axis of input
+  ///                         before computing the next pool (e.g. the stride
+  ///                          size along t-axis)
+  ///         pool_h_step_    the number of steps taken along h-axis of input
+  ///                         before computing the next pool (e.g. the stride
+  ///                          size along t-axis)
+  ///         pool_f_step_    the number of steps taken along f-axis of input
+  ///                         before computing the next pool (e.g. the stride
+  ///                          size along t-axis)
+
+  ///         index_max_      a vector that store the index of the maximum 
+  ///                         value as (r, c), used in back-propagation. The 
+  ///                         size of this vector is 2 * num_pools_t * 
+  ///                         num_pools_h * num_pools_f
+  ///
+  ///   So there are totally num_pools_t * num_pools_h * num_pools_f blocks,
+  ///   where:
+  ///       num_pools_t = 1 + (input_t_dim_ - pool_t_size_) / pool_t_step_; 
+  ///       // the number of blocks in t dimension
+  ///       num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_; 
+  ///       // the number of blocks in h dimension
+  ///       num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_; 
+  ///       // the number of blocks in f dimension
+  ///    If we have index idx_t, idx_h, idx_f in each axis, then we can find 
+  ///    the block with:
+  ///      row index: 
+  ///        [start_t, start_t + pool_t_size_];
+  ///      column index: combination of sets:
+  ///        [start_col(0), start_col(0) + pool_f_size_],
+  ///        [start_col(1), start_col(1) + pool_f_size_],
+  ///        ...,
+  ///        [start_col(pool_h_size_), start_col(pool_h_size_) + pool_f_size_]
+  ///    where:
+  ///      start_row    = idx_t * pool_t_step_
+  ///      start_col(i) = (idx_h * pool_h_step_ + i) * input_f_dim_ + idx_f * pool_f_step_
   void MaxMatBlocks(const CuMatrixBase<Real> &A, CuVectorBase<Real> &index_max_,                    
                     const int32 input_t_dim_, const int32 pool_t_size_, const int32 pool_t_step_,
                     const int32 input_h_dim_, const int32 pool_h_size_, const int32 pool_h_step_,
diff --git a/src/nnet3/nnet-convolutional-component.h b/src/nnet3/nnet-convolutional-component.h
index 6544254d8b1..ce6bfd5096f 100644
--- a/src/nnet3/nnet-convolutional-component.h
+++ b/src/nnet3/nnet-convolutional-component.h
@@ -374,11 +374,10 @@ class TimeHeightConvolutionComponent: public UpdatableComponent {
   this component should be compatible with TimeHeightConvolutionComponent
 
   MaxPoolingOverBlock :
-  MaxPoolingOverBlock component was firstly used in ConvNet for selecting an
-  representative activation in an area. It inspired Maxout nonlinearity.
-  Each output element of this component is the maximum of a block of
-  input elements where the block has a 2.5D dimension (pool_t_size_,
-  pool_h_size_ * pool_f_size_).
+  MaxPoolingOverBlock component was firstly used in ConvNet. It inspired 
+  Maxout nonlinearity. Each output element of this component is the 
+  maximum of a block of input elements where the block has a 
+  dimension (pool_t_size_, pool_h_size_ * pool_f_size_).
   Blocks could overlap if the shift value on any axis is smaller
   than its corresponding pool size (e.g. pool_t_step_ < pool_t_size_).
   If the shift values are euqal to their pool size, there is no
@@ -387,15 +386,66 @@ class TimeHeightConvolutionComponent: public UpdatableComponent {
  
   This component is designed to be used after a ConvolutionComponent
   so that the input matrix is propagated from a 2d-convolutional layer.
-  This component implements 2.5d-maxpooling which performs
+  This component implements maxpooling which performs
   max pooling along the three axes.
  
-  Input : A 2.5D matrix with dimensions:
+  Input : A matrix with dimensions:
          t: (e.g. time)
          h: (e.g. height, mel-frequency)
          f: (e.g. channels like number of filters in the ConvolutionComponent)
- 
-         The reason why we call the matrix 2.5D is because we compress the 3D block 
+
+  Parameters:
+
+            input_t_dim_    size of the input along t-axis
+                            (e.g. number of time steps)
+            input_h_dim_    size of input along h-axis
+                            (e.g. number of mel-frequency bins)
+            input_f_dim_    size of input along f-axis
+                            (e.g. number of filters in the ConvolutionComponent)
+
+            pool_t_size_    size of the pooling window along t-axis
+            pool_h_size_    size of the pooling window along h-axis
+            pool_f_size_    size of the pooling window along f-axis
+
+            pool_t_step_    the number of steps taken along t-axis of input
+                            before computing the next pool (e.g. the stride
+                             size along t-axis)
+            pool_h_step_    the number of steps taken along h-axis of input
+                            before computing the next pool (e.g. the stride
+                             size along t-axis)
+            pool_f_step_    the number of steps taken along f-axis of input
+                            before computing the next pool (e.g. the stride
+                             size along t-axis)
+
+            index_max_      a vector that store the index of the maximum 
+                            value as (r, c), used in back-propagation. The 
+                            size of this vector is 2 * num_pools_t * 
+                            num_pools_h * num_pools_f
+
+         So there are totally num_pools_t * num_pools_h * num_pools_f blocks,
+         where:
+           num_pools_t = 1 + (input_t_dim_ - pool_t_size_) / pool_t_step_; 
+           // the number of blocks in t dimension
+           num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_; 
+           // the number of blocks in h dimension
+           num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_; 
+           // the number of blocks in f dimension
+
+         If we have index idx_t, idx_h, idx_f in each axis, then we can find 
+         the block with:
+           row index: 
+             [start_t, start_t + pool_t_size_];
+           column index: combination of sets:
+             [start_col(0), start_col(0) + pool_f_size_],
+             [start_col(1), start_col(1) + pool_f_size_],
+             ...,
+             [start_col(pool_h_size_), start_col(pool_h_size_) + pool_f_size_]
+         where:
+           start_row    = idx_t * pool_t_step_
+           start_col(i) = (idx_h * pool_h_step_ + i) * input_f_dim_ + idx_f * pool_f_step_
+
+  Example:
+         We store the 3D matrix 
          into a 2D matrix by concatenating each 2D matrix at different channel like:
  
                      h = 0                   h = 1
@@ -442,45 +492,11 @@ class TimeHeightConvolutionComponent: public UpdatableComponent {
       stride of height(poo_h_step), we arrange each row of output as:
       (all filters for height 0)(all filters for height 1)...
 
-  Parameters:
-
-            input_t_dim_    size of the input along t-axis
-                            (e.g. number of time steps)
-            input_h_dim_    size of input along h-axis
-                            (e.g. number of mel-frequency bins)
-            input_f_dim_    size of input along f-axis
-                            (e.g. number of filters in the ConvolutionComponent)
-
-            pool_t_size_    size of the pooling window along t-axis
-            pool_h_size_    size of the pooling window along h-axis
-            pool_f_size_    size of the pooling window along f-axis
-
-            pool_t_step_    the number of steps taken along t-axis of input
-                            before computing the next pool (e.g. the stride
-                             size along t-axis)
-            pool_h_step_    the number of steps taken along h-axis of input
-                            before computing the next pool (e.g. the stride
-                             size along t-axis)
-            pool_f_step_    the number of steps taken along f-axis of input
-                            before computing the next pool (e.g. the stride
-                             size along t-axis)
-
-            index_max_      a vector that store the index of the maximum 
-                            value as (r, c), used in back-propagation. The 
-                            size of this vector is 2 * num_pools_t * 
-                            num_pools_h * num_pools_f
-
- 
  
-  Output : The output is also a 2.5D tensor with dimension (num_block_t by 
-           num_block_h * num_block_f) where:
  
-           num_pools_t = 1 + (input_t_dim_ - pool_t_size_) / pool_t_step_; 
-           // the number of blocks in t dimension
-           num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_; 
-           // the number of blocks in h dimension
-           num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_; 
-           // the number of blocks in f dimension
+  Output : The output is also a 2D matrix with dimension (num_block_t by 
+           num_block_h * num_block_f), with each element corresponding to
+           a block.
  
  
  */

From 7b0d46a4f0b69b301d11d8775aa8c3ba250398e8 Mon Sep 17 00:00:00 2001
From: Qian Kun <qkun@b02.clsp.jhu.edu>
Date: Tue, 8 May 2018 12:51:48 -0400
Subject: [PATCH 13/15] delete commet not needed

---
 src/cudamatrix/cu-kernels.cu             |  2 +-
 src/cudamatrix/cu-matrix.cc              | 27 ++++++++----------------
 src/nnet3/nnet-convolutional-component.h |  7 +++---
 3 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 194c36d97f1..8005482122b 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -4105,7 +4105,7 @@ void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl,
                           const int32_cuda input_f_dim_,
                           int A_trans) {
   if (A_trans) {
-    _max_mat_blocks_trans<<<Gr,Bl>>>(src, dst, index_max_
+    _max_mat_blocks_trans<<<Gr,Bl>>>(src, dst, index_max_,
                         pool_t_size_, pool_h_size_, pool_f_size_,
                         pool_t_step_, pool_h_step_, pool_f_step_,
                                       input_h_dim_, input_f_dim_);
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index 97cfce8d7e5..b9ef308bdd8 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -1207,7 +1207,6 @@ void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
   int32 num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_;
   int32 num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_;
 
-  // Not sure whether this needed?
   KALDI_ASSERT((input_t_dim_ - pool_t_size_) % pool_t_step_ == 0 && 
                (input_h_dim_ - pool_h_size_) % pool_h_step_ == 0 &&
                (input_f_dim_ - pool_f_size_) % pool_f_step_ == 0);
@@ -1216,7 +1215,6 @@ void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
       A.NumCols() >= (transA == kNoTrans ? num_cols_ : num_rows_)) {
     // This is the "forward-propagation" version of MaxMatBlocks.
     // It supports both regular and transposed operation.
-
     if (transA == kNoTrans) {
       KALDI_ASSERT(A.NumRows() == input_t_dim_ &&
                    A.NumCols() == input_h_dim_ * input_f_dim_ &&
@@ -1235,10 +1233,8 @@ void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
       CuTimer tim;
       dim3 dimBlock(num_pools_t, num_pools_h, num_pools_f);
       dim3 dimGrid(1);
-      // dim3 dimGrid, dimBlock;
-      // GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
-      //                                       &dimGrid, &dimBlock);
-      cuda_max_mat_blocks(dimGrid, dimBlock, A.data_, data_,// index_max_,
+
+      cuda_max_mat_blocks(dimGrid, dimBlock, A.data_, data_, index_max_.data_,
                           pool_t_size_, pool_h_size_, pool_f_size_,
                           pool_t_step_, pool_h_step_, pool_f_step_,
                                         input_h_dim_, input_f_dim_,
@@ -1251,8 +1247,6 @@ void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
     {
     // maxpooling without cuda
       int32 tmp = 0;
-      // Real *data = this->data_;
-      // std::vector<int32> idx_tmp(index_max_.dim_);
       for (int32 t = 0; t < num_pools_t; t++) {
         for (int32 h = 0; h < num_pools_t; h++) {
           for (int32 f = 0; f < num_pools_f; f++) {
@@ -1271,8 +1265,8 @@ void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
                     max_x = cur_x;
                     max_y = cur_y;
                     max_value = A(cur_x, cur_y);
-                    index_max_(tmp) = cur_x;
-                    index_max_(tmp+1) = cur_y;
+                    index_max_(tmp) = max_x;
+                    index_max_(tmp+1) = max_y;
                   }
                 }
               }
@@ -1304,14 +1298,11 @@ void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
       CuTimer tim;
       dim3 dimBlock(num_pools_t, num_pools_h, num_pools_f);
       dim3 dimGrid(1);
-      // dim3 dimGrid, dimBlock;
-      // GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
-      //                                       &dimGrid, &dimBlock);
-
-      // cuda_max_mat_blocks_back(dimGrid, dimBlock, A.data_, data_, index_max_,
-      //                          pool_t_size_, pool_h_size_, pool_f_size_,
-      //                          pool_t_step_, pool_h_step_, pool_f_step_,
-      //                                        input_h_dim_, input_f_dim_);
+
+      cuda_max_mat_blocks_back(dimGrid, dimBlock, A.data_, data_, index_max_.data_,
+                               pool_t_size_, pool_h_size_, pool_f_size_,
+                               pool_t_step_, pool_h_step_, pool_f_step_,
+                                             input_h_dim_, input_f_dim_);
 
       CU_SAFE_CALL(cudaGetLastError());
       CuDevice::Instantiate().AccuProfile(__func__, tim);
diff --git a/src/nnet3/nnet-convolutional-component.h b/src/nnet3/nnet-convolutional-component.h
index ce6bfd5096f..6cd421eaf5a 100644
--- a/src/nnet3/nnet-convolutional-component.h
+++ b/src/nnet3/nnet-convolutional-component.h
@@ -529,7 +529,7 @@ class MaxPoolingOverBlock: public Component {
                         Component *to_update,
                         CuMatrixBase<BaseFloat> *in_deriv) const;
   virtual void Check() const;
-  CuVectorBase<BaseFloat> index_max_; // the index of maximum value
+  
   
  protected:
   int32 input_t_dim_;   // size of the input along t-axis
@@ -550,10 +550,9 @@ class MaxPoolingOverBlock: public Component {
   int32 pool_f_step_;   // the number of steps taken along f-axis of input
   // before computing the next pool
 
-  // CuVectorBase<BaseFloat> index_max_; // the index of maximum value
-  // std::vector<int32> index_max_;
+  CuVectorBase<BaseFloat> index_max_; // the index of maximum value
 
-  // MaxPoolingOverBlock &operator = (const MaxPoolingOverBlock &other); // Disallow.
+  MaxPoolingOverBlock &operator = (const MaxPoolingOverBlock &other); // Disallow.
 };
 
 } // namespace nnet3

From 2e99da3371c35a8df006659c1d0ff2789cf812e1 Mon Sep 17 00:00:00 2001
From: Qian Kun <qkun@b02.clsp.jhu.edu>
Date: Tue, 8 May 2018 17:52:06 -0400
Subject: [PATCH 14/15] add a within-block stride

---
 src/cudamatrix/cu-kernels-ansi.h |  50 +++++-----
 src/cudamatrix/cu-kernels.cu     | 165 +++++++++++++++++--------------
 src/cudamatrix/cu-kernels.h      |  78 ++++++++-------
 src/cudamatrix/cu-matrix.cc      |  15 +--
 src/cudamatrix/cu-matrix.h       |   5 +-
 5 files changed, 171 insertions(+), 142 deletions(-)

diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index 386b43ae16d..7ee981f6544 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -112,46 +112,52 @@ void cudaF_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float *src,
                             MatrixDim src_dim, float *dst, MatrixDim dst_dim);
 void cudaD_max_mat_blocks(dim3 Gr, dim3 Bl, 
                           const double *src, double *dst, double *index_max_,
-                          const int32_cuda pool_t_size_,                            
-                          const int32_cuda pool_h_size_,
-                          const int32_cuda pool_f_size_,
+                          const int32_cuda stride,
+                          const int32_cuda input_t_dim_,
+                          const int32_cuda pool_t_size_,
                           const int32_cuda pool_t_step_,
-                          const int32_cuda pool_h_step_,
-                          const int32_cuda pool_f_step_,
                           const int32_cuda input_h_dim_,
+                          const int32_cuda pool_h_size_,
+                          const int32_cuda pool_h_step_,
                           const int32_cuda input_f_dim_,
+                          const int32_cuda pool_f_size_,
+                          const int32_cuda pool_f_step_,
                           int A_tran);
 void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, 
                           const float *src, float *dst, float *index_max_,
-                          const int32_cuda pool_t_size_,                            
-                          const int32_cuda pool_h_size_,
-                          const int32_cuda pool_f_size_,
+                          const int32_cuda stride,
+                          const int32_cuda input_t_dim_,
+                          const int32_cuda pool_t_size_,
                           const int32_cuda pool_t_step_,
-                          const int32_cuda pool_h_step_,
-                          const int32_cuda pool_f_step_,
                           const int32_cuda input_h_dim_,
+                          const int32_cuda pool_h_size_,
+                          const int32_cuda pool_h_step_,
                           const int32_cuda input_f_dim_,
+                          const int32_cuda pool_f_size_,
+                          const int32_cuda pool_f_step_,
                           int A_tran);
 void cudaD_max_mat_blocks_back(dim3 Gr, dim3 Bl, 
                                const double *src, double *dst, double *index_max_,
-                               const int32_cuda pool_t_size_,                            
-                               const int32_cuda pool_h_size_,
-                               const int32_cuda pool_f_size_,
+                               const int32_cuda input_t_dim_,
+                               const int32_cuda pool_t_size_,
                                const int32_cuda pool_t_step_,
-                               const int32_cuda pool_h_step_,
-                               const int32_cuda pool_f_step_,
                                const int32_cuda input_h_dim_,
-                               const int32_cuda input_f_dim_);
-void cudaF_max_mat_blocks_back(dim3 Gr, dim3 Bl,
-                               const float *src, float *dst, float *index_max_,
-                               const int32_cuda pool_t_size_,                            
                                const int32_cuda pool_h_size_,
+                               const int32_cuda pool_h_step_,
+                               const int32_cuda input_f_dim_,
                                const int32_cuda pool_f_size_,
+                               const int32_cuda pool_f_step_);
+void cudaF_max_mat_blocks_back(dim3 Gr, dim3 Bl,
+                               const float *src, float *dst, float *index_max_,
+                               const int32_cuda input_t_dim_,
+                               const int32_cuda pool_t_size_,
                                const int32_cuda pool_t_step_,
-                               const int32_cuda pool_h_step_,
-                               const int32_cuda pool_f_step_,
                                const int32_cuda input_h_dim_,
-                               const int32_cuda input_f_dim_);
+                               const int32_cuda pool_h_size_,
+                               const int32_cuda pool_h_step_,
+                               const int32_cuda input_f_dim_,
+                               const int32_cuda pool_f_size_,
+                               const int32_cuda pool_f_step_);
 void cudaD_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat,
                             MatrixDim mat_dim, const double *mat2,
                             int mat2_row_stride, int mat2_col_stride,
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 8005482122b..34f5c7652ba 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -775,15 +775,17 @@ static void _add_mat_blocks_trans(Real alpha, const Real* src,
 
 template<typename Real>
 __global__
-static void _max_mat_blocks(const Real *src, Real *dst, Real *index_max_,
-                            const int32_cuda pool_t_size_,                            
-                            const int32_cuda pool_h_size_,
-                            const int32_cuda pool_f_size_,
+static void _max_mat_blocks(const Real *src, Real *dst, Real *index_max_, 
+                            const int32_cuda stride,
+                            const int32_cuda input_t_dim_,
+                            const int32_cuda pool_t_size_,
                             const int32_cuda pool_t_step_,
-                            const int32_cuda pool_h_step_,
-                            const int32_cuda pool_f_step_,
                             const int32_cuda input_h_dim_,
-                            const int32_cuda input_f_dim_) {
+                            const int32_cuda pool_h_size_,
+                            const int32_cuda pool_h_step_,
+                            const int32_cuda input_f_dim_,
+                            const int32_cuda pool_f_size_,
+                            const int32_cuda pool_f_step_) {
   int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
   int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
   int32_cuda k = blockIdx.z * blockDim.z + threadIdx.z;
@@ -793,12 +795,12 @@ static void _max_mat_blocks(const Real *src, Real *dst, Real *index_max_,
   // initialize the temporary maximum value and its index in each pool
   int32_cuda max_row = i * pool_t_step_; 
   int32_cuda max_col = j * pool_h_step_ * input_f_dim_ + k * pool_f_step_;
-  int32_cuda max_value = src[max_row][max_col];
+  int32_cuda max_value = src[max_row * input_h_dim_ * input_f_dim_ + max_col];
 
   // loop over all the elements in each pool to find the maximum one,
   // and record its index.
 
-  for (int32_cuda t = 0; t < pool_t_size_; t++) {
+  for (int32_cuda t = 0; t < pool_t_size_; t += stride) {
     // the index of row in *src
     int32_cuda idx_row = i * pool_t_step_ + t;
 
@@ -807,16 +809,16 @@ static void _max_mat_blocks(const Real *src, Real *dst, Real *index_max_,
         // the index of column in *src
         int32_cuda idx_col = (j * pool_h_step_ + h) * input_f_dim_ + k * pool_f_step_ + f;
 
-        if (src[idx_row][idx_col] > max_value) {
+        if (src[idx_row * input_h_dim_ * input_f_dim_ + idx_col] > max_value) {
           max_row = idx_row;
           max_col = idx_col;
-          max_value = src[idx_row][idx_col];
+          max_value = src[max_row * input_h_dim_ * input_f_dim_ + max_col];
         }
       } 
     }
   }
 
-  dst[i][j * num_pools_f + k] = max_value;
+  dst[i * num_pools_h * num_pools_f + j * num_pools_f + k] = max_value;
 
   // the index of indexes stored in vector 'index_max_'.
   int32_cuda idx_in_idxmax = (i * num_pools_h + j) * num_pools_f + k;
@@ -830,41 +832,44 @@ static void _max_mat_blocks(const Real *src, Real *dst, Real *index_max_,
 template<typename Real>
 __global__
 static void _max_mat_blocks_trans(const Real *src, Real *dst, Real *index_max_,
-                                  const int32_cuda pool_t_size_,                            
-                                  const int32_cuda pool_h_size_,
-                                  const int32_cuda pool_f_size_,
+                                  const int32_cuda stride,
+                                  const int32_cuda input_t_dim_,
+                                  const int32_cuda pool_t_size_,
                                   const int32_cuda pool_t_step_,
-                                  const int32_cuda pool_h_step_,
-                                  const int32_cuda pool_f_step_,
                                   const int32_cuda input_h_dim_,
-                                  const int32_cuda input_f_dim_) {
+                                  const int32_cuda pool_h_size_,
+                                  const int32_cuda pool_h_step_,
+                                  const int32_cuda input_f_dim_,
+                                  const int32_cuda pool_f_size_,
+                                  const int32_cuda pool_f_step_) {
   int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
   int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
   int32_cuda k = blockIdx.z * blockDim.z + threadIdx.z;
+  int32_cuda num_pools_t = 1 + (input_t_dim_ - pool_t_size_) / pool_t_step_; 
   int32_cuda num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_;
   int32_cuda num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_;
 
   int32_cuda max_row = i * pool_t_step_; 
   int32_cuda max_col = j * pool_h_step_ * input_f_dim_ + k * pool_f_step_;
-  int32_cuda max_value = src[max_col][max_row];
+  int32_cuda max_value = src[max_col * input_t_dim_ + max_row];
 
-  for (int32_cuda t = 0; t < pool_t_size_; t++) {
+  for (int32_cuda t = 0; t < pool_t_size_; t += stride) {
   	int32_cuda idx_row = i * pool_t_step_ + t;
 
     for (int32_cuda h = 0; h < pool_h_size_; h++) {
       for (int32_cuda f = 0; f < pool_f_size_; f++) {
         int32_cuda idx_col = (j * pool_h_step_ + h) * input_f_dim_ + k * pool_f_step_ + f;
 
-        if (src[idx_col][idx_row] > max_value) {
+        if (src[idx_col * input_t_dim_ + idx_row] > max_value) {
           max_row = idx_row;
           max_col = idx_col;
-          max_value = src[idx_col][idx_row];
+          max_value = src[max_col * input_t_dim_ + max_row];
         }
       } 
     }
   }
 
-  dst[j * num_pools_f + k][i] = max_value;
+  dst[(j * num_pools_f + k) * num_pools_t + i] = max_value;
 
   int32_cuda idx_in_idxmax = (i * num_pools_h + j) * num_pools_f + k;
   index_max_[idx_in_idxmax] = max_row;
@@ -875,14 +880,15 @@ static void _max_mat_blocks_trans(const Real *src, Real *dst, Real *index_max_,
 template<typename Real>
 __global__
 static void _max_mat_blocks_back(const Real *src, Real *dst, Real *index_max_,
-                                 const int32_cuda pool_t_size_,                            
-                                 const int32_cuda pool_h_size_,
-                                 const int32_cuda pool_f_size_,
+                                 const int32_cuda input_t_dim_,
+                                 const int32_cuda pool_t_size_,
                                  const int32_cuda pool_t_step_,
-                                 const int32_cuda pool_h_step_,
-                                 const int32_cuda pool_f_step_,
                                  const int32_cuda input_h_dim_,
-                                 const int32_cuda input_f_dim_) {
+                                 const int32_cuda pool_h_size_,
+                                 const int32_cuda pool_h_step_,
+                                 const int32_cuda input_f_dim_,
+                                 const int32_cuda pool_f_size_,
+                                 const int32_cuda pool_f_step_) {
   int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
   int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
   int32_cuda k = blockIdx.z * blockDim.z + threadIdx.z;
@@ -899,10 +905,11 @@ static void _max_mat_blocks_back(const Real *src, Real *dst, Real *index_max_,
 
         if (idx_row == index_max_[idx_in_idxmax] && 
             idx_col == index_max_[idx_in_idxmax + 1] ||
-            dst[idx_row][idx_col] != 0) {
-          dst[idx_row][idx_col] = src[i][j * num_pools_f + k];
+            dst[idx_row * input_h_dim_ * input_f_dim_ + idx_col] != 0) {
+          dst[idx_row * input_h_dim_ * input_f_dim_ + idx_col] = 
+          src[i * num_pools_h * num_pools_f + j * num_pools_f + k];
         } else {
-          dst[idx_row][idx_col] = 0;
+          dst[idx_row * input_h_dim_ * input_f_dim_ + idx_col] = 0;
         }
       }
     }
@@ -4095,42 +4102,45 @@ void cudaF_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float* src,
 
 void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, 
                           const float *src, float *dst, float *index_max_,
-                          const int32_cuda pool_t_size_,                            
-                          const int32_cuda pool_h_size_,
-                          const int32_cuda pool_f_size_,
+                          const int32_cuda stride,
+                          const int32_cuda input_t_dim_,
+                          const int32_cuda pool_t_size_,
                           const int32_cuda pool_t_step_,
-                          const int32_cuda pool_h_step_,
-                          const int32_cuda pool_f_step_,
                           const int32_cuda input_h_dim_,
+                          const int32_cuda pool_h_size_,
+                          const int32_cuda pool_h_step_,
                           const int32_cuda input_f_dim_,
+                          const int32_cuda pool_f_size_,
+                          const int32_cuda pool_f_step_,
                           int A_trans) {
   if (A_trans) {
-    _max_mat_blocks_trans<<<Gr,Bl>>>(src, dst, index_max_,
-                        pool_t_size_, pool_h_size_, pool_f_size_,
-                        pool_t_step_, pool_h_step_, pool_f_step_,
-                                      input_h_dim_, input_f_dim_);
+    _max_mat_blocks_trans<<<Gr,Bl>>>(src, dst, index_max_, stride,
+                          input_t_dim_, pool_t_size_, pool_t_step_,
+                          input_h_dim_, pool_h_size_, pool_h_step_,
+                          input_f_dim_, pool_f_size_, pool_f_step_);
   } else {
-    _max_mat_blocks<<<Gr,Bl>>>(src, dst, index_max_,
-                        pool_t_size_, pool_h_size_, pool_f_size_,
-                        pool_t_step_, pool_h_step_, pool_f_step_,
-                                      input_h_dim_, input_f_dim_);
+    _max_mat_blocks<<<Gr,Bl>>>(src, dst, index_max_, stride,
+                          input_t_dim_, pool_t_size_, pool_t_step_,
+                          input_h_dim_, pool_h_size_, pool_h_step_,
+                          input_f_dim_, pool_f_size_, pool_f_step_);
   }
 }
 
 void cudaF_max_mat_blocks_back(dim3 Gr, dim3 Bl, 
                                const float *src, float *dst, float *index_max_,
-                               const int32_cuda pool_t_size_,                            
-                               const int32_cuda pool_h_size_,
-                               const int32_cuda pool_f_size_,
+                               const int32_cuda input_t_dim_,
+                               const int32_cuda pool_t_size_,
                                const int32_cuda pool_t_step_,
-                               const int32_cuda pool_h_step_,
-                               const int32_cuda pool_f_step_,
                                const int32_cuda input_h_dim_,
-                               const int32_cuda input_f_dim_) {
+                               const int32_cuda pool_h_size_,
+                               const int32_cuda pool_h_step_,
+                               const int32_cuda input_f_dim_,
+                               const int32_cuda pool_f_size_,
+                               const int32_cuda pool_f_step_) {
   _max_mat_blocks_back<<<Gr,Bl>>>(src, dst, index_max_,
-                                  pool_t_size_, pool_h_size_, pool_f_size_,
-                                  pool_t_step_, pool_h_step_, pool_f_step_,
-                                                input_h_dim_, input_f_dim_);
+	                            input_t_dim_, pool_t_size_, pool_t_step_,
+	                            input_h_dim_, pool_h_size_, pool_h_step_,
+	                            input_f_dim_, pool_f_size_, pool_f_step_);
 }
 
 void cudaF_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A, const float *B,
@@ -4838,42 +4848,45 @@ void cudaD_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double* src,
 
 void cudaD_max_mat_blocks(dim3 Gr, dim3 Bl, 
                           const double *src, double *dst, double *index_max_,
-                          const int32_cuda pool_t_size_,                            
-                          const int32_cuda pool_h_size_,
-                          const int32_cuda pool_f_size_,
+                          const int32_cuda stride,
+                          const int32_cuda input_t_dim_,
+                          const int32_cuda pool_t_size_,
                           const int32_cuda pool_t_step_,
-                          const int32_cuda pool_h_step_,
-                          const int32_cuda pool_f_step_,
                           const int32_cuda input_h_dim_,
+                          const int32_cuda pool_h_size_,
+                          const int32_cuda pool_h_step_,
                           const int32_cuda input_f_dim_,
+                          const int32_cuda pool_f_size_,
+                          const int32_cuda pool_f_step_,
                           int A_trans) {
   if (A_trans) {
-    _max_mat_blocks_trans<<<Gr,Bl>>>(src, dst, index_max_,
-                              pool_t_size_, pool_h_size_, pool_f_size_,
-                              pool_t_step_, pool_h_step_, pool_f_step_,
-                                            input_h_dim_, input_f_dim_);
+    _max_mat_blocks_trans<<<Gr,Bl>>>(src, dst, index_max_, stride,
+			                         input_t_dim_, pool_t_size_, pool_t_step_,
+			                         input_h_dim_, pool_h_size_, pool_h_step_,
+			                         input_f_dim_, pool_f_size_, pool_f_step_);
   } else {
-    _max_mat_blocks<<<Gr,Bl>>>(src, dst, index_max_,
-                              pool_t_size_, pool_h_size_, pool_f_size_,
-                              pool_t_step_, pool_h_step_, pool_f_step_,
-                                            input_h_dim_, input_f_dim_);
+    _max_mat_blocks<<<Gr,Bl>>>(src, dst, index_max_, stride,
+	                           input_t_dim_, pool_t_size_, pool_t_step_,
+	                           input_h_dim_, pool_h_size_, pool_h_step_,
+	                           input_f_dim_, pool_f_size_, pool_f_step_);
   }
 }
 
 void cudaD_max_mat_blocks_back(dim3 Gr, dim3 Bl, 
                                const double *src, double *dst, double *index_max_,
-                               const int32_cuda pool_t_size_,                            
-                               const int32_cuda pool_h_size_,
-                               const int32_cuda pool_f_size_,
+                               const int32_cuda input_t_dim_,
+                               const int32_cuda pool_t_size_,
                                const int32_cuda pool_t_step_,
-                               const int32_cuda pool_h_step_,
-                               const int32_cuda pool_f_step_,
                                const int32_cuda input_h_dim_,
-                               const int32_cuda input_f_dim_) {
+                               const int32_cuda pool_h_size_,
+                               const int32_cuda pool_h_step_,
+                               const int32_cuda input_f_dim_,
+                               const int32_cuda pool_f_size_,
+                               const int32_cuda pool_f_step_) {
   _max_mat_blocks_back<<<Gr,Bl>>>(src, dst, index_max_,
-                                  pool_t_size_, pool_h_size_, pool_f_size_,
-                                  pool_t_step_, pool_h_step_, pool_f_step_,
-                                                input_h_dim_, input_f_dim_);
+		                          input_t_dim_, pool_t_size_, pool_t_step_,
+		                          input_h_dim_, pool_h_size_, pool_h_step_,
+		                          input_f_dim_, pool_f_size_, pool_f_step_);
 }
 
 void cudaD_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A,
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index e85360eaf0f..0712179bca8 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -170,65 +170,71 @@ inline void cuda_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha,
 }
 inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, 
                           const double *src, double *dst, double *index_max_,
-                          const int32_cuda pool_t_size_,                            
-                          const int32_cuda pool_h_size_,
-                          const int32_cuda pool_f_size_,
+                          const int32_cuda stride,
+                          const int32_cuda input_t_dim_,
+                          const int32_cuda pool_t_size_,
                           const int32_cuda pool_t_step_,
-                          const int32_cuda pool_h_step_,
-                          const int32_cuda pool_f_step_,
                           const int32_cuda input_h_dim_,
+                          const int32_cuda pool_h_size_,
+                          const int32_cuda pool_h_step_,
                           const int32_cuda input_f_dim_,
+                          const int32_cuda pool_f_size_,
+                          const int32_cuda pool_f_step_,
                           int A_trans) {
-  cudaD_max_mat_blocks(Gr, Bl, src, dst, index_max_,
-                       pool_t_size_, pool_h_size_, pool_f_size_,
-                       pool_t_step_, pool_h_step_, pool_f_step_,
-                                     input_h_dim_, input_f_dim_, A_trans);
+  cudaD_max_mat_blocks(Gr, Bl, src, dst, index_max_, stride,
+                          input_t_dim_, pool_t_size_, pool_t_step_,
+                          input_h_dim_, pool_h_size_, pool_h_step_,
+                          input_f_dim_, pool_f_size_, pool_f_step_, A_trans);
 }
 inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, 
                           const float *src, float *dst, float *index_max_,
-                          const int32_cuda pool_t_size_,                            
-                          const int32_cuda pool_h_size_,
-                          const int32_cuda pool_f_size_,
+                          const int32_cuda stride,
+                          const int32_cuda input_t_dim_,
+                          const int32_cuda pool_t_size_,
                           const int32_cuda pool_t_step_,
-                          const int32_cuda pool_h_step_,
-                          const int32_cuda pool_f_step_,
                           const int32_cuda input_h_dim_,
+                          const int32_cuda pool_h_size_,
+                          const int32_cuda pool_h_step_,
                           const int32_cuda input_f_dim_,
+                          const int32_cuda pool_f_size_,
+                          const int32_cuda pool_f_step_,
                           int A_trans) {
-  cudaF_max_mat_blocks(Gr, Bl, src, dst, index_max_,
-                       pool_t_size_, pool_h_size_, pool_f_size_,
-                       pool_t_step_, pool_h_step_, pool_f_step_,
-                                     input_h_dim_, input_f_dim_, A_trans);
+  cudaF_max_mat_blocks(Gr, Bl, src, dst, index_max_, stride,
+                          input_t_dim_, pool_t_size_, pool_t_step_,
+                          input_h_dim_, pool_h_size_, pool_h_step_,
+                          input_f_dim_, pool_f_size_, pool_f_step_, A_trans);
 }
 inline void cuda_max_mat_blocks_back(dim3 Gr, dim3 Bl, 
                                const double *src, double *dst, double *index_max_,
-                               const int32_cuda pool_t_size_,                            
-                               const int32_cuda pool_h_size_,
-                               const int32_cuda pool_f_size_,
+                               const int32_cuda input_t_dim_,
+                               const int32_cuda pool_t_size_,
                                const int32_cuda pool_t_step_,
-                               const int32_cuda pool_h_step_,
-                               const int32_cuda pool_f_step_,
                                const int32_cuda input_h_dim_,
-                               const int32_cuda input_f_dim_) {
+                               const int32_cuda pool_h_size_,
+                               const int32_cuda pool_h_step_,
+                               const int32_cuda input_f_dim_,
+                               const int32_cuda pool_f_size_,
+                               const int32_cuda pool_f_step_) {
   cudaD_max_mat_blocks_back(Gr, Bl, src, dst, index_max_,
-                            pool_t_size_, pool_h_size_, pool_f_size_,
-                            pool_t_step_, pool_h_step_, pool_f_step_,
-                                          input_h_dim_, input_f_dim_);
+                            input_t_dim_, pool_t_size_, pool_t_step_,
+                            input_h_dim_, pool_h_size_, pool_h_step_,
+                            input_f_dim_, pool_f_size_, pool_f_step_);
 }
 inline void cuda_max_mat_blocks_back(dim3 Gr, dim3 Bl,
                                const float *src, float *dst, float *index_max_,
-                               const int32_cuda pool_t_size_,                            
-                               const int32_cuda pool_h_size_,
-                               const int32_cuda pool_f_size_,
+                               const int32_cuda input_t_dim_,
+                               const int32_cuda pool_t_size_,
                                const int32_cuda pool_t_step_,
-                               const int32_cuda pool_h_step_,
-                               const int32_cuda pool_f_step_,
                                const int32_cuda input_h_dim_,
-                               const int32_cuda input_f_dim_) {
+                               const int32_cuda pool_h_size_,
+                               const int32_cuda pool_h_step_,
+                               const int32_cuda input_f_dim_,
+                               const int32_cuda pool_f_size_,
+                               const int32_cuda pool_f_step_) {
   cudaF_max_mat_blocks_back(Gr, Bl, src, dst, index_max_,
-                            pool_t_size_, pool_h_size_, pool_f_size_,
-                            pool_t_step_, pool_h_step_, pool_f_step_,
-                                          input_h_dim_, input_f_dim_);
+                            input_t_dim_, pool_t_size_, pool_t_step_,
+                            input_h_dim_, pool_h_size_, pool_h_step_,
+                            input_f_dim_, pool_f_size_, pool_f_step_);
 }
 inline void cuda_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat,
                                   MatrixDim mat_dim, const double *mat2,
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index b9ef308bdd8..969a4629391 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -1191,6 +1191,7 @@ void CuMatrixBase<Real>::AddMatBlocks(Real alpha, const CuMatrixBase<Real> &A,
 template<typename Real>
 void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
                                       CuVectorBase<Real> &index_max_,
+                                      const int32 stride,
                                       const int32 input_t_dim_,
                                       const int32 pool_t_size_,
                                       const int32 pool_t_step_,
@@ -1234,10 +1235,10 @@ void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
       dim3 dimBlock(num_pools_t, num_pools_h, num_pools_f);
       dim3 dimGrid(1);
 
-      cuda_max_mat_blocks(dimGrid, dimBlock, A.data_, data_, index_max_.data_,
-                          pool_t_size_, pool_h_size_, pool_f_size_,
-                          pool_t_step_, pool_h_step_, pool_f_step_,
-                                        input_h_dim_, input_f_dim_,
+      cuda_max_mat_blocks(dimGrid, dimBlock, A.data_, data_, index_max_.data_, stride,
+                          input_t_dim_, pool_t_size_, pool_t_step_,
+                          input_h_dim_, pool_h_size_, pool_h_step_,
+                          input_f_dim_, pool_f_size_, pool_f_step_,
                           (transA == kTrans ? 1 : 0));
       CU_SAFE_CALL(cudaGetLastError());
 
@@ -1300,9 +1301,9 @@ void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
       dim3 dimGrid(1);
 
       cuda_max_mat_blocks_back(dimGrid, dimBlock, A.data_, data_, index_max_.data_,
-                               pool_t_size_, pool_h_size_, pool_f_size_,
-                               pool_t_step_, pool_h_step_, pool_f_step_,
-                                             input_h_dim_, input_f_dim_);
+                               input_t_dim_, pool_t_size_, pool_t_step_,
+                               input_h_dim_, pool_h_size_, pool_h_step_,
+                               input_f_dim_, pool_f_size_, pool_f_step_);
 
       CU_SAFE_CALL(cudaGetLastError());
       CuDevice::Instantiate().AccuProfile(__func__, tim);
diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index 7c8c5bfd8af..f64838b3871 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -554,6 +554,9 @@ class CuMatrixBase {
   ///         pool_f_step_    the number of steps taken along f-axis of input
   ///                         before computing the next pool (e.g. the stride
   ///                          size along t-axis)
+  ///         stride          the time stride size within blocks. So we get 
+  ///                         one row of maxpooling candidate every stride rows
+  ///                         in the input matrix.
 
   ///         index_max_      a vector that store the index of the maximum 
   ///                         value as (r, c), used in back-propagation. The 
@@ -580,7 +583,7 @@ class CuMatrixBase {
   ///    where:
   ///      start_row    = idx_t * pool_t_step_
   ///      start_col(i) = (idx_h * pool_h_step_ + i) * input_f_dim_ + idx_f * pool_f_step_
-  void MaxMatBlocks(const CuMatrixBase<Real> &A, CuVectorBase<Real> &index_max_,                    
+  void MaxMatBlocks(const CuMatrixBase<Real> &A, CuVectorBase<Real> &index_max_, const int32 stride,                    
                     const int32 input_t_dim_, const int32 pool_t_size_, const int32 pool_t_step_,
                     const int32 input_h_dim_, const int32 pool_h_size_, const int32 pool_h_step_,
                     const int32 input_f_dim_, const int32 pool_f_size_, const int32 pool_f_step_,

From 89ac55b0997b274eb353554ac861f877ccc76874 Mon Sep 17 00:00:00 2001
From: Qian Kun <qkun@b02.clsp.jhu.edu>
Date: Thu, 17 May 2018 07:19:38 -0400
Subject: [PATCH 15/15] testcode for MaxMatBlocks function

---
 src/cudamatrix/cu-matrix-test.cc | 92 ++++++++++++++++++++++++++++++++
 src/cudamatrix/cu-matrix.cc      |  5 +-
 src/cudamatrix/cu-matrix.h       |  4 +-
 3 files changed, 96 insertions(+), 5 deletions(-)

diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc
index 01030bb8353..60d87ca1ca7 100644
--- a/src/cudamatrix/cu-matrix-test.cc
+++ b/src/cudamatrix/cu-matrix-test.cc
@@ -2917,6 +2917,97 @@ static void UnitTestCuMatrixEqualElementMask() {
 
 }
 
+template<typename Real>
+static void UnitTestCuMatrixMaxMatBlocks() {
+  for (int32 l = 0; l < 5; l++) {
+    int32 stride = RandInt(1, 5);
+    int32 input_t_dim_ = RandInt(1, 100);
+    int32 pool_t_size_ = RandInt(1, 10);
+    int32 pool_t_step_ = RandInt(1, 10);
+    int32 input_h_dim_ = RandInt(1, 100);
+    int32 pool_h_size_ = RandInt(1, 10);
+    int32 pool_h_step_ = RandInt(1, 10);
+    int32 input_f_dim_ = RandInt(1, 100);
+    int32 pool_f_size_ = RandInt(1, 10);
+    int32 pool_f_step_ = RandInt(1, 10);
+
+
+    // this part is for testing of forward propagation
+    CuMatrix<Real> in_value(input_t_dim_, input_h_dim_ * input_f_dim_);
+    in_value.SetRandn();
+
+    int32 num_pools_t = 1 + (input_t_dim_ - pool_t_size_) / pool_t_step_;
+    int32 num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_;
+    int32 num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_;
+
+    CuMatrix<Real> out_value(num_pools_t, num_pools_h * num_pools_f);
+    out_value.SetRandn();
+
+    CuVector<Real> index_max_(2 * num_pools_t * num_pools_h * num_pools_f);
+    index_max_.SetRandn();
+
+    CuMatrix<Real> out_value_copy(out_value);
+
+    out_value.MaxMatBlocks(in_value, index_max_,
+                    input_t_dim_, pool_t_size_, pool_t_step_,
+                    input_h_dim_, pool_h_size_, pool_h_step_,
+                    input_f_dim_, pool_f_size_, pool_f_step_,
+                    kNoTrans);
+    int32 tmp = 0;
+    for (int32 t = 0; t < num_pools_t; t++) {
+      for (int32 h = 0; h < num_pools_t; h++) {
+        for (int32 f = 0; f < num_pools_f; f++) {
+          // initialize the maximum value as the first element in the pool
+          int32 max_x = 0; int32 max_y = 0;
+          int32 max_value = in_value(t * pool_t_step_, h * pool_h_step_ * input_f_dim_ + f * pool_f_step_);
+
+          // find the maximm value in the pool
+          for (int32 x = 0; x < pool_t_size_; x += stride) {
+            int32 cur_x = t * pool_t_step_ + x;
+
+            for (int32 y = 0; y < pool_h_size_; y++) {
+              for (int32 z = 0; z < pool_f_size_; z++) {
+                int32 cur_y = (h * pool_h_step_ + y) * input_f_dim_ + f * pool_f_step_ + z;
+                if (in_value(cur_x, cur_y) > max_value) {
+                  max_x = cur_x;
+                  max_y = cur_y;
+                  max_value = in_value(cur_x, cur_y);
+                  index_max_(tmp) = max_x;
+                  index_max_(tmp+1) = max_y;
+                }
+              }
+            }
+          }
+          out_value_copy(t, h * num_pools_f + f) = max_value;
+          tmp += 2;
+        }
+      }
+    }
+
+    AssertEqual(out_value, out_value_copy);
+
+    // this part is for testing backward propagation
+    CuMatrix<Real> in_deriv(input_t_dim_, input_h_dim_ * input_f_dim_);
+    in_deriv.SetZero();
+    CuMatrix<Real> out_deriv(num_pools_t, num_pools_h * num_pools_f);
+    out_deriv.SetRandn();
+    CuMatrix<Real> in_deriv_copy(in_deriv);
+
+    in_deriv.MaxMatBlocks(out_deriv, index_max_,
+                   input_t_dim_, pool_t_size_, pool_t_step_,
+                   input_h_dim_, pool_h_size_, pool_h_step_,
+                   input_f_dim_, pool_f_size_, pool_f_step_,
+                   kNoTrans);
+
+    for (int32 x = 0; x < num_pools_t * num_pools_h * num_pools_f; x += 2) {
+      int32 row_tmp = (x / 2) / (num_pools_h * num_pools_f);
+      int32 col_tmp = (x / 2) % (num_pools_h * num_pools_f);
+      in_deriv_copy(index_max_(x),index_max_(x+1)) = out_deriv(row_tmp, col_tmp);
+    }
+    AssertEqual(in_deriv, in_deriv_copy);
+  }
+}
+
 template<typename Real> void CudaMatrixUnitTest() {
   UnitTestCuMatrixApplyExpSpecial<Real>();
   UnitTestCuMatrixApplyExpLimited<Real>();
@@ -2987,6 +3078,7 @@ template<typename Real> void CudaMatrixUnitTest() {
   UnitTestCuMatrixAddToElements<Real>();
   UnitTestCuMatrixLookup<Real>();
   UnitTestCuMatrixEqualElementMask<Real>();
+  UnitTestCuMatrixMaxMatBlocks<Real>();
   // test CuVector<Real> methods
   UnitTestCuVectorAddVec<Real>();
   UnitTestCuVectorAddRowSumMat<Real>();
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index 969a4629391..f0c10086433 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -1313,9 +1313,10 @@ void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
     // maxpooling backward propagation without cuda
       this->SetZero();
       for (int32 x = 0; x < num_pools_t * num_pools_h * num_pools_f; x += 2) {
-        (*this)(index_max_(x),index_max_(x+1)) = 1;
+        int32 row_tmp = (x / 2) / (num_pools_h * num_pools_f);
+        int32 col_tmp = (x / 2) % (num_pools_h * num_pools_f);
+        (*this)(index_max_(x),index_max_(x+1)) = A(row_tmp, col_tmp);
       }
-      this->MulElements(A);
     }
   }
 }
diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index f64838b3871..345d3fd980b 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -583,12 +583,10 @@ class CuMatrixBase {
   ///    where:
   ///      start_row    = idx_t * pool_t_step_
   ///      start_col(i) = (idx_h * pool_h_step_ + i) * input_f_dim_ + idx_f * pool_f_step_
-  void MaxMatBlocks(const CuMatrixBase<Real> &A, CuVectorBase<Real> &index_max_, const int32 stride,                    
-                    const int32 input_t_dim_, const int32 pool_t_size_, const int32 pool_t_step_,
+  void MaxMatBlocks(const CuMatrixBase<Real> &A, CuVectorBase<Real> &index_max_, const int32 stride,                    const int32 input_t_dim_, const int32 pool_t_size_, const int32 pool_t_step_,
                     const int32 input_h_dim_, const int32 pool_h_size_, const int32 pool_h_step_,
                     const int32 input_f_dim_, const int32 pool_f_size_, const int32 pool_f_step_,
                     MatrixTransposeType trans = kNoTrans);
-
   /// (for each column c of *this), c = alpha * col + beta * c
   void AddVecToCols(Real alpha, const CuVectorBase<Real> &col, Real beta = 1.0);
   /// (for each row r of *this), r = alpha * row + beta * r