From 55cae0ff04f2e5d94616d49d7e85008b5cd0d293 Mon Sep 17 00:00:00 2001 From: Qian Kun Date: Thu, 12 Apr 2018 19:21:51 -0400 Subject: [PATCH 01/15] maxpooling component over blocks --- src/cudamatrix/cu-kernels.cu | 66 +++++++++++++++++++++++ src/cudamatrix/cu-kernels.h | 14 +++++ src/cudamatrix/cu-matrix.cc | 86 ++++++++++++++++++++++++++++++ src/cudamatrix/cu-matrix.h | 3 ++ src/nnet3/nnet-simple-component.cc | 71 ++++++++++++++++++++++++ src/nnet3/nnet-simple-component.h | 43 +++++++++++++++ 6 files changed, 283 insertions(+) diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 934a860a055..ae2781f1b07 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -773,6 +773,46 @@ static void _add_mat_blocks_trans(Real alpha, const Real* src, } } +template +__global__ +static void _max_mat_blocks(Real alpha, const Real* src, + int32_cuda num_row_blocks, + int32_cuda num_col_blocks, Real* dst, MatrixDim d, + int src_stride) { + int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; + int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; + int32_cuda index = i + j * d.stride; + int32_cuda index_src = i + j * src_stride; + if (i < d.cols && j < d.rows) + for (int32_cuda p = 0; p < num_row_blocks; p++) { + for (int32_cuda q = 0; q < num_col_blocks; q++) { + dst[index] = fmax( + src[index_src + p * src_stride * d.rows + q * d.cols], + dst[index]); + } + } +} + +template +__global__ +static void _max_mat_blocks_trans(Real alpha, const Real* src, + int32_cuda num_row_blocks, + int32_cuda num_col_blocks, Real* dst, + MatrixDim d, int src_stride) { + int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; + int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; + int32_cuda index = i + j * d.stride; + int32_cuda index_src = j + i * src_stride; + if (i < d.cols && j < d.rows) + for (int32_cuda p = 0; p < num_row_blocks; p++) { + for (int32_cuda q = 0; q < num_col_blocks; q++) { + dst[index] = fmax( + src[index_src + p * src_stride * d.cols + q * d.rows], + dst[index]); + } + } +} + template __global__ static void _set_mat_mat_div_mat(const Real* A, const Real* B, const Real* C, @@ -3952,6 +3992,19 @@ void cudaF_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float* src, } } +void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float* src, + int32_cuda num_row_blocks, int32_cuda num_col_blocks, + float* dst, MatrixDim d, int src_stride, + int A_trans) { + if (A_trans) { + _max_mat_blocks_trans<<>>(alpha, src, num_row_blocks, num_col_blocks, + dst, d, src_stride); + } else { + _max_mat_blocks<<>>(alpha, src, num_row_blocks, num_col_blocks, dst, + d, src_stride); + } +} + void cudaF_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float* src, MatrixDim src_dim, float *dst, MatrixDim dst_dim) { _add_mat_repeated<<>>(alpha, src, src_dim, dst, dst_dim); @@ -4656,6 +4709,19 @@ void cudaD_add_mat_blocks(dim3 Gr, dim3 Bl, double alpha, const double* src, } } +void cudaD_max_mat_blocks(dim3 Gr, dim3 Bl, double alpha, const double* src, + int32_cuda num_row_blocks, int32_cuda num_col_blocks, + double* dst, MatrixDim d, int src_stride, + int A_trans) { + if (A_trans) { + _max_mat_blocks_trans<<>>(alpha, src, num_row_blocks, num_col_blocks, + dst, d, src_stride); + } else { + _max_mat_blocks<<>>(alpha, src, num_row_blocks, num_col_blocks, dst, + d, src_stride); + } +} + void cudaD_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double* src, MatrixDim src_dim, double *dst, MatrixDim dst_dim) { _add_mat_repeated<<>>(alpha, src, src_dim, dst, dst_dim); diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index 8f719a8c4a1..234c4fe48de 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -158,6 +158,20 @@ inline void cuda_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float *src, cudaF_add_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst, d, src_stride, A_trans); } +inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, double alpha, + const double *src, int32_cuda num_row_blocks, + int32_cuda num_col_blocks, double *dst, + MatrixDim d, int src_stride, int A_trans) { + cudaD_max_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst, + d, src_stride, A_trans); +} +inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float *src, + int32_cuda num_row_blocks, + int32_cuda num_col_blocks, float *dst, + MatrixDim d, int src_stride, int A_trans) { + cudaF_max_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst, + d, src_stride, A_trans); +} inline void cuda_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double *src, MatrixDim src_dim, double *dst, MatrixDim dst_dim) { diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index beccd9dc4a5..a7000d93697 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -1187,6 +1187,92 @@ void CuMatrixBase::AddMatBlocks(Real alpha, const CuMatrixBase &A, } } + +template +void CuMatrixBase::MaxMatBlocks(Real alpha, const CuMatrixBase &A, + MatrixTransposeType transA) { + if (num_rows_ == 0 || num_cols_ == 0) return; + + if (A.NumRows() >= (transA == kNoTrans ? num_rows_ : num_cols_) && + A.NumCols() >= (transA == kNoTrans ? num_cols_ : num_rows_)) { + // This is the "summing", not broadcasting, version of AddMatBlocks. + // It supports both regular and transposed operation. + int32 num_row_blocks, num_col_blocks; + if (transA == kNoTrans) { + KALDI_ASSERT(A.NumRows() % num_rows_ == 0 && A.NumCols() % num_cols_ == 0); + num_row_blocks = A.Mat().NumRows() / num_rows_; + num_col_blocks = A.Mat().NumCols() / num_cols_; + } else { + KALDI_ASSERT(A.NumRows() % num_cols_ == 0 && A.NumCols() % num_rows_ == 0); + num_row_blocks = A.Mat().NumRows() / num_cols_; + num_col_blocks = A.Mat().NumCols() / num_rows_; + } +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + CuTimer tim; + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); + cuda_max_mat_blocks(dimGrid, dimBlock, alpha, A.data_, num_row_blocks, + num_col_blocks, data_, Dim(), A.Stride(), + (transA == kTrans ? 1 : 0)); + CU_SAFE_CALL(cudaGetLastError()); + + CuDevice::Instantiate().AccuProfile(__func__, tim); + } //else +#endif + // { + // int32 nr, nc; + // if (transA == kNoTrans) { + // nr = num_rows_; + // nc = num_cols_; + // } else { + // nr = num_cols_; + // nc = num_rows_; + // } + // for (int32 i = 0; i < num_row_blocks; i++) { + // for (int32 j = 0; j < num_col_blocks; j++) { + // Mat().AddMat(alpha, SubMatrix(A.Mat(), i * nr, nr, j * nc, nc), + // transA); + // } + // } + // } +// } else { +// // This is the "broadcasting" version of MaxMatBlocks, where +// // *this is larger than src. +// if (transA != kNoTrans) +// KALDI_ERR << "Transposed operation not supported currently."; +// if (!(num_rows_ % A.NumRows() == 0 && num_cols_ % A.NumCols() == 0)) +// KALDI_ERR << "Invalid sizes of arguments"; +// #if HAVE_CUDA == 1 +// if (CuDevice::Instantiate().Enabled()) { +// CuTimer tim; +// dim3 dimGrid, dimBlock; +// GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), +// &dimGrid, &dimBlock); +// cuda_add_mat_repeated(dimGrid, dimBlock, alpha, +// A.data_, A.Dim(), data_, Dim()); +// CU_SAFE_CALL(cudaGetLastError()); +// CuDevice::Instantiate().AccuProfile(__func__, tim); +// } else +// #endif +// { +// const MatrixBase &src_mat = A.Mat(), +// &this_mat = this->Mat(); +// for (int32 row_offset = 0; row_offset < NumRows(); +// row_offset += src_mat.NumRows()) { +// for (int32 col_offset = 0; col_offset < NumCols(); +// col_offset += src_mat.NumCols()) { +// SubMatrix this_part(this_mat, +// row_offset, src_mat.NumRows(), +// col_offset, src_mat.NumCols()); +// this_part.AddMat(alpha, src_mat); +// } +// } +// } + } +} + /// dst = a * b / c (by element; when c = 0, dst = a) /// dst can be an alias of a, b or c safely and get expected result. template diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h index 03e69b639d3..334daf65645 100644 --- a/src/cudamatrix/cu-matrix.h +++ b/src/cudamatrix/cu-matrix.h @@ -509,6 +509,9 @@ class CuMatrixBase { void AddMatBlocks(Real alpha, const CuMatrixBase &A, MatrixTransposeType trans = kNoTrans); + void MaxMatBlocks(Real alpha, const CuMatrixBase &A, + MatrixTransposeType trans = kNoTrans); + /// (for each column c of *this), c = alpha * col + beta * c void AddVecToCols(Real alpha, const CuVectorBase &col, Real beta = 1.0); /// (for each row r of *this), r = alpha * row + beta * r diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index 4eb078c0fcb..02b7d661459 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -5860,6 +5860,77 @@ void SumBlockComponent::Backprop( } } +MaxPoolingOverBlock::MaxPoolingOverBlock(const MaxPoolingOverBlock &other): + input_dim_(other.input_dim_), output_dim_(other.output_dim_), + scale_(other.scale_) { } + +void MaxPoolingOverBlock::InitFromConfig(ConfigLine *cfl) { + scale_ = 1.0; + bool ok = cfl->GetValue("input-dim", &input_dim_) && + cfl->GetValue("output-dim", &output_dim_); + if (!ok) + KALDI_ERR << "input-dim and output-dim must both be provided."; + if (input_dim_ <= 0 || input_dim_ % output_dim_ != 0) + KALDI_ERR << "Invalid values input-dim=" << input_dim_ + << " output-dim=" << output_dim_; + cfl->GetValue("scale", &scale_); + if (cfl->HasUnusedValues()) + KALDI_ERR << "Could not process these elements in initializer: " + << cfl->UnusedValues(); +} + +void MaxPoolingOverBlock::Read(std::istream &is, bool binary) { + ExpectOneOrTwoTokens(is, binary, "", ""); + ReadBasicType(is, binary, &input_dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &output_dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &scale_); + ExpectToken(is, binary, ""); +} + +void MaxPoolingOverBlock::Write(std::ostream &os, bool binary) const { + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, input_dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, output_dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, scale_); + WriteToken(os, binary, ""); +} + +std::string MaxPoolingOverBlock::Info() const { + std::ostringstream stream; + stream << Type() << ", input-dim=" << input_dim_ + << ", output-dim=" << output_dim_ + << ", scale=" << scale_; + return stream.str(); +} + +void* MaxPoolingOverBlock::Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const { + KALDI_ASSERT(out->NumRows() == in.NumRows() && + out->NumCols() == output_dim_ && + in.NumCols() == input_dim_); + out->MaxMatBlocks(scale_, in, kNoTrans); + return NULL; +} + +void MaxPoolingOverBlock::Backprop( + const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &, //in_value + const CuMatrixBase &, // out_value, + const CuMatrixBase &out_deriv, + void *memo, + Component *to_update, + CuMatrixBase *in_deriv) const { + if (in_deriv) { + in_deriv->MaxMatBlocks(scale_, out_deriv, kNoTrans); + } +} } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h index 3929c253aab..0f83631b7e6 100644 --- a/src/nnet3/nnet-simple-component.h +++ b/src/nnet3/nnet-simple-component.h @@ -1220,6 +1220,49 @@ class SumBlockComponent: public Component { SumBlockComponent &operator = (const SumBlockComponent &other); // Disallow. }; +/** MaxPoolingOverBlock gets maximum value over blocks of its input: for instance, if + you create one with the config "input-dim=400 output-dim=100", + its output will be the maximum value over the 4 100-dimensional blocks of + the input. + + Accepted values on its config-file line are: + input-dim The input dimension. Required. + output-dim The block dimension. Required. Must divide input-dim. + scale A scaling factor on the output. Defaults to 1.0. + */ +class MaxPoolingOverBlock: public Component { + public: + explicit MaxPoolingOverBlock(const MaxPoolingOverBlock &other); + MaxPoolingOverBlock() { } + virtual std::string Type() const { return "MaxPoolingOverBlock"; } + virtual int32 Properties() const { + return kSimpleComponent|kPropagateAdds|kBackpropAdds; + } + virtual void (ConfigLine *cfl); + virtual int32 InputDim() const { return input_dim_; } + virtual int32 OutputDim() const { return output_dim_; } + virtual void Read(std::istream &is, bool binary); + virtual void Write(std::ostream &os, bool binary) const; + virtual std::string Info() const; + virtual Component* Copy() const { return new MaxPoolingOverBlock(*this); } + virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const; + virtual void Backprop(const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &, //in_value + const CuMatrixBase &, // out_value, + const CuMatrixBase &out_deriv, + void *memo, + Component *to_update, + CuMatrixBase *in_deriv) const; + private: + int32 input_dim_; + int32 output_dim_; + BaseFloat scale_; + MaxPoolingOverBlock &operator = (const MaxPoolingOverBlock &other); // Disallow. +}; + /* ClipGradientComponent just duplicates its input, but clips gradients From ecb5a86ec0cd03af7103252ed2fefcb156aaad33 Mon Sep 17 00:00:00 2001 From: Qian Kun Date: Thu, 12 Apr 2018 19:45:12 -0400 Subject: [PATCH 02/15] maxpooling component over blocks: repeated func --- src/cudamatrix/cu-kernels.cu | 24 +++++++++++++++++++++++ src/cudamatrix/cu-kernels.h | 18 +++++++++++++---- src/cudamatrix/cu-matrix.cc | 38 ++++++++++++++++++------------------ 3 files changed, 57 insertions(+), 23 deletions(-) diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index ae2781f1b07..fe1bcf60fc9 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -813,6 +813,21 @@ static void _max_mat_blocks_trans(Real alpha, const Real* src, } } +template +__global__ +static void _max_mat_repeated(Real alpha, const Real* src, + MatrixDim src_dim, Real* dst, + MatrixDim dst_dim) { + int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; + int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; + int32_cuda src_i = i % src_dim.cols, + src_j = j % src_dim.rows, + dst_index = i + j * dst_dim.stride, + src_index = src_i + src_j * src_dim.stride; + if (i < dst_dim.cols && j < dst_dim.rows) + dst[dst_index] += alpha * src[src_index]; +} + template __global__ static void _set_mat_mat_div_mat(const Real* A, const Real* B, const Real* C, @@ -4010,6 +4025,10 @@ void cudaF_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float* src, _add_mat_repeated<<>>(alpha, src, src_dim, dst, dst_dim); } +void cudaF_max_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float* src, + MatrixDim src_dim, float *dst, MatrixDim dst_dim) { + _max_mat_repeated<<>>(alpha, src, src_dim, dst, dst_dim); +} void cudaF_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A, const float *B, const float *C, float *dst, MatrixDim d, @@ -4727,6 +4746,11 @@ void cudaD_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double* src, _add_mat_repeated<<>>(alpha, src, src_dim, dst, dst_dim); } +void cudaD_max_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double* src, + MatrixDim src_dim, double *dst, MatrixDim dst_dim) { + _max_mat_repeated<<>>(alpha, src, src_dim, dst, dst_dim); +} + void cudaD_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A, const double *B, const double *C, double *dst, MatrixDim d, int stride_a, int stride_b, diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index 234c4fe48de..d106c973aa1 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -158,6 +158,16 @@ inline void cuda_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float *src, cudaF_add_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst, d, src_stride, A_trans); } +inline void cuda_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha, + const double *src, MatrixDim src_dim, + double *dst, MatrixDim dst_dim) { + cudaD_add_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim); +} +inline void cuda_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, + const float *src, MatrixDim src_dim, + float *dst, MatrixDim dst_dim) { + cudaF_add_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim); +} inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, double alpha, const double *src, int32_cuda num_row_blocks, int32_cuda num_col_blocks, double *dst, @@ -172,15 +182,15 @@ inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float *src, cudaF_max_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst, d, src_stride, A_trans); } -inline void cuda_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha, +inline void cuda_max_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double *src, MatrixDim src_dim, double *dst, MatrixDim dst_dim) { - cudaD_add_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim); + cudaD_max_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim); } -inline void cuda_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, +inline void cuda_max_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float *src, MatrixDim src_dim, float *dst, MatrixDim dst_dim) { - cudaF_add_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim); + cudaF_max_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim); } inline void cuda_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat, MatrixDim mat_dim, const double *mat2, diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index a7000d93697..bf1f019d3ac 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -1237,25 +1237,25 @@ void CuMatrixBase::MaxMatBlocks(Real alpha, const CuMatrixBase &A, // } // } // } -// } else { -// // This is the "broadcasting" version of MaxMatBlocks, where -// // *this is larger than src. -// if (transA != kNoTrans) -// KALDI_ERR << "Transposed operation not supported currently."; -// if (!(num_rows_ % A.NumRows() == 0 && num_cols_ % A.NumCols() == 0)) -// KALDI_ERR << "Invalid sizes of arguments"; -// #if HAVE_CUDA == 1 -// if (CuDevice::Instantiate().Enabled()) { -// CuTimer tim; -// dim3 dimGrid, dimBlock; -// GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), -// &dimGrid, &dimBlock); -// cuda_add_mat_repeated(dimGrid, dimBlock, alpha, -// A.data_, A.Dim(), data_, Dim()); -// CU_SAFE_CALL(cudaGetLastError()); -// CuDevice::Instantiate().AccuProfile(__func__, tim); -// } else -// #endif + } else { + // This is the "broadcasting" version of MaxMatBlocks, where + // *this is larger than src. + if (transA != kNoTrans) + KALDI_ERR << "Transposed operation not supported currently."; + if (!(num_rows_ % A.NumRows() == 0 && num_cols_ % A.NumCols() == 0)) + KALDI_ERR << "Invalid sizes of arguments"; +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + CuTimer tim; + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); + cuda_max_mat_repeated(dimGrid, dimBlock, alpha, + A.data_, A.Dim(), data_, Dim()); + CU_SAFE_CALL(cudaGetLastError()); + CuDevice::Instantiate().AccuProfile(__func__, tim); + } //else +#endif // { // const MatrixBase &src_mat = A.Mat(), // &this_mat = this->Mat(); From b626325c01280cbc2a073c9162bafe327a548396 Mon Sep 17 00:00:00 2001 From: Qian Kun Date: Mon, 23 Apr 2018 10:45:58 -0400 Subject: [PATCH 03/15] remove scale parameter and add comments --- src/cudamatrix/cu-kernels.cu | 28 ++++++++++++++-------------- src/cudamatrix/cu-kernels.h | 8 ++++---- src/cudamatrix/cu-matrix.cc | 6 +++--- src/cudamatrix/cu-matrix.h | 18 +++++++++++++++++- src/nnet3/nnet-simple-component.cc | 16 ++++------------ src/nnet3/nnet-simple-component.h | 2 -- 6 files changed, 42 insertions(+), 36 deletions(-) diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index fe1bcf60fc9..63ac13cd6ef 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -775,7 +775,7 @@ static void _add_mat_blocks_trans(Real alpha, const Real* src, template __global__ -static void _max_mat_blocks(Real alpha, const Real* src, +static void _max_mat_blocks(const Real* src, int32_cuda num_row_blocks, int32_cuda num_col_blocks, Real* dst, MatrixDim d, int src_stride) { @@ -795,7 +795,7 @@ static void _max_mat_blocks(Real alpha, const Real* src, template __global__ -static void _max_mat_blocks_trans(Real alpha, const Real* src, +static void _max_mat_blocks_trans(const Real* src, int32_cuda num_row_blocks, int32_cuda num_col_blocks, Real* dst, MatrixDim d, int src_stride) { @@ -815,7 +815,7 @@ static void _max_mat_blocks_trans(Real alpha, const Real* src, template __global__ -static void _max_mat_repeated(Real alpha, const Real* src, +static void _max_mat_repeated(const Real* src, MatrixDim src_dim, Real* dst, MatrixDim dst_dim) { int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; @@ -825,7 +825,7 @@ static void _max_mat_repeated(Real alpha, const Real* src, dst_index = i + j * dst_dim.stride, src_index = src_i + src_j * src_dim.stride; if (i < dst_dim.cols && j < dst_dim.rows) - dst[dst_index] += alpha * src[src_index]; + dst[dst_index] += src[src_index]; } template @@ -4007,15 +4007,15 @@ void cudaF_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float* src, } } -void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float* src, +void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, const float* src, int32_cuda num_row_blocks, int32_cuda num_col_blocks, float* dst, MatrixDim d, int src_stride, int A_trans) { if (A_trans) { - _max_mat_blocks_trans<<>>(alpha, src, num_row_blocks, num_col_blocks, + _max_mat_blocks_trans<<>>(src, num_row_blocks, num_col_blocks, dst, d, src_stride); } else { - _max_mat_blocks<<>>(alpha, src, num_row_blocks, num_col_blocks, dst, + _max_mat_blocks<<>>(src, num_row_blocks, num_col_blocks, dst, d, src_stride); } } @@ -4025,9 +4025,9 @@ void cudaF_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float* src, _add_mat_repeated<<>>(alpha, src, src_dim, dst, dst_dim); } -void cudaF_max_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float* src, +void cudaF_max_mat_repeated(dim3 Gr, dim3 Bl, const float* src, MatrixDim src_dim, float *dst, MatrixDim dst_dim) { - _max_mat_repeated<<>>(alpha, src, src_dim, dst, dst_dim); + _max_mat_repeated<<>>(src, src_dim, dst, dst_dim); } void cudaF_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A, const float *B, @@ -4728,15 +4728,15 @@ void cudaD_add_mat_blocks(dim3 Gr, dim3 Bl, double alpha, const double* src, } } -void cudaD_max_mat_blocks(dim3 Gr, dim3 Bl, double alpha, const double* src, +void cudaD_max_mat_blocks(dim3 Gr, dim3 Bl, const double* src, int32_cuda num_row_blocks, int32_cuda num_col_blocks, double* dst, MatrixDim d, int src_stride, int A_trans) { if (A_trans) { - _max_mat_blocks_trans<<>>(alpha, src, num_row_blocks, num_col_blocks, + _max_mat_blocks_trans<<>>(src, num_row_blocks, num_col_blocks, dst, d, src_stride); } else { - _max_mat_blocks<<>>(alpha, src, num_row_blocks, num_col_blocks, dst, + _max_mat_blocks<<>>(src, num_row_blocks, num_col_blocks, dst, d, src_stride); } } @@ -4746,9 +4746,9 @@ void cudaD_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double* src, _add_mat_repeated<<>>(alpha, src, src_dim, dst, dst_dim); } -void cudaD_max_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double* src, +void cudaD_max_mat_repeated(dim3 Gr, dim3 Bl, const double* src, MatrixDim src_dim, double *dst, MatrixDim dst_dim) { - _max_mat_repeated<<>>(alpha, src, src_dim, dst, dst_dim); + _max_mat_repeated<<>>(src, src_dim, dst, dst_dim); } void cudaD_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A, diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index d106c973aa1..27a8d8a34fe 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -168,26 +168,26 @@ inline void cuda_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, float *dst, MatrixDim dst_dim) { cudaF_add_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim); } -inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, double alpha, +inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, const double *src, int32_cuda num_row_blocks, int32_cuda num_col_blocks, double *dst, MatrixDim d, int src_stride, int A_trans) { cudaD_max_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst, d, src_stride, A_trans); } -inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float *src, +inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, const float *src, int32_cuda num_row_blocks, int32_cuda num_col_blocks, float *dst, MatrixDim d, int src_stride, int A_trans) { cudaF_max_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst, d, src_stride, A_trans); } -inline void cuda_max_mat_repeated(dim3 Gr, dim3 Bl, double alpha, +inline void cuda_max_mat_repeated(dim3 Gr, dim3 Bl, const double *src, MatrixDim src_dim, double *dst, MatrixDim dst_dim) { cudaD_max_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim); } -inline void cuda_max_mat_repeated(dim3 Gr, dim3 Bl, float alpha, +inline void cuda_max_mat_repeated(dim3 Gr, dim3 Bl, const float *src, MatrixDim src_dim, float *dst, MatrixDim dst_dim) { cudaF_max_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim); diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index bf1f019d3ac..c74e3caf3eb 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -1189,7 +1189,7 @@ void CuMatrixBase::AddMatBlocks(Real alpha, const CuMatrixBase &A, template -void CuMatrixBase::MaxMatBlocks(Real alpha, const CuMatrixBase &A, +void CuMatrixBase::MaxMatBlocks(const CuMatrixBase &A, MatrixTransposeType transA) { if (num_rows_ == 0 || num_cols_ == 0) return; @@ -1213,7 +1213,7 @@ void CuMatrixBase::MaxMatBlocks(Real alpha, const CuMatrixBase &A, dim3 dimGrid, dimBlock; GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), &dimGrid, &dimBlock); - cuda_max_mat_blocks(dimGrid, dimBlock, alpha, A.data_, num_row_blocks, + cuda_max_mat_blocks(dimGrid, dimBlock, A.data_, num_row_blocks, num_col_blocks, data_, Dim(), A.Stride(), (transA == kTrans ? 1 : 0)); CU_SAFE_CALL(cudaGetLastError()); @@ -1250,7 +1250,7 @@ void CuMatrixBase::MaxMatBlocks(Real alpha, const CuMatrixBase &A, dim3 dimGrid, dimBlock; GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), &dimGrid, &dimBlock); - cuda_max_mat_repeated(dimGrid, dimBlock, alpha, + cuda_max_mat_repeated(dimGrid, dimBlock, A.data_, A.Dim(), data_, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim); diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h index 334daf65645..6d3b35ed19c 100644 --- a/src/cudamatrix/cu-matrix.h +++ b/src/cudamatrix/cu-matrix.h @@ -509,7 +509,23 @@ class CuMatrixBase { void AddMatBlocks(Real alpha, const CuMatrixBase &A, MatrixTransposeType trans = kNoTrans); - void MaxMatBlocks(Real alpha, const CuMatrixBase &A, + /// This function does *this = max(*this, src). similar with AddMatBlocks, + /// it supports cases where *this and src have different dimension. + /// There are two allowed cases: + /// + /// (1) *this is larger than src; this case is probably for backpropagation. + /// In this case, we do a broadcasting operation. *this must + /// have NumRows() == a * src.NumRows() and NumCols() == b * + /// src.NumCols() for integer a >= 1, b >= 1. *this will be treated as + /// being made up of of blocks with the same size as src, and to each + /// block we'll add src. This case does not support trans == kTrans. + /// + /// (2) *this is smaller than src; we select the maximum. src.NumRows() must + /// == a * this->NumRows(), and src.NumCols() must == b * this->NumCols(), + /// for a >= 1, b >= 1. In this case, src will be treated as being made + /// up of blocks with the same size as *this, and to *this we will select + /// max value over all of those blocks. + void MaxMatBlocks(const CuMatrixBase &A, MatrixTransposeType trans = kNoTrans); /// (for each column c of *this), c = alpha * col + beta * c diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index 02b7d661459..3bfd6c7d100 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -5861,11 +5861,9 @@ void SumBlockComponent::Backprop( } MaxPoolingOverBlock::MaxPoolingOverBlock(const MaxPoolingOverBlock &other): - input_dim_(other.input_dim_), output_dim_(other.output_dim_), - scale_(other.scale_) { } + input_dim_(other.input_dim_), output_dim_(other.output_dim_) { } void MaxPoolingOverBlock::InitFromConfig(ConfigLine *cfl) { - scale_ = 1.0; bool ok = cfl->GetValue("input-dim", &input_dim_) && cfl->GetValue("output-dim", &output_dim_); if (!ok) @@ -5873,7 +5871,6 @@ void MaxPoolingOverBlock::InitFromConfig(ConfigLine *cfl) { if (input_dim_ <= 0 || input_dim_ % output_dim_ != 0) KALDI_ERR << "Invalid values input-dim=" << input_dim_ << " output-dim=" << output_dim_; - cfl->GetValue("scale", &scale_); if (cfl->HasUnusedValues()) KALDI_ERR << "Could not process these elements in initializer: " << cfl->UnusedValues(); @@ -5884,8 +5881,6 @@ void MaxPoolingOverBlock::Read(std::istream &is, bool binary) { ReadBasicType(is, binary, &input_dim_); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &output_dim_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &scale_); ExpectToken(is, binary, ""); } @@ -5895,16 +5890,13 @@ void MaxPoolingOverBlock::Write(std::ostream &os, bool binary) const { WriteBasicType(os, binary, input_dim_); WriteToken(os, binary, ""); WriteBasicType(os, binary, output_dim_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, scale_); WriteToken(os, binary, ""); } std::string MaxPoolingOverBlock::Info() const { std::ostringstream stream; stream << Type() << ", input-dim=" << input_dim_ - << ", output-dim=" << output_dim_ - << ", scale=" << scale_; + << ", output-dim=" << output_dim_; return stream.str(); } @@ -5914,7 +5906,7 @@ void* MaxPoolingOverBlock::Propagate(const ComponentPrecomputedIndexes *indexes, KALDI_ASSERT(out->NumRows() == in.NumRows() && out->NumCols() == output_dim_ && in.NumCols() == input_dim_); - out->MaxMatBlocks(scale_, in, kNoTrans); + out->MaxMatBlocks(in, kNoTrans); return NULL; } @@ -5928,7 +5920,7 @@ void MaxPoolingOverBlock::Backprop( Component *to_update, CuMatrixBase *in_deriv) const { if (in_deriv) { - in_deriv->MaxMatBlocks(scale_, out_deriv, kNoTrans); + in_deriv->MaxMatBlocks(out_deriv, kNoTrans); } } diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h index 0f83631b7e6..3983d87816b 100644 --- a/src/nnet3/nnet-simple-component.h +++ b/src/nnet3/nnet-simple-component.h @@ -1228,7 +1228,6 @@ class SumBlockComponent: public Component { Accepted values on its config-file line are: input-dim The input dimension. Required. output-dim The block dimension. Required. Must divide input-dim. - scale A scaling factor on the output. Defaults to 1.0. */ class MaxPoolingOverBlock: public Component { public: @@ -1259,7 +1258,6 @@ class MaxPoolingOverBlock: public Component { private: int32 input_dim_; int32 output_dim_; - BaseFloat scale_; MaxPoolingOverBlock &operator = (const MaxPoolingOverBlock &other); // Disallow. }; From f76586de2c747fa33679770b4849a9068e6d1b91 Mon Sep 17 00:00:00 2001 From: Qian Kun Date: Mon, 23 Apr 2018 17:07:22 -0400 Subject: [PATCH 04/15] basic back-propagation(fake cuda)(stride = pool_size) --- src/nnet3/nnet-simple-component.cc | 21 +++++++++++++++++---- src/nnet3/nnet-simple-component.h | 4 ++-- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index 3bfd6c7d100..a2f51e25a52 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -5913,14 +5913,27 @@ void* MaxPoolingOverBlock::Propagate(const ComponentPrecomputedIndexes *indexes, void MaxPoolingOverBlock::Backprop( const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &, //in_value - const CuMatrixBase &, // out_value, + const CuMatrixBase &in_value, //in_value + const CuMatrixBase &out_value, // out_value, const CuMatrixBase &out_deriv, void *memo, Component *to_update, CuMatrixBase *in_deriv) const { - if (in_deriv) { - in_deriv->MaxMatBlocks(out_deriv, kNoTrans); + // if (in_deriv) { + // in_deriv->MaxMatBlocks(out_deriv, kNoTrans); + // } + if (!in_deriv) + return; + + int32 num_pools = input_dim_; + int32 pool_size = output_dim_ / input_dim_; + + for (int32 q = 0; q < pool_size; q++) { + // zero-out mask + CuMatrix mask; + out_value.EqualElementMask(in_value.ColRange(q * num_pools, num_pools), &mask); + mask.MulElements(out_deriv); + in_deriv.ColRange(q * num_pools, num_pools).CopyFromMat(mask); } } diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h index 3983d87816b..6284a686333 100644 --- a/src/nnet3/nnet-simple-component.h +++ b/src/nnet3/nnet-simple-component.h @@ -1249,8 +1249,8 @@ class MaxPoolingOverBlock: public Component { CuMatrixBase *out) const; virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &, //in_value - const CuMatrixBase &, // out_value, + const CuMatrixBase &in_value, //in_value + const CuMatrixBase &out_value, // out_value, const CuMatrixBase &out_deriv, void *memo, Component *to_update, From f284026155d3abef914635c46f39dac07107c0f4 Mon Sep 17 00:00:00 2001 From: Qian Kun Date: Mon, 30 Apr 2018 21:39:29 -0400 Subject: [PATCH 05/15] structure of 2.5d maxpooling component (compatible with nnet-convolution-component)(without cuda functions) --- src/cudamatrix/cu-matrix.cc | 118 ++++++++------- src/cudamatrix/cu-matrix.h | 37 ++--- src/nnet3/nnet-convolutional-component.cc | 163 +++++++++++++++++++++ src/nnet3/nnet-convolutional-component.h | 166 +++++++++++++++++++++- src/nnet3/nnet-simple-component.cc | 77 ---------- src/nnet3/nnet-simple-component.h | 42 ------ 6 files changed, 415 insertions(+), 188 deletions(-) diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index c74e3caf3eb..dcd37b76709 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -1190,86 +1190,100 @@ void CuMatrixBase::AddMatBlocks(Real alpha, const CuMatrixBase &A, template void CuMatrixBase::MaxMatBlocks(const CuMatrixBase &A, - MatrixTransposeType transA) { + vector index_max, + MatrixTransposeType transA, + const int32 input_t_dim_, + const int32 pool_t_size_, + const int32 pool_t_step_, + const int32 input_h_dim_, + const int32 pool_h_size_, + const int32 pool_h_step_, + const int32 input_f_dim_, + const int32 pool_f_size_, + const int32 pool_f_step_) { if (num_rows_ == 0 || num_cols_ == 0) return; + int32 num_pools_t = 1 + (input_t_dim_ - pool_t_size_) / pool_t_step_; + int32 num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_; + int32 num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_; + + // Not sure whether this needed? + KALDI_ASSERT((input_t_dim_ - pool_t_size_) % pool_t_step_ == 0 && + (input_h_dim_ - pool_h_size_) % pool_h_step_ == 0 && + (input_f_dim_ - pool_f_size_) % pool_f_step_ == 0); + if (A.NumRows() >= (transA == kNoTrans ? num_rows_ : num_cols_) && A.NumCols() >= (transA == kNoTrans ? num_cols_ : num_rows_)) { - // This is the "summing", not broadcasting, version of AddMatBlocks. + // This is the "forward-propagation" version of MaxMatBlocks. // It supports both regular and transposed operation. - int32 num_row_blocks, num_col_blocks; + if (transA == kNoTrans) { - KALDI_ASSERT(A.NumRows() % num_rows_ == 0 && A.NumCols() % num_cols_ == 0); - num_row_blocks = A.Mat().NumRows() / num_rows_; - num_col_blocks = A.Mat().NumCols() / num_cols_; + KALDI_ASSERT(A.NumRows() == input_t_dim_ && + A.NumCols() == input_h_dim_ * input_f_dim_ && + num_rows_ == num_pools_t && + num_cols_ == num_pools_h * num_pools_f); + } else { - KALDI_ASSERT(A.NumRows() % num_cols_ == 0 && A.NumCols() % num_rows_ == 0); - num_row_blocks = A.Mat().NumRows() / num_cols_; - num_col_blocks = A.Mat().NumCols() / num_rows_; + KALDI_ASSERT(A.NumCols() == input_t_dim_ && + A.NumRows() == input_h_dim_ * input_f_dim_ && + num_cols_ == num_pools_t && + num_rows_ == num_pools_h * num_pools_f); + } #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; - dim3 dimGrid, dimBlock; - GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), - &dimGrid, &dimBlock); - cuda_max_mat_blocks(dimGrid, dimBlock, A.data_, num_row_blocks, - num_col_blocks, data_, Dim(), A.Stride(), - (transA == kTrans ? 1 : 0)); + dim3 dimBlock(num_pools_t, num_pools_h, num_pools_f); + dim3 dimGrid(1024); + // dim3 dimGrid, dimBlock; + // GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + // &dimGrid, &dimBlock); + cuda_max_mat_blocks(dimGrid, dimBlock, A.data_, data_, Dim(), + input_t_dim_, pool_t_size_, pool_t_step_, + input_h_dim_, pool_h_size_, pool_h_step_, + input_f_dim_, pool_f_size_, pool_f_step_, + index_max_, (transA == kTrans ? 1 : 0)); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim); - } //else + } else #endif // { - // int32 nr, nc; - // if (transA == kNoTrans) { - // nr = num_rows_; - // nc = num_cols_; - // } else { - // nr = num_cols_; - // nc = num_rows_; - // } - // for (int32 i = 0; i < num_row_blocks; i++) { - // for (int32 j = 0; j < num_col_blocks; j++) { - // Mat().AddMat(alpha, SubMatrix(A.Mat(), i * nr, nr, j * nc, nc), - // transA); - // } - // } + // TO DO + // maxpooling without cuda // } } else { - // This is the "broadcasting" version of MaxMatBlocks, where + + // This is the "backward-propagation" version of MaxMatBlocks, where // *this is larger than src. - if (transA != kNoTrans) - KALDI_ERR << "Transposed operation not supported currently."; - if (!(num_rows_ % A.NumRows() == 0 && num_cols_ % A.NumCols() == 0)) - KALDI_ERR << "Invalid sizes of arguments"; + if (transA == kNoTrans){ + KALDI_ASSERT(A.NumRows() == num_pools_t && + A.NumCols() == num_pools_h * num_pools_f && + num_rows_ == input_t_dim_ && + num_cols_ == input_h_dim_ * input_f_dim_); + } else { + KALDI_ASSERT(A.NumCols() == num_pools_t && + A.NumRows() == num_pools_h * num_pools_f && + num_cols_ == input_t_dim_ && + num_rows_ == input_h_dim_ * input_f_dim_); + } + #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; dim3 dimGrid, dimBlock; GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), &dimGrid, &dimBlock); - cuda_max_mat_repeated(dimGrid, dimBlock, + cuda_max_mat_blocks_back(dimGrid, dimBlock, index_max_, A.data_, A.Dim(), data_, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim); - } //else -#endif -// { -// const MatrixBase &src_mat = A.Mat(), -// &this_mat = this->Mat(); -// for (int32 row_offset = 0; row_offset < NumRows(); -// row_offset += src_mat.NumRows()) { -// for (int32 col_offset = 0; col_offset < NumCols(); -// col_offset += src_mat.NumCols()) { -// SubMatrix this_part(this_mat, -// row_offset, src_mat.NumRows(), -// col_offset, src_mat.NumCols()); -// this_part.AddMat(alpha, src_mat); -// } -// } -// } + } else +#endif + // { + // TO DO + // maxpooling backward propagation without cuda + // } } } diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h index 6d3b35ed19c..b12819eac39 100644 --- a/src/cudamatrix/cu-matrix.h +++ b/src/cudamatrix/cu-matrix.h @@ -509,24 +509,29 @@ class CuMatrixBase { void AddMatBlocks(Real alpha, const CuMatrixBase &A, MatrixTransposeType trans = kNoTrans); - /// This function does *this = max(*this, src). similar with AddMatBlocks, - /// it supports cases where *this and src have different dimension. - /// There are two allowed cases: + + /// This function is used for do the maxpooling over blocks. The detailed + /// description is written in the MaxPoolingOverBlock component in file + /// nnet-convolutional-component.h /// - /// (1) *this is larger than src; this case is probably for backpropagation. - /// In this case, we do a broadcasting operation. *this must - /// have NumRows() == a * src.NumRows() and NumCols() == b * - /// src.NumCols() for integer a >= 1, b >= 1. *this will be treated as - /// being made up of of blocks with the same size as src, and to each - /// block we'll add src. This case does not support trans == kTrans. + /// To point out, this function has two version 'forward-propagate' and + /// 'backward-propagate': /// - /// (2) *this is smaller than src; we select the maximum. src.NumRows() must - /// == a * this->NumRows(), and src.NumCols() must == b * this->NumCols(), - /// for a >= 1, b >= 1. In this case, src will be treated as being made - /// up of blocks with the same size as *this, and to *this we will select - /// max value over all of those blocks. - void MaxMatBlocks(const CuMatrixBase &A, - MatrixTransposeType trans = kNoTrans); + /// (1) When the size of input matrix &A is larger than *this, it is then a + /// 'forward-propagate' version, and the function do the maxpooling + /// depending on the parameters. Meanwhile, it stores the index of + /// maximum value in each pool in vector 'index_max_' for backpropagation. + /// + /// (2) When the size of input matrix &A is smaller than *this, it is then a + /// 'backward-propagate' version. According to the vector 'index_max_', the + /// function set all the values in &out_deriv whose index is not in + /// vector(not corresponding to maximum value in each pool of &in_value) + /// as zero, and keeps those correponding to maximum value as the *in_deriv. + void MaxMatBlocks(const CuMatrixBase &A, vector index_max_, + MatrixTransposeType trans = kNoTrans, + const int32 input_t_dim_, const int32 pool_t_size_, const int32 pool_t_step_, + const int32 input_h_dim_, const int32 pool_h_size_, const int32 pool_h_step_, + const int32 input_f_dim_, const int32 pool_f_size_, const int32 pool_f_step_); /// (for each column c of *this), c = alpha * col + beta * c void AddVecToCols(Real alpha, const CuVectorBase &col, Real beta = 1.0); diff --git a/src/nnet3/nnet-convolutional-component.cc b/src/nnet3/nnet-convolutional-component.cc index bea3b9d31d5..73de0ea5844 100644 --- a/src/nnet3/nnet-convolutional-component.cc +++ b/src/nnet3/nnet-convolutional-component.cc @@ -666,6 +666,169 @@ void TimeHeightConvolutionComponent::PrecomputedIndexes::Read( ExpectToken(is, binary, ""); } +MaxPoolingOverBlock::MaxPoolingOverBlock( + const MaxPoolingOverBlock &other): + input_t_dim_(other.input_t_dim_), + input_h_dim_(other.input_h_dim_), + input_f_dim_(other.input_f_dim_), + pool_t_size_(other.pool_t_size_), + pool_h_size_(other.pool_h_size_), + pool_f_size_(other.pool_f_size_), + pool_t_step_(other.pool_t_step_), + pool_h_step_(other.pool_h_step_), + pool_f_step_(other.pool_f_step_) { } + +// aquire input dim +int32 MaxpoolingComponent::InputDim() const { + return input_t_dim_ * input_h_dim_ * input_f_dim_; +} + +// aquire output dim +int32 MaxpoolingComponent::OutputDim() const { + int32 num_pools_t = 1 + (input_t_dim_ - pool_t_size_) / pool_t_step_; + int32 num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_; + int32 num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_; + return num_pools_t * num_pools_h * num_pools_f; +} + +// check the component parameters +void MaxpoolingComponent::Check() const { + // sanity check of the max pooling parameters + KALDI_ASSERT(input_t_dim_ > 0); + KALDI_ASSERT(input_h_dim_ > 0); + KALDI_ASSERT(input_f_dim_ > 0); + KALDI_ASSERT(pool_t_size_ > 0); + KALDI_ASSERT(pool_h_size_ > 0); + KALDI_ASSERT(pool_f_size_ > 0); + KALDI_ASSERT(pool_t_step_ > 0); + KALDI_ASSERT(pool_h_step_ > 0); + KALDI_ASSERT(pool_f_step_ > 0); + KALDI_ASSERT(input_t_dim_ >= pool_t_size_); + KALDI_ASSERT(input_h_dim_ >= pool_h_size_); + KALDI_ASSERT(input_f_dim_ >= pool_f_size_); + KALDI_ASSERT(pool_t_size_ >= pool_t_step_); + KALDI_ASSERT(pool_h_size_ >= pool_h_step_); + KALDI_ASSERT(pool_f_size_ >= pool_f_step_); + KALDI_ASSERT((input_t_dim_ - pool_t_size_) % pool_t_step_ == 0); + KALDI_ASSERT((input_h_dim_ - pool_h_size_) % pool_h_step_ == 0); + KALDI_ASSERT((input_f_dim_ - pool_f_size_) % pool_f_step_ == 0); +} + +// initialize the component using configuration file +void MaxPoolingOverBlock::InitFromConfig(ConfigLine *cfl) { + bool ok = true; + + ok = ok && cfl->GetValue("input-t-dim", &input_t_dim_); + ok = ok && cfl->GetValue("input-h-dim", &input_h_dim_); + ok = ok && cfl->GetValue("input-f-dim", &input_f_dim_); + ok = ok && cfl->GetValue("pool-t-size", &pool_t_size_); + ok = ok && cfl->GetValue("pool-h-size", &pool_h_size_); + ok = ok && cfl->GetValue("pool-f-size", &pool_f_size_); + ok = ok && cfl->GetValue("pool-t-step", &pool_t_step_); + ok = ok && cfl->GetValue("pool-h-step", &pool_h_step_); + ok = ok && cfl->GetValue("pool-f-step", &pool_f_step_); + + if (cfl->HasUnusedValues()) + KALDI_ERR << "Could not process these elements in initializer: " + << cfl->UnusedValues(); + if (!ok) + KALDI_ERR << "Bad initializer " + << cfl->WholeLine(); + + Check(); +} + +void MaxPoolingOverBlock::Read(std::istream &is, bool binary) { + ExpectOneOrTwoTokens(is, binary, "", ""); + ReadBasicType(is, binary, &input_t_dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &input_h_dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &input_f_dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &pool_t_size_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &pool_h_size_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &pool_f_size_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &pool_t_step_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &pool_h_step_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &pool_f_step_); + ExpectToken(is, binary, ""); + Check(); +} + +void MaxPoolingOverBlock::Write(std::ostream &os, bool binary) const { + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, input_t_dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, input_h_dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, input_f_dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, pool_t_size_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, pool_h_size_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, pool_f_size_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, pool_t_step_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, pool_h_step_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, pool_f_step_); + WriteToken(os, binary, ""); +} + +// display information about component +std::string MaxPoolingOverBlock::Info() const { + std::ostringstream stream; + stream << Type() + << ", input-t-dim=" << input_t_dim_ + << ", input-h-dim=" << input_h_dim_ + << ", input-f-dim=" << input_f_dim_ + << ", pool-t-size=" << pool_t_size_ + << ", pool-h-size=" << pool_h_size_ + << ", pool-f-size=" << pool_f_size_ + << ", pool-t-step=" << pool_t_step_ + << ", pool-h-step=" << pool_h_step_ + << ", pool-f-step=" << pool_f_step_; + return stream.str(); +} + +void* MaxPoolingOverBlock::Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const { + + out->MaxMatBlocks(in, index_max_, kNoTrans, + input_t_dim_, pool_t_size_, pool_t_step_, + input_h_dim_, pool_h_size_, pool_h_step_, + input_f_dim_, pool_f_size_, pool_f_step_); + return NULL; +} + +void MaxPoolingOverBlock::Backprop( + const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in_value, //in_value + const CuMatrixBase &out_value, // out_value, + const CuMatrixBase &out_deriv, + void *memo, + Component *to_update, + CuMatrixBase *in_deriv) const { + + if (in_deriv) { + in_derv->MaxMatBlocks(out_deriv, index_max_, kNoTrans, + input_t_dim_, pool_t_size_, pool_t_step_, + input_h_dim_, pool_h_size_, pool_h_step_, + input_f_dim_, pool_f_size_, pool_f_step_); + + } +} } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-convolutional-component.h b/src/nnet3/nnet-convolutional-component.h index 35cf0de11c9..4677c894e69 100644 --- a/src/nnet3/nnet-convolutional-component.h +++ b/src/nnet3/nnet-convolutional-component.h @@ -25,6 +25,7 @@ #include "nnet3/natural-gradient-online.h" #include "nnet3/convolution.h" #include +#include namespace kaldi { namespace nnet3 { @@ -370,8 +371,171 @@ class TimeHeightConvolutionComponent: public UpdatableComponent { OnlineNaturalGradient preconditioner_out_; }; +/** MaxPoolingOverBlock gets maximum value over blocks of its input + this component should be compatible with TimeHeightConvolutionComponent + + MaxPoolingOverBlock : + MaxPoolingOverBlock component was firstly used in ConvNet for selecting an + representative activation in an area. It inspired Maxout nonlinearity. + Each output element of this component is the maximum of a block of + input elements where the block has a 2.5D dimension (pool_t_size_, + pool_h_size_ * pool_f_size_). + Blocks could overlap if the shift value on any axis is smaller + than its corresponding pool size (e.g. pool_t_step_ < pool_t_size_). + If the shift values are euqal to their pool size, there is no + overlap; while if they all equal 1, the blocks overlap to + the greatest possible extent. + + This component is designed to be used after a ConvolutionComponent + so that the input matrix is propagated from a 2d-convolutional layer. + This component implements 2.5d-maxpooling which performs + max pooling along the three axes. + + Input : A 2.5D matrix with dimensions: + t: (e.g. time) + h: (e.g. height, mel-frequency) + f: (e.g. channels like number of filters in the ConvolutionComponent) + + The reason why we call the matrix 2.5D is because we compress the 3D block + into a 2D matrix by concatenating each 2D matrix at different channel like: + + h = 0 h = 1 + |------------------------|------------------------|----... + f=0 f=1 f=2 ... f=n f=0 f=1 f=2 ... f=n + |----|----|----|----|----|----|----|----|----|----|----... + t=0 **** **** **** **** ****|**** **** **** **** ****|****... - + t=1 **** **** **** **** ****|**** **** **** **** ****|****... | m + t=2 **** **** **** **** ****|**** **** **** **** ****|****... | a + t=3 **** **** **** **** ****|**** **** **** **** ****|****... | t + t=4 **** **** **** **** ****|**** **** **** **** ****|****... | r + t=5 **** **** **** **** ****|**** **** **** **** ****|****... | i + t=6 **** **** **** **** ****|**** **** **** **** ****|****... | x + t=7 **** **** **** **** ****|**** **** **** **** ****|****... - + + In this case, if we set pool_t_size = 2, pool_t_step = 1 + pool_h_size = 2, pool_h_step = 1 + pool_f_size = 2, pool_f_step = 1 + Then, the pooling block is like: + + h = 0 h = 1 h = 0 h = 1 h = 1 h = 2 + |---------|---------| |---------|---------| |---------|---------| + f=0 f=1 f=0 f=1 f=1 f=2 f=1 f=2 f=0 f=1 f=0 f=1 + |----|----|----|----| |----|----|----|----| ... |----|----|----|----| ...... + t=0 **** **** **** **** t=0 **** **** **** **** t=0 **** **** **** **** + t=1 **** **** **** **** t=1 **** **** **** **** t=1 **** **** **** **** + + + h = 0 f = 1 h = 0 h = 1 h = 1 h = 2 + |---------|---------| |---------|---------| |---------|---------| + f=0 f=1 f=0 f=1 f=1 f=2 f=1 f=2 f=0 f=1 f=0 f=1 + |----|----|----|----| |----|----|----|----| ... |----|----|----|----| ...... + t=1 **** **** **** **** t=1 **** **** **** **** t=1 **** **** **** **** + t=2 **** **** **** **** t=2 **** **** **** **** t=2 **** **** **** **** + + . . . + . . . + . . . + . . . + . . . + . . . + + Since the stride of filter(pool_f_step) is usually smaller than the + stride of height(poo_h_step), we arrange each row of output as: + (all filters for height 0)(all filters for height 1)... + + Parameters: + + input_t_dim_ size of the input along t-axis + (e.g. number of time steps) + input_h_dim_ size of input along h-axis + (e.g. number of mel-frequency bins) + input_f_dim_ size of input along f-axis + (e.g. number of filters in the ConvolutionComponent) + + pool_t_size_ size of the pooling window along t-axis + pool_h_size_ size of the pooling window along h-axis + pool_f_size_ size of the pooling window along f-axis + + pool_t_step_ the number of steps taken along t-axis of input + before computing the next pool (e.g. the stride + size along t-axis) + pool_h_step_ the number of steps taken along h-axis of input + before computing the next pool (e.g. the stride + size along t-axis) + pool_f_step_ the number of steps taken along f-axis of input + before computing the next pool (e.g. the stride + size along t-axis) + + index_max_ a vector that store the index of the maximum + value as (t,h,f), used in back-propagation. + + + + Output : The output is also a 2.5D tensor with dimension (num_block_t by + num_block_h * num_block_f) where: + + num_block_t = 1 + (input_t_dim_ - pool_t_size_) / pool_t_step_; + // the number of blocks in t dimension + num_block_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_; + // the number of blocks in h dimension + num_block_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_; + // the number of blocks in f dimension + + + */ +class MaxPoolingOverBlock: public Component { + public: + explicit MaxPoolingOverBlock(const MaxPoolingOverBlock &other); + MaxPoolingOverBlock(): input_t_dim_(0), input_h_dim_(0), input_f_dim_(0), + pool_t_size_(0), pool_h_size_(0), pool_f_size_(0), + pool_t_step_(0), pool_h_step_(0), pool_f_step_(0) { } + virtual std::string Type() const { return "MaxPoolingOverBlock"; } + virtual int32 Properties() const { + return kSimpleComponent|kBackpropNeedsInput|kBackpropNeedsOutput|kBackpropAdds; + } + virtual void InitFromConfig(ConfigLine *cfl); + virtual int32 InputDim() const;// { return input_dim_; } + virtual int32 OutputDim() const;// { return output_dim_; } + virtual void Read(std::istream &is, bool binary); + virtual void Write(std::ostream &os, bool binary) const; + virtual std::string Info() const; + virtual Component* Copy() const { return new MaxPoolingOverBlock(*this); } + virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const; + virtual void Backprop(const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in_value, //in_value + const CuMatrixBase &out_value, // out_value, + const CuMatrixBase &out_deriv, + void *memo, + Component *to_update, + CuMatrixBase *in_deriv) const; + virtual void Check() const; - + private: + int32 input_t_dim_; // size of the input along t-axis + // (e.g. number of time steps) + int32 input_h_dim_; // size of input along h-axis + // (e.g. number of mel-frequency bins) + int32 input_f_dim_; // size of input along f-axis + // (e.g. number of filters in the ConvolutionComponent) + + int32 pool_t_size_; // size of the pooling window along t-axis + int32 pool_h_size_; // size of the pooling window along h-axis + int32 pool_f_size_; // size of the pooling window along f-axis + + int32 pool_t_step_; // the number of steps taken along t-axis of input + // before computing the next pool + int32 pool_h_step_; // the number of steps taken along h-axis of input + // before computing the next pool + int32 pool_f_step_; // the number of steps taken along f-axis of input + // before computing the next pool + + vector index_max_; + + MaxPoolingOverBlock &operator = (const MaxPoolingOverBlock &other); // Disallow. +}; } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index a2f51e25a52..081e61bfa28 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -5860,82 +5860,5 @@ void SumBlockComponent::Backprop( } } -MaxPoolingOverBlock::MaxPoolingOverBlock(const MaxPoolingOverBlock &other): - input_dim_(other.input_dim_), output_dim_(other.output_dim_) { } - -void MaxPoolingOverBlock::InitFromConfig(ConfigLine *cfl) { - bool ok = cfl->GetValue("input-dim", &input_dim_) && - cfl->GetValue("output-dim", &output_dim_); - if (!ok) - KALDI_ERR << "input-dim and output-dim must both be provided."; - if (input_dim_ <= 0 || input_dim_ % output_dim_ != 0) - KALDI_ERR << "Invalid values input-dim=" << input_dim_ - << " output-dim=" << output_dim_; - if (cfl->HasUnusedValues()) - KALDI_ERR << "Could not process these elements in initializer: " - << cfl->UnusedValues(); -} - -void MaxPoolingOverBlock::Read(std::istream &is, bool binary) { - ExpectOneOrTwoTokens(is, binary, "", ""); - ReadBasicType(is, binary, &input_dim_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &output_dim_); - ExpectToken(is, binary, ""); -} - -void MaxPoolingOverBlock::Write(std::ostream &os, bool binary) const { - WriteToken(os, binary, ""); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, input_dim_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, output_dim_); - WriteToken(os, binary, ""); -} - -std::string MaxPoolingOverBlock::Info() const { - std::ostringstream stream; - stream << Type() << ", input-dim=" << input_dim_ - << ", output-dim=" << output_dim_; - return stream.str(); -} - -void* MaxPoolingOverBlock::Propagate(const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in, - CuMatrixBase *out) const { - KALDI_ASSERT(out->NumRows() == in.NumRows() && - out->NumCols() == output_dim_ && - in.NumCols() == input_dim_); - out->MaxMatBlocks(in, kNoTrans); - return NULL; -} - -void MaxPoolingOverBlock::Backprop( - const std::string &debug_info, - const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in_value, //in_value - const CuMatrixBase &out_value, // out_value, - const CuMatrixBase &out_deriv, - void *memo, - Component *to_update, - CuMatrixBase *in_deriv) const { - // if (in_deriv) { - // in_deriv->MaxMatBlocks(out_deriv, kNoTrans); - // } - if (!in_deriv) - return; - - int32 num_pools = input_dim_; - int32 pool_size = output_dim_ / input_dim_; - - for (int32 q = 0; q < pool_size; q++) { - // zero-out mask - CuMatrix mask; - out_value.EqualElementMask(in_value.ColRange(q * num_pools, num_pools), &mask); - mask.MulElements(out_deriv); - in_deriv.ColRange(q * num_pools, num_pools).CopyFromMat(mask); - } -} - } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h index 6284a686333..55e7ff703cf 100644 --- a/src/nnet3/nnet-simple-component.h +++ b/src/nnet3/nnet-simple-component.h @@ -1220,48 +1220,6 @@ class SumBlockComponent: public Component { SumBlockComponent &operator = (const SumBlockComponent &other); // Disallow. }; -/** MaxPoolingOverBlock gets maximum value over blocks of its input: for instance, if - you create one with the config "input-dim=400 output-dim=100", - its output will be the maximum value over the 4 100-dimensional blocks of - the input. - - Accepted values on its config-file line are: - input-dim The input dimension. Required. - output-dim The block dimension. Required. Must divide input-dim. - */ -class MaxPoolingOverBlock: public Component { - public: - explicit MaxPoolingOverBlock(const MaxPoolingOverBlock &other); - MaxPoolingOverBlock() { } - virtual std::string Type() const { return "MaxPoolingOverBlock"; } - virtual int32 Properties() const { - return kSimpleComponent|kPropagateAdds|kBackpropAdds; - } - virtual void (ConfigLine *cfl); - virtual int32 InputDim() const { return input_dim_; } - virtual int32 OutputDim() const { return output_dim_; } - virtual void Read(std::istream &is, bool binary); - virtual void Write(std::ostream &os, bool binary) const; - virtual std::string Info() const; - virtual Component* Copy() const { return new MaxPoolingOverBlock(*this); } - virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in, - CuMatrixBase *out) const; - virtual void Backprop(const std::string &debug_info, - const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in_value, //in_value - const CuMatrixBase &out_value, // out_value, - const CuMatrixBase &out_deriv, - void *memo, - Component *to_update, - CuMatrixBase *in_deriv) const; - private: - int32 input_dim_; - int32 output_dim_; - MaxPoolingOverBlock &operator = (const MaxPoolingOverBlock &other); // Disallow. -}; - - /* ClipGradientComponent just duplicates its input, but clips gradients during backpropagation if they cross a predetermined threshold. From 79f2c5f1f85aa2160caf48a873c31ed7f64dfdf0 Mon Sep 17 00:00:00 2001 From: Qian Kun Date: Thu, 3 May 2018 09:55:18 -0400 Subject: [PATCH 06/15] non-cuda case --- src/cudamatrix/cu-kernels-ansi.h | 26 +++++++++++ src/cudamatrix/cu-kernels.h | 54 ++++++++++++++++------- src/cudamatrix/cu-matrix.cc | 45 ++++++++++++++++--- src/cudamatrix/cu-matrix.h | 2 +- src/nnet3/nnet-convolutional-component.cc | 2 +- src/nnet3/nnet-convolutional-component.h | 13 +++--- 6 files changed, 111 insertions(+), 31 deletions(-) diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h index 6b99a77e73b..5520cb1c1db 100644 --- a/src/cudamatrix/cu-kernels-ansi.h +++ b/src/cudamatrix/cu-kernels-ansi.h @@ -110,6 +110,32 @@ void cudaD_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double *src, MatrixDim src_dim, double *dst, MatrixDim dst_dim); void cudaF_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float *src, MatrixDim src_dim, float *dst, MatrixDim dst_dim); +void cudaD_max_mat_blocks(dim3 Gr, dim3 Bl, const double *src, double *dst, + int32_cuda input_t_dim_, + int32_cuda pool_t_size_, + int32_cuda pool_t_step_, + int32_cuda input_h_dim_, + int32_cuda pool_h_size_, + int32_cuda pool_h_step_, + int32_cuda input_f_dim_, + int32_cuda pool_f_size_, + int32_cuda pool_f_step_, + double index_max_, int A_trans); +void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, const float *src, float *dst, + int32_cuda input_t_dim_, + int32_cuda pool_t_size_, + int32_cuda pool_t_step_, + int32_cuda input_h_dim_, + int32_cuda pool_h_size_, + int32_cuda pool_h_step_, + int32_cuda input_f_dim_, + int32_cuda pool_f_size_, + int32_cuda pool_f_step_, + float index_max_, int A_trans); +void cudaD_max_mat_blocks_back(dim3 Gr, dim3 Bl, double index_max_, const double *src, + MatrixDim src_dim, double *dst, MatrixDim dst_dim); +void cudaF_max_mat_blocks_back(dim3 Gr, dim3 Bl, float index_max_, const float *src, + MatrixDim src_dim, float *dst, MatrixDim dst_dim); void cudaD_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat, MatrixDim mat_dim, const double *mat2, int mat2_row_stride, int mat2_col_stride, diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index 27a8d8a34fe..2804a624959 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -169,28 +169,50 @@ inline void cuda_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, cudaF_add_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim); } inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, - const double *src, int32_cuda num_row_blocks, - int32_cuda num_col_blocks, double *dst, - MatrixDim d, int src_stride, int A_trans) { - cudaD_max_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst, - d, src_stride, A_trans); -} -inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, const float *src, - int32_cuda num_row_blocks, - int32_cuda num_col_blocks, float *dst, - MatrixDim d, int src_stride, int A_trans) { - cudaF_max_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst, - d, src_stride, A_trans); + const double *src, double *dst, + int32_cuda input_t_dim_, + int32_cuda pool_t_size_, + int32_cuda pool_t_step_, + int32_cuda input_h_dim_, + int32_cuda pool_h_size_, + int32_cuda pool_h_step_, + int32_cuda input_f_dim_, + int32_cuda pool_f_size_, + int32_cuda pool_f_step_, + double index_max_, int A_trans) { + cudaD_max_mat_blocks(Gr, Bl, src, dst, + input_t_dim_, pool_t_size_, pool_t_step_, + input_h_dim_, pool_h_size_, pool_h_step_, + input_f_dim_, pool_f_size_, pool_f_step_, + index_max_, A_trans); } -inline void cuda_max_mat_repeated(dim3 Gr, dim3 Bl, +inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, + const float *src, float *dst, + int32_cuda input_t_dim_, + int32_cuda pool_t_size_, + int32_cuda pool_t_step_, + int32_cuda input_h_dim_, + int32_cuda pool_h_size_, + int32_cuda pool_h_step_, + int32_cuda input_f_dim_, + int32_cuda pool_f_size_, + int32_cuda pool_f_step_, + float index_max_, int A_trans) { + cudaF_max_mat_blocks(Gr, Bl, src, dst, + input_t_dim_, pool_t_size_, pool_t_step_, + input_h_dim_, pool_h_size_, pool_h_step_, + input_f_dim_, pool_f_size_, pool_f_step_, + index_max_, A_trans); +} +inline void cuda_max_mat_blocks_back(dim3 Gr, dim3 Bl, double index_max_, const double *src, MatrixDim src_dim, double *dst, MatrixDim dst_dim) { - cudaD_max_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim); + cudaD_max_mat_blocks_back(Gr, Bl, index_max_, src, src_dim, dst, dst_dim); } -inline void cuda_max_mat_repeated(dim3 Gr, dim3 Bl, +inline void cuda_max_mat_blocks_back(dim3 Gr, dim3 Bl, float index_max_, const float *src, MatrixDim src_dim, float *dst, MatrixDim dst_dim) { - cudaF_max_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim); + cudaF_max_mat_blocks_back(Gr, Bl, index_max_, src, src_dim, dst, dst_dim); } inline void cuda_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat, MatrixDim mat_dim, const double *mat2, diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index dcd37b76709..ae88db54df9 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -1190,7 +1190,7 @@ void CuMatrixBase::AddMatBlocks(Real alpha, const CuMatrixBase &A, template void CuMatrixBase::MaxMatBlocks(const CuMatrixBase &A, - vector index_max, + CuVectorBase &index_max_, MatrixTransposeType transA, const int32 input_t_dim_, const int32 pool_t_size_, @@ -1248,10 +1248,37 @@ void CuMatrixBase::MaxMatBlocks(const CuMatrixBase &A, CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif - // { - // TO DO + { // maxpooling without cuda - // } + int32 tmp = 0; + for (int32 t = 0; t < num_pools_t; t++) { + for (int32 h = 0; h < num_pools_t; h++) { + for (int32 f = 0; f < num_pools_f; f++) { + // initialize the maximum value as the first element in the pool + int32 max_x = 0; int32 max_y = 0; + int32 max_value = A[t * pool_t_step_][h * pool_h_step_ * input_f_dim_ + f * pool_f_step_]; + + // find the maximm value in the pool + for (int32 x = 0; x < pool_t_size_; x++) { + for (int32 y = 0; y < pool_h_size_; y++) { + for (int32 z = 0; z < pool_f_size_; z++) { + int32 cur_x = t * pool_t_step_ + x; + int32 cur_y = (h * pool_h_step_ + y) * input_f_dim_ + f * pool_f_step_ + z; + if (A[cur_x][cur_y] > max_value) { + max_x = cur_x + max_y = cur_y; + max_value = A[cur_x][cur_y]; + index_max_[tmp] = cur_x + index_max_[tmp+1] = cur_y + } + } + } + } + tmp += 2 + } + } + } + } } else { // This is the "backward-propagation" version of MaxMatBlocks, where @@ -1280,10 +1307,14 @@ void CuMatrixBase::MaxMatBlocks(const CuMatrixBase &A, CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif - // { - // TO DO + { // maxpooling backward propagation without cuda - // } + *this.SetZero() + for (int32 x = 0; x < num_pools_t * num_pools_h * num_pools_f; x += 2) { + *this[index_max_[x]][index_max_[x+1]] = 1 + } + *this.MulElements(A); + } } } diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h index b12819eac39..67e2c9ad6ca 100644 --- a/src/cudamatrix/cu-matrix.h +++ b/src/cudamatrix/cu-matrix.h @@ -527,7 +527,7 @@ class CuMatrixBase { /// function set all the values in &out_deriv whose index is not in /// vector(not corresponding to maximum value in each pool of &in_value) /// as zero, and keeps those correponding to maximum value as the *in_deriv. - void MaxMatBlocks(const CuMatrixBase &A, vector index_max_, + void MaxMatBlocks(const CuMatrixBase &A, CuVectorBase &index_max_, MatrixTransposeType trans = kNoTrans, const int32 input_t_dim_, const int32 pool_t_size_, const int32 pool_t_step_, const int32 input_h_dim_, const int32 pool_h_size_, const int32 pool_h_step_, diff --git a/src/nnet3/nnet-convolutional-component.cc b/src/nnet3/nnet-convolutional-component.cc index 73de0ea5844..16b6ff0da37 100644 --- a/src/nnet3/nnet-convolutional-component.cc +++ b/src/nnet3/nnet-convolutional-component.cc @@ -1,4 +1,4 @@ -// nnet3/nnet-convolutional-component.cc + // nnet3/nnet-convolutional-component.cc // Copyright 2017 Johns Hopkins University (author: Daniel Povey) diff --git a/src/nnet3/nnet-convolutional-component.h b/src/nnet3/nnet-convolutional-component.h index 4677c894e69..dd73216ff18 100644 --- a/src/nnet3/nnet-convolutional-component.h +++ b/src/nnet3/nnet-convolutional-component.h @@ -25,7 +25,6 @@ #include "nnet3/natural-gradient-online.h" #include "nnet3/convolution.h" #include -#include namespace kaldi { namespace nnet3 { @@ -467,18 +466,20 @@ class TimeHeightConvolutionComponent: public UpdatableComponent { size along t-axis) index_max_ a vector that store the index of the maximum - value as (t,h,f), used in back-propagation. + value as (r, c), used in back-propagation. The + size of this vector is 2 * num_pools_t * + num_pools_h * num_pools_f Output : The output is also a 2.5D tensor with dimension (num_block_t by num_block_h * num_block_f) where: - num_block_t = 1 + (input_t_dim_ - pool_t_size_) / pool_t_step_; + num_pools_t = 1 + (input_t_dim_ - pool_t_size_) / pool_t_step_; // the number of blocks in t dimension - num_block_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_; + num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_; // the number of blocks in h dimension - num_block_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_; + num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_; // the number of blocks in f dimension @@ -532,7 +533,7 @@ class MaxPoolingOverBlock: public Component { int32 pool_f_step_; // the number of steps taken along f-axis of input // before computing the next pool - vector index_max_; + CuVectorBase index_max_; // the index of maximum value MaxPoolingOverBlock &operator = (const MaxPoolingOverBlock &other); // Disallow. }; From 5ca6aba110fce7203298209375baf54b5fc2d990 Mon Sep 17 00:00:00 2001 From: Qian Kun Date: Thu, 3 May 2018 14:35:53 -0400 Subject: [PATCH 07/15] few updates about parameters' name --- src/cudamatrix/cu-kernels-ansi.h | 40 ++++---- src/cudamatrix/cu-kernels.cu | 114 ++++++++++++++-------- src/cudamatrix/cu-kernels.h | 4 +- src/nnet3/nnet-convolutional-component.cc | 10 +- src/nnet3/nnet-convolutional-component.h | 12 +-- 5 files changed, 107 insertions(+), 73 deletions(-) diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h index 5520cb1c1db..a5fd632976f 100644 --- a/src/cudamatrix/cu-kernels-ansi.h +++ b/src/cudamatrix/cu-kernels-ansi.h @@ -111,27 +111,27 @@ void cudaD_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double *src, void cudaF_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float *src, MatrixDim src_dim, float *dst, MatrixDim dst_dim); void cudaD_max_mat_blocks(dim3 Gr, dim3 Bl, const double *src, double *dst, - int32_cuda input_t_dim_, - int32_cuda pool_t_size_, - int32_cuda pool_t_step_, - int32_cuda input_h_dim_, - int32_cuda pool_h_size_, - int32_cuda pool_h_step_, - int32_cuda input_f_dim_, - int32_cuda pool_f_size_, - int32_cuda pool_f_step_, - double index_max_, int A_trans); + const int32_cuda input_t_dim_, + const int32_cuda pool_t_size_, + const int32_cuda pool_t_step_, + const int32_cuda input_h_dim_, + const int32_cuda pool_h_size_, + const int32_cuda pool_h_step_, + const int32_cuda input_f_dim_, + const int32_cuda pool_f_size_, + const int32_cuda pool_f_step_, + int index_max_, int A_trans); void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, const float *src, float *dst, - int32_cuda input_t_dim_, - int32_cuda pool_t_size_, - int32_cuda pool_t_step_, - int32_cuda input_h_dim_, - int32_cuda pool_h_size_, - int32_cuda pool_h_step_, - int32_cuda input_f_dim_, - int32_cuda pool_f_size_, - int32_cuda pool_f_step_, - float index_max_, int A_trans); + const int32_cuda input_t_dim_, + const int32_cuda pool_t_size_, + const int32_cuda pool_t_step_, + const int32_cuda input_h_dim_, + const int32_cuda pool_h_size_, + const int32_cuda pool_h_step_, + const int32_cuda input_f_dim_, + const int32_cuda pool_f_size_, + const int32_cuda pool_f_step_, + int index_max_, int A_trans); void cudaD_max_mat_blocks_back(dim3 Gr, dim3 Bl, double index_max_, const double *src, MatrixDim src_dim, double *dst, MatrixDim dst_dim); void cudaF_max_mat_blocks_back(dim3 Gr, dim3 Bl, float index_max_, const float *src, diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 63ac13cd6ef..4699060520e 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -775,30 +775,37 @@ static void _add_mat_blocks_trans(Real alpha, const Real* src, template __global__ -static void _max_mat_blocks(const Real* src, - int32_cuda num_row_blocks, - int32_cuda num_col_blocks, Real* dst, MatrixDim d, - int src_stride) { +static void _max_mat_blocks(const Real* src, Real* dst, + const int32_cuda input_t_dim_, + const int32_cuda pool_t_size_, + const int32_cuda pool_t_step_, + const int32_cuda input_h_dim_, + const int32_cuda pool_h_size_, + const int32_cuda pool_h_step_, + const int32_cuda input_f_dim_, + const int32_cuda pool_f_size_, + const int32_cuda pool_f_step_, + int index_max_) { int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; - int32_cuda index = i + j * d.stride; - int32_cuda index_src = i + j * src_stride; - if (i < d.cols && j < d.rows) - for (int32_cuda p = 0; p < num_row_blocks; p++) { - for (int32_cuda q = 0; q < num_col_blocks; q++) { - dst[index] = fmax( - src[index_src + p * src_stride * d.rows + q * d.cols], - dst[index]); + } } } template __global__ -static void _max_mat_blocks_trans(const Real* src, - int32_cuda num_row_blocks, - int32_cuda num_col_blocks, Real* dst, - MatrixDim d, int src_stride) { +static void _max_mat_blocks_trans(const Real* src, Real* dst, + const int32_cuda input_t_dim_, + const int32_cuda pool_t_size_, + const int32_cuda pool_t_step_, + const int32_cuda input_h_dim_, + const int32_cuda pool_h_size_, + const int32_cuda pool_h_step_, + const int32_cuda input_f_dim_, + const int32_cuda pool_f_size_, + const int32_cuda pool_f_step_, + int index_max_) { int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; int32_cuda index = i + j * d.stride; @@ -815,9 +822,9 @@ static void _max_mat_blocks_trans(const Real* src, template __global__ -static void _max_mat_repeated(const Real* src, - MatrixDim src_dim, Real* dst, - MatrixDim dst_dim) { +static void _max_mat_blocks_back(const int index_max_, const Real* src, + MatrixDim src_dim, Real* dst, + MatrixDim dst_dim) { int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; int32_cuda src_i = i % src_dim.cols, @@ -4007,16 +4014,29 @@ void cudaF_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float* src, } } -void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, const float* src, - int32_cuda num_row_blocks, int32_cuda num_col_blocks, - float* dst, MatrixDim d, int src_stride, - int A_trans) { +void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, const float *src, float *dst, + int32_cuda input_t_dim_, + int32_cuda pool_t_size_, + int32_cuda pool_t_step_, + int32_cuda input_h_dim_, + int32_cuda pool_h_size_, + int32_cuda pool_h_step_, + int32_cuda input_f_dim_, + int32_cuda pool_f_size_, + int32_cuda pool_f_step_, + int index_max_, int A_trans) { if (A_trans) { - _max_mat_blocks_trans<<>>(src, num_row_blocks, num_col_blocks, - dst, d, src_stride); + _max_mat_blocks_trans<<>>(src, dst, + input_t_dim_, pool_t_size_, pool_t_step_, + input_h_dim_, pool_h_size_, pool_h_step_, + input_f_dim_, pool_f_size_, pool_f_step_, + index_max_); } else { - _max_mat_blocks<<>>(src, num_row_blocks, num_col_blocks, dst, - d, src_stride); + _max_mat_blocks<<>>(src, dst, + input_t_dim_, pool_t_size_, pool_t_step_, + input_h_dim_, pool_h_size_, pool_h_step_, + input_f_dim_, pool_f_size_, pool_f_step_, + index_max_); } } @@ -4025,9 +4045,10 @@ void cudaF_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float* src, _add_mat_repeated<<>>(alpha, src, src_dim, dst, dst_dim); } -void cudaF_max_mat_repeated(dim3 Gr, dim3 Bl, const float* src, - MatrixDim src_dim, float *dst, MatrixDim dst_dim) { - _max_mat_repeated<<>>(src, src_dim, dst, dst_dim); +void cudaF_max_mat_blocks_back(dim3 Gr, dim3 Bl, int index_max_, + const float *src, MatrixDim src_dim, + float *dst, MatrixDim dst_dim) { + _max_mat_blocks_back<<>>(index_max_, src, src_dim, dst, dst_dim); } void cudaF_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A, const float *B, @@ -4728,16 +4749,29 @@ void cudaD_add_mat_blocks(dim3 Gr, dim3 Bl, double alpha, const double* src, } } -void cudaD_max_mat_blocks(dim3 Gr, dim3 Bl, const double* src, - int32_cuda num_row_blocks, int32_cuda num_col_blocks, - double* dst, MatrixDim d, int src_stride, - int A_trans) { +void cudaD_max_mat_blocks(dim3 Gr, dim3 Bl, const double *src, double *dst, + int32_cuda input_t_dim_, + int32_cuda pool_t_size_, + int32_cuda pool_t_step_, + int32_cuda input_h_dim_, + int32_cuda pool_h_size_, + int32_cuda pool_h_step_, + int32_cuda input_f_dim_, + int32_cuda pool_f_size_, + int32_cuda pool_f_step_, + int index_max_, int A_trans) { if (A_trans) { - _max_mat_blocks_trans<<>>(src, num_row_blocks, num_col_blocks, - dst, d, src_stride); + _max_mat_blocks_trans<<>>(src, dst, + input_t_dim_, pool_t_size_, pool_t_step_, + input_h_dim_, pool_h_size_, pool_h_step_, + input_f_dim_, pool_f_size_, pool_f_step_, + index_max_); } else { - _max_mat_blocks<<>>(src, num_row_blocks, num_col_blocks, dst, - d, src_stride); + _max_mat_blocks<<>>(src, dst, + input_t_dim_, pool_t_size_, pool_t_step_, + input_h_dim_, pool_h_size_, pool_h_step_, + input_f_dim_, pool_f_size_, pool_f_step_, + index_max_); } } @@ -4746,9 +4780,9 @@ void cudaD_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double* src, _add_mat_repeated<<>>(alpha, src, src_dim, dst, dst_dim); } -void cudaD_max_mat_repeated(dim3 Gr, dim3 Bl, const double* src, +void cudaD_max_mat_blocks_back(dim3 Gr, dim3 Bl, const double *src, MatrixDim src_dim, double *dst, MatrixDim dst_dim) { - _max_mat_repeated<<>>(src, src_dim, dst, dst_dim); + _max_mat_blocks_back<<>>(index_max_, src, src_dim, dst, dst_dim); } void cudaD_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A, diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index 2804a624959..8985051a8c2 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -179,7 +179,7 @@ inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, int32_cuda input_f_dim_, int32_cuda pool_f_size_, int32_cuda pool_f_step_, - double index_max_, int A_trans) { + int index_max_, int A_trans) { cudaD_max_mat_blocks(Gr, Bl, src, dst, input_t_dim_, pool_t_size_, pool_t_step_, input_h_dim_, pool_h_size_, pool_h_step_, @@ -197,7 +197,7 @@ inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, int32_cuda input_f_dim_, int32_cuda pool_f_size_, int32_cuda pool_f_step_, - float index_max_, int A_trans) { + int index_max_, int A_trans) { cudaF_max_mat_blocks(Gr, Bl, src, dst, input_t_dim_, pool_t_size_, pool_t_step_, input_h_dim_, pool_h_size_, pool_h_step_, diff --git a/src/nnet3/nnet-convolutional-component.cc b/src/nnet3/nnet-convolutional-component.cc index 16b6ff0da37..9822b34a2a5 100644 --- a/src/nnet3/nnet-convolutional-component.cc +++ b/src/nnet3/nnet-convolutional-component.cc @@ -801,10 +801,10 @@ std::string MaxPoolingOverBlock::Info() const { } void* MaxPoolingOverBlock::Propagate(const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in, - CuMatrixBase *out) const { + const CuMatrixBase &in_value, + CuMatrixBase *out_value) const { - out->MaxMatBlocks(in, index_max_, kNoTrans, + out_value->MaxMatBlocks(in_value, index_max_, kNoTrans, input_t_dim_, pool_t_size_, pool_t_step_, input_h_dim_, pool_h_size_, pool_h_step_, input_f_dim_, pool_f_size_, pool_f_step_); @@ -814,8 +814,8 @@ void* MaxPoolingOverBlock::Propagate(const ComponentPrecomputedIndexes *indexes, void MaxPoolingOverBlock::Backprop( const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in_value, //in_value - const CuMatrixBase &out_value, // out_value, + const CuMatrixBase &,//in_value, + const CuMatrixBase &,//out_value, const CuMatrixBase &out_deriv, void *memo, Component *to_update, diff --git a/src/nnet3/nnet-convolutional-component.h b/src/nnet3/nnet-convolutional-component.h index dd73216ff18..5e454be9cea 100644 --- a/src/nnet3/nnet-convolutional-component.h +++ b/src/nnet3/nnet-convolutional-component.h @@ -495,19 +495,19 @@ class MaxPoolingOverBlock: public Component { return kSimpleComponent|kBackpropNeedsInput|kBackpropNeedsOutput|kBackpropAdds; } virtual void InitFromConfig(ConfigLine *cfl); - virtual int32 InputDim() const;// { return input_dim_; } - virtual int32 OutputDim() const;// { return output_dim_; } + virtual int32 InputDim() const; + virtual int32 OutputDim() const; virtual void Read(std::istream &is, bool binary); virtual void Write(std::ostream &os, bool binary) const; virtual std::string Info() const; virtual Component* Copy() const { return new MaxPoolingOverBlock(*this); } virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in, - CuMatrixBase *out) const; + const CuMatrixBase &in_value, + CuMatrixBase *out_value) const; virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in_value, //in_value - const CuMatrixBase &out_value, // out_value, + const CuMatrixBase &,//in_value, + const CuMatrixBase &,//out_value, const CuMatrixBase &out_deriv, void *memo, Component *to_update, From 7d53314df6925797f8464fa73433e29a56d3fad7 Mon Sep 17 00:00:00 2001 From: Qian Kun Date: Thu, 3 May 2018 17:42:07 -0400 Subject: [PATCH 08/15] cuda code for back propagation --- src/cudamatrix/cu-kernels-ansi.h | 26 +++++-- src/cudamatrix/cu-kernels.cu | 118 +++++++++++++++++++++---------- src/cudamatrix/cu-kernels.h | 44 +++++++++--- src/cudamatrix/cu-matrix.cc | 26 ++++--- 4 files changed, 156 insertions(+), 58 deletions(-) diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h index a5fd632976f..a5fd9e9d5a9 100644 --- a/src/cudamatrix/cu-kernels-ansi.h +++ b/src/cudamatrix/cu-kernels-ansi.h @@ -132,10 +132,28 @@ void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, const float *src, float *dst, const int32_cuda pool_f_size_, const int32_cuda pool_f_step_, int index_max_, int A_trans); -void cudaD_max_mat_blocks_back(dim3 Gr, dim3 Bl, double index_max_, const double *src, - MatrixDim src_dim, double *dst, MatrixDim dst_dim); -void cudaF_max_mat_blocks_back(dim3 Gr, dim3 Bl, float index_max_, const float *src, - MatrixDim src_dim, float *dst, MatrixDim dst_dim); +void cudaD_max_mat_blocks_back(dim3 Gr, dim3 Bl, const double *src, double *dst, + const int32_cuda num_pools_t, + const int32_cuda pool_t_size_, + const int32_cuda pool_t_step_, + const int32_cuda num_pools_h, + const int32_cuda pool_h_size_, + const int32_cuda pool_h_step_, + const int32_cuda num_pools_f, + const int32_cuda pool_f_size_, + const int32_cuda pool_f_step_, + int index_max_); +void cudaF_max_mat_blocks_back(dim3 Gr, dim3 Bl, const float *src, float *dst, + const int32_cuda num_pools_t, + const int32_cuda pool_t_size_, + const int32_cuda pool_t_step_, + const int32_cuda num_pools_h, + const int32_cuda pool_h_size_, + const int32_cuda pool_h_step_, + const int32_cuda num_pools_f, + const int32_cuda pool_f_size_, + const int32_cuda pool_f_step_, + int index_max_); void cudaD_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat, MatrixDim mat_dim, const double *mat2, int mat2_row_stride, int mat2_col_stride, diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 4699060520e..a9198124b74 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -822,17 +822,38 @@ static void _max_mat_blocks_trans(const Real* src, Real* dst, template __global__ -static void _max_mat_blocks_back(const int index_max_, const Real* src, - MatrixDim src_dim, Real* dst, - MatrixDim dst_dim) { +static void _max_mat_blocks_back(const Real* src, Real* dst, + const int32_cuda num_pools_t, + const int32_cuda pool_t_size_, + const int32_cuda pool_t_step_, + const int32_cuda num_pools_h, + const int32_cuda pool_h_size_, + const int32_cuda pool_h_step_, + const int32_cuda num_pools_f, + const int32_cuda pool_f_size_, + const int32_cuda pool_f_step_, + int index_max_) { int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; - int32_cuda src_i = i % src_dim.cols, - src_j = j % src_dim.rows, - dst_index = i + j * dst_dim.stride, - src_index = src_i + src_j * src_dim.stride; - if (i < dst_dim.cols && j < dst_dim.rows) - dst[dst_index] += src[src_index]; + int32_cuda k = blockIdx.z * blcokDim.z + threadIdx.z; + + for (int32_cuda x = 0; x < pool_t_size_ ; x ++) { + int32_cuda cur_x = i * pool_t_step_ + x; + + for (int32_cuda y = 0; y < pool_h_size_ ; y++) { + for (int32_cuda z = 0; z < pool_f_size_; z++) { + int32_cuda cur_y = (j * pool_h_step_ + y) * input_f_dim_ + k * pool_f_step_ + z; + int32_cuda idx_in_idxmax = (i * num_pools_h + j) * num_pools_f + f + + if (cur_x == index_max_[idx_in_idxmax] && + cur_y == index_max_[idx_in_idxmax + 1] || + dst[cur_x][cur_y] == 1) + dst[cur_x][cur_y] = 1; + else + dst[cur_x][cur_y] = 0; + } + } + } } template @@ -4015,16 +4036,16 @@ void cudaF_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float* src, } void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, const float *src, float *dst, - int32_cuda input_t_dim_, - int32_cuda pool_t_size_, - int32_cuda pool_t_step_, - int32_cuda input_h_dim_, - int32_cuda pool_h_size_, - int32_cuda pool_h_step_, - int32_cuda input_f_dim_, - int32_cuda pool_f_size_, - int32_cuda pool_f_step_, - int index_max_, int A_trans) { + int32_cuda input_t_dim_, + int32_cuda pool_t_size_, + int32_cuda pool_t_step_, + int32_cuda input_h_dim_, + int32_cuda pool_h_size_, + int32_cuda pool_h_step_, + int32_cuda input_f_dim_, + int32_cuda pool_f_size_, + int32_cuda pool_f_step_, + int index_max_, int A_trans) { if (A_trans) { _max_mat_blocks_trans<<>>(src, dst, input_t_dim_, pool_t_size_, pool_t_step_, @@ -4045,10 +4066,22 @@ void cudaF_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float* src, _add_mat_repeated<<>>(alpha, src, src_dim, dst, dst_dim); } -void cudaF_max_mat_blocks_back(dim3 Gr, dim3 Bl, int index_max_, - const float *src, MatrixDim src_dim, - float *dst, MatrixDim dst_dim) { - _max_mat_blocks_back<<>>(index_max_, src, src_dim, dst, dst_dim); +void cudaF_max_mat_blocks_back(dim3 Gr, dim3 Bl, const float *src, float *dst, + int32_cuda input_t_dim_, + int32_cuda pool_t_size_, + int32_cuda pool_t_step_, + int32_cuda input_h_dim_, + int32_cuda pool_h_size_, + int32_cuda pool_h_step_, + int32_cuda input_f_dim_, + int32_cuda pool_f_size_, + int32_cuda pool_f_step_, + int index_max_) { + _max_mat_blocks_back<<>>(src, dst, + input_t_dim_, pool_t_size_, pool_t_step_, + input_h_dim_, pool_h_size_, pool_h_step_, + input_f_dim_, pool_f_size_, pool_f_step_, + index_max_); } void cudaF_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A, const float *B, @@ -4750,16 +4783,16 @@ void cudaD_add_mat_blocks(dim3 Gr, dim3 Bl, double alpha, const double* src, } void cudaD_max_mat_blocks(dim3 Gr, dim3 Bl, const double *src, double *dst, - int32_cuda input_t_dim_, - int32_cuda pool_t_size_, - int32_cuda pool_t_step_, - int32_cuda input_h_dim_, - int32_cuda pool_h_size_, - int32_cuda pool_h_step_, - int32_cuda input_f_dim_, - int32_cuda pool_f_size_, - int32_cuda pool_f_step_, - int index_max_, int A_trans) { + int32_cuda input_t_dim_, + int32_cuda pool_t_size_, + int32_cuda pool_t_step_, + int32_cuda input_h_dim_, + int32_cuda pool_h_size_, + int32_cuda pool_h_step_, + int32_cuda input_f_dim_, + int32_cuda pool_f_size_, + int32_cuda pool_f_step_, + int index_max_, int A_trans) { if (A_trans) { _max_mat_blocks_trans<<>>(src, dst, input_t_dim_, pool_t_size_, pool_t_step_, @@ -4780,9 +4813,22 @@ void cudaD_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double* src, _add_mat_repeated<<>>(alpha, src, src_dim, dst, dst_dim); } -void cudaD_max_mat_blocks_back(dim3 Gr, dim3 Bl, const double *src, - MatrixDim src_dim, double *dst, MatrixDim dst_dim) { - _max_mat_blocks_back<<>>(index_max_, src, src_dim, dst, dst_dim); +void cudaD_max_mat_blocks_back(dim3 Gr, dim3 Bl, const double *src, double *dst, + int32_cuda input_t_dim_, + int32_cuda pool_t_size_, + int32_cuda pool_t_step_, + int32_cuda input_h_dim_, + int32_cuda pool_h_size_, + int32_cuda pool_h_step_, + int32_cuda input_f_dim_, + int32_cuda pool_f_size_, + int32_cuda pool_f_step_, + int index_max_) { + _max_mat_blocks_back<<>>(src, dst, + input_t_dim_, pool_t_size_, pool_t_step_, + input_h_dim_, pool_h_size_, pool_h_step_, + input_f_dim_, pool_f_size_, pool_f_step_, + index_max_); } void cudaD_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A, diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index 8985051a8c2..a2faf2682d3 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -204,15 +204,41 @@ inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, input_f_dim_, pool_f_size_, pool_f_step_, index_max_, A_trans); } -inline void cuda_max_mat_blocks_back(dim3 Gr, dim3 Bl, double index_max_, - const double *src, MatrixDim src_dim, - double *dst, MatrixDim dst_dim) { - cudaD_max_mat_blocks_back(Gr, Bl, index_max_, src, src_dim, dst, dst_dim); -} -inline void cuda_max_mat_blocks_back(dim3 Gr, dim3 Bl, float index_max_, - const float *src, MatrixDim src_dim, - float *dst, MatrixDim dst_dim) { - cudaF_max_mat_blocks_back(Gr, Bl, index_max_, src, src_dim, dst, dst_dim); +inline void cuda_max_mat_blocks_back(dim3 Gr, dim3 Bl, + const double *src, double *dst, + int32_cuda num_pools_t, + int32_cuda pool_t_size_, + int32_cuda pool_t_step_, + int32_cuda num_pools_h, + int32_cuda pool_h_size_, + int32_cuda pool_h_step_, + int32_cuda num_pools_f, + int32_cuda pool_f_size_, + int32_cuda pool_f_step_, + int index_max_) { + cudaD_max_mat_blocks_back(Gr, Bl, src, dst, + num_pools_t, pool_t_size_, pool_t_step_, + num_pools_h, pool_h_size_, pool_h_step_, + num_pools_f, pool_f_size_, pool_f_step_, + index_max_); +} +inline void cuda_max_mat_blocks_back(dim3 Gr, dim3 Bl, + const float *src, float *dst, + int32_cuda num_pools_t, + int32_cuda pool_t_size_, + int32_cuda pool_t_step_, + int32_cuda num_pools_h, + int32_cuda pool_h_size_, + int32_cuda pool_h_step_, + int32_cuda num_pools_f, + int32_cuda pool_f_size_, + int32_cuda pool_f_step_, + int index_max_) { + cudaF_max_mat_blocks_back(Gr, Bl, src, dst, + num_pools_t, pool_t_size_, pool_t_step_, + num_pools_h, pool_h_size_, pool_h_step_, + num_pools_f, pool_f_size_, pool_f_step_, + index_max_); } inline void cuda_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat, MatrixDim mat_dim, const double *mat2, diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index ae88db54df9..49958a9fff0 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -1234,11 +1234,11 @@ void CuMatrixBase::MaxMatBlocks(const CuMatrixBase &A, if (CuDevice::Instantiate().Enabled()) { CuTimer tim; dim3 dimBlock(num_pools_t, num_pools_h, num_pools_f); - dim3 dimGrid(1024); + dim3 dimGrid(1); // dim3 dimGrid, dimBlock; // GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), // &dimGrid, &dimBlock); - cuda_max_mat_blocks(dimGrid, dimBlock, A.data_, data_, Dim(), + cuda_max_mat_blocks(dimGrid, dimBlock, A.data_, data_, input_t_dim_, pool_t_size_, pool_t_step_, input_h_dim_, pool_h_size_, pool_h_step_, input_f_dim_, pool_f_size_, pool_f_step_, @@ -1260,9 +1260,10 @@ void CuMatrixBase::MaxMatBlocks(const CuMatrixBase &A, // find the maximm value in the pool for (int32 x = 0; x < pool_t_size_; x++) { + int32 cur_x = t * pool_t_step_ + x; + for (int32 y = 0; y < pool_h_size_; y++) { - for (int32 z = 0; z < pool_f_size_; z++) { - int32 cur_x = t * pool_t_step_ + x; + for (int32 z = 0; z < pool_f_size_; z++) { int32 cur_y = (h * pool_h_step_ + y) * input_f_dim_ + f * pool_f_step_ + z; if (A[cur_x][cur_y] > max_value) { max_x = cur_x @@ -1298,11 +1299,18 @@ void CuMatrixBase::MaxMatBlocks(const CuMatrixBase &A, #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; - dim3 dimGrid, dimBlock; - GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), - &dimGrid, &dimBlock); - cuda_max_mat_blocks_back(dimGrid, dimBlock, index_max_, - A.data_, A.Dim(), data_, Dim()); + dim3 dimBlock(num_pools_t, num_pools_h, num_pools_f); + dim3 dimGrid(1); + // dim3 dimGrid, dimBlock; + // GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + // &dimGrid, &dimBlock); + cuda_max_mat_blocks_back(A.data_, data_, Dim(), + num_pools_t, pool_t_size_, pool_t_step_, + num_pools_h, pool_h_size_, pool_h_step_, + num_pools_f, pool_f_size_, pool_f_step_, + index_max_,) + // cuda_max_mat_blocks_back(dimGrid, dimBlock, index_max_, + // A.data_, A.Dim(), data_, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim); } else From a91c345fb9f9e3eed0205ba9e3913371f306befe Mon Sep 17 00:00:00 2001 From: Qian Kun Date: Fri, 4 May 2018 16:06:19 -0400 Subject: [PATCH 09/15] cuda programming --- src/cudamatrix/cu-kernels-ansi.h | 72 ++++--- src/cudamatrix/cu-kernels.cu | 317 ++++++++++++++++++------------- src/cudamatrix/cu-kernels.h | 128 ++++++------- src/cudamatrix/cu-matrix.cc | 20 +- 4 files changed, 285 insertions(+), 252 deletions(-) diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h index a5fd9e9d5a9..386b43ae16d 100644 --- a/src/cudamatrix/cu-kernels-ansi.h +++ b/src/cudamatrix/cu-kernels-ansi.h @@ -110,50 +110,48 @@ void cudaD_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double *src, MatrixDim src_dim, double *dst, MatrixDim dst_dim); void cudaF_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float *src, MatrixDim src_dim, float *dst, MatrixDim dst_dim); -void cudaD_max_mat_blocks(dim3 Gr, dim3 Bl, const double *src, double *dst, - const int32_cuda input_t_dim_, - const int32_cuda pool_t_size_, - const int32_cuda pool_t_step_, - const int32_cuda input_h_dim_, +void cudaD_max_mat_blocks(dim3 Gr, dim3 Bl, + const double *src, double *dst, double *index_max_, + const int32_cuda pool_t_size_, const int32_cuda pool_h_size_, - const int32_cuda pool_h_step_, - const int32_cuda input_f_dim_, const int32_cuda pool_f_size_, - const int32_cuda pool_f_step_, - int index_max_, int A_trans); -void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, const float *src, float *dst, - const int32_cuda input_t_dim_, - const int32_cuda pool_t_size_, const int32_cuda pool_t_step_, + const int32_cuda pool_h_step_, + const int32_cuda pool_f_step_, const int32_cuda input_h_dim_, + const int32_cuda input_f_dim_, + int A_tran); +void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, + const float *src, float *dst, float *index_max_, + const int32_cuda pool_t_size_, const int32_cuda pool_h_size_, + const int32_cuda pool_f_size_, + const int32_cuda pool_t_step_, const int32_cuda pool_h_step_, + const int32_cuda pool_f_step_, + const int32_cuda input_h_dim_, const int32_cuda input_f_dim_, - const int32_cuda pool_f_size_, - const int32_cuda pool_f_step_, - int index_max_, int A_trans); -void cudaD_max_mat_blocks_back(dim3 Gr, dim3 Bl, const double *src, double *dst, - const int32_cuda num_pools_t, - const int32_cuda pool_t_size_, - const int32_cuda pool_t_step_, - const int32_cuda num_pools_h, - const int32_cuda pool_h_size_, - const int32_cuda pool_h_step_, - const int32_cuda num_pools_f, - const int32_cuda pool_f_size_, - const int32_cuda pool_f_step_, - int index_max_); -void cudaF_max_mat_blocks_back(dim3 Gr, dim3 Bl, const float *src, float *dst, - const int32_cuda num_pools_t, - const int32_cuda pool_t_size_, - const int32_cuda pool_t_step_, - const int32_cuda num_pools_h, - const int32_cuda pool_h_size_, - const int32_cuda pool_h_step_, - const int32_cuda num_pools_f, - const int32_cuda pool_f_size_, - const int32_cuda pool_f_step_, - int index_max_); + int A_tran); +void cudaD_max_mat_blocks_back(dim3 Gr, dim3 Bl, + const double *src, double *dst, double *index_max_, + const int32_cuda pool_t_size_, + const int32_cuda pool_h_size_, + const int32_cuda pool_f_size_, + const int32_cuda pool_t_step_, + const int32_cuda pool_h_step_, + const int32_cuda pool_f_step_, + const int32_cuda input_h_dim_, + const int32_cuda input_f_dim_); +void cudaF_max_mat_blocks_back(dim3 Gr, dim3 Bl, + const float *src, float *dst, float *index_max_, + const int32_cuda pool_t_size_, + const int32_cuda pool_h_size_, + const int32_cuda pool_f_size_, + const int32_cuda pool_t_step_, + const int32_cuda pool_h_step_, + const int32_cuda pool_f_step_, + const int32_cuda input_h_dim_, + const int32_cuda input_f_dim_); void cudaD_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat, MatrixDim mat_dim, const double *mat2, int mat2_row_stride, int mat2_col_stride, diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index a9198124b74..f043f64b64b 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -775,82 +775,135 @@ static void _add_mat_blocks_trans(Real alpha, const Real* src, template __global__ -static void _max_mat_blocks(const Real* src, Real* dst, - const int32_cuda input_t_dim_, - const int32_cuda pool_t_size_, - const int32_cuda pool_t_step_, - const int32_cuda input_h_dim_, +static void _max_mat_blocks(const Real* src, Real* dst, Real* index_max_, + const int32_cuda pool_t_size_, const int32_cuda pool_h_size_, - const int32_cuda pool_h_step_, - const int32_cuda input_f_dim_, const int32_cuda pool_f_size_, - const int32_cuda pool_f_step_, - int index_max_) { + const int32_cuda pool_t_step_, + const int32_cuda pool_h_step_, + const int32_cuda pool_f_step_, + const int32_cuda input_h_dim_, + const int32_cuda input_f_dim_) { int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; - - } + int32_cuda k = blockIdx.z * blcokDim.z + threadIdx.z; + int32_cuda num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_; + int32_cuda num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_; + + // initialize the temporary maximum value and its index in each pool + int32_cuda max_row = i * pool_t_step_; + int32_cuda max_col = j * pool_h_step_ * input_f_dim_ + k * pool_f_step_; + int32_cuda max_value = src[max_row][max_col]; + + // loop over all the elements in each pool to find the maximum one, + // and record its index. + + for (int32_cuda t = 0; t < pool_t_size_; t++) { + // the index of row in *src + idx_row = i * pool_t_step_ + t; + + for (int32_cuda h = 0; h < pool_h_size_; h++) { + for (int32_cuda f = 0; f < pool_f_size_; f++) { + // the index of column in *src + idx_col = (j * pool_h_step_ + h) * input_f_dim_ + k * pool_f_step_ + f; + + if (src[idx_row][idx_col] > max_value) { + max_row = idx_row; + max_col = idx_col; + max_value = src[idx_row][idx_col] + } + } } + } + + dst[i][j * num_pools_f + k] = max_value; + + // the index of indexes stored in vector 'index_max_'. + int32_cuda idx_in_idxmax = (i * num_pools_h + j) * num_pools_f + k; + index_max_[idx_in_idxmax] = max_row; + index_max_[idx_in_idxmax + 1] = max_col; } +// this function is basicall the same as _max_mat_blocks, except it +// deal with the transpose matrix of *src. So the column and row index +// are exchanged. template __global__ -static void _max_mat_blocks_trans(const Real* src, Real* dst, - const int32_cuda input_t_dim_, - const int32_cuda pool_t_size_, - const int32_cuda pool_t_step_, - const int32_cuda input_h_dim_, +static void _max_mat_blocks_trans(const Real* src, Real* dst, Real* index_max_, + const int32_cuda pool_t_size_, const int32_cuda pool_h_size_, - const int32_cuda pool_h_step_, - const int32_cuda input_f_dim_, const int32_cuda pool_f_size_, - const int32_cuda pool_f_step_, - int index_max_) { + const int32_cuda pool_t_step_, + const int32_cuda pool_h_step_, + const int32_cuda pool_f_step_, + const int32_cuda input_h_dim_, + const int32_cuda input_f_dim_) { int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; - int32_cuda index = i + j * d.stride; - int32_cuda index_src = j + i * src_stride; - if (i < d.cols && j < d.rows) - for (int32_cuda p = 0; p < num_row_blocks; p++) { - for (int32_cuda q = 0; q < num_col_blocks; q++) { - dst[index] = fmax( - src[index_src + p * src_stride * d.cols + q * d.rows], - dst[index]); - } + int32_cuda k = blockIdx.z * blcokDim.z + threadIdx.z; + int32_cuda num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_; + int32_cuda num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_; + + int32_cuda max_row = i * pool_t_step_; + int32_cuda max_col = j * pool_h_step_ * input_f_dim_ + k * pool_f_step_; + int32_cuda max_value = src[max_col][max_row]; + + for (int32_cuda t = 0; t < pool_t_size_; t++) { + idx_row = i * pool_t_step_ + t; + + for (int32_cuda h = 0; h < pool_h_size_; h++) { + for (int32_cuda f = 0; f < pool_f_size_; f++) { + idx_col = (j * pool_h_step_ + h) * input_f_dim_ + k * pool_f_step_ + f; + + if (src[idx_col][idx_row] > max_value) { + max_row = idx_row; + max_col = idx_col; + max_value = src[idx_col][idx_row] + } + } } + } + + dst[j * num_pools_f + k][i] = max_value; + + int32_cuda idx_in_idxmax = (i * num_pools_h + j) * num_pools_f + k; + index_max_[idx_in_idxmax] = max_row; + index_max_[idx_in_idxmax + 1] = max_col; + } template __global__ -static void _max_mat_blocks_back(const Real* src, Real* dst, - const int32_cuda num_pools_t, - const int32_cuda pool_t_size_, - const int32_cuda pool_t_step_, - const int32_cuda num_pools_h, - const int32_cuda pool_h_size_, - const int32_cuda pool_h_step_, - const int32_cuda num_pools_f, - const int32_cuda pool_f_size_, - const int32_cuda pool_f_step_, - int index_max_) { +static void _max_mat_blocks_back(const Real* src, Real* dst, Real* index_max_, + const int32_cuda pool_t_size_, + const int32_cuda pool_h_size_, + const int32_cuda pool_f_size_, + const int32_cuda pool_t_step_, + const int32_cuda pool_h_step_, + const int32_cuda pool_f_step_, + const int32_cuda input_h_dim_, + const int32_cuda input_f_dim_) { int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; int32_cuda k = blockIdx.z * blcokDim.z + threadIdx.z; + int32_cuda num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_; + int32_cuda num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_; - for (int32_cuda x = 0; x < pool_t_size_ ; x ++) { - int32_cuda cur_x = i * pool_t_step_ + x; + for (int32_cuda t = 0; t < pool_t_size_ ; t++) { + int32_cuda idx_row = i * pool_t_step_ + t; - for (int32_cuda y = 0; y < pool_h_size_ ; y++) { - for (int32_cuda z = 0; z < pool_f_size_; z++) { - int32_cuda cur_y = (j * pool_h_step_ + y) * input_f_dim_ + k * pool_f_step_ + z; - int32_cuda idx_in_idxmax = (i * num_pools_h + j) * num_pools_f + f + for (int32_cuda h = 0; h < pool_h_size_ ; h++) { + for (int32_cuda f = 0; f < pool_f_size_; f++) { + int32_cuda idx_col = (j * pool_h_step_ + h) * input_f_dim_ + k * pool_f_step_ + f; + int32_cuda idx_in_idxmax = (i * num_pools_h + j) * num_pools_f + k; - if (cur_x == index_max_[idx_in_idxmax] && - cur_y == index_max_[idx_in_idxmax + 1] || - dst[cur_x][cur_y] == 1) - dst[cur_x][cur_y] = 1; - else - dst[cur_x][cur_y] = 0; + if (idx_row == index_max_[idx_in_idxmax] && + idx_col == index_max_[idx_in_idxmax + 1] || + dst[idx_row][idx_col] != 0) { + dst[idx_row][idx_col] = src[i][j * num_pools_f + k]; + } else { + dst[idx_row][idx_col] = 0; + } } } } @@ -4035,53 +4088,49 @@ void cudaF_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float* src, } } -void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, const float *src, float *dst, - int32_cuda input_t_dim_, - int32_cuda pool_t_size_, - int32_cuda pool_t_step_, - int32_cuda input_h_dim_, - int32_cuda pool_h_size_, - int32_cuda pool_h_step_, - int32_cuda input_f_dim_, - int32_cuda pool_f_size_, - int32_cuda pool_f_step_, - int index_max_, int A_trans) { - if (A_trans) { - _max_mat_blocks_trans<<>>(src, dst, - input_t_dim_, pool_t_size_, pool_t_step_, - input_h_dim_, pool_h_size_, pool_h_step_, - input_f_dim_, pool_f_size_, pool_f_step_, - index_max_); - } else { - _max_mat_blocks<<>>(src, dst, - input_t_dim_, pool_t_size_, pool_t_step_, - input_h_dim_, pool_h_size_, pool_h_step_, - input_f_dim_, pool_f_size_, pool_f_step_, - index_max_); - } -} - void cudaF_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float* src, MatrixDim src_dim, float *dst, MatrixDim dst_dim) { _add_mat_repeated<<>>(alpha, src, src_dim, dst, dst_dim); } -void cudaF_max_mat_blocks_back(dim3 Gr, dim3 Bl, const float *src, float *dst, - int32_cuda input_t_dim_, - int32_cuda pool_t_size_, - int32_cuda pool_t_step_, - int32_cuda input_h_dim_, - int32_cuda pool_h_size_, - int32_cuda pool_h_step_, - int32_cuda input_f_dim_, - int32_cuda pool_f_size_, - int32_cuda pool_f_step_, - int index_max_) { - _max_mat_blocks_back<<>>(src, dst, - input_t_dim_, pool_t_size_, pool_t_step_, - input_h_dim_, pool_h_size_, pool_h_step_, - input_f_dim_, pool_f_size_, pool_f_step_, - index_max_); +void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, + const float *src, float *dst, float *index_max_, + const int32_cuda pool_t_size_, + const int32_cuda pool_h_size_, + const int32_cuda pool_f_size_, + const int32_cuda pool_t_step_, + const int32_cuda pool_h_step_, + const int32_cuda pool_f_step_, + const int32_cuda input_h_dim_, + const int32_cuda input_f_dim_, + int A_trans) { + if (A_trans) { + _max_mat_blocks_trans<<>>(src, dst, index_max_ + pool_t_size_, pool_h_size_, pool_f_size_, + pool_t_step_, pool_h_step_, pool_f_step_, + input_h_dim_, input_f_dim_); + } else { + _max_mat_blocks<<>>(src, dst, index_max_, + pool_t_size_, pool_h_size_, pool_f_size_, + pool_t_step_, pool_h_step_, pool_f_step_, + input_h_dim_, input_f_dim_); + } +} + +void cudaF_max_mat_blocks_back(dim3 Gr, dim3 Bl, + const float *src, float *dst, float *index_max_, + const int32_cuda pool_t_size_, + const int32_cuda pool_h_size_, + const int32_cuda pool_f_size_, + const int32_cuda pool_t_step_, + const int32_cuda pool_h_step_, + const int32_cuda pool_f_step_, + const int32_cuda input_h_dim_, + const int32_cuda input_f_dim_) { + _max_mat_blocks_back<<>>(src, dst, index_max_, + pool_t_size_, pool_h_size_, pool_f_size_, + pool_t_step_, pool_h_step_, pool_f_step_, + input_h_dim_, input_f_dim_); } void cudaF_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A, const float *B, @@ -4782,53 +4831,49 @@ void cudaD_add_mat_blocks(dim3 Gr, dim3 Bl, double alpha, const double* src, } } -void cudaD_max_mat_blocks(dim3 Gr, dim3 Bl, const double *src, double *dst, - int32_cuda input_t_dim_, - int32_cuda pool_t_size_, - int32_cuda pool_t_step_, - int32_cuda input_h_dim_, - int32_cuda pool_h_size_, - int32_cuda pool_h_step_, - int32_cuda input_f_dim_, - int32_cuda pool_f_size_, - int32_cuda pool_f_step_, - int index_max_, int A_trans) { - if (A_trans) { - _max_mat_blocks_trans<<>>(src, dst, - input_t_dim_, pool_t_size_, pool_t_step_, - input_h_dim_, pool_h_size_, pool_h_step_, - input_f_dim_, pool_f_size_, pool_f_step_, - index_max_); - } else { - _max_mat_blocks<<>>(src, dst, - input_t_dim_, pool_t_size_, pool_t_step_, - input_h_dim_, pool_h_size_, pool_h_step_, - input_f_dim_, pool_f_size_, pool_f_step_, - index_max_); - } -} - void cudaD_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double* src, MatrixDim src_dim, double *dst, MatrixDim dst_dim) { _add_mat_repeated<<>>(alpha, src, src_dim, dst, dst_dim); } -void cudaD_max_mat_blocks_back(dim3 Gr, dim3 Bl, const double *src, double *dst, - int32_cuda input_t_dim_, - int32_cuda pool_t_size_, - int32_cuda pool_t_step_, - int32_cuda input_h_dim_, - int32_cuda pool_h_size_, - int32_cuda pool_h_step_, - int32_cuda input_f_dim_, - int32_cuda pool_f_size_, - int32_cuda pool_f_step_, - int index_max_) { - _max_mat_blocks_back<<>>(src, dst, - input_t_dim_, pool_t_size_, pool_t_step_, - input_h_dim_, pool_h_size_, pool_h_step_, - input_f_dim_, pool_f_size_, pool_f_step_, - index_max_); +void cudaD_max_mat_blocks(dim3 Gr, dim3 Bl, + const double *src, double *dst, double *index_max_, + const int32_cuda pool_t_size_, + const int32_cuda pool_h_size_, + const int32_cuda pool_f_size_, + const int32_cuda pool_t_step_, + const int32_cuda pool_h_step_, + const int32_cuda pool_f_step_, + const int32_cuda input_h_dim_, + const int32_cuda input_f_dim_, + int A_trans) { + if (A_trans) { + _max_mat_blocks_trans<<>>(src, dst, index_max_, + pool_t_size_, pool_h_size_, pool_f_size_, + pool_t_step_, pool_h_step_, pool_f_step_, + input_h_dim_, input_f_dim_); + } else { + _max_mat_blocks<<>>(src, dst, index_max_, + pool_t_size_, pool_h_size_, pool_f_size_, + pool_t_step_, pool_h_step_, pool_f_step_, + input_h_dim_, input_f_dim_); + } +} + +void cudaD_max_mat_blocks_back(dim3 Gr, dim3 Bl, + const double *src, double *dst, double *index_max_, + const int32_cuda pool_t_size_, + const int32_cuda pool_h_size_, + const int32_cuda pool_f_size_, + const int32_cuda pool_t_step_, + const int32_cuda pool_h_step_, + const int32_cuda pool_f_step_, + const int32_cuda input_h_dim_, + const int32_cuda input_f_dim_) { + _max_mat_blocks_back<<>>(src, dst, index_max_, + pool_t_size_, pool_h_size_, pool_f_size_, + pool_t_step_, pool_h_step_, pool_f_step_, + input_h_dim_, input_f_dim_); } void cudaD_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A, diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index a2faf2682d3..0d8361cb38d 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -168,77 +168,67 @@ inline void cuda_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, float *dst, MatrixDim dst_dim) { cudaF_add_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim); } -inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, - const double *src, double *dst, - int32_cuda input_t_dim_, - int32_cuda pool_t_size_, - int32_cuda pool_t_step_, - int32_cuda input_h_dim_, - int32_cuda pool_h_size_, - int32_cuda pool_h_step_, - int32_cuda input_f_dim_, - int32_cuda pool_f_size_, - int32_cuda pool_f_step_, - int index_max_, int A_trans) { - cudaD_max_mat_blocks(Gr, Bl, src, dst, - input_t_dim_, pool_t_size_, pool_t_step_, - input_h_dim_, pool_h_size_, pool_h_step_, - input_f_dim_, pool_f_size_, pool_f_step_, - index_max_, A_trans); -} -inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, - const float *src, float *dst, - int32_cuda input_t_dim_, - int32_cuda pool_t_size_, - int32_cuda pool_t_step_, - int32_cuda input_h_dim_, - int32_cuda pool_h_size_, - int32_cuda pool_h_step_, - int32_cuda input_f_dim_, - int32_cuda pool_f_size_, - int32_cuda pool_f_step_, - int index_max_, int A_trans) { - cudaF_max_mat_blocks(Gr, Bl, src, dst, - input_t_dim_, pool_t_size_, pool_t_step_, - input_h_dim_, pool_h_size_, pool_h_step_, - input_f_dim_, pool_f_size_, pool_f_step_, - index_max_, A_trans); +inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, + const double *src, double *dst, double *index_max_, + const int32_cuda pool_t_size_, + const int32_cuda pool_h_size_, + const int32_cuda pool_f_size_, + const int32_cuda pool_t_step_, + const int32_cuda pool_h_step_, + const int32_cuda pool_f_step_, + const int32_cuda input_h_dim_, + const int32_cuda input_f_dim_, + int A_tran) { + cudaD_max_mat_blocks(Gr, Bl, src, dst, index_max_, + pool_t_size_, pool_h_size_, pool_f_size_, + pool_t_step_, pool_h_step_, pool_f_step_, + input_h_dim_, input_f_dim_, A_trans); +} +inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, + const float *src, float *dst, float *index_max_, + const int32_cuda pool_t_size_, + const int32_cuda pool_h_size_, + const int32_cuda pool_f_size_, + const int32_cuda pool_t_step_, + const int32_cuda pool_h_step_, + const int32_cuda pool_f_step_, + const int32_cuda input_h_dim_, + const int32_cuda input_f_dim_, + int A_tran) { + cudaF_max_mat_blocks(Gr, Bl, src, dst, index_max_, + pool_t_size_, pool_h_size_, pool_f_size_, + pool_t_step_, pool_h_step_, pool_f_step_, + input_h_dim_, input_f_dim_, A_trans); +} +inline void cuda_max_mat_blocks_back(dim3 Gr, dim3 Bl, + const double *src, double *dst, double *index_max_, + const int32_cuda pool_t_size_, + const int32_cuda pool_h_size_, + const int32_cuda pool_f_size_, + const int32_cuda pool_t_step_, + const int32_cuda pool_h_step_, + const int32_cuda pool_f_step_, + const int32_cuda input_h_dim_, + const int32_cuda input_f_dim_) { + cudaD_max_mat_blocks_back(Gr, Bl, src, dst, index_max_, + pool_t_size_, pool_h_size_, pool_f_size_, + pool_t_step_, pool_h_step_, pool_f_step_, + input_h_dim_, input_f_dim_); } inline void cuda_max_mat_blocks_back(dim3 Gr, dim3 Bl, - const double *src, double *dst, - int32_cuda num_pools_t, - int32_cuda pool_t_size_, - int32_cuda pool_t_step_, - int32_cuda num_pools_h, - int32_cuda pool_h_size_, - int32_cuda pool_h_step_, - int32_cuda num_pools_f, - int32_cuda pool_f_size_, - int32_cuda pool_f_step_, - int index_max_) { - cudaD_max_mat_blocks_back(Gr, Bl, src, dst, - num_pools_t, pool_t_size_, pool_t_step_, - num_pools_h, pool_h_size_, pool_h_step_, - num_pools_f, pool_f_size_, pool_f_step_, - index_max_); -} -inline void cuda_max_mat_blocks_back(dim3 Gr, dim3 Bl, - const float *src, float *dst, - int32_cuda num_pools_t, - int32_cuda pool_t_size_, - int32_cuda pool_t_step_, - int32_cuda num_pools_h, - int32_cuda pool_h_size_, - int32_cuda pool_h_step_, - int32_cuda num_pools_f, - int32_cuda pool_f_size_, - int32_cuda pool_f_step_, - int index_max_) { - cudaF_max_mat_blocks_back(Gr, Bl, src, dst, - num_pools_t, pool_t_size_, pool_t_step_, - num_pools_h, pool_h_size_, pool_h_step_, - num_pools_f, pool_f_size_, pool_f_step_, - index_max_); + const float *src, float *dst, float *index_max_, + const int32_cuda pool_t_size_, + const int32_cuda pool_h_size_, + const int32_cuda pool_f_size_, + const int32_cuda pool_t_step_, + const int32_cuda pool_h_step_, + const int32_cuda pool_f_step_, + const int32_cuda input_h_dim_, + const int32_cuda input_f_dim_) { + cudaF_max_mat_blocks_back(Gr, Bl, src, dst, index_max_, + pool_t_size_, pool_h_size_, pool_f_size_, + pool_t_step_, pool_h_step_, pool_f_step_, + input_h_dim_, input_f_dim_); } inline void cuda_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat, MatrixDim mat_dim, const double *mat2, diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index 49958a9fff0..9af200a8b5e 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -1238,11 +1238,11 @@ void CuMatrixBase::MaxMatBlocks(const CuMatrixBase &A, // dim3 dimGrid, dimBlock; // GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), // &dimGrid, &dimBlock); - cuda_max_mat_blocks(dimGrid, dimBlock, A.data_, data_, - input_t_dim_, pool_t_size_, pool_t_step_, - input_h_dim_, pool_h_size_, pool_h_step_, - input_f_dim_, pool_f_size_, pool_f_step_, - index_max_, (transA == kTrans ? 1 : 0)); + cuda_max_mat_blocks(dimGrid, dimBlock, A.data_, data_, index_max_, + pool_t_size_, pool_h_size_, pool_f_size_, + pool_t_step_, pool_h_step_, pool_f_step_, + input_h_dim_, input_f_dim_, + (transA == kTrans ? 1 : 0)); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim); @@ -1275,6 +1275,7 @@ void CuMatrixBase::MaxMatBlocks(const CuMatrixBase &A, } } } + *this[t][h * num_pools_f + f] = max_value; tmp += 2 } } @@ -1304,11 +1305,10 @@ void CuMatrixBase::MaxMatBlocks(const CuMatrixBase &A, // dim3 dimGrid, dimBlock; // GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), // &dimGrid, &dimBlock); - cuda_max_mat_blocks_back(A.data_, data_, Dim(), - num_pools_t, pool_t_size_, pool_t_step_, - num_pools_h, pool_h_size_, pool_h_step_, - num_pools_f, pool_f_size_, pool_f_step_, - index_max_,) + cuda_max_mat_blocks_back(A.data_, data_, index_max_ + pool_t_size_, pool_h_size_, pool_f_size_, + pool_t_step_, pool_h_step_, pool_f_step_, + input_h_dim_, input_f_dim_) // cuda_max_mat_blocks_back(dimGrid, dimBlock, index_max_, // A.data_, A.Dim(), data_, Dim()); CU_SAFE_CALL(cudaGetLastError()); From 0d02c03de9a49fa9d28f275b3d46aa5719ea6d37 Mon Sep 17 00:00:00 2001 From: Qian Kun Date: Mon, 7 May 2018 13:55:50 -0400 Subject: [PATCH 10/15] cpp grammar error --- src/cudamatrix/cu-kernels.cu | 6 +++--- src/cudamatrix/cu-kernels.h | 4 ++-- src/cudamatrix/cu-matrix.cc | 22 +++++++++++----------- src/cudamatrix/cu-matrix.h | 6 +++--- src/nnet3/nnet-convolutional-component.cc | 16 +++++++++------- src/nnet3/nnet-convolutional-component.h | 10 ++++++---- 6 files changed, 34 insertions(+), 30 deletions(-) diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index f043f64b64b..739a1b48d35 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -775,7 +775,7 @@ static void _add_mat_blocks_trans(Real alpha, const Real* src, template __global__ -static void _max_mat_blocks(const Real* src, Real* dst, Real* index_max_, +static void _max_mat_blocks(const Real *src, Real *dst, Real *index_max_, const int32_cuda pool_t_size_, const int32_cuda pool_h_size_, const int32_cuda pool_f_size_, @@ -829,7 +829,7 @@ static void _max_mat_blocks(const Real* src, Real* dst, Real* index_max_, // are exchanged. template __global__ -static void _max_mat_blocks_trans(const Real* src, Real* dst, Real* index_max_, +static void _max_mat_blocks_trans(const Real *src, Real *dst, Real *index_max_, const int32_cuda pool_t_size_, const int32_cuda pool_h_size_, const int32_cuda pool_f_size_, @@ -874,7 +874,7 @@ static void _max_mat_blocks_trans(const Real* src, Real* dst, Real* index_max_, template __global__ -static void _max_mat_blocks_back(const Real* src, Real* dst, Real* index_max_, +static void _max_mat_blocks_back(const Real *src, Real *dst, Real *index_max_, const int32_cuda pool_t_size_, const int32_cuda pool_h_size_, const int32_cuda pool_f_size_, diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index 0d8361cb38d..e85360eaf0f 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -178,7 +178,7 @@ inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, const int32_cuda pool_f_step_, const int32_cuda input_h_dim_, const int32_cuda input_f_dim_, - int A_tran) { + int A_trans) { cudaD_max_mat_blocks(Gr, Bl, src, dst, index_max_, pool_t_size_, pool_h_size_, pool_f_size_, pool_t_step_, pool_h_step_, pool_f_step_, @@ -194,7 +194,7 @@ inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, const int32_cuda pool_f_step_, const int32_cuda input_h_dim_, const int32_cuda input_f_dim_, - int A_tran) { + int A_trans) { cudaF_max_mat_blocks(Gr, Bl, src, dst, index_max_, pool_t_size_, pool_h_size_, pool_f_size_, pool_t_step_, pool_h_step_, pool_f_step_, diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index 9af200a8b5e..06e40603dcb 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -1191,7 +1191,6 @@ void CuMatrixBase::AddMatBlocks(Real alpha, const CuMatrixBase &A, template void CuMatrixBase::MaxMatBlocks(const CuMatrixBase &A, CuVectorBase &index_max_, - MatrixTransposeType transA, const int32 input_t_dim_, const int32 pool_t_size_, const int32 pool_t_step_, @@ -1200,7 +1199,8 @@ void CuMatrixBase::MaxMatBlocks(const CuMatrixBase &A, const int32 pool_h_step_, const int32 input_f_dim_, const int32 pool_f_size_, - const int32 pool_f_step_) { + const int32 pool_f_step_, + MatrixTransposeType transA) { if (num_rows_ == 0 || num_cols_ == 0) return; int32 num_pools_t = 1 + (input_t_dim_ - pool_t_size_) / pool_t_step_; @@ -1266,17 +1266,17 @@ void CuMatrixBase::MaxMatBlocks(const CuMatrixBase &A, for (int32 z = 0; z < pool_f_size_; z++) { int32 cur_y = (h * pool_h_step_ + y) * input_f_dim_ + f * pool_f_step_ + z; if (A[cur_x][cur_y] > max_value) { - max_x = cur_x + max_x = cur_x; max_y = cur_y; max_value = A[cur_x][cur_y]; - index_max_[tmp] = cur_x - index_max_[tmp+1] = cur_y + index_max_[tmp] = cur_x; + index_max_[tmp+1] = cur_y; } } } } *this[t][h * num_pools_f + f] = max_value; - tmp += 2 + tmp += 2; } } } @@ -1305,10 +1305,10 @@ void CuMatrixBase::MaxMatBlocks(const CuMatrixBase &A, // dim3 dimGrid, dimBlock; // GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), // &dimGrid, &dimBlock); - cuda_max_mat_blocks_back(A.data_, data_, index_max_ + cuda_max_mat_blocks_back(dimGrid, dimBlock, A.data_, data_, index_max_, pool_t_size_, pool_h_size_, pool_f_size_, pool_t_step_, pool_h_step_, pool_f_step_, - input_h_dim_, input_f_dim_) + input_h_dim_, input_f_dim_); // cuda_max_mat_blocks_back(dimGrid, dimBlock, index_max_, // A.data_, A.Dim(), data_, Dim()); CU_SAFE_CALL(cudaGetLastError()); @@ -1317,11 +1317,11 @@ void CuMatrixBase::MaxMatBlocks(const CuMatrixBase &A, #endif { // maxpooling backward propagation without cuda - *this.SetZero() + this->SetZero(); for (int32 x = 0; x < num_pools_t * num_pools_h * num_pools_f; x += 2) { - *this[index_max_[x]][index_max_[x+1]] = 1 + *this[index_max_[x]][index_max_[x+1]] = 1; } - *this.MulElements(A); + this->MulElements(A); } } } diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h index 67e2c9ad6ca..57761457226 100644 --- a/src/cudamatrix/cu-matrix.h +++ b/src/cudamatrix/cu-matrix.h @@ -527,11 +527,11 @@ class CuMatrixBase { /// function set all the values in &out_deriv whose index is not in /// vector(not corresponding to maximum value in each pool of &in_value) /// as zero, and keeps those correponding to maximum value as the *in_deriv. - void MaxMatBlocks(const CuMatrixBase &A, CuVectorBase &index_max_, - MatrixTransposeType trans = kNoTrans, + void MaxMatBlocks(const CuMatrixBase &A, CuVectorBase &index_max_, const int32 input_t_dim_, const int32 pool_t_size_, const int32 pool_t_step_, const int32 input_h_dim_, const int32 pool_h_size_, const int32 pool_h_step_, - const int32 input_f_dim_, const int32 pool_f_size_, const int32 pool_f_step_); + const int32 input_f_dim_, const int32 pool_f_size_, const int32 pool_f_step_, + MatrixTransposeType trans = kNoTrans); /// (for each column c of *this), c = alpha * col + beta * c void AddVecToCols(Real alpha, const CuVectorBase &col, Real beta = 1.0); diff --git a/src/nnet3/nnet-convolutional-component.cc b/src/nnet3/nnet-convolutional-component.cc index 9822b34a2a5..7bcca4bd3dc 100644 --- a/src/nnet3/nnet-convolutional-component.cc +++ b/src/nnet3/nnet-convolutional-component.cc @@ -679,12 +679,12 @@ MaxPoolingOverBlock::MaxPoolingOverBlock( pool_f_step_(other.pool_f_step_) { } // aquire input dim -int32 MaxpoolingComponent::InputDim() const { +int32 MaxPoolingOverBlock::InputDim() const { return input_t_dim_ * input_h_dim_ * input_f_dim_; } // aquire output dim -int32 MaxpoolingComponent::OutputDim() const { +int32 MaxPoolingOverBlock::OutputDim() const { int32 num_pools_t = 1 + (input_t_dim_ - pool_t_size_) / pool_t_step_; int32 num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_; int32 num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_; @@ -692,7 +692,7 @@ int32 MaxpoolingComponent::OutputDim() const { } // check the component parameters -void MaxpoolingComponent::Check() const { +void MaxPoolingOverBlock::Check() const { // sanity check of the max pooling parameters KALDI_ASSERT(input_t_dim_ > 0); KALDI_ASSERT(input_h_dim_ > 0); @@ -804,10 +804,11 @@ void* MaxPoolingOverBlock::Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in_value, CuMatrixBase *out_value) const { - out_value->MaxMatBlocks(in_value, index_max_, kNoTrans, + out_value->MaxMatBlocks(in_value, index_max_, input_t_dim_, pool_t_size_, pool_t_step_, input_h_dim_, pool_h_size_, pool_h_step_, - input_f_dim_, pool_f_size_, pool_f_step_); + input_f_dim_, pool_f_size_, pool_f_step_, + kNoTrans); return NULL; } @@ -822,10 +823,11 @@ void MaxPoolingOverBlock::Backprop( CuMatrixBase *in_deriv) const { if (in_deriv) { - in_derv->MaxMatBlocks(out_deriv, index_max_, kNoTrans, + in_deriv->MaxMatBlocks(out_deriv, index_max_, input_t_dim_, pool_t_size_, pool_t_step_, input_h_dim_, pool_h_size_, pool_h_step_, - input_f_dim_, pool_f_size_, pool_f_step_); + input_f_dim_, pool_f_size_, pool_f_step_, + kNoTrans); } } diff --git a/src/nnet3/nnet-convolutional-component.h b/src/nnet3/nnet-convolutional-component.h index 5e454be9cea..6544254d8b1 100644 --- a/src/nnet3/nnet-convolutional-component.h +++ b/src/nnet3/nnet-convolutional-component.h @@ -513,8 +513,9 @@ class MaxPoolingOverBlock: public Component { Component *to_update, CuMatrixBase *in_deriv) const; virtual void Check() const; - - private: + CuVectorBase index_max_; // the index of maximum value + + protected: int32 input_t_dim_; // size of the input along t-axis // (e.g. number of time steps) int32 input_h_dim_; // size of input along h-axis @@ -533,9 +534,10 @@ class MaxPoolingOverBlock: public Component { int32 pool_f_step_; // the number of steps taken along f-axis of input // before computing the next pool - CuVectorBase index_max_; // the index of maximum value + // CuVectorBase index_max_; // the index of maximum value + // std::vector index_max_; - MaxPoolingOverBlock &operator = (const MaxPoolingOverBlock &other); // Disallow. + // MaxPoolingOverBlock &operator = (const MaxPoolingOverBlock &other); // Disallow. }; } // namespace nnet3 From 8d554310373361b4f8cebdc6ccb67300de8bcf6d Mon Sep 17 00:00:00 2001 From: Qian Kun Date: Mon, 7 May 2018 15:01:25 -0400 Subject: [PATCH 11/15] change [] to () as index of CuVectorBase & CuMatrixBase --- src/cudamatrix/cu-matrix.cc | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index 06e40603dcb..771cb4ab6c4 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -1251,12 +1251,14 @@ void CuMatrixBase::MaxMatBlocks(const CuMatrixBase &A, { // maxpooling without cuda int32 tmp = 0; + // Real *data = this->data_; + // std::vector idx_tmp(index_max_.dim_); for (int32 t = 0; t < num_pools_t; t++) { for (int32 h = 0; h < num_pools_t; h++) { for (int32 f = 0; f < num_pools_f; f++) { // initialize the maximum value as the first element in the pool int32 max_x = 0; int32 max_y = 0; - int32 max_value = A[t * pool_t_step_][h * pool_h_step_ * input_f_dim_ + f * pool_f_step_]; + int32 max_value = A(t * pool_t_step_, h * pool_h_step_ * input_f_dim_ + f * pool_f_step_); // find the maximm value in the pool for (int32 x = 0; x < pool_t_size_; x++) { @@ -1265,21 +1267,25 @@ void CuMatrixBase::MaxMatBlocks(const CuMatrixBase &A, for (int32 y = 0; y < pool_h_size_; y++) { for (int32 z = 0; z < pool_f_size_; z++) { int32 cur_y = (h * pool_h_step_ + y) * input_f_dim_ + f * pool_f_step_ + z; - if (A[cur_x][cur_y] > max_value) { + if (A(cur_x, cur_y) > max_value) { max_x = cur_x; max_y = cur_y; - max_value = A[cur_x][cur_y]; - index_max_[tmp] = cur_x; - index_max_[tmp+1] = cur_y; + max_value = A(cur_x, cur_y); + index_max_(tmp) = cur_x; + index_max_(tmp+1) = cur_y; } } } } - *this[t][h * num_pools_f + f] = max_value; + (*this)(t, h * num_pools_f + f) = max_value; tmp += 2; } } } + // CuArray cu_cols(idx_tmp); + // // index_max_->CopyCols(index_max_, cu_cols); + // // CuVectorBase ttmp(idx_tmp); + // index_max_.CopyFromVec(cu_cols); } } else { @@ -1305,12 +1311,12 @@ void CuMatrixBase::MaxMatBlocks(const CuMatrixBase &A, // dim3 dimGrid, dimBlock; // GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), // &dimGrid, &dimBlock); - cuda_max_mat_blocks_back(dimGrid, dimBlock, A.data_, data_, index_max_, - pool_t_size_, pool_h_size_, pool_f_size_, - pool_t_step_, pool_h_step_, pool_f_step_, - input_h_dim_, input_f_dim_); - // cuda_max_mat_blocks_back(dimGrid, dimBlock, index_max_, - // A.data_, A.Dim(), data_, Dim()); + + // cuda_max_mat_blocks_back(dimGrid, dimBlock, A.data_, data_, index_max_, + // pool_t_size_, pool_h_size_, pool_f_size_, + // pool_t_step_, pool_h_step_, pool_f_step_, + // input_h_dim_, input_f_dim_); + CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim); } else @@ -1319,7 +1325,7 @@ void CuMatrixBase::MaxMatBlocks(const CuMatrixBase &A, // maxpooling backward propagation without cuda this->SetZero(); for (int32 x = 0; x < num_pools_t * num_pools_h * num_pools_f; x += 2) { - *this[index_max_[x]][index_max_[x+1]] = 1; + (*this)(index_max_(x),index_max_(x+1)) = 1; } this->MulElements(A); } From d73c86d4e7a8cc63cce91c84be27570ad355517d Mon Sep 17 00:00:00 2001 From: Qian Kun Date: Tue, 8 May 2018 12:14:30 -0400 Subject: [PATCH 12/15] add some description --- src/cudamatrix/cu-kernels.cu | 18 ++-- src/cudamatrix/cu-matrix.cc | 6 +- src/cudamatrix/cu-matrix.h | 53 +++++++++++ src/nnet3/nnet-convolutional-component.h | 108 +++++++++++++---------- 4 files changed, 125 insertions(+), 60 deletions(-) diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 739a1b48d35..194c36d97f1 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -786,7 +786,7 @@ static void _max_mat_blocks(const Real *src, Real *dst, Real *index_max_, const int32_cuda input_f_dim_) { int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; - int32_cuda k = blockIdx.z * blcokDim.z + threadIdx.z; + int32_cuda k = blockIdx.z * blockDim.z + threadIdx.z; int32_cuda num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_; int32_cuda num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_; @@ -800,17 +800,17 @@ static void _max_mat_blocks(const Real *src, Real *dst, Real *index_max_, for (int32_cuda t = 0; t < pool_t_size_; t++) { // the index of row in *src - idx_row = i * pool_t_step_ + t; + int32_cuda idx_row = i * pool_t_step_ + t; for (int32_cuda h = 0; h < pool_h_size_; h++) { for (int32_cuda f = 0; f < pool_f_size_; f++) { // the index of column in *src - idx_col = (j * pool_h_step_ + h) * input_f_dim_ + k * pool_f_step_ + f; + int32_cuda idx_col = (j * pool_h_step_ + h) * input_f_dim_ + k * pool_f_step_ + f; if (src[idx_row][idx_col] > max_value) { max_row = idx_row; max_col = idx_col; - max_value = src[idx_row][idx_col] + max_value = src[idx_row][idx_col]; } } } @@ -840,7 +840,7 @@ static void _max_mat_blocks_trans(const Real *src, Real *dst, Real *index_max_, const int32_cuda input_f_dim_) { int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; - int32_cuda k = blockIdx.z * blcokDim.z + threadIdx.z; + int32_cuda k = blockIdx.z * blockDim.z + threadIdx.z; int32_cuda num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_; int32_cuda num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_; @@ -849,16 +849,16 @@ static void _max_mat_blocks_trans(const Real *src, Real *dst, Real *index_max_, int32_cuda max_value = src[max_col][max_row]; for (int32_cuda t = 0; t < pool_t_size_; t++) { - idx_row = i * pool_t_step_ + t; + int32_cuda idx_row = i * pool_t_step_ + t; for (int32_cuda h = 0; h < pool_h_size_; h++) { for (int32_cuda f = 0; f < pool_f_size_; f++) { - idx_col = (j * pool_h_step_ + h) * input_f_dim_ + k * pool_f_step_ + f; + int32_cuda idx_col = (j * pool_h_step_ + h) * input_f_dim_ + k * pool_f_step_ + f; if (src[idx_col][idx_row] > max_value) { max_row = idx_row; max_col = idx_col; - max_value = src[idx_col][idx_row] + max_value = src[idx_col][idx_row]; } } } @@ -885,7 +885,7 @@ static void _max_mat_blocks_back(const Real *src, Real *dst, Real *index_max_, const int32_cuda input_f_dim_) { int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; - int32_cuda k = blockIdx.z * blcokDim.z + threadIdx.z; + int32_cuda k = blockIdx.z * blockDim.z + threadIdx.z; int32_cuda num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_; int32_cuda num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_; diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index 771cb4ab6c4..97cfce8d7e5 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -1238,7 +1238,7 @@ void CuMatrixBase::MaxMatBlocks(const CuMatrixBase &A, // dim3 dimGrid, dimBlock; // GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), // &dimGrid, &dimBlock); - cuda_max_mat_blocks(dimGrid, dimBlock, A.data_, data_, index_max_, + cuda_max_mat_blocks(dimGrid, dimBlock, A.data_, data_,// index_max_, pool_t_size_, pool_h_size_, pool_f_size_, pool_t_step_, pool_h_step_, pool_f_step_, input_h_dim_, input_f_dim_, @@ -1282,10 +1282,6 @@ void CuMatrixBase::MaxMatBlocks(const CuMatrixBase &A, } } } - // CuArray cu_cols(idx_tmp); - // // index_max_->CopyCols(index_max_, cu_cols); - // // CuVectorBase ttmp(idx_tmp); - // index_max_.CopyFromVec(cu_cols); } } else { diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h index 57761457226..7c8c5bfd8af 100644 --- a/src/cudamatrix/cu-matrix.h +++ b/src/cudamatrix/cu-matrix.h @@ -527,6 +527,59 @@ class CuMatrixBase { /// function set all the values in &out_deriv whose index is not in /// vector(not corresponding to maximum value in each pool of &in_value) /// as zero, and keeps those correponding to maximum value as the *in_deriv. + /// Parameters: + /// + /// size of input matrix: + /// input_t_dim_ size of the input along t-axis + /// (e.g. number of time steps) + /// input_h_dim_ size of input along h-axis + /// (e.g. number of mel-frequency bins) + /// input_f_dim_ size of input along f-axis + /// (e.g. number of filters in the ConvolutionComponent) + /// + /// block size: + /// pool_t_size_ size of the pooling window along t-axis + /// pool_h_size_ size of the pooling window along h-axis + /// pool_f_size_ size of the pooling window along f-axis + /// (So, the dimension of block is: + /// pool_t_size_ by pool_h_size_ * pool_f_size_) + /// + /// stride size: + /// pool_t_step_ the number of steps taken along t-axis of input + /// before computing the next pool (e.g. the stride + /// size along t-axis) + /// pool_h_step_ the number of steps taken along h-axis of input + /// before computing the next pool (e.g. the stride + /// size along t-axis) + /// pool_f_step_ the number of steps taken along f-axis of input + /// before computing the next pool (e.g. the stride + /// size along t-axis) + + /// index_max_ a vector that store the index of the maximum + /// value as (r, c), used in back-propagation. The + /// size of this vector is 2 * num_pools_t * + /// num_pools_h * num_pools_f + /// + /// So there are totally num_pools_t * num_pools_h * num_pools_f blocks, + /// where: + /// num_pools_t = 1 + (input_t_dim_ - pool_t_size_) / pool_t_step_; + /// // the number of blocks in t dimension + /// num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_; + /// // the number of blocks in h dimension + /// num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_; + /// // the number of blocks in f dimension + /// If we have index idx_t, idx_h, idx_f in each axis, then we can find + /// the block with: + /// row index: + /// [start_t, start_t + pool_t_size_]; + /// column index: combination of sets: + /// [start_col(0), start_col(0) + pool_f_size_], + /// [start_col(1), start_col(1) + pool_f_size_], + /// ..., + /// [start_col(pool_h_size_), start_col(pool_h_size_) + pool_f_size_] + /// where: + /// start_row = idx_t * pool_t_step_ + /// start_col(i) = (idx_h * pool_h_step_ + i) * input_f_dim_ + idx_f * pool_f_step_ void MaxMatBlocks(const CuMatrixBase &A, CuVectorBase &index_max_, const int32 input_t_dim_, const int32 pool_t_size_, const int32 pool_t_step_, const int32 input_h_dim_, const int32 pool_h_size_, const int32 pool_h_step_, diff --git a/src/nnet3/nnet-convolutional-component.h b/src/nnet3/nnet-convolutional-component.h index 6544254d8b1..ce6bfd5096f 100644 --- a/src/nnet3/nnet-convolutional-component.h +++ b/src/nnet3/nnet-convolutional-component.h @@ -374,11 +374,10 @@ class TimeHeightConvolutionComponent: public UpdatableComponent { this component should be compatible with TimeHeightConvolutionComponent MaxPoolingOverBlock : - MaxPoolingOverBlock component was firstly used in ConvNet for selecting an - representative activation in an area. It inspired Maxout nonlinearity. - Each output element of this component is the maximum of a block of - input elements where the block has a 2.5D dimension (pool_t_size_, - pool_h_size_ * pool_f_size_). + MaxPoolingOverBlock component was firstly used in ConvNet. It inspired + Maxout nonlinearity. Each output element of this component is the + maximum of a block of input elements where the block has a + dimension (pool_t_size_, pool_h_size_ * pool_f_size_). Blocks could overlap if the shift value on any axis is smaller than its corresponding pool size (e.g. pool_t_step_ < pool_t_size_). If the shift values are euqal to their pool size, there is no @@ -387,15 +386,66 @@ class TimeHeightConvolutionComponent: public UpdatableComponent { This component is designed to be used after a ConvolutionComponent so that the input matrix is propagated from a 2d-convolutional layer. - This component implements 2.5d-maxpooling which performs + This component implements maxpooling which performs max pooling along the three axes. - Input : A 2.5D matrix with dimensions: + Input : A matrix with dimensions: t: (e.g. time) h: (e.g. height, mel-frequency) f: (e.g. channels like number of filters in the ConvolutionComponent) - - The reason why we call the matrix 2.5D is because we compress the 3D block + + Parameters: + + input_t_dim_ size of the input along t-axis + (e.g. number of time steps) + input_h_dim_ size of input along h-axis + (e.g. number of mel-frequency bins) + input_f_dim_ size of input along f-axis + (e.g. number of filters in the ConvolutionComponent) + + pool_t_size_ size of the pooling window along t-axis + pool_h_size_ size of the pooling window along h-axis + pool_f_size_ size of the pooling window along f-axis + + pool_t_step_ the number of steps taken along t-axis of input + before computing the next pool (e.g. the stride + size along t-axis) + pool_h_step_ the number of steps taken along h-axis of input + before computing the next pool (e.g. the stride + size along t-axis) + pool_f_step_ the number of steps taken along f-axis of input + before computing the next pool (e.g. the stride + size along t-axis) + + index_max_ a vector that store the index of the maximum + value as (r, c), used in back-propagation. The + size of this vector is 2 * num_pools_t * + num_pools_h * num_pools_f + + So there are totally num_pools_t * num_pools_h * num_pools_f blocks, + where: + num_pools_t = 1 + (input_t_dim_ - pool_t_size_) / pool_t_step_; + // the number of blocks in t dimension + num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_; + // the number of blocks in h dimension + num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_; + // the number of blocks in f dimension + + If we have index idx_t, idx_h, idx_f in each axis, then we can find + the block with: + row index: + [start_t, start_t + pool_t_size_]; + column index: combination of sets: + [start_col(0), start_col(0) + pool_f_size_], + [start_col(1), start_col(1) + pool_f_size_], + ..., + [start_col(pool_h_size_), start_col(pool_h_size_) + pool_f_size_] + where: + start_row = idx_t * pool_t_step_ + start_col(i) = (idx_h * pool_h_step_ + i) * input_f_dim_ + idx_f * pool_f_step_ + + Example: + We store the 3D matrix into a 2D matrix by concatenating each 2D matrix at different channel like: h = 0 h = 1 @@ -442,45 +492,11 @@ class TimeHeightConvolutionComponent: public UpdatableComponent { stride of height(poo_h_step), we arrange each row of output as: (all filters for height 0)(all filters for height 1)... - Parameters: - - input_t_dim_ size of the input along t-axis - (e.g. number of time steps) - input_h_dim_ size of input along h-axis - (e.g. number of mel-frequency bins) - input_f_dim_ size of input along f-axis - (e.g. number of filters in the ConvolutionComponent) - - pool_t_size_ size of the pooling window along t-axis - pool_h_size_ size of the pooling window along h-axis - pool_f_size_ size of the pooling window along f-axis - - pool_t_step_ the number of steps taken along t-axis of input - before computing the next pool (e.g. the stride - size along t-axis) - pool_h_step_ the number of steps taken along h-axis of input - before computing the next pool (e.g. the stride - size along t-axis) - pool_f_step_ the number of steps taken along f-axis of input - before computing the next pool (e.g. the stride - size along t-axis) - - index_max_ a vector that store the index of the maximum - value as (r, c), used in back-propagation. The - size of this vector is 2 * num_pools_t * - num_pools_h * num_pools_f - - - Output : The output is also a 2.5D tensor with dimension (num_block_t by - num_block_h * num_block_f) where: - num_pools_t = 1 + (input_t_dim_ - pool_t_size_) / pool_t_step_; - // the number of blocks in t dimension - num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_; - // the number of blocks in h dimension - num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_; - // the number of blocks in f dimension + Output : The output is also a 2D matrix with dimension (num_block_t by + num_block_h * num_block_f), with each element corresponding to + a block. */ From 7b0d46a4f0b69b301d11d8775aa8c3ba250398e8 Mon Sep 17 00:00:00 2001 From: Qian Kun Date: Tue, 8 May 2018 12:51:48 -0400 Subject: [PATCH 13/15] delete commet not needed --- src/cudamatrix/cu-kernels.cu | 2 +- src/cudamatrix/cu-matrix.cc | 27 ++++++++---------------- src/nnet3/nnet-convolutional-component.h | 7 +++--- 3 files changed, 13 insertions(+), 23 deletions(-) diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 194c36d97f1..8005482122b 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -4105,7 +4105,7 @@ void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, const int32_cuda input_f_dim_, int A_trans) { if (A_trans) { - _max_mat_blocks_trans<<>>(src, dst, index_max_ + _max_mat_blocks_trans<<>>(src, dst, index_max_, pool_t_size_, pool_h_size_, pool_f_size_, pool_t_step_, pool_h_step_, pool_f_step_, input_h_dim_, input_f_dim_); diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index 97cfce8d7e5..b9ef308bdd8 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -1207,7 +1207,6 @@ void CuMatrixBase::MaxMatBlocks(const CuMatrixBase &A, int32 num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_; int32 num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_; - // Not sure whether this needed? KALDI_ASSERT((input_t_dim_ - pool_t_size_) % pool_t_step_ == 0 && (input_h_dim_ - pool_h_size_) % pool_h_step_ == 0 && (input_f_dim_ - pool_f_size_) % pool_f_step_ == 0); @@ -1216,7 +1215,6 @@ void CuMatrixBase::MaxMatBlocks(const CuMatrixBase &A, A.NumCols() >= (transA == kNoTrans ? num_cols_ : num_rows_)) { // This is the "forward-propagation" version of MaxMatBlocks. // It supports both regular and transposed operation. - if (transA == kNoTrans) { KALDI_ASSERT(A.NumRows() == input_t_dim_ && A.NumCols() == input_h_dim_ * input_f_dim_ && @@ -1235,10 +1233,8 @@ void CuMatrixBase::MaxMatBlocks(const CuMatrixBase &A, CuTimer tim; dim3 dimBlock(num_pools_t, num_pools_h, num_pools_f); dim3 dimGrid(1); - // dim3 dimGrid, dimBlock; - // GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), - // &dimGrid, &dimBlock); - cuda_max_mat_blocks(dimGrid, dimBlock, A.data_, data_,// index_max_, + + cuda_max_mat_blocks(dimGrid, dimBlock, A.data_, data_, index_max_.data_, pool_t_size_, pool_h_size_, pool_f_size_, pool_t_step_, pool_h_step_, pool_f_step_, input_h_dim_, input_f_dim_, @@ -1251,8 +1247,6 @@ void CuMatrixBase::MaxMatBlocks(const CuMatrixBase &A, { // maxpooling without cuda int32 tmp = 0; - // Real *data = this->data_; - // std::vector idx_tmp(index_max_.dim_); for (int32 t = 0; t < num_pools_t; t++) { for (int32 h = 0; h < num_pools_t; h++) { for (int32 f = 0; f < num_pools_f; f++) { @@ -1271,8 +1265,8 @@ void CuMatrixBase::MaxMatBlocks(const CuMatrixBase &A, max_x = cur_x; max_y = cur_y; max_value = A(cur_x, cur_y); - index_max_(tmp) = cur_x; - index_max_(tmp+1) = cur_y; + index_max_(tmp) = max_x; + index_max_(tmp+1) = max_y; } } } @@ -1304,14 +1298,11 @@ void CuMatrixBase::MaxMatBlocks(const CuMatrixBase &A, CuTimer tim; dim3 dimBlock(num_pools_t, num_pools_h, num_pools_f); dim3 dimGrid(1); - // dim3 dimGrid, dimBlock; - // GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), - // &dimGrid, &dimBlock); - - // cuda_max_mat_blocks_back(dimGrid, dimBlock, A.data_, data_, index_max_, - // pool_t_size_, pool_h_size_, pool_f_size_, - // pool_t_step_, pool_h_step_, pool_f_step_, - // input_h_dim_, input_f_dim_); + + cuda_max_mat_blocks_back(dimGrid, dimBlock, A.data_, data_, index_max_.data_, + pool_t_size_, pool_h_size_, pool_f_size_, + pool_t_step_, pool_h_step_, pool_f_step_, + input_h_dim_, input_f_dim_); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim); diff --git a/src/nnet3/nnet-convolutional-component.h b/src/nnet3/nnet-convolutional-component.h index ce6bfd5096f..6cd421eaf5a 100644 --- a/src/nnet3/nnet-convolutional-component.h +++ b/src/nnet3/nnet-convolutional-component.h @@ -529,7 +529,7 @@ class MaxPoolingOverBlock: public Component { Component *to_update, CuMatrixBase *in_deriv) const; virtual void Check() const; - CuVectorBase index_max_; // the index of maximum value + protected: int32 input_t_dim_; // size of the input along t-axis @@ -550,10 +550,9 @@ class MaxPoolingOverBlock: public Component { int32 pool_f_step_; // the number of steps taken along f-axis of input // before computing the next pool - // CuVectorBase index_max_; // the index of maximum value - // std::vector index_max_; + CuVectorBase index_max_; // the index of maximum value - // MaxPoolingOverBlock &operator = (const MaxPoolingOverBlock &other); // Disallow. + MaxPoolingOverBlock &operator = (const MaxPoolingOverBlock &other); // Disallow. }; } // namespace nnet3 From 2e99da3371c35a8df006659c1d0ff2789cf812e1 Mon Sep 17 00:00:00 2001 From: Qian Kun Date: Tue, 8 May 2018 17:52:06 -0400 Subject: [PATCH 14/15] add a within-block stride --- src/cudamatrix/cu-kernels-ansi.h | 50 +++++----- src/cudamatrix/cu-kernels.cu | 165 +++++++++++++++++-------------- src/cudamatrix/cu-kernels.h | 78 ++++++++------- src/cudamatrix/cu-matrix.cc | 15 +-- src/cudamatrix/cu-matrix.h | 5 +- 5 files changed, 171 insertions(+), 142 deletions(-) diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h index 386b43ae16d..7ee981f6544 100644 --- a/src/cudamatrix/cu-kernels-ansi.h +++ b/src/cudamatrix/cu-kernels-ansi.h @@ -112,46 +112,52 @@ void cudaF_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float *src, MatrixDim src_dim, float *dst, MatrixDim dst_dim); void cudaD_max_mat_blocks(dim3 Gr, dim3 Bl, const double *src, double *dst, double *index_max_, - const int32_cuda pool_t_size_, - const int32_cuda pool_h_size_, - const int32_cuda pool_f_size_, + const int32_cuda stride, + const int32_cuda input_t_dim_, + const int32_cuda pool_t_size_, const int32_cuda pool_t_step_, - const int32_cuda pool_h_step_, - const int32_cuda pool_f_step_, const int32_cuda input_h_dim_, + const int32_cuda pool_h_size_, + const int32_cuda pool_h_step_, const int32_cuda input_f_dim_, + const int32_cuda pool_f_size_, + const int32_cuda pool_f_step_, int A_tran); void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, const float *src, float *dst, float *index_max_, - const int32_cuda pool_t_size_, - const int32_cuda pool_h_size_, - const int32_cuda pool_f_size_, + const int32_cuda stride, + const int32_cuda input_t_dim_, + const int32_cuda pool_t_size_, const int32_cuda pool_t_step_, - const int32_cuda pool_h_step_, - const int32_cuda pool_f_step_, const int32_cuda input_h_dim_, + const int32_cuda pool_h_size_, + const int32_cuda pool_h_step_, const int32_cuda input_f_dim_, + const int32_cuda pool_f_size_, + const int32_cuda pool_f_step_, int A_tran); void cudaD_max_mat_blocks_back(dim3 Gr, dim3 Bl, const double *src, double *dst, double *index_max_, - const int32_cuda pool_t_size_, - const int32_cuda pool_h_size_, - const int32_cuda pool_f_size_, + const int32_cuda input_t_dim_, + const int32_cuda pool_t_size_, const int32_cuda pool_t_step_, - const int32_cuda pool_h_step_, - const int32_cuda pool_f_step_, const int32_cuda input_h_dim_, - const int32_cuda input_f_dim_); -void cudaF_max_mat_blocks_back(dim3 Gr, dim3 Bl, - const float *src, float *dst, float *index_max_, - const int32_cuda pool_t_size_, const int32_cuda pool_h_size_, + const int32_cuda pool_h_step_, + const int32_cuda input_f_dim_, const int32_cuda pool_f_size_, + const int32_cuda pool_f_step_); +void cudaF_max_mat_blocks_back(dim3 Gr, dim3 Bl, + const float *src, float *dst, float *index_max_, + const int32_cuda input_t_dim_, + const int32_cuda pool_t_size_, const int32_cuda pool_t_step_, - const int32_cuda pool_h_step_, - const int32_cuda pool_f_step_, const int32_cuda input_h_dim_, - const int32_cuda input_f_dim_); + const int32_cuda pool_h_size_, + const int32_cuda pool_h_step_, + const int32_cuda input_f_dim_, + const int32_cuda pool_f_size_, + const int32_cuda pool_f_step_); void cudaD_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat, MatrixDim mat_dim, const double *mat2, int mat2_row_stride, int mat2_col_stride, diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 8005482122b..34f5c7652ba 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -775,15 +775,17 @@ static void _add_mat_blocks_trans(Real alpha, const Real* src, template __global__ -static void _max_mat_blocks(const Real *src, Real *dst, Real *index_max_, - const int32_cuda pool_t_size_, - const int32_cuda pool_h_size_, - const int32_cuda pool_f_size_, +static void _max_mat_blocks(const Real *src, Real *dst, Real *index_max_, + const int32_cuda stride, + const int32_cuda input_t_dim_, + const int32_cuda pool_t_size_, const int32_cuda pool_t_step_, - const int32_cuda pool_h_step_, - const int32_cuda pool_f_step_, const int32_cuda input_h_dim_, - const int32_cuda input_f_dim_) { + const int32_cuda pool_h_size_, + const int32_cuda pool_h_step_, + const int32_cuda input_f_dim_, + const int32_cuda pool_f_size_, + const int32_cuda pool_f_step_) { int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; int32_cuda k = blockIdx.z * blockDim.z + threadIdx.z; @@ -793,12 +795,12 @@ static void _max_mat_blocks(const Real *src, Real *dst, Real *index_max_, // initialize the temporary maximum value and its index in each pool int32_cuda max_row = i * pool_t_step_; int32_cuda max_col = j * pool_h_step_ * input_f_dim_ + k * pool_f_step_; - int32_cuda max_value = src[max_row][max_col]; + int32_cuda max_value = src[max_row * input_h_dim_ * input_f_dim_ + max_col]; // loop over all the elements in each pool to find the maximum one, // and record its index. - for (int32_cuda t = 0; t < pool_t_size_; t++) { + for (int32_cuda t = 0; t < pool_t_size_; t += stride) { // the index of row in *src int32_cuda idx_row = i * pool_t_step_ + t; @@ -807,16 +809,16 @@ static void _max_mat_blocks(const Real *src, Real *dst, Real *index_max_, // the index of column in *src int32_cuda idx_col = (j * pool_h_step_ + h) * input_f_dim_ + k * pool_f_step_ + f; - if (src[idx_row][idx_col] > max_value) { + if (src[idx_row * input_h_dim_ * input_f_dim_ + idx_col] > max_value) { max_row = idx_row; max_col = idx_col; - max_value = src[idx_row][idx_col]; + max_value = src[max_row * input_h_dim_ * input_f_dim_ + max_col]; } } } } - dst[i][j * num_pools_f + k] = max_value; + dst[i * num_pools_h * num_pools_f + j * num_pools_f + k] = max_value; // the index of indexes stored in vector 'index_max_'. int32_cuda idx_in_idxmax = (i * num_pools_h + j) * num_pools_f + k; @@ -830,41 +832,44 @@ static void _max_mat_blocks(const Real *src, Real *dst, Real *index_max_, template __global__ static void _max_mat_blocks_trans(const Real *src, Real *dst, Real *index_max_, - const int32_cuda pool_t_size_, - const int32_cuda pool_h_size_, - const int32_cuda pool_f_size_, + const int32_cuda stride, + const int32_cuda input_t_dim_, + const int32_cuda pool_t_size_, const int32_cuda pool_t_step_, - const int32_cuda pool_h_step_, - const int32_cuda pool_f_step_, const int32_cuda input_h_dim_, - const int32_cuda input_f_dim_) { + const int32_cuda pool_h_size_, + const int32_cuda pool_h_step_, + const int32_cuda input_f_dim_, + const int32_cuda pool_f_size_, + const int32_cuda pool_f_step_) { int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; int32_cuda k = blockIdx.z * blockDim.z + threadIdx.z; + int32_cuda num_pools_t = 1 + (input_t_dim_ - pool_t_size_) / pool_t_step_; int32_cuda num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_; int32_cuda num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_; int32_cuda max_row = i * pool_t_step_; int32_cuda max_col = j * pool_h_step_ * input_f_dim_ + k * pool_f_step_; - int32_cuda max_value = src[max_col][max_row]; + int32_cuda max_value = src[max_col * input_t_dim_ + max_row]; - for (int32_cuda t = 0; t < pool_t_size_; t++) { + for (int32_cuda t = 0; t < pool_t_size_; t += stride) { int32_cuda idx_row = i * pool_t_step_ + t; for (int32_cuda h = 0; h < pool_h_size_; h++) { for (int32_cuda f = 0; f < pool_f_size_; f++) { int32_cuda idx_col = (j * pool_h_step_ + h) * input_f_dim_ + k * pool_f_step_ + f; - if (src[idx_col][idx_row] > max_value) { + if (src[idx_col * input_t_dim_ + idx_row] > max_value) { max_row = idx_row; max_col = idx_col; - max_value = src[idx_col][idx_row]; + max_value = src[max_col * input_t_dim_ + max_row]; } } } } - dst[j * num_pools_f + k][i] = max_value; + dst[(j * num_pools_f + k) * num_pools_t + i] = max_value; int32_cuda idx_in_idxmax = (i * num_pools_h + j) * num_pools_f + k; index_max_[idx_in_idxmax] = max_row; @@ -875,14 +880,15 @@ static void _max_mat_blocks_trans(const Real *src, Real *dst, Real *index_max_, template __global__ static void _max_mat_blocks_back(const Real *src, Real *dst, Real *index_max_, - const int32_cuda pool_t_size_, - const int32_cuda pool_h_size_, - const int32_cuda pool_f_size_, + const int32_cuda input_t_dim_, + const int32_cuda pool_t_size_, const int32_cuda pool_t_step_, - const int32_cuda pool_h_step_, - const int32_cuda pool_f_step_, const int32_cuda input_h_dim_, - const int32_cuda input_f_dim_) { + const int32_cuda pool_h_size_, + const int32_cuda pool_h_step_, + const int32_cuda input_f_dim_, + const int32_cuda pool_f_size_, + const int32_cuda pool_f_step_) { int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; int32_cuda k = blockIdx.z * blockDim.z + threadIdx.z; @@ -899,10 +905,11 @@ static void _max_mat_blocks_back(const Real *src, Real *dst, Real *index_max_, if (idx_row == index_max_[idx_in_idxmax] && idx_col == index_max_[idx_in_idxmax + 1] || - dst[idx_row][idx_col] != 0) { - dst[idx_row][idx_col] = src[i][j * num_pools_f + k]; + dst[idx_row * input_h_dim_ * input_f_dim_ + idx_col] != 0) { + dst[idx_row * input_h_dim_ * input_f_dim_ + idx_col] = + src[i * num_pools_h * num_pools_f + j * num_pools_f + k]; } else { - dst[idx_row][idx_col] = 0; + dst[idx_row * input_h_dim_ * input_f_dim_ + idx_col] = 0; } } } @@ -4095,42 +4102,45 @@ void cudaF_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float* src, void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, const float *src, float *dst, float *index_max_, - const int32_cuda pool_t_size_, - const int32_cuda pool_h_size_, - const int32_cuda pool_f_size_, + const int32_cuda stride, + const int32_cuda input_t_dim_, + const int32_cuda pool_t_size_, const int32_cuda pool_t_step_, - const int32_cuda pool_h_step_, - const int32_cuda pool_f_step_, const int32_cuda input_h_dim_, + const int32_cuda pool_h_size_, + const int32_cuda pool_h_step_, const int32_cuda input_f_dim_, + const int32_cuda pool_f_size_, + const int32_cuda pool_f_step_, int A_trans) { if (A_trans) { - _max_mat_blocks_trans<<>>(src, dst, index_max_, - pool_t_size_, pool_h_size_, pool_f_size_, - pool_t_step_, pool_h_step_, pool_f_step_, - input_h_dim_, input_f_dim_); + _max_mat_blocks_trans<<>>(src, dst, index_max_, stride, + input_t_dim_, pool_t_size_, pool_t_step_, + input_h_dim_, pool_h_size_, pool_h_step_, + input_f_dim_, pool_f_size_, pool_f_step_); } else { - _max_mat_blocks<<>>(src, dst, index_max_, - pool_t_size_, pool_h_size_, pool_f_size_, - pool_t_step_, pool_h_step_, pool_f_step_, - input_h_dim_, input_f_dim_); + _max_mat_blocks<<>>(src, dst, index_max_, stride, + input_t_dim_, pool_t_size_, pool_t_step_, + input_h_dim_, pool_h_size_, pool_h_step_, + input_f_dim_, pool_f_size_, pool_f_step_); } } void cudaF_max_mat_blocks_back(dim3 Gr, dim3 Bl, const float *src, float *dst, float *index_max_, - const int32_cuda pool_t_size_, - const int32_cuda pool_h_size_, - const int32_cuda pool_f_size_, + const int32_cuda input_t_dim_, + const int32_cuda pool_t_size_, const int32_cuda pool_t_step_, - const int32_cuda pool_h_step_, - const int32_cuda pool_f_step_, const int32_cuda input_h_dim_, - const int32_cuda input_f_dim_) { + const int32_cuda pool_h_size_, + const int32_cuda pool_h_step_, + const int32_cuda input_f_dim_, + const int32_cuda pool_f_size_, + const int32_cuda pool_f_step_) { _max_mat_blocks_back<<>>(src, dst, index_max_, - pool_t_size_, pool_h_size_, pool_f_size_, - pool_t_step_, pool_h_step_, pool_f_step_, - input_h_dim_, input_f_dim_); + input_t_dim_, pool_t_size_, pool_t_step_, + input_h_dim_, pool_h_size_, pool_h_step_, + input_f_dim_, pool_f_size_, pool_f_step_); } void cudaF_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A, const float *B, @@ -4838,42 +4848,45 @@ void cudaD_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double* src, void cudaD_max_mat_blocks(dim3 Gr, dim3 Bl, const double *src, double *dst, double *index_max_, - const int32_cuda pool_t_size_, - const int32_cuda pool_h_size_, - const int32_cuda pool_f_size_, + const int32_cuda stride, + const int32_cuda input_t_dim_, + const int32_cuda pool_t_size_, const int32_cuda pool_t_step_, - const int32_cuda pool_h_step_, - const int32_cuda pool_f_step_, const int32_cuda input_h_dim_, + const int32_cuda pool_h_size_, + const int32_cuda pool_h_step_, const int32_cuda input_f_dim_, + const int32_cuda pool_f_size_, + const int32_cuda pool_f_step_, int A_trans) { if (A_trans) { - _max_mat_blocks_trans<<>>(src, dst, index_max_, - pool_t_size_, pool_h_size_, pool_f_size_, - pool_t_step_, pool_h_step_, pool_f_step_, - input_h_dim_, input_f_dim_); + _max_mat_blocks_trans<<>>(src, dst, index_max_, stride, + input_t_dim_, pool_t_size_, pool_t_step_, + input_h_dim_, pool_h_size_, pool_h_step_, + input_f_dim_, pool_f_size_, pool_f_step_); } else { - _max_mat_blocks<<>>(src, dst, index_max_, - pool_t_size_, pool_h_size_, pool_f_size_, - pool_t_step_, pool_h_step_, pool_f_step_, - input_h_dim_, input_f_dim_); + _max_mat_blocks<<>>(src, dst, index_max_, stride, + input_t_dim_, pool_t_size_, pool_t_step_, + input_h_dim_, pool_h_size_, pool_h_step_, + input_f_dim_, pool_f_size_, pool_f_step_); } } void cudaD_max_mat_blocks_back(dim3 Gr, dim3 Bl, const double *src, double *dst, double *index_max_, - const int32_cuda pool_t_size_, - const int32_cuda pool_h_size_, - const int32_cuda pool_f_size_, + const int32_cuda input_t_dim_, + const int32_cuda pool_t_size_, const int32_cuda pool_t_step_, - const int32_cuda pool_h_step_, - const int32_cuda pool_f_step_, const int32_cuda input_h_dim_, - const int32_cuda input_f_dim_) { + const int32_cuda pool_h_size_, + const int32_cuda pool_h_step_, + const int32_cuda input_f_dim_, + const int32_cuda pool_f_size_, + const int32_cuda pool_f_step_) { _max_mat_blocks_back<<>>(src, dst, index_max_, - pool_t_size_, pool_h_size_, pool_f_size_, - pool_t_step_, pool_h_step_, pool_f_step_, - input_h_dim_, input_f_dim_); + input_t_dim_, pool_t_size_, pool_t_step_, + input_h_dim_, pool_h_size_, pool_h_step_, + input_f_dim_, pool_f_size_, pool_f_step_); } void cudaD_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A, diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index e85360eaf0f..0712179bca8 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -170,65 +170,71 @@ inline void cuda_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, } inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, const double *src, double *dst, double *index_max_, - const int32_cuda pool_t_size_, - const int32_cuda pool_h_size_, - const int32_cuda pool_f_size_, + const int32_cuda stride, + const int32_cuda input_t_dim_, + const int32_cuda pool_t_size_, const int32_cuda pool_t_step_, - const int32_cuda pool_h_step_, - const int32_cuda pool_f_step_, const int32_cuda input_h_dim_, + const int32_cuda pool_h_size_, + const int32_cuda pool_h_step_, const int32_cuda input_f_dim_, + const int32_cuda pool_f_size_, + const int32_cuda pool_f_step_, int A_trans) { - cudaD_max_mat_blocks(Gr, Bl, src, dst, index_max_, - pool_t_size_, pool_h_size_, pool_f_size_, - pool_t_step_, pool_h_step_, pool_f_step_, - input_h_dim_, input_f_dim_, A_trans); + cudaD_max_mat_blocks(Gr, Bl, src, dst, index_max_, stride, + input_t_dim_, pool_t_size_, pool_t_step_, + input_h_dim_, pool_h_size_, pool_h_step_, + input_f_dim_, pool_f_size_, pool_f_step_, A_trans); } inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, const float *src, float *dst, float *index_max_, - const int32_cuda pool_t_size_, - const int32_cuda pool_h_size_, - const int32_cuda pool_f_size_, + const int32_cuda stride, + const int32_cuda input_t_dim_, + const int32_cuda pool_t_size_, const int32_cuda pool_t_step_, - const int32_cuda pool_h_step_, - const int32_cuda pool_f_step_, const int32_cuda input_h_dim_, + const int32_cuda pool_h_size_, + const int32_cuda pool_h_step_, const int32_cuda input_f_dim_, + const int32_cuda pool_f_size_, + const int32_cuda pool_f_step_, int A_trans) { - cudaF_max_mat_blocks(Gr, Bl, src, dst, index_max_, - pool_t_size_, pool_h_size_, pool_f_size_, - pool_t_step_, pool_h_step_, pool_f_step_, - input_h_dim_, input_f_dim_, A_trans); + cudaF_max_mat_blocks(Gr, Bl, src, dst, index_max_, stride, + input_t_dim_, pool_t_size_, pool_t_step_, + input_h_dim_, pool_h_size_, pool_h_step_, + input_f_dim_, pool_f_size_, pool_f_step_, A_trans); } inline void cuda_max_mat_blocks_back(dim3 Gr, dim3 Bl, const double *src, double *dst, double *index_max_, - const int32_cuda pool_t_size_, - const int32_cuda pool_h_size_, - const int32_cuda pool_f_size_, + const int32_cuda input_t_dim_, + const int32_cuda pool_t_size_, const int32_cuda pool_t_step_, - const int32_cuda pool_h_step_, - const int32_cuda pool_f_step_, const int32_cuda input_h_dim_, - const int32_cuda input_f_dim_) { + const int32_cuda pool_h_size_, + const int32_cuda pool_h_step_, + const int32_cuda input_f_dim_, + const int32_cuda pool_f_size_, + const int32_cuda pool_f_step_) { cudaD_max_mat_blocks_back(Gr, Bl, src, dst, index_max_, - pool_t_size_, pool_h_size_, pool_f_size_, - pool_t_step_, pool_h_step_, pool_f_step_, - input_h_dim_, input_f_dim_); + input_t_dim_, pool_t_size_, pool_t_step_, + input_h_dim_, pool_h_size_, pool_h_step_, + input_f_dim_, pool_f_size_, pool_f_step_); } inline void cuda_max_mat_blocks_back(dim3 Gr, dim3 Bl, const float *src, float *dst, float *index_max_, - const int32_cuda pool_t_size_, - const int32_cuda pool_h_size_, - const int32_cuda pool_f_size_, + const int32_cuda input_t_dim_, + const int32_cuda pool_t_size_, const int32_cuda pool_t_step_, - const int32_cuda pool_h_step_, - const int32_cuda pool_f_step_, const int32_cuda input_h_dim_, - const int32_cuda input_f_dim_) { + const int32_cuda pool_h_size_, + const int32_cuda pool_h_step_, + const int32_cuda input_f_dim_, + const int32_cuda pool_f_size_, + const int32_cuda pool_f_step_) { cudaF_max_mat_blocks_back(Gr, Bl, src, dst, index_max_, - pool_t_size_, pool_h_size_, pool_f_size_, - pool_t_step_, pool_h_step_, pool_f_step_, - input_h_dim_, input_f_dim_); + input_t_dim_, pool_t_size_, pool_t_step_, + input_h_dim_, pool_h_size_, pool_h_step_, + input_f_dim_, pool_f_size_, pool_f_step_); } inline void cuda_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat, MatrixDim mat_dim, const double *mat2, diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index b9ef308bdd8..969a4629391 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -1191,6 +1191,7 @@ void CuMatrixBase::AddMatBlocks(Real alpha, const CuMatrixBase &A, template void CuMatrixBase::MaxMatBlocks(const CuMatrixBase &A, CuVectorBase &index_max_, + const int32 stride, const int32 input_t_dim_, const int32 pool_t_size_, const int32 pool_t_step_, @@ -1234,10 +1235,10 @@ void CuMatrixBase::MaxMatBlocks(const CuMatrixBase &A, dim3 dimBlock(num_pools_t, num_pools_h, num_pools_f); dim3 dimGrid(1); - cuda_max_mat_blocks(dimGrid, dimBlock, A.data_, data_, index_max_.data_, - pool_t_size_, pool_h_size_, pool_f_size_, - pool_t_step_, pool_h_step_, pool_f_step_, - input_h_dim_, input_f_dim_, + cuda_max_mat_blocks(dimGrid, dimBlock, A.data_, data_, index_max_.data_, stride, + input_t_dim_, pool_t_size_, pool_t_step_, + input_h_dim_, pool_h_size_, pool_h_step_, + input_f_dim_, pool_f_size_, pool_f_step_, (transA == kTrans ? 1 : 0)); CU_SAFE_CALL(cudaGetLastError()); @@ -1300,9 +1301,9 @@ void CuMatrixBase::MaxMatBlocks(const CuMatrixBase &A, dim3 dimGrid(1); cuda_max_mat_blocks_back(dimGrid, dimBlock, A.data_, data_, index_max_.data_, - pool_t_size_, pool_h_size_, pool_f_size_, - pool_t_step_, pool_h_step_, pool_f_step_, - input_h_dim_, input_f_dim_); + input_t_dim_, pool_t_size_, pool_t_step_, + input_h_dim_, pool_h_size_, pool_h_step_, + input_f_dim_, pool_f_size_, pool_f_step_); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim); diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h index 7c8c5bfd8af..f64838b3871 100644 --- a/src/cudamatrix/cu-matrix.h +++ b/src/cudamatrix/cu-matrix.h @@ -554,6 +554,9 @@ class CuMatrixBase { /// pool_f_step_ the number of steps taken along f-axis of input /// before computing the next pool (e.g. the stride /// size along t-axis) + /// stride the time stride size within blocks. So we get + /// one row of maxpooling candidate every stride rows + /// in the input matrix. /// index_max_ a vector that store the index of the maximum /// value as (r, c), used in back-propagation. The @@ -580,7 +583,7 @@ class CuMatrixBase { /// where: /// start_row = idx_t * pool_t_step_ /// start_col(i) = (idx_h * pool_h_step_ + i) * input_f_dim_ + idx_f * pool_f_step_ - void MaxMatBlocks(const CuMatrixBase &A, CuVectorBase &index_max_, + void MaxMatBlocks(const CuMatrixBase &A, CuVectorBase &index_max_, const int32 stride, const int32 input_t_dim_, const int32 pool_t_size_, const int32 pool_t_step_, const int32 input_h_dim_, const int32 pool_h_size_, const int32 pool_h_step_, const int32 input_f_dim_, const int32 pool_f_size_, const int32 pool_f_step_, From 89ac55b0997b274eb353554ac861f877ccc76874 Mon Sep 17 00:00:00 2001 From: Qian Kun Date: Thu, 17 May 2018 07:19:38 -0400 Subject: [PATCH 15/15] testcode for MaxMatBlocks function --- src/cudamatrix/cu-matrix-test.cc | 92 ++++++++++++++++++++++++++++++++ src/cudamatrix/cu-matrix.cc | 5 +- src/cudamatrix/cu-matrix.h | 4 +- 3 files changed, 96 insertions(+), 5 deletions(-) diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc index 01030bb8353..60d87ca1ca7 100644 --- a/src/cudamatrix/cu-matrix-test.cc +++ b/src/cudamatrix/cu-matrix-test.cc @@ -2917,6 +2917,97 @@ static void UnitTestCuMatrixEqualElementMask() { } +template +static void UnitTestCuMatrixMaxMatBlocks() { + for (int32 l = 0; l < 5; l++) { + int32 stride = RandInt(1, 5); + int32 input_t_dim_ = RandInt(1, 100); + int32 pool_t_size_ = RandInt(1, 10); + int32 pool_t_step_ = RandInt(1, 10); + int32 input_h_dim_ = RandInt(1, 100); + int32 pool_h_size_ = RandInt(1, 10); + int32 pool_h_step_ = RandInt(1, 10); + int32 input_f_dim_ = RandInt(1, 100); + int32 pool_f_size_ = RandInt(1, 10); + int32 pool_f_step_ = RandInt(1, 10); + + + // this part is for testing of forward propagation + CuMatrix in_value(input_t_dim_, input_h_dim_ * input_f_dim_); + in_value.SetRandn(); + + int32 num_pools_t = 1 + (input_t_dim_ - pool_t_size_) / pool_t_step_; + int32 num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_; + int32 num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_; + + CuMatrix out_value(num_pools_t, num_pools_h * num_pools_f); + out_value.SetRandn(); + + CuVector index_max_(2 * num_pools_t * num_pools_h * num_pools_f); + index_max_.SetRandn(); + + CuMatrix out_value_copy(out_value); + + out_value.MaxMatBlocks(in_value, index_max_, + input_t_dim_, pool_t_size_, pool_t_step_, + input_h_dim_, pool_h_size_, pool_h_step_, + input_f_dim_, pool_f_size_, pool_f_step_, + kNoTrans); + int32 tmp = 0; + for (int32 t = 0; t < num_pools_t; t++) { + for (int32 h = 0; h < num_pools_t; h++) { + for (int32 f = 0; f < num_pools_f; f++) { + // initialize the maximum value as the first element in the pool + int32 max_x = 0; int32 max_y = 0; + int32 max_value = in_value(t * pool_t_step_, h * pool_h_step_ * input_f_dim_ + f * pool_f_step_); + + // find the maximm value in the pool + for (int32 x = 0; x < pool_t_size_; x += stride) { + int32 cur_x = t * pool_t_step_ + x; + + for (int32 y = 0; y < pool_h_size_; y++) { + for (int32 z = 0; z < pool_f_size_; z++) { + int32 cur_y = (h * pool_h_step_ + y) * input_f_dim_ + f * pool_f_step_ + z; + if (in_value(cur_x, cur_y) > max_value) { + max_x = cur_x; + max_y = cur_y; + max_value = in_value(cur_x, cur_y); + index_max_(tmp) = max_x; + index_max_(tmp+1) = max_y; + } + } + } + } + out_value_copy(t, h * num_pools_f + f) = max_value; + tmp += 2; + } + } + } + + AssertEqual(out_value, out_value_copy); + + // this part is for testing backward propagation + CuMatrix in_deriv(input_t_dim_, input_h_dim_ * input_f_dim_); + in_deriv.SetZero(); + CuMatrix out_deriv(num_pools_t, num_pools_h * num_pools_f); + out_deriv.SetRandn(); + CuMatrix in_deriv_copy(in_deriv); + + in_deriv.MaxMatBlocks(out_deriv, index_max_, + input_t_dim_, pool_t_size_, pool_t_step_, + input_h_dim_, pool_h_size_, pool_h_step_, + input_f_dim_, pool_f_size_, pool_f_step_, + kNoTrans); + + for (int32 x = 0; x < num_pools_t * num_pools_h * num_pools_f; x += 2) { + int32 row_tmp = (x / 2) / (num_pools_h * num_pools_f); + int32 col_tmp = (x / 2) % (num_pools_h * num_pools_f); + in_deriv_copy(index_max_(x),index_max_(x+1)) = out_deriv(row_tmp, col_tmp); + } + AssertEqual(in_deriv, in_deriv_copy); + } +} + template void CudaMatrixUnitTest() { UnitTestCuMatrixApplyExpSpecial(); UnitTestCuMatrixApplyExpLimited(); @@ -2987,6 +3078,7 @@ template void CudaMatrixUnitTest() { UnitTestCuMatrixAddToElements(); UnitTestCuMatrixLookup(); UnitTestCuMatrixEqualElementMask(); + UnitTestCuMatrixMaxMatBlocks(); // test CuVector methods UnitTestCuVectorAddVec(); UnitTestCuVectorAddRowSumMat(); diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index 969a4629391..f0c10086433 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -1313,9 +1313,10 @@ void CuMatrixBase::MaxMatBlocks(const CuMatrixBase &A, // maxpooling backward propagation without cuda this->SetZero(); for (int32 x = 0; x < num_pools_t * num_pools_h * num_pools_f; x += 2) { - (*this)(index_max_(x),index_max_(x+1)) = 1; + int32 row_tmp = (x / 2) / (num_pools_h * num_pools_f); + int32 col_tmp = (x / 2) % (num_pools_h * num_pools_f); + (*this)(index_max_(x),index_max_(x+1)) = A(row_tmp, col_tmp); } - this->MulElements(A); } } } diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h index f64838b3871..345d3fd980b 100644 --- a/src/cudamatrix/cu-matrix.h +++ b/src/cudamatrix/cu-matrix.h @@ -583,12 +583,10 @@ class CuMatrixBase { /// where: /// start_row = idx_t * pool_t_step_ /// start_col(i) = (idx_h * pool_h_step_ + i) * input_f_dim_ + idx_f * pool_f_step_ - void MaxMatBlocks(const CuMatrixBase &A, CuVectorBase &index_max_, const int32 stride, - const int32 input_t_dim_, const int32 pool_t_size_, const int32 pool_t_step_, + void MaxMatBlocks(const CuMatrixBase &A, CuVectorBase &index_max_, const int32 stride, const int32 input_t_dim_, const int32 pool_t_size_, const int32 pool_t_step_, const int32 input_h_dim_, const int32 pool_h_size_, const int32 pool_h_step_, const int32 input_f_dim_, const int32 pool_f_size_, const int32 pool_f_step_, MatrixTransposeType trans = kNoTrans); - /// (for each column c of *this), c = alpha * col + beta * c void AddVecToCols(Real alpha, const CuVectorBase &col, Real beta = 1.0); /// (for each row r of *this), r = alpha * row + beta * r