kaldi-asr · danpovey · Aug 16, 2015 · Aug 3, 2015 · Aug 3, 2015 · Aug 7, 2015
diff --git a/egs/hkust/s5/RESULTS b/egs/hkust/s5/RESULTS
@@ -7,3 +7,6 @@ exp/tri5a/decode/cer_13:%WER 49.67 [ 27891 / 56154, 2877 ins, 4538 del, 20476 su
 exp/tri5a_mce/decode/cer_11:%WER 44.74 [ 25125 / 56154, 2112 ins, 4108 del, 18905 sub ]
 exp/tri5a_mmi_b0.1/decode/cer_11:%WER 44.24 [ 24840 / 56154, 2060 ins, 4118 del, 18662 sub ]
 exp/tri5a_mpe/decode/cer_12:%WER 44.96 [ 25247 / 56154, 2233 ins, 4174 del, 18840 sub ]
+
+# ConvNet with 2 convolutional layers and 2 ReLU layers
+exp/nnet2_convnet/decode/cer_10:%WER 40.73 [ 22873 / 56154, 2609 ins, 3712 del, 16552 sub ]
diff --git a/egs/hkust/s5/local/nnet2/run_convnet.sh b/egs/hkust/s5/local/nnet2/run_convnet.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+# 2015 Xingyu Na
+# This script runs on the full training set, using ConvNet setup on top of
+# fbank features, on GPU. The ConvNet has four hidden layers, two convolutional
+# layers and two affine transform layers with ReLU nonlinearity.
+# Convolutional layer [1]:
+#   convolution1d, input feature dim is 36, filter dim is 7, output dim is
+#   30, 128 filters are used
+#   maxpooling, 3-to-1 maxpooling, input dim is 30, output dim is 10
+# Convolutional layer [2]:
+#   convolution1d, input feature dim is 10, filter dim is 4, output dim is
+#   7, 256 filters are used
+# Affine transform layers [3-4]:
+#   affine transform with ReLU nonlinearity.
+
+temp_dir=
+dir=exp/nnet2_convnet
+stage=-5
+train_original=data/train
+train=data-fb/train
+
+. ./cmd.sh
+. ./path.sh
+
+. utils/parse_options.sh
+
+parallel_opts="--gpu 1"  # This is suitable for the CLSP network, you'll
+                         # likely have to change it.
+
+# Make the FBANK features
+if [ $stage -le -5 ]; then
+  # Dev set
+  utils/copy_data_dir.sh data/dev data-fb/dev || exit 1; rm $train/{cmvn,feats}.scp
+  steps/make_fbank.sh --nj 10 --cmd "$train_cmd" \
+     data-fb/dev data-fb/dev/log data-fb/dev/data || exit 1;
+  steps/compute_cmvn_stats.sh data-fb/dev data-fb/dev/log data-fb/dev/data || exit 1;
+  # Training set
+  utils/copy_data_dir.sh $train_original $train || exit 1; rm $train/{cmvn,feats}.scp
+  steps/make_fbank.sh --nj 10 --cmd "$train_cmd" \
+     $train $train/log $train/data || exit 1;
+  steps/compute_cmvn_stats.sh $train $train/log $train/data || exit 1;
+fi
+
+( 
+  if [ ! -f $dir/final.mdl ]; then
+    steps/nnet2/train_convnet_accel2.sh --parallel-opts "$parallel_opts" \
+      --cmd "$decode_cmd" --stage $stage \
+      --num-threads 1 --minibatch-size 512 \
+      --mix-up 20000 --samples-per-iter 300000 \
+      --num-epochs 15 --delta-order 2 \
+      --initial-effective-lrate 0.0005 --final-effective-lrate 0.000025 \
+      --num-jobs-initial 3 --num-jobs-final 8 --splice-width 5 \
+      --hidden-dim 2000 --num-filters1 128 --patch-dim1 7 --pool-size 3 \
+      --num-filters2 256 --patch-dim2 4 \
+      $train data/lang exp/tri5a_ali $dir || exit 1;
+  fi
+
+  steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 10 \
+    --config conf/decode.config \
+    exp/tri5a/graph data-fb/dev \
+    $dir/decode || exit 1;
+)
diff --git a/egs/wsj/s5/steps/nnet2/decode.sh b/egs/wsj/s5/steps/nnet2/decode.sh
@@ -84,7 +84,12 @@ fi
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
 
 case $feat_type in
-  raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |";;
+  raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
+  if [ -f $srcdir/delta_order ]; then
+    delta_order=`cat $srcdir/delta_order 2>/dev/null`
+    feats="$feats add-deltas --delta-order=$delta_order ark:- ark:- |"
+  fi
+    ;;
   lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
     ;;
   *) echo "$0: invalid feature type $feat_type" && exit 1;

diff --git a/egs/wsj/s5/steps/nnet2/train_convnet_accel2.sh b/egs/wsj/s5/steps/nnet2/train_convnet_accel2.sh
diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
@@ -62,6 +62,7 @@ void cudaF_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power, bool include
 void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);  
 void cudaF_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val, MatrixDim d);
 void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
+void cudaF_add_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
 void cudaF_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
 void cudaF_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val, MatrixDim d);
 void cudaF_set_diag(int Gr, int Bl, float* mat, float value, MatrixDim d);
@@ -190,6 +191,7 @@ void cudaD_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power, bool inclu
 void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);  
 void cudaD_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val, MatrixDim d);
 void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
+void cudaD_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
 void cudaD_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
 void cudaD_apply_ceiling(dim3 Gr, dim3 Bl, double* mat, double ceiling_val, MatrixDim d);
 void cudaD_set_diag(int Gr, int Bl, double* mat, double value, MatrixDim d);

diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
@@ -1259,6 +1259,25 @@ static void _copy_cols(Real* dst, const Real *src, const MatrixIndexT_cuda* reor
   } 
 }
 
+template<typename Real>
+__global__
+static void _add_cols(Real* dst, const Real *src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
+  // Note: in this kernel, the x dimension corresponds to rows and the y to columns,
+  // as it will be going forward.
+
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  if (i < dst_dim.rows && j < dst_dim.cols) {
+    int index = reorder[j],
+        dst_index = i * dst_dim.stride + j;
+    if (index >= 0) {
+      int src_index = i * src_stride + reorder[j];
+      Real val = src[src_index]; 
+      dst[dst_index] += val;
+    }
+  } 
+}
+
 template<typename Real>
 __global__
 static void _copy_rows(Real* dst, const Real *src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
@@ -2024,6 +2043,10 @@ void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const Matri
   _copy_cols<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
 }
 
+void cudaF_add_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
+  _add_cols<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
+}
+
 void cudaF_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
   _copy_rows<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
 }
@@ -2445,6 +2468,10 @@ void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const Mat
   _copy_cols<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
 }
 
+void cudaD_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
+  _add_cols<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
+}
+
 void cudaD_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
   _copy_rows<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
 }

diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
@@ -92,6 +92,9 @@ inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val,
 inline void cuda_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
   cudaF_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
 }
+inline void cuda_add_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
+  cudaF_add_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
+}
 inline void cuda_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
   cudaF_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
 }
@@ -259,6 +262,9 @@ inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, double* mat, double ceiling_val
 inline void cuda_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
   cudaD_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
 }
+inline void cuda_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
+  cudaD_add_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
+}
 inline void cuda_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
   cudaD_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
 }

diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc
@@ -509,6 +509,36 @@ static void UnitTestCuMatrixCopyCols() {
 }
 
 
+template<typename Real>
+static void UnitTestCuMatrixAddCols() {
+  for (MatrixIndexT p = 0; p < 2; p++) {
+    MatrixIndexT num_cols1 = 10 + Rand() % 10,
+        num_cols2 = 10 + Rand() % 10,
+        num_rows = 10 + Rand() % 10;
+    CuMatrix<Real> M(num_rows, num_cols1);
+    M.SetRandn();
+
+    CuMatrix<Real> N(num_rows, num_cols2), O(num_rows, num_cols2);
+    std::vector<int32> reorder(num_cols2);
+    for (int32 i = 0; i < num_cols2; i++)
+      reorder[i] = -1 + (Rand() % (num_cols1 + 1));
+
+    if (Rand() % 2 == 0) {
+      N.AddCols(M, reorder);
+    } else {
+      CuArray<int32> cuda_reorder(reorder);
+      N.AddCols(M, cuda_reorder);
+    }
+
+    for (int32 i = 0; i < num_rows; i++)
+      for (int32 j = 0; j < num_cols2; j++)
+        if (reorder[j] < 0) O(i, j) = 0;
+        else O(i, j) = M(i, reorder[j]);
+    AssertEqual(N, O);
+  }
+}
+
+
 template<typename Real> 
 static void UnitTestCuMatrixApplyFloor() {
 
@@ -2093,6 +2123,7 @@ template<typename Real> void CudaMatrixUnitTest() {
   UnitTestCuMatrixCopyFromTp<Real>();
   UnitTestCuMatrixAddMatTp<Real>();
   UnitTestCuMatrixCopyCols<Real>();
+  UnitTestCuMatrixAddCols<Real>();
   UnitTestCuMatrixSumColumnRanges<Real>();
   UnitTestCuMatrixCopyRows<Real>();
   UnitTestCuMatrixCopyRowsFromVec<Real>();

diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
@@ -1960,6 +1960,56 @@ void CuMatrixBase<Real>::CopyCols(const CuMatrixBase<Real> &src,
   }
 }
 
+template<typename Real>
+void CuMatrixBase<Real>::AddCols(const CuMatrixBase<Real> &src,
+                                 const std::vector<MatrixIndexT> &reorder) {
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    KALDI_ASSERT(static_cast<MatrixIndexT>(reorder.size()) == NumCols());
+    KALDI_ASSERT(NumRows() == src.NumRows());
+#ifdef KALDI_PARANOID
+    MatrixIndexT src_cols = src.NumCols();
+    for (size_t i = 0; i < reorder.size(); i++)
+      KALDI_ASSERT(reorder[i] >= -1 && reorder[i] < src_cols);
+#endif
+    CuArray<MatrixIndexT> cuda_reorder(reorder);
+
+    Timer tim;
+    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
+    // This kernel, as it is newer has the (x,y) dims as (rows,cols).
+    dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), n_blocks(NumCols(), CU2DBLOCK));
+    cuda_add_cols(dimGrid, dimBlock, data_, src.Data(), cuda_reorder.Data(), Dim(), src.Stride());
+    CU_SAFE_CALL(cudaGetLastError());
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+#endif
+  {
+    Mat().AddCols(src.Mat(), reorder);
+  }
+}
+
+template<typename Real>
+void CuMatrixBase<Real>::AddCols(const CuMatrixBase<Real> &src,
+                                 const CuArray<MatrixIndexT> &reorder) {
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    KALDI_ASSERT(reorder.Dim() == NumCols());
+    KALDI_ASSERT(NumRows() == src.NumRows());
+    Timer tim;
+    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
+    // This kernel, as it is newer has the (x,y) dims as (rows,cols).
+    dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), n_blocks(NumCols(), CU2DBLOCK));
+    cuda_add_cols(dimGrid, dimBlock, data_, src.Data(), reorder.Data(), Dim(), src.Stride());
+    CU_SAFE_CALL(cudaGetLastError());
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+#endif
+  {
+    std::vector<MatrixIndexT> reorder_cpu;
+    reorder.CopyToVec(&reorder_cpu);
+    Mat().AddCols(src.Mat(), reorder_cpu);
+  }
+}
 
 template<typename Real>
 void CuMatrixBase<Real>::CopyRows(const CuMatrixBase<Real> &src,

diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
@@ -98,6 +98,18 @@ class CuMatrixBase {
   void CopyCols(const CuMatrixBase<Real> &src,
                 const CuArray<MatrixIndexT> &indices);
 
+
+  /// Add column indices[r] of src to column r.
+  /// As a special case, if indexes[i] == -1, skip column i
+  /// indices.size() must equal this->NumCols(),
+  /// all elements of "reorder" must be in [-1, src.NumCols()-1],
+  /// and src.NumRows() must equal this.NumRows()
+  void AddCols(const CuMatrixBase<Real> &src,
+               const std::vector<MatrixIndexT> &indices);
+
+  /// Version of CopyCols that takes CuArray argument.
+  void AddCols(const CuMatrixBase<Real> &src,
+               const CuArray<MatrixIndexT> &indices);
 
   /// Copies row r from row indices[r] of src.
   /// As a special case, if indexes[i] <== -1, sets row i to zero  

diff --git a/src/matrix/kaldi-matrix.cc b/src/matrix/kaldi-matrix.cc
@@ -2566,6 +2566,34 @@ void MatrixBase<Real>::CopyCols(const MatrixBase<Real> &src,
   }
 }
 
+
+template<typename Real>
+void MatrixBase<Real>::AddCols(const MatrixBase<Real> &src,
+                               const std::vector<MatrixIndexT> &indices) {
+  KALDI_ASSERT(NumRows() == src.NumRows());
+  KALDI_ASSERT(NumCols() == static_cast<MatrixIndexT>(indices.size()));
+  MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
+      this_stride = stride_, src_stride = src.stride_;
+  Real *this_data = this->data_;
+  const Real *src_data = src.data_;
+#ifdef KALDI_PARANOID
+  MatrixIndexT src_cols = src.NumCols();
+  for (std::vector<MatrixIndexT>::const_iterator iter = indices.begin();
+       iter != indices.end(); ++iter)
+    KALDI_ASSERT(*iter >= -1 && *iter < src_cols);
+#endif                
+
+  // For the sake of memory locality we do this row by row, rather
+  // than doing it column-wise using cublas_Xcopy
+  for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride, src_data += src_stride) {
+    const MatrixIndexT *index_ptr = &(indices[0]);
+    for (MatrixIndexT c = 0; c < num_cols; c++, index_ptr++) {
+      if (*index_ptr >= 0)
+	this_data[c] += src_data[*index_ptr];
+    }
+  }
+}
+
 template<typename Real>
 void MatrixBase<Real>::CopyRows(const MatrixBase<Real> &src,
                                 const std::vector<MatrixIndexT> &indices) {

diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h
@@ -284,6 +284,14 @@ class MatrixBase {
   void CopyRows(const MatrixBase<Real> &src,
                 const std::vector<MatrixIndexT> &indices);
 
+  /// Add column indices[r] of src to column r.
+  /// As a special case, if indexes[i] == -1, skip column i
+  /// indices.size() must equal this->NumCols(),
+  /// all elements of "reorder" must be in [-1, src.NumCols()-1],
+  /// and src.NumRows() must equal this.NumRows()
+  void AddCols(const MatrixBase<Real> &src,
+               const std::vector<MatrixIndexT> &indices);
+
   /// Applies floor to all matrix elements
   void ApplyFloor(Real floor_val);