Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions egs/hkust/s5/RESULTS
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,6 @@ exp/tri5a/decode/cer_13:%WER 49.67 [ 27891 / 56154, 2877 ins, 4538 del, 20476 su
exp/tri5a_mce/decode/cer_11:%WER 44.74 [ 25125 / 56154, 2112 ins, 4108 del, 18905 sub ]
exp/tri5a_mmi_b0.1/decode/cer_11:%WER 44.24 [ 24840 / 56154, 2060 ins, 4118 del, 18662 sub ]
exp/tri5a_mpe/decode/cer_12:%WER 44.96 [ 25247 / 56154, 2233 ins, 4174 del, 18840 sub ]

# ConvNet with 2 convolutional layers and 2 ReLU layers
exp/nnet2_convnet/decode/cer_10:%WER 40.73 [ 22873 / 56154, 2609 ins, 3712 del, 16552 sub ]
63 changes: 63 additions & 0 deletions egs/hkust/s5/local/nnet2/run_convnet.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/bin/bash

# 2015 Xingyu Na
# This script runs on the full training set, using ConvNet setup on top of
# fbank features, on GPU. The ConvNet has four hidden layers, two convolutional
# layers and two affine transform layers with ReLU nonlinearity.
# Convolutional layer [1]:
# convolution1d, input feature dim is 36, filter dim is 7, output dim is
# 30, 128 filters are used
# maxpooling, 3-to-1 maxpooling, input dim is 30, output dim is 10
# Convolutional layer [2]:
# convolution1d, input feature dim is 10, filter dim is 4, output dim is
# 7, 256 filters are used
# Affine transform layers [3-4]:
# affine transform with ReLU nonlinearity.

temp_dir=
dir=exp/nnet2_convnet
stage=-5
train_original=data/train
train=data-fb/train

. ./cmd.sh
. ./path.sh

. utils/parse_options.sh

parallel_opts="--gpu 1" # This is suitable for the CLSP network, you'll
# likely have to change it.

# Make the FBANK features
if [ $stage -le -5 ]; then
# Dev set
utils/copy_data_dir.sh data/dev data-fb/dev || exit 1; rm $train/{cmvn,feats}.scp
steps/make_fbank.sh --nj 10 --cmd "$train_cmd" \
data-fb/dev data-fb/dev/log data-fb/dev/data || exit 1;
steps/compute_cmvn_stats.sh data-fb/dev data-fb/dev/log data-fb/dev/data || exit 1;
# Training set
utils/copy_data_dir.sh $train_original $train || exit 1; rm $train/{cmvn,feats}.scp
steps/make_fbank.sh --nj 10 --cmd "$train_cmd" \
$train $train/log $train/data || exit 1;
steps/compute_cmvn_stats.sh $train $train/log $train/data || exit 1;
fi

(
if [ ! -f $dir/final.mdl ]; then
steps/nnet2/train_convnet_accel2.sh --parallel-opts "$parallel_opts" \
--cmd "$decode_cmd" --stage $stage \
--num-threads 1 --minibatch-size 512 \
--mix-up 20000 --samples-per-iter 300000 \
--num-epochs 15 --delta-order 2 \
--initial-effective-lrate 0.0005 --final-effective-lrate 0.000025 \
--num-jobs-initial 3 --num-jobs-final 8 --splice-width 5 \
--hidden-dim 2000 --num-filters1 128 --patch-dim1 7 --pool-size 3 \
--num-filters2 256 --patch-dim2 4 \
$train data/lang exp/tri5a_ali $dir || exit 1;
fi

steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 10 \
--config conf/decode.config \
exp/tri5a/graph data-fb/dev \
$dir/decode || exit 1;
)
7 changes: 6 additions & 1 deletion egs/wsj/s5/steps/nnet2/decode.sh
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,12 @@ fi
splice_opts=`cat $srcdir/splice_opts 2>/dev/null`

case $feat_type in
raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |";;
raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
if [ -f $srcdir/delta_order ]; then
delta_order=`cat $srcdir/delta_order 2>/dev/null`
feats="$feats add-deltas --delta-order=$delta_order ark:- ark:- |"
fi
;;
lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
;;
*) echo "$0: invalid feature type $feat_type" && exit 1;
Expand Down
674 changes: 674 additions & 0 deletions egs/wsj/s5/steps/nnet2/train_convnet_accel2.sh

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions src/cudamatrix/cu-kernels-ansi.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ void cudaF_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power, bool include
void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
void cudaF_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val, MatrixDim d);
void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
void cudaF_add_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
void cudaF_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
void cudaF_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val, MatrixDim d);
void cudaF_set_diag(int Gr, int Bl, float* mat, float value, MatrixDim d);
Expand Down Expand Up @@ -190,6 +191,7 @@ void cudaD_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power, bool inclu
void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
void cudaD_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val, MatrixDim d);
void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
void cudaD_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
void cudaD_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
void cudaD_apply_ceiling(dim3 Gr, dim3 Bl, double* mat, double ceiling_val, MatrixDim d);
void cudaD_set_diag(int Gr, int Bl, double* mat, double value, MatrixDim d);
Expand Down
27 changes: 27 additions & 0 deletions src/cudamatrix/cu-kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1259,6 +1259,25 @@ static void _copy_cols(Real* dst, const Real *src, const MatrixIndexT_cuda* reor
}
}

template<typename Real>
__global__
static void _add_cols(Real* dst, const Real *src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
// Note: in this kernel, the x dimension corresponds to rows and the y to columns,
// as it will be going forward.

int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < dst_dim.rows && j < dst_dim.cols) {
int index = reorder[j],
dst_index = i * dst_dim.stride + j;
if (index >= 0) {
int src_index = i * src_stride + reorder[j];
Real val = src[src_index];
dst[dst_index] += val;
}
}
}

template<typename Real>
__global__
static void _copy_rows(Real* dst, const Real *src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
Expand Down Expand Up @@ -2024,6 +2043,10 @@ void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const Matri
_copy_cols<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
}

void cudaF_add_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
_add_cols<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
}

void cudaF_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
_copy_rows<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
}
Expand Down Expand Up @@ -2445,6 +2468,10 @@ void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const Mat
_copy_cols<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
}

void cudaD_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
_add_cols<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
}

void cudaD_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
_copy_rows<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
}
Expand Down
6 changes: 6 additions & 0 deletions src/cudamatrix/cu-kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val,
inline void cuda_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
cudaF_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
}
inline void cuda_add_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
cudaF_add_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
}
inline void cuda_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
cudaF_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
}
Expand Down Expand Up @@ -259,6 +262,9 @@ inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, double* mat, double ceiling_val
inline void cuda_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
cudaD_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
}
inline void cuda_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
cudaD_add_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
}
inline void cuda_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
cudaD_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
}
Expand Down
31 changes: 31 additions & 0 deletions src/cudamatrix/cu-matrix-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -509,6 +509,36 @@ static void UnitTestCuMatrixCopyCols() {
}


template<typename Real>
static void UnitTestCuMatrixAddCols() {
for (MatrixIndexT p = 0; p < 2; p++) {
MatrixIndexT num_cols1 = 10 + Rand() % 10,
num_cols2 = 10 + Rand() % 10,
num_rows = 10 + Rand() % 10;
CuMatrix<Real> M(num_rows, num_cols1);
M.SetRandn();

CuMatrix<Real> N(num_rows, num_cols2), O(num_rows, num_cols2);
std::vector<int32> reorder(num_cols2);
for (int32 i = 0; i < num_cols2; i++)
reorder[i] = -1 + (Rand() % (num_cols1 + 1));

if (Rand() % 2 == 0) {
N.AddCols(M, reorder);
} else {
CuArray<int32> cuda_reorder(reorder);
N.AddCols(M, cuda_reorder);
}

for (int32 i = 0; i < num_rows; i++)
for (int32 j = 0; j < num_cols2; j++)
if (reorder[j] < 0) O(i, j) = 0;
else O(i, j) = M(i, reorder[j]);
AssertEqual(N, O);
}
}


template<typename Real>
static void UnitTestCuMatrixApplyFloor() {

Expand Down Expand Up @@ -2093,6 +2123,7 @@ template<typename Real> void CudaMatrixUnitTest() {
UnitTestCuMatrixCopyFromTp<Real>();
UnitTestCuMatrixAddMatTp<Real>();
UnitTestCuMatrixCopyCols<Real>();
UnitTestCuMatrixAddCols<Real>();
UnitTestCuMatrixSumColumnRanges<Real>();
UnitTestCuMatrixCopyRows<Real>();
UnitTestCuMatrixCopyRowsFromVec<Real>();
Expand Down
50 changes: 50 additions & 0 deletions src/cudamatrix/cu-matrix.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1960,6 +1960,56 @@ void CuMatrixBase<Real>::CopyCols(const CuMatrixBase<Real> &src,
}
}

template<typename Real>
void CuMatrixBase<Real>::AddCols(const CuMatrixBase<Real> &src,
const std::vector<MatrixIndexT> &reorder) {
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
KALDI_ASSERT(static_cast<MatrixIndexT>(reorder.size()) == NumCols());
KALDI_ASSERT(NumRows() == src.NumRows());
#ifdef KALDI_PARANOID
MatrixIndexT src_cols = src.NumCols();
for (size_t i = 0; i < reorder.size(); i++)
KALDI_ASSERT(reorder[i] >= -1 && reorder[i] < src_cols);
#endif
CuArray<MatrixIndexT> cuda_reorder(reorder);

Timer tim;
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
// This kernel, as it is newer has the (x,y) dims as (rows,cols).
dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), n_blocks(NumCols(), CU2DBLOCK));
cuda_add_cols(dimGrid, dimBlock, data_, src.Data(), cuda_reorder.Data(), Dim(), src.Stride());
CU_SAFE_CALL(cudaGetLastError());
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
#endif
{
Mat().AddCols(src.Mat(), reorder);
}
}

template<typename Real>
void CuMatrixBase<Real>::AddCols(const CuMatrixBase<Real> &src,
const CuArray<MatrixIndexT> &reorder) {
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
KALDI_ASSERT(reorder.Dim() == NumCols());
KALDI_ASSERT(NumRows() == src.NumRows());
Timer tim;
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
// This kernel, as it is newer has the (x,y) dims as (rows,cols).
dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), n_blocks(NumCols(), CU2DBLOCK));
cuda_add_cols(dimGrid, dimBlock, data_, src.Data(), reorder.Data(), Dim(), src.Stride());
CU_SAFE_CALL(cudaGetLastError());
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
#endif
{
std::vector<MatrixIndexT> reorder_cpu;
reorder.CopyToVec(&reorder_cpu);
Mat().AddCols(src.Mat(), reorder_cpu);
}
}

template<typename Real>
void CuMatrixBase<Real>::CopyRows(const CuMatrixBase<Real> &src,
Expand Down
12 changes: 12 additions & 0 deletions src/cudamatrix/cu-matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,18 @@ class CuMatrixBase {
void CopyCols(const CuMatrixBase<Real> &src,
const CuArray<MatrixIndexT> &indices);


/// Add column indices[r] of src to column r.
/// As a special case, if indexes[i] == -1, skip column i
/// indices.size() must equal this->NumCols(),
/// all elements of "reorder" must be in [-1, src.NumCols()-1],
/// and src.NumRows() must equal this.NumRows()
void AddCols(const CuMatrixBase<Real> &src,
const std::vector<MatrixIndexT> &indices);

/// Version of CopyCols that takes CuArray argument.
void AddCols(const CuMatrixBase<Real> &src,
const CuArray<MatrixIndexT> &indices);

/// Copies row r from row indices[r] of src.
/// As a special case, if indexes[i] <== -1, sets row i to zero
Expand Down
28 changes: 28 additions & 0 deletions src/matrix/kaldi-matrix.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2566,6 +2566,34 @@ void MatrixBase<Real>::CopyCols(const MatrixBase<Real> &src,
}
}


template<typename Real>
void MatrixBase<Real>::AddCols(const MatrixBase<Real> &src,
const std::vector<MatrixIndexT> &indices) {
KALDI_ASSERT(NumRows() == src.NumRows());
KALDI_ASSERT(NumCols() == static_cast<MatrixIndexT>(indices.size()));
MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
this_stride = stride_, src_stride = src.stride_;
Real *this_data = this->data_;
const Real *src_data = src.data_;
#ifdef KALDI_PARANOID
MatrixIndexT src_cols = src.NumCols();
for (std::vector<MatrixIndexT>::const_iterator iter = indices.begin();
iter != indices.end(); ++iter)
KALDI_ASSERT(*iter >= -1 && *iter < src_cols);
#endif

// For the sake of memory locality we do this row by row, rather
// than doing it column-wise using cublas_Xcopy
for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride, src_data += src_stride) {
const MatrixIndexT *index_ptr = &(indices[0]);
for (MatrixIndexT c = 0; c < num_cols; c++, index_ptr++) {
if (*index_ptr >= 0)
this_data[c] += src_data[*index_ptr];
}
}
}

template<typename Real>
void MatrixBase<Real>::CopyRows(const MatrixBase<Real> &src,
const std::vector<MatrixIndexT> &indices) {
Expand Down
8 changes: 8 additions & 0 deletions src/matrix/kaldi-matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,14 @@ class MatrixBase {
void CopyRows(const MatrixBase<Real> &src,
const std::vector<MatrixIndexT> &indices);

/// Add column indices[r] of src to column r.
/// As a special case, if indexes[i] == -1, skip column i
/// indices.size() must equal this->NumCols(),
/// all elements of "reorder" must be in [-1, src.NumCols()-1],
/// and src.NumRows() must equal this.NumRows()
void AddCols(const MatrixBase<Real> &src,
const std::vector<MatrixIndexT> &indices);

/// Applies floor to all matrix elements
void ApplyFloor(Real floor_val);

Expand Down
Loading