diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh index 753dfc632ba..b927cc86823 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh @@ -18,7 +18,7 @@ # # # local/chain/compare_wer_general.sh --rt03 tdnn7n_sp tdnn7m26o_sp -# System tdnn7n_sp tdnn7m26j_sp +# System tdnn7n_sp tdnn7m26o_sp # WER on train_dev(tg) 12.18 11.74 # WER on train_dev(fg) 11.12 10.69 # WER on eval2000(tg) 14.9 14.6 diff --git a/src/chain/chain-denominator.cc b/src/chain/chain-denominator.cc index c936061de26..3a767721c6d 100644 --- a/src/chain/chain-denominator.cc +++ b/src/chain/chain-denominator.cc @@ -64,7 +64,10 @@ DenominatorComputation::DenominatorComputation( nnet_output.NumRows(), kUndefined, kStrideEqualNumCols); exp_nnet_output_transposed_.CopyFromMat(nnet_output, kTrans); - exp_nnet_output_transposed_.ApplyExp(); + // We limit the nnet output to the range [-30,30] before doing the exp; + // this avoids NaNs appearing in the forward-backward computation, which + // is not done in log space. + exp_nnet_output_transposed_.ApplyExpLimited(-30.0, 30.0); } diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h index f2926ddc2f1..6b99a77e73b 100644 --- a/src/cudamatrix/cu-kernels-ansi.h +++ b/src/cudamatrix/cu-kernels-ansi.h @@ -200,6 +200,10 @@ void cudaF_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val, MatrixDim d); void cudaD_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d); void cudaF_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d); +void cudaD_apply_exp_limited(dim3 Gr, dim3 Bl, double* mat, MatrixDim d, + double lower_limit, double upper_limit); +void cudaF_apply_exp_limited(dim3 Gr, dim3 Bl, float* mat, MatrixDim d, + float lower_limit, float upper_limit); void cudaD_apply_exp_special(dim3 Gr, dim3 Bl, double* out, MatrixDim out_dim, const double* in, int in_stride); void cudaF_apply_exp_special(dim3 Gr, dim3 Bl, float* out, MatrixDim out_dim, diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 50dd3d1d0ca..934a860a055 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -400,6 +400,26 @@ static void _apply_exp(Real* mat, MatrixDim d) { } } +template +__global__ +static void _apply_exp_limited(Real* mat, MatrixDim d, + Real lower_limit, Real upper_limit) { + int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; + int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; + int32_cuda index = i + j * d.stride; + if (i < d.cols && j < d.rows) { + Real x = mat[index]; + // I'm writing !(x >= lower_limit) instead of (x < lower_limit) so that + // nan's will be set to the lower-limit. + if (!(x >= lower_limit)) + x = lower_limit; + else if (x > upper_limit) + x = upper_limit; + mat[index] = exp(x); + } +} + + template __global__ static void _scale_diag_packed(Real* mat, Real value, int dim) { @@ -3734,6 +3754,11 @@ void cudaF_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { _apply_exp<<>>(mat,d); } +void cudaF_apply_exp_limited(dim3 Gr, dim3 Bl, float* mat, MatrixDim d, + float lower_limit, float upper_limit) { + _apply_exp_limited<<>>(mat, d, lower_limit, upper_limit); +} + void cudaF_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim d) { _apply_pow<<>>(mat, power, d); } @@ -4430,6 +4455,13 @@ void cudaD_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) { _apply_exp<<>>(mat,d); } +void cudaD_apply_exp_limited(dim3 Gr, dim3 Bl, double* mat, MatrixDim d, + double lower_limit, double upper_limit) { + _apply_exp_limited<<>>(mat, d, lower_limit, upper_limit); +} + + + void cudaD_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim d) { _apply_pow<<>>(mat, power, d); } diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index fe706815a44..8f719a8c4a1 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -345,6 +345,14 @@ inline void cuda_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) { inline void cuda_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { cudaF_apply_exp(Gr, Bl, mat, d); } +inline void cuda_apply_exp_limited(dim3 Gr, dim3 Bl, double* mat, MatrixDim d, + double lower_limit, double upper_limit) { + cudaD_apply_exp_limited(Gr, Bl, mat, d, lower_limit, upper_limit); +} +inline void cuda_apply_exp_limited(dim3 Gr, dim3 Bl, float* mat, MatrixDim d, + float lower_limit, float upper_limit) { + cudaF_apply_exp_limited(Gr, Bl, mat, d, lower_limit, upper_limit); +} inline void cuda_apply_exp_special(dim3 Gr, dim3 Bl, double* out, MatrixDim out_dim, const double* in, int in_stride) { diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc index 33db8b3e625..01030bb8353 100644 --- a/src/cudamatrix/cu-matrix-test.cc +++ b/src/cudamatrix/cu-matrix-test.cc @@ -194,6 +194,30 @@ static void UnitTestCuMatrixApplyExp() { } +template +static void UnitTestCuMatrixApplyExpLimited() { + int32 M = 10 + Rand() % 20, N = 10 + Rand() % 20; + Matrix H(M, N); + H.SetRandn(); + + + BaseFloat lower_limit = -0.2, upper_limit = 0.2; + + CuMatrix D(H); + + D.ApplyExpLimited(lower_limit, upper_limit); + + + H.ApplyFloor(lower_limit); + H.ApplyCeiling(upper_limit); + H.ApplyExp(); + + Matrix H2(D); + + AssertEqual(H,H2); +} + + template static void UnitTestCuMatrixSigmoid() { @@ -2895,6 +2919,7 @@ static void UnitTestCuMatrixEqualElementMask() { template void CudaMatrixUnitTest() { UnitTestCuMatrixApplyExpSpecial(); + UnitTestCuMatrixApplyExpLimited(); UnitTextCuMatrixAddSmatMat(); UnitTextCuMatrixAddMatSmat(); UnitTextCuMatrixAddSmat(); diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index 34290561cc5..beccd9dc4a5 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -2498,6 +2498,37 @@ void CuMatrixBase::ApplyExp() { } } +template +void CuMatrixBase::ApplyExpLimited(Real lower_limit, Real upper_limit) { + KALDI_ASSERT(upper_limit > lower_limit); +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + CuTimer tim; + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); + cuda_apply_exp_limited(dimGrid, dimBlock, data_, Dim(), lower_limit, upper_limit); + CU_SAFE_CALL(cudaGetLastError()); + CuDevice::Instantiate().AccuProfile(__func__, tim); + } else +#endif + { + int32 num_rows = num_rows_, num_cols = num_cols_; + for (int32 r = 0; r < num_rows; r++) { + Real *row_data = this->RowData(r); + for (int32 c = 0; c < num_cols; c++) { + Real x = row_data[c]; + if (!(x >= lower_limit)) + x = lower_limit; + if (x > upper_limit) + x = upper_limit; + row_data[c] = Exp(x); + } + } + } +} + + template void CuMatrixBase::ApplyExpSpecial() { #if HAVE_CUDA == 1 diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h index 86c50cfc485..03e69b639d3 100644 --- a/src/cudamatrix/cu-matrix.h +++ b/src/cudamatrix/cu-matrix.h @@ -399,6 +399,13 @@ class CuMatrixBase { void ApplyCeiling(Real ceiling_val); void ApplyExp(); + + /// This is equivalent to running: + /// ApplyFloor(lower_limit); + /// ApplyCeiling(upper_limit); + /// ApplyExp() + void ApplyExpLimited(Real lower_limit, Real upper_limit); + /// For each element x of the matrix, set it to /// (x < 0 ? exp(x) : x + 1). This function is used /// in our RNNLM training. diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc index 669e5112793..00a31fa897c 100644 --- a/src/nnet3/nnet-general-component.cc +++ b/src/nnet3/nnet-general-component.cc @@ -1414,7 +1414,7 @@ void* DropoutMaskComponent::Propagate( BaseFloat dropout_proportion = dropout_proportion_; KALDI_ASSERT(dropout_proportion >= 0.0 && dropout_proportion <= 1.0); - if (dropout_proportion_ == 0) { + if (dropout_proportion == 0) { out->Set(1.0); return NULL; } diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index f9f286aaed2..4eb078c0fcb 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -3730,15 +3730,11 @@ void NaturalGradientPerElementScaleComponent::InitFromConfig(ConfigLine *cfl) { // for the preconditioner actually exceeds the memory for the // parameters (by "rank"). update_period = 10; - BaseFloat num_samples_history = 2000.0, alpha = 4.0, - max_change_per_minibatch = 0.0; + BaseFloat num_samples_history = 2000.0, alpha = 4.0; cfl->GetValue("rank", &rank); cfl->GetValue("update-period", &update_period); cfl->GetValue("num-samples-history", &num_samples_history); cfl->GetValue("alpha", &alpha); - cfl->GetValue("max-change-per-minibatch", &max_change_per_minibatch); - if (max_change_per_minibatch != 0.0) - KALDI_WARN << "max-change-per-minibatch is now ignored, use 'max-change'"; InitLearningRatesFromConfig(cfl); std::string filename; // Accepts "scales" config (for filename) or "dim" -> random init, for testing. diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h index 9d438678f5d..3929c253aab 100644 --- a/src/nnet3/nnet-simple-component.h +++ b/src/nnet3/nnet-simple-component.h @@ -1446,6 +1446,19 @@ class PermuteComponent: public Component { trainable scale; it's like a linear component with a diagonal matrix. This version (and its child class NaturalGradientPerElementScaleComponent) requires the input for backprop. See also ScaleAndOffsetComponent. + + Accepted values on its config line, with defaults if applicable: + + vector If specified, the offsets will be read from this file ('vector' + is interpreted as an rxfilename). + + dim The dimension that this component inputs and outputs. + Only required if 'vector' is not specified. + + param-mean=1.0 Mean of randomly initialized offset parameters; should only + be supplied if 'vector' is not supplied. + param-stddev=0.0 Standard deviation of randomly initialized offset parameters; + should only be supplied if 'vector' is not supplied. */ class PerElementScaleComponent: public UpdatableComponent { public: @@ -1670,8 +1683,29 @@ class ConstantFunctionComponent: public UpdatableComponent { -// NaturalGradientPerElementScaleComponent is like PerElementScaleComponent but -// it uses a natural gradient update for the per-element scales. +/** + NaturalGradientPerElementScaleComponent is like PerElementScaleComponent but + it uses a natural gradient update for the per-element scales. + + Accepted values on its config line, with defaults if applicable: + + vector If specified, the offsets will be read from this file ('vector' + is interpreted as an rxfilename). + + dim The dimension that this component inputs and outputs. + Only required if 'vector' is not specified. + + param-mean=1.0 Mean of randomly initialized offset parameters; should only + be supplied if 'vector' is not supplied. + param-stddev=0.0 Standard deviation of randomly initialized offset parameters; + should only be supplied if 'vector' is not supplied. + + And the natural-gradient-related configuration values: + rank=8 + update-period=10 + num-samples-history=2000.0 + alpha=4.0 +*/ class NaturalGradientPerElementScaleComponent: public PerElementScaleComponent { public: