diff --git a/egs/ami/s5b/RESULTS_ihm b/egs/ami/s5b/RESULTS_ihm index 25a60d24cfb..a2b5d0c3a5c 100644 --- a/egs/ami/s5b/RESULTS_ihm +++ b/egs/ami/s5b/RESULTS_ihm @@ -84,6 +84,11 @@ %WER 20.8 | 13098 94489 | 82.0 10.0 8.0 2.8 20.8 53.2 | -0.096 | exp/ihm/chain_cleaned/tdnn_lstm1i_sp_bi_ld5/decode_dev/ascore_11/dev_hires.ctm.filt.sys %WER 20.7 | 12643 89980 | 81.7 11.5 6.8 2.5 20.7 51.8 | 0.015 | exp/ihm/chain_cleaned/tdnn_lstm1i_sp_bi_ld5/decode_eval/ascore_11/eval_hires.ctm.filt.sys +# local/chain/tuning/run_tdnn_lstm_1l.sh --mic ihm --train-set train_cleaned --gmm tri3_cleaned +# same as local/chain/tuning/run_tdnn_lstm_1i.sh, except that dropout is adopted +# cleanup + chain TDNN+LSTM model + per-frame dropout +%WER 19.8 | 13098 94475 | 83.1 9.6 7.4 2.8 19.8 51.8 | -0.041 | exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/decode_dev/ascore_10/dev_hires.ctm.filt.sys +%WER 19.2 | 12643 89964 | 83.2 10.7 6.1 2.5 19.2 49.7 | 0.079 | exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/decode_eval/ascore_10/eval_hires.ctm.filt.sys # local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh --mic ihm # cleanup + chain TDNN+LSTM model + IHM reverberated data diff --git a/egs/ami/s5b/RESULTS_sdm b/egs/ami/s5b/RESULTS_sdm index 05b68e5e780..bbe0ba3aa12 100644 --- a/egs/ami/s5b/RESULTS_sdm +++ b/egs/ami/s5b/RESULTS_sdm @@ -91,6 +91,11 @@ %WER 37.6 | 15122 94495 | 66.1 18.7 15.1 3.7 37.6 63.2 | 0.646 | exp/sdm1/chain_cleaned/tdnn_lstm1i_sp_bi_ihmali_ld5/decode_dev/ascore_10/dev_hires_o4.ctm.filt.sys %WER 40.9 | 13807 89961 | 62.4 20.0 17.6 3.3 40.9 65.7 | 0.612 | exp/sdm1/chain_cleaned/tdnn_lstm1i_sp_bi_ihmali_ld5/decode_eval/ascore_10/eval_hires_o4.ctm.filt.sys +# local/chain/tuning/run_tdnn_lstm_1l.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned +# same as local/chain/tuning/run_tdnn_lstm_1i.sh, except that dropout is adopted +# cleanup + chain TDNN+LSTM model, SDM audio + alignments from ihm data + per-frame dropout. +%WER 35.9 | 14900 94497 | 67.8 18.2 14.1 3.7 35.9 62.5 | 0.647 | exp/sdm1/chain_cleaned/tdnn_lstm1l_sp_bi_ihmali_ld5/decode_dev/ascore_9/dev_hires_o4.ctm.filt.sys +%WER 39.4 | 13223 89946 | 64.1 19.7 16.2 3.5 39.4 67.0 | 0.611 | exp/sdm1/chain_cleaned/tdnn_lstm1l_sp_bi_ihmali_ld5/decode_eval/ascore_9/eval_hires_o4.ctm.filt.sys # local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned # cleanup + chain TDNN+LSTM model, SDM original + IHM reverberated data, alignments from ihm data. diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh index 3e3976ac7a8..92636b4c17e 100755 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh @@ -26,6 +26,7 @@ gmm=tri3_cleaned # the gmm for the target data ihm_gmm=tri3 # the gmm for the IHM system (if --use-ihm-ali true). num_threads_ubm=32 nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +num_epochs=4 chunk_width=150 chunk_left_context=40 @@ -242,7 +243,7 @@ if [ $stage -le 16 ]; then --egs.chunk-right-context $chunk_right_context \ --trainer.num-chunk-per-minibatch 64 \ --trainer.frames-per-iter 1500000 \ - --trainer.num-epochs 4 \ + --trainer.num-epochs $num_epochs \ --trainer.optimization.shrink-value 0.99 \ --trainer.optimization.num-jobs-initial 2 \ --trainer.optimization.num-jobs-final 12 \ diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh index 008060df070..a96230075b6 100755 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh @@ -34,6 +34,7 @@ gmm=tri3_cleaned # the gmm for the target data ihm_gmm=tri3 # the gmm for the IHM system (if --use-ihm-ali true). num_threads_ubm=32 nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +num_epochs=4 chunk_width=150 chunk_left_context=40 @@ -254,7 +255,7 @@ if [ $stage -le 16 ]; then --egs.chunk-right-context-final 0 \ --trainer.num-chunk-per-minibatch 64,32 \ --trainer.frames-per-iter 1500000 \ - --trainer.num-epochs 4 \ + --trainer.num-epochs $num_epochs \ --trainer.optimization.shrink-value 0.99 \ --trainer.optimization.num-jobs-initial 2 \ --trainer.optimization.num-jobs-final 12 \ diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh new file mode 100644 index 00000000000..74c0f5a6ead --- /dev/null +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh @@ -0,0 +1,344 @@ +#!/bin/bash + +# This (1l.sh) is the same as 1i but with per-frame dropout on LSTM layer +# It is a regular (non-fast) LSTM with per-frame dropout on [i, f, o] gates of the LSTM, +# the dropout-adding place is "place4" in paper : http://www.danielpovey.com/files/2017_interspeech_dropout.pdf. +# We have tried both 4-epoch and 5-epoch training. + +### IHM +# Results with flags : --mic ihm --train-set train_cleaned --gmm tri3_cleaned\ +#System tdnn_lstm1i_sp_bi_ld5 tdnn_lstm1l_sp_bi_ld5 +#WER on dev 20.6 19.8 +#WER on eval 20.1 19.2 +#Final train prob -0.044763 -0.0666221 +#Final valid prob -0.0981107 -0.097616 +#Final train prob (xent) -0.722765 -0.915559 +#Final valid prob (xent) -1.03985 -1.09907 + +# ./steps/info/chain_dir_info.pl exp/ihm/chain_cleaned/tdnn_lstm1i_sp_bi_ld5/ exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/ +# exp/ihm/chain_cleaned/tdnn_lstm1i_sp_bi_ld5/: num-iters=89 nj=2..12 num-params=43.4M dim=40+100->3765 combine=-0.064->-0.059 xent:train/valid[58,88,final]=(-0.940,-0.739,-0.723/-1.14,-1.04,-1.04) logprob:train/valid[58,88,final]=(-0.067,-0.046,-0.045/-0.103,-0.099,-0.098) +# exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/: num-iters=89 nj=2..12 num-params=43.4M dim=40+100->3765 combine=-0.094->-0.082 xent:train/valid[58,88,final]=(-3.10,-1.11,-0.916/-3.17,-1.29,-1.10) logprob:train/valid[58,88,final]=(-0.164,-0.073,-0.067/-0.182,-0.104,-0.098) + +# Results with flags for (1l.sh) : --num-epochs 5 --tlstm-affix 1i_5epoch --mic ihm --train-set train_cleaned --gmm tri3_cleaned\ +# Results with flags for (1i.sh) : --num-epochs 5 --tlstm-affix 1l_5epoch --mic ihm --train-set train_cleaned --gmm tri3_cleaned\ +#System tdnn_lstm1i_5epoch_sp_bi_ld5 tdnn_lstm1l_5epoch_sp_bi_ld5 +#WER on dev 20.8 19.7 +#WER on eval 20.6 19.3 +#Final train prob -0.0347795-0.0600903 +#Final valid prob -0.102486-0.0964607 +#Final train prob (xent) -0.621007 -0.84667 +#Final valid prob (xent) -1.02634 -1.04725 + +# ./steps/info/chain_dir_info.pl exp/ihm/chain_cleaned/tdnn_lstm1i_5epoch_sp_bi_ld5/ exp/ihm/chain_cleaned/tdnn_lstm1l_5epoch_sp_bi_ld5/ +# exp/ihm/chain_cleaned/tdnn_lstm1i_5epoch_sp_bi_ld5/: num-iters=111 nj=2..12 num-params=43.4M dim=40+100->3765 combine=-0.053->-0.049 xent:train/valid[73,110,final]=(-0.832,-0.631,-0.621/-1.09,-1.03,-1.03) logprob:train/valid[73,110,final]=(-0.057,-0.037,-0.035/-0.102,-0.103,-0.102) +# exp/ihm/chain_cleaned/tdnn_lstm1l_5epoch_sp_bi_ld5/: num-iters=111 nj=2..12 num-params=43.4M dim=40+100->3765 combine=-0.085->-0.074 xent:train/valid[73,110,final]=(-3.14,-1.02,-0.847/-3.20,-1.21,-1.05) logprob:train/valid[73,110,final]=(-0.162,-0.065,-0.060/-0.177,-0.101,-0.096) + +### SDM +# Results with flags : --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned \ +#System tdnn_lstm1i_sp_bi_ihmali_ld5 tdnn_lstm1l_sp_bi_ihmali_ld5 +#WER on dev 37.0 35.9 +#WER on eval 40.0 39.4 +#Final train prob -0.106971 -0.15439 +#Final valid prob -0.252201 -0.244499 +#Final train prob (xent) -1.41142 -1.73795 +#Final valid prob (xent) -2.13741 -2.14519 + +# ./steps/info/chain_dir_info.pl exp/sdm1/chain_cleaned/tdnn_lstm1i_sp_bi_ihmali_ld5/ exp/sdm1/chain_cleaned/tdnn_lstm1l_sp_bi_ihmali_ld5/ +# exp/sdm1/chain_cleaned/tdnn_lstm1i_sp_bi_ihmali_ld5/: num-iters=87 nj=2..12 num-params=43.4M dim=40+100->3741 combine=-0.138->-0.128 xent:train/valid[57,86,final]=(-1.78,-1.42,-1.41/-2.23,-2.14,-2.14) logprob:train/valid[57,86,final]=(-0.155,-0.108,-0.107/-0.251,-0.254,-0.252) +# exp/sdm1/chain_cleaned/tdnn_lstm1l_sp_bi_ihmali_ld5/: num-iters=87 nj=2..12 num-params=43.4M dim=40+100->3741 combine=-0.192->-0.174 xent:train/valid[57,86,final]=(-3.74,-1.95,-1.74/-3.86,-2.31,-2.15) logprob:train/valid[57,86,final]=(-0.287,-0.165,-0.154/-0.335,-0.250,-0.244) + +# Results with flags for (1i.sh) : --num-epochs 5 --tlstm-affix 1i_5epoch --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned\ +# Results with flags for (1l.sh) : --num-epochs 5 --tlstm-affix 1l_5epoch --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned\ +#System tdnn_lstm1i_5epoch_sp_bi_ihmali_ld5 tdnn_lstm1l_5epoch_sp_bi_ihmali_ld5 +#WER on dev 36.9 35.8 +#WER on eval 40.2 39.5 +#Final train prob -0.0854552 -0.134189 +#Final valid prob -0.262789 -0.244183 +#inal train prob (xent) -1.2195 -1.58789 +#Final valid prob (xent) -2.13389 -2.08964 + +# ./steps/info/chain_dir_info.pl exp/sdm1/chain_cleaned/tdnn_lstm1i_5epoch_sp_bi_ihmali_ld5 exp/sdm1/chain_cleaned/tdnn_lstm1l_5epoch_sp_bi_ihmali_ld5/ +# exp/sdm1/chain_cleaned/tdnn_lstm1i_5epoch_sp_bi_ihmali_ld5: num-iters=109 nj=2..12 num-params=43.4M dim=40+100->3741 combine=-0.111->-0.104 xent:train/valid[71,108,final]=(-1.61,-1.25,-1.22/-2.16,-2.15,-2.13) logprob:train/valid[71,108,final]=(-0.133,-0.089,-0.085/-0.246,-0.264,-0.263) +# exp/sdm1/chain_cleaned/tdnn_lstm1l_5epoch_sp_bi_ihmali_ld5/: num-iters=109 nj=2..12 num-params=43.4M dim=40+100->3741 combine=-0.170->-0.153 xent:train/valid[71,108,final]=(-3.67,-1.76,-1.59/-3.81,-2.22,-2.09) logprob:train/valid[71,108,final]=(-0.274,-0.144,-0.134/-0.327,-0.248,-0.244) + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +mic=ihm +nj=30 +min_seg_len=1.55 +use_ihm_ali=false +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +ihm_gmm=tri3 # the gmm for the IHM system (if --use-ihm-ali true). +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +dropout_schedule='0,0@0.20,0.3@0.50,0' + +chunk_width=150 +chunk_left_context=40 +chunk_right_context=0 +label_delay=5 +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tlstm_affix=1l #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + + +# decode options +extra_left_context=50 +frames_per_chunk= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +xent_regularize=0.1 + +if [ $stage -le 15 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=1024 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 dropout-per-frame=true + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 + lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 dropout-per-frame=true + relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn8 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn9 input=Append(-3,0,3) dim=1024 + lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 dropout-per-frame=true + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + +graph_dir=$dir/graph_${LM} +if [ $stage -le 17 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + rm $dir/.error 2>/dev/null || true + + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + + for decode_set in dev eval; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$decode_cmd" \ + --extra-left-context $extra_left_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1; + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh new file mode 100644 index 00000000000..b0e7af0618d --- /dev/null +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh @@ -0,0 +1,352 @@ +#!/bin/bash + +# This (1m.sh) is the same as 1j but with per-frame dropout on LSTM layer +# It is a fast LSTM with per-frame dropout on [i, f, o] gates of the LSTM, +# the dropout-adding place is "place4" in paper : http://www.danielpovey.com/files/2017_interspeech_dropout.pdf. +# We have tried both 4-epoch and 5-epoch training. + +### IHM +# Results with flags : --mic ihm --train-set train_cleaned --gmm tri3_cleaned \ +#System tdnn_lstm1j_sp_bi_ld5 tdnn_lstm1m_sp_bi_ld5 +#WER on dev 20.8 19.9 +#WER on eval 20.3 19.3 +#Final train prob -0.0439145 -0.0653269 +#Final valid prob -0.10673 -0.0998743 +#Final train prob (xent) -0.683776 -0.884698 +#Final valid prob (xent) -1.05254 -1.09002 + +# steps/info/chain_dir_info.pl exp/ihm/chain_cleaned/tdnn_lstm1j_sp_bi_ld5/ exp/ihm/chain_cleaned/tdnn_lstm1m_sp_bi_ld5/ +# exp/ihm/chain_cleaned/tdnn_lstm1j_sp_bi_ld5: num-iters=89 nj=2..12 num-params=43.4M dim=40+100->3765 combine=-0.063->-0.058 xent:train/valid[58,88,final]=(-0.888,-0.695,-0.684/-1.12,-1.06,-1.05) logprob:train/valid[58,88,final]=(-0.065,-0.045,-0.044/-0.105,-0.107,-0.107) +# exp/ihm/chain_cleaned/tdnn_lstm1m_sp_bi_ld5: num-iters=89 nj=2..12 num-params=43.4M dim=40+100->3765 combine=-0.092->-0.080 xent:train/valid[58,88,final]=(-3.12,-1.09,-0.885/-3.20,-1.27,-1.09) logprob:train/valid[58,88,final]=(-0.164,-0.072,-0.065/-0.181,-0.103,-0.100) + +# Results with flags for (1m.sh) : --num-epochs 5 --tlstm-affix 1m_5epoch --mic ihm --train-set train_cleaned --gmm tri3_cleaned \ +# Results with flags for (1j.sh) : --num-epochs 5 --tlstm-affix 1j_5epoch --mic ihm --train-set train_cleaned --gmm tri3_cleaned \ +#System tdnn_lstm1j_5epoch_sp_bi_ld5 tdnn_lstm1m_5epoch_sp_bi_ld5 +#WER on dev 21.1 19.9 +#WER on eval 20.9 19.8 +#Final train prob -0.0365079 -0.057024 +#Final valid prob -0.112709-0.0992725 +#inal train prob (xent) -0.601602 -0.800653 +#Final valid prob (xent) -1.03241 -1.04748 + +# ./steps/info/chain_dir_info.pl exp/ihm/chain_cleaned/tdnn_lstm1j_5epoch_sp_bi_ld5/ exp/ihm/chain_cleaned/tdnn_lstm1m_5epoch_sp_bi_ld5/ +# exp/ihm/chain_cleaned/tdnn_lstm1j_5epoch_sp_bi_ld5/: num-iters=111 nj=2..12 num-params=43.4M dim=40+100->3765 combine=-0.053->-0.049 xent:train/valid[73,110,final]=(-0.813,-0.615,-0.602/-1.08,-1.04,-1.03) logprob:train/valid[73,110,final]=(-0.057,-0.038,-0.037/-0.106,-0.113,-0.113) +# exp/ihm/chain_cleaned/tdnn_lstm1m_5epoch_sp_bi_ld5/: num-iters=111 nj=2..12 num-params=43.4M dim=40+100->3765 combine=-0.080->-0.072 xent:train/valid[73,110,final]=(-3.15,-0.985,-0.801/-3.26,-1.21,-1.05) logprob:train/valid[73,110,final]=(-0.161,-0.062,-0.057/-0.183,-0.102,-0.099) + +#### SDM +# Results with flags : --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned \ +#System tdnn_lstm1j_sp_bi_ihmali_ld5 tdnn_lstm1m_sp_bi_ihmali_ld5 +#WER on dev 36.9 36.4 +#WER on eval 40.5 39.9 +#Final train prob -0.108141 -0.148861 +#Final valid prob -0.257468 -0.240962 +#Final train prob (xent) -1.38179 -1.70258 +#Final valid prob (xent) -2.13095 -2.12803 + +# ./steps/info/chain_dir_info.pl exp/sdm1/chain_cleaned/tdnn_lstm1j_sp_bi_ihmali_ld5/ exp/sdm1/chain_cleaned/tdnn_lstm1m_sp_bi_ihmali_ld5/ +# exp/sdm1/chain_cleaned/tdnn_lstm1j_sp_bi_ihmali_ld5/: num-iters=87 nj=2..12 num-params=43.4M dim=40+100->3741 combine=-0.138->-0.128 xent:train/valid[57,86,final]=(-1.71,-1.39,-1.38/-2.18,-2.14,-2.13) logprob:train/valid[57,86,final]=(-0.150,-0.110,-0.108/-0.251,-0.260,-0.257) +# exp/sdm1/chain_cleaned/tdnn_lstm1m_sp_bi_ihmali_ld5/: num-iters=87 nj=2..12 num-params=43.4M dim=40+100->3741 combine=-0.187->-0.170 xent:train/valid[57,86,final]=(-3.74,-1.90,-1.70/-3.88,-2.28,-2.13) logprob:train/valid[57,86,final]=(-0.286,-0.158,-0.149/-0.336,-0.245,-0.241) + +# Results with flags for (1m.sh) : --num-epochs 5 --tlstm-affix 1m_5epoch --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned\ +# Results with flags for (1j.sh) : --num-epochs 5 --tlstm-affix 1j_5epoch --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned\ +#System tdnn_lstm1j_5epoch_sp_bi_ihmali_ld5 tdnn_lstm1m_5epoch_sp_bi_ihmali_ld5 +#WER on dev 37.4 36.0 +#WER on eval 40.7 39.6 +#Final train prob -0.0879063 -0.133092 +#Final valid prob -0.270953 -0.243246 +#Final train prob (xent) -1.20822 -1.56293 +#Final valid prob (xent) -2.1425 -2.07265 + +# ./steps/info/chain_dir_info.pl exp/sdm1/chain_cleaned/tdnn_lstm1j_5epoch_sp_bi_ihmali_ld5/ exp/sdm1/chain_cleaned/tdnn_lstm1m_5epoch_sp_bi_ihmali_ld5/ +# exp/sdm1/chain_cleaned/tdnn_lstm1j_5epoch_sp_bi_ihmali_ld5/: num-iters=109 nj=2..12 num-params=43.4M dim=40+100->3741 combine=-0.115->-0.107 xent:train/valid[71,108,final]=(-1.56,-1.22,-1.21/-2.16,-2.16,-2.14) logprob:train/valid[71,108,final]=(-0.131,-0.090,-0.088/-0.256,-0.273,-0.271) +# exp/sdm1/chain_cleaned/tdnn_lstm1m_5epoch_sp_bi_ihmali_ld5/: num-iters=109 nj=2..12 num-params=43.4M dim=40+100->3741 combine=-0.167->-0.153 xent:train/valid[71,108,final]=(-3.69,-1.71,-1.56/-3.84,-2.20,-2.07) logprob:train/valid[71,108,final]=(-0.279,-0.140,-0.133/-0.329,-0.247,-0.243) + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +mic=ihm +nj=30 +min_seg_len=1.55 +use_ihm_ali=false +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +ihm_gmm=tri3 # the gmm for the IHM system (if --use-ihm-ali true). +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +dropout_schedule='0,0@0.20,0.3@0.50,0' # dropout schedule controls the dropout + # proportion for each training iteration. +num_epochs=4 + +chunk_width=150 +chunk_left_context=40 +chunk_right_context=0 +label_delay=5 +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tlstm_affix=1m #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + + +# decode options +extra_left_context=50 +frames_per_chunk= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +xent_regularize=0.1 + +if [ $stage -le 15 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=20 dropout-proportion=0.0" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=1024 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + fast-lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 + fast-lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn8 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn9 input=Append(-3,0,3) dim=1024 + fast-lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + +graph_dir=$dir/graph_${LM} +if [ $stage -le 17 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + rm $dir/.error 2>/dev/null || true + + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + + for decode_set in dev eval; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$decode_cmd" \ + --extra-left-context $extra_left_context \ + --frames-per-chunk "$frames_per_chunk" \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1; + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/swbd/s5c/RESULTS b/egs/swbd/s5c/RESULTS index f103200f966..2cf34c600c1 100644 --- a/egs/swbd/s5c/RESULTS +++ b/egs/swbd/s5c/RESULTS @@ -203,6 +203,12 @@ exit 0 %WER 21.2 | 2628 21594 | 81.4 12.8 5.9 2.6 21.2 56.7 | exp/chain/lstm_d_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys %WER 13.88 [ 6829 / 49204, 935 ins, 1690 del, 4204 sub ] exp/chain/lstm_d_ld5_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# current best 'chain' models with TDNN + LSTM + dropout (see local/chain/run_tdnn_lstm_1l.sh) +%WER 13.5 | 4459 42989 | 88.2 8.0 3.8 1.7 13.5 48.2 | exp/chain/tdnn_lstm_1b_dropout_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 8.8 | 1831 21395 | 92.3 5.2 2.5 1.1 8.8 41.9 | exp/chain/tdnn_lstm_1b_dropout_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 18.1 | 2628 21594 | 84.0 10.8 5.2 2.2 18.1 52.6 | exp/chain/tdnn_lstm_1b_dropout_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.callhm.filt.sys +%WER 11.59 [ 5615 / 48460, 708 ins, 1450 del, 3457 sub ] exp/chain/tdnn_lstm_1b_dropout_ld5_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 + # these are results with nnet3 LSTMs with CTC training : local/ctc/run_lstm.sh %WER 17.4 | 1831 21395 | 85.3 10.1 4.6 2.7 17.4 57.8 | exp/ctc/lstm_sp/decode_eval2000_sw1_fsh_fg_0.15/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys %WER 19.4 | 1831 21395 | 83.5 11.2 5.2 3.0 19.4 60.7 | exp/ctc/lstm_sp/decode_eval2000_sw1_tg_0.15/score_12_0.5/eval2000_hires.ctm.swbd.filt.sys diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh new file mode 100644 index 00000000000..68daf81ab01 --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh @@ -0,0 +1,248 @@ +#!/bin/bash + +# 6l is same as 6k, but with the per-frame dropout +# location4 as paper : http://www.danielpovey.com/files/2017_interspeech_dropout.pdf + +# local/chain/compare_wer_general.sh blstm_6k_sp blstm_6l_sp +# attention: the blatm_6k_sp result here is far better than the updated +# result (14.5 vs 14.1), this may due to noise + +# System blstm_6k_sp blstm_6l_sp +# WER on train_dev(tg) 13.30 13.06 +# WER on train_dev(fg) 12.34 12.16 +# WER on eval2000(tg) 15.5 15.2 +# WER on eval2000(fg) 14.1 13.8 +# Final train prob -0.052 -0.065 +# Final valid prob -0.090 -0.093 +# Final train prob (xent) -0.743 -0.831 +# Final valid prob (xent) -0.9579 -0.9821 + +# exp/chain/blstm_6k_sp/: num-iters=327 nj=3..16 num-params=41.2M dim=40+100->6074 combine=-0.069->-0.069 xent:train/valid[217,326,final]=(-0.849,-0.748,-0.743/-1.04,-0.959,-0.958) logprob:train/valid[217,326,final]=(-0.065,-0.053,-0.052/-0.096,-0.090,-0.090) +# exp/chain/blstm_6l_sp/: num-iters=327 nj=3..16 num-params=41.2M dim=40+100->6074 combine=-0.084->-0.082 xent:train/valid[217,326,final]=(-1.45,-0.840,-0.831/-1.58,-0.994,-0.982) logprob:train/valid[217,326,final]=(-0.110,-0.066,-0.065/-0.132,-0.094,-0.093) +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/blstm_6l # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= +decode_dir_affix= + +# training options +leftmost_questions_truncate=-1 +chunk_width=150 +chunk_left_context=40 +chunk_right_context=40 +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=0 +dropout_schedule='0,0@0.20,0.1@0.50,0' + +# decode options +extra_left_context=50 +extra_right_context=50 +frames_per_chunk= + +remove_egs=false +common_egs_dir= + +affix= +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; } + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=20 dropout-proportion=0.0" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + fast-lstmp-layer name=blstm1-forward input=lda cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + fast-lstmp-layer name=blstm1-backward input=lda cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 $lstm_opts + + fast-lstmp-layer name=blstm2-forward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + fast-lstmp-layer name=blstm2-backward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 $lstm_opts + + fast-lstmp-layer name=blstm3-forward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + fast-lstmp-layer name=blstm3-backward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 1200000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --trainer.dropout-schedule $dropout_schedule \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $extra_right_context ] && extra_right_context=$chunk_right_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1b.sh new file mode 100644 index 00000000000..3929cdc432e --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1b.sh @@ -0,0 +1,248 @@ +#!/bin/bash + +# tdnn_blstm_1b is same as tdnn_blstm_1a, but with the per-frame dropout +# added with location 4, see paper: +# http://www.danielpovey.com/files/2017_interspeech_dropout.pdf + +# ./local/chain/compare_wer_general.sh tdnn_blstm_1a_sp tdnn_blstm_1b_sp +# System tdnn_blstm_1a_sp tdnn_blstm_1b_sp +# WER on train_dev(tg) 12.86 12.60 +# WER on train_dev(fg) 11.86 11.80 +# WER on eval2000(tg) 15.3 14.9 +# WER on eval2000(fg) 14.0 13.5 +# Final train prob -0.042 -0.054 +# Final valid prob -0.099 -0.091 +# Final train prob (xent) -0.637 -0.719 +# Final valid prob (xent) -0.9418 -0.9190 + +# exp/chain/tdnn_blstm_1a_sp/: num-iters=327 nj=3..16 num-params=53.7M dim=40+100->6074 combine=-0.058->-0.057 xent:train/valid[217,326,final]=(-0.753,-0.631,-0.637/-0.974,-0.941,-0.942) logprob:train/valid[217,326,final]=(-0.055,-0.041,-0.042/-0.094,-0.099,-0.099) +# exp/chain/tdnn_blstm_1b_sp/: num-iters=327 nj=3..16 num-params=53.7M dim=40+100->6074 combine=-0.070->-0.068 xent:train/valid[217,326,final]=(-1.27,-0.732,-0.719/-1.42,-0.931,-0.919) logprob:train/valid[217,326,final]=(-0.094,-0.055,-0.054/-0.117,-0.091,-0.091) +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_blstm_1b # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= +decode_dir_affix= + +# training options +leftmost_questions_truncate=-1 +chunk_width=150 +chunk_left_context=40 +chunk_right_context=40 +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=0 +dropout_schedule='0,0@0.20,0.1@0.50,0' +# decode options +extra_left_context=50 +extra_right_context=50 +frames_per_chunk= + +remove_egs=false +common_egs_dir= + +affix= +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; } + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=20 dropout-proportion=0.0" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=1024 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + fast-lstmp-layer name=blstm1-forward input=tdnn3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + fast-lstmp-layer name=blstm1-backward input=tdnn3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 $lstm_opts + + fast-lstmp-layer name=blstm2-forward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + fast-lstmp-layer name=blstm2-backward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 $lstm_opts + + fast-lstmp-layer name=blstm3-forward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + fast-lstmp-layer name=blstm3-backward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 1200000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --trainer.dropout-schedule $dropout_schedule \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $extra_right_context ] && extra_right_context=$chunk_right_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1k.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1k.sh new file mode 100644 index 00000000000..21cb4fa9373 --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1k.sh @@ -0,0 +1,321 @@ +#!/bin/bash + +# run_tdnn_lstm_1k.sh is like run_tdnn_lstm_1e.sh but +# added the per-frame dropout location 4 as paper: +# http://www.danielpovey.com/files/2017_interspeech_dropout.pdf + +# ./local/chain/compare_wer_general.sh --looped tdnn_lstm_1e_sp tdnn_lstm_1k_sp +# System tdnn_lstm_1e_sp tdnn_lstm_1k_sp +# WER on train_dev(tg) 13.18 12.60 +# [looped:] 13.10 12.56 +# WER on train_dev(fg) 12.21 11.58 +# [looped:] 12.28 11.62 +# WER on eval2000(tg) 15.8 15.2 +# [looped:] 15.8 15.2 +# WER on eval2000(fg) 14.5 13.7 +# [looped:] 14.5 13.8 +# Final train prob -0.060 -0.076 +# Final valid prob -0.101 -0.106 +# Final train prob (xent) -0.868 -0.989 +# Final valid prob (xent) -1.0740 -1.1341 + +# exp/chain/tdnn_lstm_1e_sp/: num-iters=262 nj=3..16 num-params=39.6M dim=40+100->6074 combine=-0.072->-0.071 xent:train/valid[173,261,final]=(-1.01,-0.876,-0.868/-1.16,-1.08,-1.07) logprob:train/valid[173,261,final]=(-0.075,-0.061,-0.060/-0.106,-0.101,-0.101) +# exp/chain/tdnn_lstm_1k_sp/: num-iters=262 nj=3..16 num-params=39.6M dim=40+100->6074 combine=-0.093->-0.089 xent:train/valid[173,261,final]=(-2.87,-1.07,-0.989/-2.90,-1.20,-1.13) logprob:train/valid[173,261,final]=(-0.153,-0.079,-0.076/-0.179,-0.107,-0.106) + +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_lstm_1e # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= +decode_nj=50 + +# training options +xent_regularize=0.01 +self_repair_scale=0.00001 +label_delay=5 + +chunk_left_context=40 +chunk_right_context=0 +# we'll put chunk-left-context-initial=0 and chunk-right-context-final=0 +# directly without variables. +frames_per_chunk=140,100,160 + +# (non-looped) decoding options +frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1) +extra_left_context=50 +extra_right_context=0 +# we'll put extra-left-context-initial=0 and extra-right-context-final=0 +# directly without variables. + + +remove_egs=false +common_egs_dir= + +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; } + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=20" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=1024 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 + fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 + fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_chunk \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + + +graph_dir=$dir/graph_sw1_tg +iter_opts= +if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " +fi + +if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --num-threads 4 \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 25 --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk "$frames_per_chunk_primary" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + +if [ $stage -le 16 ]; then + # looped decoding. Note: this does not make sense for BLSTMs or other + # backward-recurrent setups, and for TDNNs and other non-recurrent there is no + # point doing it because it would give identical results to regular decoding. + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode_looped.sh \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $decode_nj --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg_looped || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg}_looped || exit 1; + fi + ) & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in looped decoding" + exit 1 + fi +fi + +if $test_online_decoding && [ $stage -le 17 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3/extractor $dir ${dir}_online + + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000; do + ( + # note: we just give it "$decode_set" as it only uses the wav.scp, the + # feature type does not matter. + + steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" $iter_opts \ + --acwt 1.0 --post-decode-acwt 10.0 \ + $graph_dir data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in online decoding" + exit 1 + fi +fi + +exit 0; diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1l.sh new file mode 100644 index 00000000000..e88e199839c --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1l.sh @@ -0,0 +1,244 @@ +#!/bin/bash + +# tdnn_lstm_1l is same as tdnn_lstm_1b, but with the per-frame dropout +# added with location 4 in LSTM layer, see paper: +# http://www.danielpovey.com/files/2017_interspeech_dropout.pdf + +# ./local/chain/compare_wer_general.sh tdnn_lstm_1b_ld5_sp tdnn_lstm_1l_ld5_sp +# System tdnn_lstm_1b_ld5_sp tdnn_lstm_1l_ld5_sp +# WER on train_dev(tg) 13.06 12.41 +# WER on train_dev(fg) 12.13 11.59 +# WER on eval2000(tg) 15.1 14.8 +# WER on eval2000(fg) 13.9 13.5 +# Final train prob -0.047 -0.069 +# Final valid prob -0.093 -0.095 +# Final train prob (xent) -0.735 -0.913 +# Final valid prob (xent) -1.0151 -1.0820 + +# exp/chain/tdnn_lstm_1b_ld5_sp: num-iters=327 nj=3..16 num-params=39.6M dim=40+100->6074 combine=-0.062->-0.061 xent:train/valid[217,326,final]=(-0.877,-0.741,-0.735/-1.08,-1.02,-1.02) logprob:train/valid[217,326,final]=(-0.063,-0.048,-0.047/-0.095,-0.093,-0.093) +# exp/chain/tdnn_lstm_1l_ld5_sp: num-iters=327 nj=3..16 num-params=39.6M dim=40+100->6074 combine=-0.088->-0.084 xent:train/valid[217,326,final]=(-3.32,-0.961,-0.913/-3.40,-1.13,-1.08) logprob:train/valid[217,326,final]=(-0.176,-0.072,-0.069/-0.198,-0.097,-0.095) +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_lstm_1l # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= +decode_dir_affix= + +# training options +leftmost_questions_truncate=-1 +chunk_width=150 +chunk_left_context=40 +chunk_right_context=0 +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 +dropout_schedule='0,0@0.20,0.3@0.50,0' +# decode options +extra_left_context=50 +extra_right_context=0 +frames_per_chunk= + +remove_egs=false +common_egs_dir= + +affix= +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=1024 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 dropout-per-frame=true + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 + lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 dropout-per-frame=true + relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 + lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 dropout-per-frame=true + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 1200000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --trainer.dropout-schedule $dropout_schedule \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $extra_right_context ] && extra_right_context=$chunk_right_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh new file mode 100644 index 00000000000..dc0f59fb64a --- /dev/null +++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh @@ -0,0 +1,333 @@ +#!/bin/bash + +# 1s is as 1e, but adding per-frame dropout to LSTM in location4 +# as paper : http://www.danielpovey.com/files/2017_interspeech_dropout.pdf + +# ./local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1s_sp_bi +# System tdnn_lstm1e_sp_bi tdnn_lstm1s_sp_bi +# WER on dev(orig) 9.0 8.9 +# [looped:] 9.0 8.9 +# WER on dev(rescored) 8.4 8.1 +# [looped:] 8.4 8.1 +# WER on test(orig) 8.9 8.8 +# [looped:] 8.9 8.8 +# WER on test(rescored) 8.4 8.4 +# [looped:] 8.4 8.3 +# Final train prob -0.0712 -0.0914 +# Final valid prob -0.0892 -0.0977 +# Final train prob (xent) -0.8566 -0.9931 +# Final valid prob (xent) -0.9927 -1.0633 + +# exp/chain_cleaned/tdnn_lstm1e_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3626 combine=-0.082->-0.081 xent:train/valid[167,252,final]=(-0.961,-0.859,-0.857/-1.06,-0.999,-0.993) logprob:train/valid[167,252,final]=(-0.086,-0.072,-0.071/-0.098,-0.091,-0.089) +# exp/chain_cleaned/tdnn_lstm1s_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3626 combine=-0.104->-0.101 xent:train/valid[167,252,final]=(-3.08,-1.07,-0.993/-3.13,-1.14,-1.06) logprob:train/valid[167,252,final]=(-0.181,-0.093,-0.091/-0.183,-0.100,-0.098) + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +decode_nj=30 +min_seg_len=1.55 +label_delay=5 +xent_regularize=0.1 +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +# training options +chunk_left_context=40 +chunk_right_context=0 +chunk_left_context_initial=0 +chunk_right_context_final=0 +frames_per_chunk=140,100,160 +dropout_schedule="0,0@0.2,0.3@0.5,0" +# decode options +frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1) +extra_left_context=50 +extra_right_context=0 +extra_left_context_initial=0 +extra_right_context_final=0 + + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnn_lstm_affix=1s #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. +remove_egs=true + +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 15 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 16 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + + +if [ $stage -le 17 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=512 + relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1) + fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 dropout-proportion=0.0 delay=-3 + relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3) + relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3) + fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 dropout-proportion=0.0 delay=-3 + relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3) + relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3) + fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 dropout-proportion=0.0 delay=-3 + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 18 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width "$frames_per_chunk" \ + --egs.chunk-left-context "$chunk_left_context" \ + --egs.chunk-right-context "$chunk_right_context" \ + --egs.chunk-left-context-initial "$chunk_left_context_initial" \ + --egs.chunk-right-context-final "$chunk_right_context_final" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 128,64 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.deriv-truncate-margin 10 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --cleanup.remove-egs "$remove_egs" \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + + +if [ $stage -le 19 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph +fi + +if [ $stage -le 20 ]; then + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial $extra_left_context_initial \ + --extra-right-context-final $extra_right_context_final \ + --frames-per-chunk "$frames_per_chunk_primary" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +if [ $stage -le 21 ]; then + # 'looped' decoding. we didn't write a -parallel version of this program yet, + # so it will take a bit longer as the --num-threads option is not supported. + # we just hardcode the --frames-per-chunk option as it doesn't have to + # match any value used in training, and it won't affect the results very much (unlike + # regular decoding)... [it will affect them slightly due to differences in the + # iVector extraction; probably smaller will be worse as it sees less of the future, + # but in a real scenario, long chunks will introduce excessive latency]. + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context-initial $extra_left_context_initial \ + --frames-per-chunk 30 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +if $test_online_decoding && [ $stage -le 22 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + data/lang_chain exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online + + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + # note: we just give it "$dset" as it only uses the wav.scp, the + # feature type does not matter. + + steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \ + --extra-left-context-initial $extra_left_context_initial \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset} ${dir}_online/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}_online/decode_${dset} ${dir}_online/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + + +exit 0 diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh new file mode 100644 index 00000000000..c286fcef353 --- /dev/null +++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh @@ -0,0 +1,333 @@ +#!/bin/bash + +# 1t is as 1e, but increasing the TDNN dim and LSTM cell-dim into +# 1024, the recurrent and non-recurrent projection of the LSTM from +# 128 into 256. + +# ./local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1t_sp_bi +# System tdnn_lstm1e_again_sp_bi tdnn_lstm1t_again_sp_bi +# WER on dev(orig) 9.0 8.9 +# [looped:] 9.0 8.9 +# WER on dev(rescored) 8.4 8.2 +# [looped:] 8.4 8.3 +# WER on test(orig) 8.9 8.9 +# [looped:] 8.9 9.0 +# WER on test(rescored) 8.4 8.4 +# [looped:] 8.4 8.5 +# Final train prob -0.0712 -0.0459 +# Final valid prob -0.0892 -0.0867 +# Final train prob (xent) -0.8566 -0.6434 +# Final valid prob (xent) -0.9927 -0.8733 + +# exp/chain_cleaned/tdnn_lstm1e_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3626 combine=-0.082->-0.081 xent:train/valid[167,252,final]=(-0.961,-0.859,-0.857/-1.06,-0.999,-0.993) logprob:train/valid[167,252,final]=(-0.086,-0.072,-0.071/-0.098,-0.091,-0.089) +# exp/chain_cleaned/tdnn_lstm1t_sp_bi: num-iters=253 nj=2..12 num-params=37.1M dim=40+100->3626 combine=-0.055->-0.055 xent:train/valid[167,252,final]=(-0.774,-0.655,-0.643/-0.928,-0.883,-0.873) logprob:train/valid[167,252,final]=(-0.063,-0.048,-0.046/-0.087,-0.089,-0.087) + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +decode_nj=30 +min_seg_len=1.55 +label_delay=5 +xent_regularize=0.1 +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +# training options +chunk_left_context=40 +chunk_right_context=0 +chunk_left_context_initial=0 +chunk_right_context_final=0 +frames_per_chunk=140,100,160 +# decode options +frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1) +extra_left_context=50 +extra_right_context=0 +extra_left_context_initial=0 +extra_right_context_final=0 + + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnn_lstm_affix=1t #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. +remove_egs=true + +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 15 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 16 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + + +if [ $stage -le 17 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=1024 + relu-renorm-layer name=tdnn2 dim=1024 input=Append(-1,0,1) + fast-lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 decay-time=20 delay=-3 + relu-renorm-layer name=tdnn3 dim=1024 input=Append(-3,0,3) + relu-renorm-layer name=tdnn4 dim=1024 input=Append(-3,0,3) + fast-lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 decay-time=20 delay=-3 + relu-renorm-layer name=tdnn5 dim=1024 input=Append(-3,0,3) + relu-renorm-layer name=tdnn6 dim=1024 input=Append(-3,0,3) + fast-lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 decay-time=20 delay=-3 + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 18 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width "$frames_per_chunk" \ + --egs.chunk-left-context "$chunk_left_context" \ + --egs.chunk-right-context "$chunk_right_context" \ + --egs.chunk-left-context-initial "$chunk_left_context_initial" \ + --egs.chunk-right-context-final "$chunk_right_context_final" \ + --trainer.num-chunk-per-minibatch 128,64 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.deriv-truncate-margin 10 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --cleanup.remove-egs "$remove_egs" \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + + +if [ $stage -le 19 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph +fi + +if [ $stage -le 20 ]; then + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial $extra_left_context_initial \ + --extra-right-context-final $extra_right_context_final \ + --frames-per-chunk "$frames_per_chunk_primary" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +if [ $stage -le 21 ]; then + # 'looped' decoding. we didn't write a -parallel version of this program yet, + # so it will take a bit longer as the --num-threads option is not supported. + # we just hardcode the --frames-per-chunk option as it doesn't have to + # match any value used in training, and it won't affect the results very much (unlike + # regular decoding)... [it will affect them slightly due to differences in the + # iVector extraction; probably smaller will be worse as it sees less of the future, + # but in a real scenario, long chunks will introduce excessive latency]. + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context-initial $extra_left_context_initial \ + --frames-per-chunk 30 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +if $test_online_decoding && [ $stage -le 22 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + data/lang_chain exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online + + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + # note: we just give it "$dset" as it only uses the wav.scp, the + # feature type does not matter. + + steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \ + --extra-left-context-initial $extra_left_context_initial \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset} ${dir}_online/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}_online/decode_${dset} ${dir}_online/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + + +exit 0 diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh new file mode 100644 index 00000000000..9e50060f5d6 --- /dev/null +++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh @@ -0,0 +1,327 @@ +#!/bin/bash + +# 1u is the same as 1t but adding per-frame dropout to LSTM +# in location4, see paper : http://www.danielpovey.com/files/2017_interspeech_dropout.pdf + +# ./local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1t_sp_bi exp/chain_cleaned/tdnn_lstm1u_sp_bi +# System tdnn_lstm1t_again_sp_bi tdnn_lstm1u_sp_bi +# WER on dev(orig) 8.9 8.6 +# WER on dev(rescored) 8.2 8.0 +# WER on test(orig) 8.9 8.3 +# WER on test(rescored) 8.4 7.9 +# Final train prob -0.0459 -0.0709 +# Final valid prob -0.0867 -0.0902 +# Final train prob (xent) -0.6434 -0.8112 +# Final valid prob (xent) -0.8733 -0.9384 + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +decode_nj=30 +min_seg_len=1.55 +label_delay=5 +xent_regularize=0.1 +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +# training options +chunk_left_context=40 +chunk_right_context=0 +chunk_left_context_initial=0 +chunk_right_context_final=0 +frames_per_chunk=140,100,160 +dropout_schedule="0,0@0.20,0.3@0.5,0" +# decode options +frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1) +extra_left_context=50 +extra_right_context=0 +extra_left_context_initial=0 +extra_right_context_final=0 + + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnn_lstm_affix=1u #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. +remove_egs=true + +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 15 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 16 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + + +if [ $stage -le 17 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=1024 + relu-renorm-layer name=tdnn2 dim=1024 input=Append(-1,0,1) + fast-lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 dropout-proportion=0.0 decay-time=20 delay=-3 + relu-renorm-layer name=tdnn3 dim=1024 input=Append(-3,0,3) + relu-renorm-layer name=tdnn4 dim=1024 input=Append(-3,0,3) + fast-lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 dropout-proportion=0.0 decay-time=20 delay=-3 + relu-renorm-layer name=tdnn5 dim=1024 input=Append(-3,0,3) + relu-renorm-layer name=tdnn6 dim=1024 input=Append(-3,0,3) + fast-lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 dropout-proportion=0.0 decay-time=20 delay=-3 + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 18 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width "$frames_per_chunk" \ + --egs.chunk-left-context "$chunk_left_context" \ + --egs.chunk-right-context "$chunk_right_context" \ + --egs.chunk-left-context-initial "$chunk_left_context_initial" \ + --egs.chunk-right-context-final "$chunk_right_context_final" \ + --trainer.dropout-schedule="$dropout_schedule" \ + --trainer.num-chunk-per-minibatch 128,64 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.deriv-truncate-margin 10 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --cleanup.remove-egs "$remove_egs" \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + + +if [ $stage -le 19 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph +fi + +if [ $stage -le 20 ]; then + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial $extra_left_context_initial \ + --extra-right-context-final $extra_right_context_final \ + --frames-per-chunk "$frames_per_chunk_primary" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +if [ $stage -le 21 ]; then + # 'looped' decoding. we didn't write a -parallel version of this program yet, + # so it will take a bit longer as the --num-threads option is not supported. + # we just hardcode the --frames-per-chunk option as it doesn't have to + # match any value used in training, and it won't affect the results very much (unlike + # regular decoding)... [it will affect them slightly due to differences in the + # iVector extraction; probably smaller will be worse as it sees less of the future, + # but in a real scenario, long chunks will introduce excessive latency]. + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context-initial $extra_left_context_initial \ + --frames-per-chunk 30 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +if $test_online_decoding && [ $stage -le 22 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + data/lang_chain exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online + + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + # note: we just give it "$dset" as it only uses the wav.scp, the + # feature type does not matter. + + steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \ + --extra-left-context-initial $extra_left_context_initial \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset} ${dir}_online/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}_online/decode_${dset} ${dir}_online/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + + +exit 0 diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py index 4ffebcd9436..c92afb1c2dc 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py @@ -716,9 +716,9 @@ def set_default_configs(self): 'decay-time': -1.0, 'zeroing-interval' : 20, 'zeroing-threshold' : 15.0, - 'dropout-proportion' : -1.0, # If -1.0, no dropout components will be added - 'dropout-per-frame' : False # If false, regular dropout, not per frame. - } + 'dropout-proportion' : -1.0, # If -1.0, no dropout will + # be used) + } def set_derived_configs(self): if self.config['cell-dim'] <= 0: @@ -751,7 +751,6 @@ def check_configs(self): raise RuntimeError("dropout-proportion has invalid value {0}.".format(self.config['dropout-proportion'])) - def auxiliary_outputs(self): return ['c_t'] @@ -818,7 +817,6 @@ def generate_lstm_config(self): lstm_str = self.config['lstm-nonlinearity-options'] dropout_proportion = self.config['dropout-proportion'] - dropout_per_frame = 'true' if self.config['dropout-per-frame'] else 'false' configs = [] @@ -833,14 +831,16 @@ def generate_lstm_config(self): configs.append("# The core LSTM nonlinearity, implemented as a single component.") configs.append("# Input = (i_part, f_part, c_part, o_part, c_{t-1}), output = (c_t, m_t)") configs.append("# See cu-math.h:ComputeLstmNonlinearity() for details.") - configs.append("component name={0}.lstm_nonlin type=LstmNonlinearityComponent cell-dim={1} {2}".format(name, cell_dim, lstm_str)) + configs.append("component name={0}.lstm_nonlin type=LstmNonlinearityComponent cell-dim={1} " + "use-dropout={2} {3}" + .format(name, cell_dim, "true" if dropout_proportion != -1.0 else "false", lstm_str)) configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.") configs.append("component name={0}.cr_trunc type=BackpropTruncationComponent " "dim={1} {2}".format(name, cell_dim + rec_proj_dim, bptrunc_str)) if dropout_proportion != -1.0: - configs.append("component name={0}.cr_trunc.dropout type=DropoutComponent dim={1} " - "dropout-proportion={2} dropout-per-frame={3}" - .format(name, cell_dim + rec_proj_dim, dropout_proportion, dropout_per_frame)) + configs.append("component name={0}.dropout_mask type=DropoutMaskComponent output-dim=3 " + "dropout-proportion={1} " + .format(name, dropout_proportion)) configs.append("# Component specific to 'projected' LSTM (LSTMP), contains both recurrent"); configs.append("# and non-recurrent projections") configs.append("component name={0}.W_rp type=NaturalGradientAffineComponent input-dim={1} " @@ -849,8 +849,17 @@ def generate_lstm_config(self): configs.append("### Nodes for the components above.") configs.append("component-node name={0}.four_parts component={0}.W_all input=Append({1}, " "IfDefined(Offset({0}.r_trunc, {2})))".format(name, input_descriptor, delay)) - configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin " - "input=Append({0}.four_parts, IfDefined(Offset({0}.c_trunc, {1})))".format(name, delay)) + if dropout_proportion != -1.0: + # note: the 'input' is a don't-care as the component never uses it; it's required + # in component-node lines. + configs.append("component-node name={0}.dropout_mask component={0}.dropout_mask " + "input={0}.dropout_mask".format(name)) + configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin " + "input=Append({0}.four_parts, IfDefined(Offset({0}.c_trunc, {1})), {0}.dropout_mask)" + .format(name, delay)) + else: + configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin " + "input=Append({0}.four_parts, IfDefined(Offset({0}.c_trunc, {1})))".format(name, delay)) configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin " "dim-offset=0 dim={1}".format(name, cell_dim)) configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin " @@ -864,17 +873,10 @@ def generate_lstm_config(self): configs.append("# makes the deriv truncation more accurate .") configs.append("component-node name={0}.cr_trunc component={0}.cr_trunc " "input=Append({0}.c, {0}.r)".format(name)) - if dropout_proportion != -1.0: - configs.append("component-node name={0}.cr_trunc.dropout component={0}.cr_trunc.dropout input={0}.cr_trunc".format(name)) - configs.append("dim-range-node name={0}.c_trunc input-node={0}.cr_trunc.dropout " - "dim-offset=0 dim={1}".format(name, cell_dim)) - configs.append("dim-range-node name={0}.r_trunc input-node={0}.cr_trunc.dropout " - "dim-offset={1} dim={2}".format(name, cell_dim, rec_proj_dim)) - else: - configs.append("dim-range-node name={0}.c_trunc input-node={0}.cr_trunc " - "dim-offset=0 dim={1}".format(name, cell_dim)) - configs.append("dim-range-node name={0}.r_trunc input-node={0}.cr_trunc " - "dim-offset={1} dim={2}".format(name, cell_dim, rec_proj_dim)) + configs.append("dim-range-node name={0}.c_trunc input-node={0}.cr_trunc " + "dim-offset=0 dim={1}".format(name, cell_dim)) + configs.append("dim-range-node name={0}.r_trunc input-node={0}.cr_trunc " + "dim-offset={1} dim={2}".format(name, cell_dim, rec_proj_dim)) configs.append("### End LSTM Layer '{0}'".format(name)) return configs diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h index 444da38dd30..5b72a62e716 100644 --- a/src/cudamatrix/cu-kernels-ansi.h +++ b/src/cudamatrix/cu-kernels-ansi.h @@ -330,6 +330,7 @@ void cudaF_diff_log_softmax(dim3 Gr, dim3 Bl, const MatrixDim in_deriv_dim, const float* out_deriv, const int out_deriv_stride, float* in_deriv); void cudaD_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, + const int have_dropout_mask, const int num_rows, const double* input, const int in_stride, const double* params, const int params_stride, @@ -349,6 +350,7 @@ void cudaD_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, double* self_repair_sum_out, const int self_repair_sum_out_stride); void cudaF_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, + const int have_dropout_mask, const int num_rows, const float* input, const int in_stride, const float* params, const int params_stride, @@ -455,12 +457,14 @@ void cudaF_log_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x, void cudaD_lstm_nonlinearity(dim3 Gr, dim3 Bl, const double* in, const int in_stride, const double* params, const int params_stride, const int out_stride, - const int cell_dim, const int num_rows, + const int cell_dim, const int have_dropout_mask, + const int num_rows, double* out); void cudaF_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in, const int in_stride, const float* params, const int params_stride, const int out_stride, - const int cell_dim, const int num_rows, + const int cell_dim, const int have_dropout_mask, + const int num_rows, float* out); void cudaD_matrix_add_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim dim, double alpha, MatrixElement* x, diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 60800d9568d..6df0e5af9db 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -2846,6 +2846,9 @@ static void _diff_log_softmax(const MatrixDim in_deriv_dim, consecutive blocks, each of dimension cell_dim, which we name: (i_part, f_part, c_part, o_part, c_{t-1}). + If 'have_dropout_mask' is nonzero, each row of + 'in' will have 3 extra elements, interpreted + as dropout masks/scales for i_t, f_t and o_t. @param [in] params A matrix, of dimension 3 by cell_dim, with rows containing the 3 diagonal parameter matrices used in LSTMs, namely @@ -2870,7 +2873,8 @@ __global__ static void _lstm_nonlinearity(const Real* in, const int in_stride, const Real* params, const int params_stride, const int out_stride, const int cell_dim, - const int num_rows, Real* out) { + const int have_dropout_mask, const int num_rows, + Real* out) { const int tid = threadIdx.x; const int i = blockIdx.x; const Real* i_part = in + i * in_stride; @@ -2883,15 +2887,18 @@ static void _lstm_nonlinearity(const Real* in, const int in_stride, const Real* w_oc = params + params_stride * 2; Real* c_t = out + i * out_stride; Real* m_t = out + i * out_stride + cell_dim; + Real i_scale = (have_dropout_mask ? in[i * in_stride + cell_dim * 5] : 1), + f_scale = (have_dropout_mask ? in[i * in_stride + cell_dim * 5 + 1] : 1), + o_scale = (have_dropout_mask ? in[i * in_stride + cell_dim * 5 + 2] : 1); for (int j = tid; j < cell_dim; j += CU1DBLOCK) { Real c_tm1_j = c_tm1[j]; Real i_t_j = Real(1) / (Real(1) + exp(-i_part[j] - w_ic[j] * c_tm1_j)); Real f_t_j = Real(1) / (Real(1) + exp(-f_part[j] - w_fc[j] * c_tm1_j)); - Real c_t_j = f_t_j * c_tm1_j + i_t_j * tanh(c_part[j]); + Real c_t_j = f_t_j * f_scale * c_tm1_j + i_t_j * i_scale * tanh(c_part[j]); Real o_t_j = Real(1) / (Real(1) + exp(-o_part[j] - w_oc[j] * c_t_j)); c_t[j] = c_t_j; - m_t[j] = o_t_j * tanh(c_t_j); + m_t[j] = o_t_j * o_scale * tanh(c_t_j); } } @@ -2916,6 +2923,9 @@ static void _lstm_nonlinearity(const Real* in, const int in_stride, a multiple of 5). The column-space is interpreted as 5 consecutive blocks, each of dimension C, which we name: (i_part, f_part, c_part, o_part, c_{t-1}). + If 'have_dropout_mask' is nonzero, each row of + 'in' will have 3 extra elements, interpreted + as dropout masks/scales for i_t, f_t and o_t. @param [in] params The same as in ComputeLstmNonlinearity(). A matrix, of dimension 3 by C, with rows containing the three diagonal parameter matrices used in LSTMs, namely @@ -2988,7 +2998,8 @@ static void _lstm_nonlinearity(const Real* in, const int in_stride, */ template __global__ -static void _diff_lstm_nonlinearity(const int cell_dim, const int num_rows, +static void _diff_lstm_nonlinearity(const int cell_dim, const int have_dropout_mask, + const int num_rows, const Real* input, const int input_stride, const Real* params, const int params_stride, const Real* output_deriv, @@ -3042,6 +3053,7 @@ static void _diff_lstm_nonlinearity(const int cell_dim, const int num_rows, const Real o_t_self_repair = (update_sr[3] ? sr_config[8] : 0); const Real c_t_self_repair = (update_sr[4] ? sr_config[9] : 0); + for (int i = i0; i < num_rows; i += grid_stride) { const Real i_part = input[i * input_stride + j]; const Real f_part = input[i * input_stride + j + cell_dim]; @@ -3049,10 +3061,19 @@ static void _diff_lstm_nonlinearity(const int cell_dim, const int num_rows, const Real o_part = input[i * input_stride + j + 3 * cell_dim]; const Real c_prev = input[i * input_stride + j + 4 * cell_dim]; - const Real i_t = 1 / (1 + exp(-i_part - w_ic * c_prev)); - const Real f_t = 1 / (1 + exp(-f_part - w_fc * c_prev)); + + const Real i_scale = (have_dropout_mask ? + input[i * input_stride + cell_dim * 5] : 1), + f_scale = (have_dropout_mask ? + input[i * input_stride + cell_dim * 5 + 1] :1), + o_scale = (have_dropout_mask ? + input[i * input_stride + cell_dim * 5 + 2] :1); + + + const Real i_t = Real(1) / (1 + exp(-i_part - w_ic * c_prev)); + const Real f_t = Real(1) / (1 + exp(-f_part - w_fc * c_prev)); const Real tanh_c_part = tanh(c_part); - const Real c_t = f_t * c_prev + i_t * tanh_c_part; + const Real c_t = f_t * f_scale * c_prev + i_t * i_scale * tanh_c_part; const Real o_t = 1 / (1 + exp(-o_part - w_oc * c_t)); const Real tanh_c_t = tanh(c_t); @@ -3079,20 +3100,20 @@ static void _diff_lstm_nonlinearity(const int cell_dim, const int num_rows, const Real dc_t_out = output_deriv[i * output_deriv_stride + j]; const Real dm_t = output_deriv[i * output_deriv_stride + j + cell_dim]; - const Real dtanh_c_t = o_t * dm_t; - const Real do_t = tanh_c_t * dm_t; + const Real dtanh_c_t = o_t * o_scale * dm_t; + const Real do_t = o_scale * tanh_c_t * dm_t; const Real do_t_input = (o_t_deriv * do_t - (2 * o_t - 1) * o_t_self_repair); const Real dc_t = (c_t_deriv * dtanh_c_t + dc_t_out + do_t_input * w_oc) - tanh_c_t * c_t_self_repair; - const Real dtanh_c_part = i_t * dc_t; - const Real df_t = dc_t * c_prev; + const Real dtanh_c_part = i_t * i_scale * dc_t; + const Real df_t = dc_t * f_scale * c_prev; const Real df_t_input = (df_t * f_t_deriv - - (2 * f_t - 1) * f_t_self_repair); - const Real di_t = dc_t * tanh_c_part; + - (2 * f_t - 1) * f_t_self_repair); + const Real di_t = dc_t * i_scale * tanh_c_part; const Real di_t_input = (di_t * i_t_deriv - - (2 * i_t - 1) * i_t_self_repair); + - (2 * i_t - 1) * i_t_self_repair); if (params_deriv) { w_ic_deriv_sum += c_prev * di_t_input; @@ -3100,7 +3121,7 @@ static void _diff_lstm_nonlinearity(const int cell_dim, const int num_rows, w_oc_deriv_sum += c_t * do_t_input; } - const Real dc_prev = w_ic * di_t_input + w_fc * df_t_input + f_t * dc_t; + const Real dc_prev = w_ic * di_t_input + w_fc * df_t_input + f_t * f_scale * dc_t; const Real do_part = do_t_input; const Real dc_part = (c_part_deriv * dtanh_c_part - tanh_c_part * c_part_self_repair); @@ -4737,20 +4758,23 @@ void cudaD_trace_mat_smat_trans(dim3 Gr, dim3 Bl, const double* mat_in, void cudaD_lstm_nonlinearity(dim3 Gr, dim3 Bl, const double* in, const int in_stride, const double* params, const int params_stride, const int out_stride, - const int cell_dim, const int num_rows, - double* out) { - _lstm_nonlinearity<<>>(in, in_stride, params, params_stride, - out_stride, cell_dim, num_rows, out); + const int cell_dim, const int have_dropout_mask, + const int num_rows, double* out) { + _lstm_nonlinearity<<>>( + in, in_stride, params, params_stride, + out_stride, cell_dim, have_dropout_mask, num_rows, out); } void cudaF_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in, const int in_stride, const float* params, const int params_stride, const int out_stride, - const int cell_dim, const int num_rows, - float* out) { - _lstm_nonlinearity<<>>(in, in_stride, params, params_stride, - out_stride, cell_dim, num_rows, out); + const int cell_dim, const int have_dropout_mask, + const int num_rows, float* out) { + _lstm_nonlinearity<<>>( + in, in_stride, params, params_stride, + out_stride, cell_dim, have_dropout_mask, num_rows, out); } void cudaD_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, + const int have_dropout_mask, const int num_rows, const double* input, const int input_stride, const double* params, const int params_stride, @@ -4769,7 +4793,8 @@ void cudaD_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, const int deriv_sum_out_stride, double* self_repair_sum_out, const int self_repair_sum_out_stride) { - _diff_lstm_nonlinearity<<>>(cell_dim, num_rows, input, + _diff_lstm_nonlinearity<<>>( + cell_dim, have_dropout_mask, num_rows, input, input_stride, params, params_stride, output_deriv, output_deriv_stride, deriv_sum_in, deriv_sum_in_stride, self_repair_config, count, input_deriv, input_deriv_stride, params_deriv, params_deriv_stride, value_sum_out, @@ -4777,6 +4802,7 @@ void cudaD_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, self_repair_sum_out, self_repair_sum_out_stride); } void cudaF_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, + const int have_dropout_mask, const int num_rows, const float* input, const int input_stride, const float* params, const int params_stride, @@ -4795,7 +4821,8 @@ void cudaF_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, const int deriv_sum_out_stride, float* self_repair_sum_out, const int self_repair_sum_out_stride) { - _diff_lstm_nonlinearity<<>>(cell_dim, num_rows, input, + _diff_lstm_nonlinearity<<>>( + cell_dim, have_dropout_mask, num_rows, input, input_stride, params, params_stride, output_deriv, output_deriv_stride, deriv_sum_in, deriv_sum_in_stride, self_repair_config, count, input_deriv, input_deriv_stride, params_deriv, params_deriv_stride, value_sum_out, diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index 77352b5925f..d2a79f471c8 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -626,6 +626,7 @@ inline void cuda_diff_log_softmax(dim3 Gr, dim3 Bl, out_deriv, out_deriv_stride, in_deriv); } inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, + const int have_dropout_mask, const int num_rows, const double* input, const int input_stride, const double* params, @@ -645,7 +646,8 @@ inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, const int deriv_sum_out_stride, double* self_repair_sum_out, const int self_repair_sum_out_stride) { - cudaD_diff_lstm_nonlinearity(Gr, Bl, cell_dim, num_rows, input, input_stride, + cudaD_diff_lstm_nonlinearity(Gr, Bl, cell_dim, have_dropout_mask, num_rows, + input, input_stride, params, params_stride, output_deriv, output_deriv_stride, deriv_sum_in, deriv_sum_in_stride, self_repair_config, count, @@ -656,6 +658,7 @@ inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, self_repair_sum_out_stride); } inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, + const int have_dropout_mask, const int num_rows, const float* input, const int input_stride, const float* params, @@ -675,7 +678,8 @@ inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, const int deriv_sum_out_stride, float* self_repair_sum_out, const int self_repair_sum_out_stride) { - cudaF_diff_lstm_nonlinearity(Gr, Bl, cell_dim, num_rows, input, input_stride, + cudaF_diff_lstm_nonlinearity(Gr, Bl, cell_dim, have_dropout_mask, + num_rows, input, input_stride, params, params_stride, output_deriv, output_deriv_stride, deriv_sum_in, deriv_sum_in_stride, self_repair_config, count, @@ -849,17 +853,21 @@ inline void cuda_lstm_nonlinearity(dim3 Gr, dim3 Bl, const double* in, const int in_stride, const double* params, const int params_stride, const int out_stride, const int cell_dim, + const int have_dropout_mask, const int num_rows, double* out) { cudaD_lstm_nonlinearity(Gr, Bl, in, in_stride, params, params_stride, - out_stride, cell_dim, num_rows, out); + out_stride, cell_dim, have_dropout_mask, + num_rows, out); } inline void cuda_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in, const int in_stride, const float* params, const int params_stride, const int out_stride, const int cell_dim, + const int have_dropout_mask, const int num_rows, float* out) { cudaF_lstm_nonlinearity(Gr, Bl, in, in_stride, params, params_stride, - out_stride, cell_dim, num_rows, out); + out_stride, cell_dim, have_dropout_mask, + num_rows, out); } inline void cuda_matrix_add_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim dim, double alpha, diff --git a/src/cudamatrix/cu-math-test.cc b/src/cudamatrix/cu-math-test.cc index 0febd5c0853..daf5c708465 100644 --- a/src/cudamatrix/cu-math-test.cc +++ b/src/cudamatrix/cu-math-test.cc @@ -144,7 +144,8 @@ static void UnitTestCuMathComputeLstmNonlinearity() { for (int i = 0; i < 3; i++) { int32 num_rows = 1 + Rand() % 100; int32 cell_dim = 1 + Rand() % 2000; - Matrix Hinput(num_rows, 5 * cell_dim); + int32 dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3); + Matrix Hinput(num_rows, 5 * cell_dim + dropout_dim); Matrix Hparams(3, cell_dim); Matrix Houtput(num_rows, 2 * cell_dim); Hinput.SetRandn(); @@ -165,7 +166,8 @@ static void UnitTestCuMathComputeLstmNonlinearity() { BaseFloat time_in_secs = 0.025; int32 num_rows = i; int32 cell_dim = i; - CuMatrix input(num_rows, 5 * cell_dim); + int32 dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3); + CuMatrix input(num_rows, 5 * cell_dim + dropout_dim); CuMatrix params(3, cell_dim); CuMatrix output(num_rows, 2 * cell_dim); input.SetRandn(); @@ -190,7 +192,8 @@ void UnitTestLstmNonlinearity() { // problem dimensions. int32 num_rows = RandInt(5, 20), - cell_dim = RandInt(2, 200); + cell_dim = RandInt(2, 200), + dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3); // Pick the (input or params block), and output block, for which we'll // spot-check the derivative values. This will give us test failures @@ -207,7 +210,7 @@ void UnitTestLstmNonlinearity() { test_params = -1; - CuMatrix input(num_rows, cell_dim * 5), + CuMatrix input(num_rows, cell_dim * 5 + dropout_dim), params(3, cell_dim), output_deriv(num_rows, cell_dim * 2); input.SetRandn(); @@ -230,7 +233,7 @@ void UnitTestLstmNonlinearity() { CuVector self_repair_config(10.0); // leave at zero... we don't really test this here. CuMatrix self_repair_sum(5, cell_dim), - input_deriv(num_rows, 5 * cell_dim), + input_deriv(num_rows, 5 * cell_dim + dropout_dim), params_deriv(3, cell_dim); double count_in = 0.0; @@ -249,7 +252,7 @@ void UnitTestLstmNonlinearity() { measured_objf_change(test_dim); for (int32 i = 0; i < test_dim; i++) { - CuMatrix delta_input(num_rows, 5 * cell_dim), + CuMatrix delta_input(num_rows, 5 * cell_dim + dropout_dim), delta_params(3, cell_dim); if (test_input >= 0) { delta_input.ColRange(test_input * cell_dim, cell_dim).SetRandn(); @@ -260,12 +263,9 @@ void UnitTestLstmNonlinearity() { delta_params.Scale(delta); } - - predicted_objf_change(i) = TraceMatMat(delta_input, input_deriv, kTrans) + TraceMatMat(delta_params, params_deriv, kTrans); - CuMatrix perturbed_input(input); perturbed_input.AddMat(1.0, delta_input); @@ -280,7 +280,9 @@ void UnitTestLstmNonlinearity() { measured_objf_change(i) = objf_change; } KALDI_LOG << "LSTM nonlinearity test: num_rows=" << num_rows - << ", cell_dim=" << cell_dim << ", test_input=" << test_input + << ", cell_dim=" << cell_dim + << ", dropout_dim=" << dropout_dim + << ", test_input=" << test_input << ", test_params=" << test_params << ", test_output=" << test_output << ", predicted_objf_change=" << predicted_objf_change @@ -296,16 +298,17 @@ template static void UnitTestBackpropLstmNonlinearity() { for (int i = 0; i < 3; i++) { int32 num_rows = 1 + Rand() % 200; - int32 cell_dim = 1 + Rand() % 2000; + int32 cell_dim = 1 + Rand() % 2000, + dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3); // KALDI_LOG << num_rows << ", " << cell_dim; - Matrix hinput(num_rows, 5 * cell_dim); + Matrix hinput(num_rows, 5 * cell_dim + dropout_dim); Matrix hparams(3, cell_dim); Matrix houtput_deriv(num_rows, 2 * cell_dim); Matrix hderiv_sum_in(5, cell_dim); Vector hself_repair_config(10); double count_in; - Matrix hinput_deriv(num_rows, 5 * cell_dim); + Matrix hinput_deriv(num_rows, 5 * cell_dim + dropout_dim); Matrix hparams_deriv(3, cell_dim); Matrix hvalue_sum_out(5, cell_dim); Matrix hderiv_sum_out(5, cell_dim); @@ -409,15 +412,16 @@ static void UnitTestBackpropLstmNonlinearity() { BaseFloat time_in_secs = 0.025; int32 num_rows = i; int32 cell_dim = i; + int32 dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3); - CuMatrix input(num_rows, 5 * cell_dim); + CuMatrix input(num_rows, 5 * cell_dim + dropout_dim); CuMatrix params(3, cell_dim); CuMatrix output_deriv(num_rows, 2 * cell_dim); CuMatrix deriv_sum_in(5, cell_dim); CuVector self_repair_config(10); double count_in; - CuMatrix input_deriv(num_rows, 5 * cell_dim); + CuMatrix input_deriv(num_rows, 5 * cell_dim + dropout_dim); CuMatrix params_deriv(3, cell_dim); CuMatrix value_sum_out(5, cell_dim); CuMatrix deriv_sum_out(5, cell_dim); diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc index 2bd184bf116..a9cd9efcfce 100644 --- a/src/cudamatrix/cu-math.cc +++ b/src/cudamatrix/cu-math.cc @@ -411,10 +411,11 @@ template void CpuComputeLstmNonlinearity(const MatrixBase &input_mat, const MatrixBase ¶ms_mat, MatrixBase *output) { - int32 num_rows = input_mat.NumRows(); - int32 cell_dim = input_mat.NumCols() / 5; + int32 num_rows = input_mat.NumRows(), + input_cols = input_mat.NumCols(), + cell_dim = input_cols / 5; + KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3); KALDI_ASSERT(output->NumRows() == num_rows); - KALDI_ASSERT(input_mat.NumCols() % 5 == 0); KALDI_ASSERT(params_mat.NumRows() == 3); KALDI_ASSERT(params_mat.NumCols() == cell_dim); KALDI_ASSERT(output->NumCols() == 2 * cell_dim); @@ -424,6 +425,11 @@ void CpuComputeLstmNonlinearity(const MatrixBase &input_mat, int32 params_stride = params_mat.Stride(); for (int32 r = 0; r < num_rows; r++) { const Real *input_row = input_mat.RowData(r); + // i_scale and f_scale relate to dropout, they will normally be 1.0. + Real i_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5]), + f_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5 + 1]), + o_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5 + 2]); + Real *output_row = output_mat.RowData(r); for (int32 c = 0; c < cell_dim; c++) { Real i_part = input_row[c]; @@ -436,9 +442,9 @@ void CpuComputeLstmNonlinearity(const MatrixBase &input_mat, Real w_oc = params_data[c + params_stride * 2]; Real i_t = ScalarSigmoid(i_part + w_ic * c_prev); Real f_t = ScalarSigmoid(f_part + w_fc * c_prev); - Real c_t = f_t * c_prev + i_t * ScalarTanh(c_part); + Real c_t = f_t * f_scale * c_prev + i_t * i_scale * ScalarTanh(c_part); Real o_t = ScalarSigmoid(o_part + w_oc * c_t); - Real m_t = o_t * ScalarTanh(c_t); + Real m_t = o_t * o_scale * ScalarTanh(c_t); output_row[c] = c_t; output_row[c + cell_dim] = m_t; } @@ -449,10 +455,11 @@ template void ComputeLstmNonlinearity(const CuMatrixBase &input, const CuMatrixBase ¶ms, CuMatrixBase *output) { - int32 num_rows = input.NumRows(); - int32 cell_dim = input.NumCols() / 5; + int32 num_rows = input.NumRows(), + input_cols = input.NumCols(), + cell_dim = input_cols / 5; + KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3); KALDI_ASSERT(output->NumRows() == num_rows); - KALDI_ASSERT(input.NumCols() % 5 == 0); KALDI_ASSERT(params.NumRows() == 3); KALDI_ASSERT(params.NumCols() == cell_dim); KALDI_ASSERT(output->NumCols() == 2 * cell_dim); @@ -461,6 +468,8 @@ void ComputeLstmNonlinearity(const CuMatrixBase &input, if (CuDevice::Instantiate().Enabled()) { Timer tim; + int have_dropout_mask = (input_cols == (cell_dim * 5) + 3); + // Each thread block is working on 1 row of the data. // It's best that cell dim is a multiple fo CU1DBLOCK dim3 dimBlock(CU1DBLOCK); @@ -468,7 +477,7 @@ void ComputeLstmNonlinearity(const CuMatrixBase &input, cuda_lstm_nonlinearity(dimGrid, dimBlock, input.Data(), input.Stride(), params.Data(), params.Stride(), output->Stride(), - cell_dim, num_rows, output->Data()); + cell_dim, have_dropout_mask, num_rows, output->Data()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); @@ -508,10 +517,12 @@ void CpuBackpropLstmNonlinearity(const MatrixBase &input, MatrixBase *value_sum_out, MatrixBase *deriv_sum_out, MatrixBase *self_repair_sum_out) { - int32 num_rows = input.NumRows(); - int32 cell_dim = input.NumCols() / 5; + int32 num_rows = input.NumRows(), + input_cols = input + .NumCols(), + cell_dim = input.NumCols() / 5; // Check dimensions. - KALDI_ASSERT(input.NumCols() % 5 == 0); + KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3); KALDI_ASSERT(params.NumRows() == 3); KALDI_ASSERT(params.NumCols() == cell_dim); KALDI_ASSERT(output_deriv.NumRows() == num_rows); @@ -606,6 +617,14 @@ void CpuBackpropLstmNonlinearity(const MatrixBase &input, c_part = input_mat(r, c + 2 * cell_dim), o_part = input_mat(r, c + 3 * cell_dim), c_prev = input_mat(r, c + 4 * cell_dim); + + Real i_scale = (input_cols == cell_dim * 5 ? 1.0 : + input_mat(r, cell_dim * 5)), + f_scale = (input_cols == cell_dim * 5 ? 1.0 : + input_mat(r, cell_dim * 5 + 1)), + o_scale = (input_cols == cell_dim * 5 ? 1.0 : + input_mat(r, cell_dim * 5 + 2)); + // For greater clarity, we give some of the quantities in the // forward equations their own names. Real i_t_input = i_part + w_ic * c_prev, @@ -613,7 +632,7 @@ void CpuBackpropLstmNonlinearity(const MatrixBase &input, f_t_input = f_part + w_fc * c_prev, f_t = ScalarSigmoid(f_t_input), tanh_c_part = ScalarTanh(c_part), - c_t = f_t * c_prev + i_t * tanh_c_part, + c_t = f_t * f_scale * c_prev + i_t * i_scale * tanh_c_part, o_t_input = o_part + w_oc * c_t, o_t = ScalarSigmoid(o_t_input), tanh_c_t = ScalarTanh(c_t); @@ -645,25 +664,25 @@ void CpuBackpropLstmNonlinearity(const MatrixBase &input, // comes directly from the output of this function. Real dc_t_out = output_deriv_mat(r, c); Real dm_t = output_deriv_mat(r, c + cell_dim); - Real dtanh_c_t = o_t * dm_t; - Real do_t = tanh_c_t * dm_t; + Real dtanh_c_t = o_t * o_scale * dm_t; + Real do_t = o_scale * tanh_c_t * dm_t; Real do_t_input = (o_t * (1.0F - o_t) * do_t - (2.0F * o_t - 1.0F) * o_t_self_repair); Real dc_t = ((1.0F - tanh_c_t * tanh_c_t) * dtanh_c_t + dc_t_out + do_t_input * w_oc) - tanh_c_t * c_t_self_repair; - Real dtanh_c_part = i_t * dc_t; - Real df_t = dc_t * c_prev; - Real df_t_input = (df_t * f_t * (1.0F - f_t) - - (2.0F * f_t - 1.0F) * f_t_self_repair); - Real di_t = dc_t * tanh_c_part; - Real di_t_input = (di_t * i_t * (1.0F - i_t) - - (2.0F * i_t - 1.0F) * i_t_self_repair); + Real dtanh_c_part = i_t * i_scale * dc_t; + Real df_t = dc_t * f_scale * c_prev; + Real df_t_input = ((df_t * f_t * (1.0F - f_t) + - (2.0F * f_t - 1.0F) * f_t_self_repair)); + Real di_t = dc_t * i_scale * tanh_c_part; + Real di_t_input = ((di_t * i_t * (1.0F - i_t) + - (2.0F * i_t - 1.0F) * i_t_self_repair)); w_ic_deriv_sum += c_prev * di_t_input; w_fc_deriv_sum += c_prev * df_t_input; w_oc_deriv_sum += c_t * do_t_input; - Real dc_prev = w_ic * di_t_input + w_fc * df_t_input + f_t * dc_t; + Real dc_prev = w_ic * di_t_input + w_fc * df_t_input + f_t * f_scale * dc_t; Real do_part = do_t_input; Real dc_part = ((1.0F - tanh_c_part * tanh_c_part) * dtanh_c_part - tanh_c_part * c_part_self_repair); @@ -724,10 +743,11 @@ void BackpropLstmNonlinearity(const CuMatrixBase &input, CuMatrixBase *value_sum_out, CuMatrixBase *deriv_sum_out, CuMatrixBase *self_repair_sum_out) { - int32 num_rows = input.NumRows(); - int32 cell_dim = input.NumCols() / 5; + int32 num_rows = input.NumRows(), + cell_dim = input.NumCols() / 5, + input_cols = input.NumCols(); // Check dimensions. - KALDI_ASSERT(input.NumCols() % 5 == 0); + KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim*5) + 3); KALDI_ASSERT(params.NumRows() == 3); KALDI_ASSERT(params.NumCols() == cell_dim); KALDI_ASSERT(output_deriv.NumRows() == num_rows); @@ -762,6 +782,7 @@ void BackpropLstmNonlinearity(const CuMatrixBase &input, // Each thread block is working on 1 row of the data. // It's best that cell dim is a multiple fo CU1DBLOCK + int have_dropout_mask = (input_cols == (cell_dim * 5) + 3); // Use 2D block (8x32 threads) as we need to compute column sum. // Use 1D grid to cover the data matrix width `cell_dim`. @@ -775,7 +796,8 @@ void BackpropLstmNonlinearity(const CuMatrixBase &input, dim3 dimGrid(n_blocks(cell_dim, dimBlock.x)); if (input_deriv == NULL) { if (params_deriv == NULL) { - cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim, num_rows, + cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim, + have_dropout_mask, num_rows, input.Data(), input.Stride(), params.Data(), params.Stride(), output_deriv.Data(), output_deriv.Stride(), deriv_sum_in.Data(), @@ -793,7 +815,8 @@ void BackpropLstmNonlinearity(const CuMatrixBase &input, 0); } else { - cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim, num_rows, + cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim, + have_dropout_mask, num_rows, input.Data(), input.Stride(), params.Data(), params.Stride(), output_deriv.Data(), output_deriv.Stride(), deriv_sum_in.Data(), @@ -811,7 +834,8 @@ void BackpropLstmNonlinearity(const CuMatrixBase &input, } } else { if (params_deriv == NULL) { - cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim, num_rows, + cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim, + have_dropout_mask, num_rows, input.Data(), input.Stride(), params.Data(), params.Stride(), output_deriv.Data(), output_deriv.Stride(), deriv_sum_in.Data(), @@ -821,7 +845,8 @@ void BackpropLstmNonlinearity(const CuMatrixBase &input, NULL, 0, NULL, 0, NULL, 0, NULL, 0); } else { - cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim, num_rows, + cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim, + have_dropout_mask, num_rows, input.Data(), input.Stride(), params.Data(), params.Stride(), output_deriv.Data(), output_deriv.Stride(), deriv_sum_in.Data(), diff --git a/src/cudamatrix/cu-math.h b/src/cudamatrix/cu-math.h index b0e0c2a1ff2..af3da0b47e2 100644 --- a/src/cudamatrix/cu-math.h +++ b/src/cudamatrix/cu-math.h @@ -88,6 +88,9 @@ void Group2norm(const CuMatrixBase &src, a multiple of 5). The column-space is interpreted as 5 consecutive blocks, each of dimension C, which we name: (i_part, f_part, c_part, o_part, c_{t-1}). + This function will also accept input of dimension N by 5C + 3, + and the three final elements will be used as scaling factors + on i_t, f_t and o_t (useful as per-frame dropout masks). @param [in] params A matrix, of dimension 3 by C, with rows containing the three diagonal parameter matrices used in LSTMs, namely w_{ic}, w_{fc} and w_{oc}. @@ -101,7 +104,6 @@ void Group2norm(const CuMatrixBase &src, o_t = Sigmoid(o_part + w_{oc}*c_t) m_t = o_t * Tanh(c_t) - */ template void ComputeLstmNonlinearity(const CuMatrixBase &input, @@ -134,6 +136,9 @@ void CpuComputeLstmNonlinearity(const MatrixBase &input, a multiple of 5). The column-space is interpreted as 5 consecutive blocks, each of dimension C, which we name: (i_part, f_part, c_part, o_part, c_{t-1}). + This function will also accept input of dimension N by 5C + 3, + and the three final elements will be interpreted as scaling factors + on i_t, f_t and o_t (useful as per-frame dropout masks). @param [in] params The same as in ComputeLstmNonlinearity(). A matrix, of dimension 3 by C, with rows containing the three diagonal parameter matrices used in LSTMs, namely @@ -165,9 +170,13 @@ void CpuComputeLstmNonlinearity(const MatrixBase &input, May be NULL; if not, this function writes, to this location, the backpropagated derivative of the objective function w.r.t. the 'input' matrix. This matrix should - have the same dimension as 'input' i.e. N by 5C. In - addition to the regular backpropagated derivative, the - output will include small values relating to 'self-repair'. + have the same dimension as 'input'. In addition to the + regular backpropagated derivative, the output will include + small values relating to 'self-repair'. If the input + is of column-dimension 5C + 3 (i.e. we are using dropout + masks), the derivatives w.r.t. the dropout masks will not + be set; they will retain their value prior to this + function call. @param [out] params_deriv May be NULL; if not, this is where this function *writes* [not adds] the backpropagated derivative of the objective @@ -196,6 +205,7 @@ void CpuComputeLstmNonlinearity(const MatrixBase &input, processed outside this function into self-repair stats for diagnostics. */ + template void BackpropLstmNonlinearity(const CuMatrixBase &input, const CuMatrixBase ¶ms, diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc index 23a8662a0d5..4a2a8d1c09a 100644 --- a/src/nnet3/nnet-component-itf.cc +++ b/src/nnet3/nnet-component-itf.cc @@ -147,6 +147,8 @@ Component* Component::NewComponentOfType(const std::string &component_type) { ans = new ConstantComponent(); } else if (component_type == "DropoutComponent") { ans = new DropoutComponent(); + } else if (component_type == "DropoutMaskComponent") { + ans = new DropoutMaskComponent(); } else if (component_type == "BackpropTruncationComponent") { ans = new BackpropTruncationComponent(); } else if (component_type == "LstmNonlinearityComponent") { diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h index c1732fc9b25..7cf438a025e 100644 --- a/src/nnet3/nnet-component-itf.h +++ b/src/nnet3/nnet-component-itf.h @@ -82,8 +82,11 @@ enum ComponentProperties { // Tanh, Sigmoid, ReLU and Softmax). kInputContiguous = 0x1000, // true if the component requires its input data (and // input derivatives) to have Stride()== NumCols(). - kOutputContiguous = 0x2000 // true if the component requires its input data (and + kOutputContiguous = 0x2000, // true if the component requires its input data (and // output derivatives) to have Stride()== NumCols(). + kRandomComponent = 0x4000 // true if the component has some kind of + // randomness, like DropoutComponent (these should + // inherit from class RandomComponent. }; diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc index 4aa65ce70ed..85743490518 100644 --- a/src/nnet3/nnet-general-component.cc +++ b/src/nnet3/nnet-general-component.cc @@ -1376,5 +1376,88 @@ void ConstantComponent::UnVectorize(const VectorBase ¶ms) { +std::string DropoutMaskComponent::Info() const { + std::ostringstream stream; + stream << Type() + << ", output-dim=" << output_dim_ + << ", dropout-proportion=" << dropout_proportion_; + return stream.str(); +} + +DropoutMaskComponent::DropoutMaskComponent(): + output_dim_(-1), dropout_proportion_(0.5) { } + +DropoutMaskComponent::DropoutMaskComponent( + const DropoutMaskComponent &other): + output_dim_(other.output_dim_), + dropout_proportion_(other.dropout_proportion_) { } + +void DropoutMaskComponent::Propagate( + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const { + KALDI_ASSERT(in.NumRows() == 0 && out->NumCols() == output_dim_); + BaseFloat dropout_proportion = dropout_proportion_; + KALDI_ASSERT(dropout_proportion >= 0.0 && dropout_proportion <= 1.0); + + if (dropout_proportion_ == 0) { + out->Set(1.0); + return; + } + const_cast&>(random_generator_).RandUniform(out); + out->Add(-dropout_proportion); + out->ApplyHeaviside(); + // To generate data where it's never the case that both of the dimensions + // for a row are zero, we generate uniformly distributed data (call this u_i), + // and for row i, set (*out)(i, 0) = (0 if u_i < dropout_proportion else 1) + // and (*out)(i, 1) = (0 if u_i > 1-dropout_proportion else 1) + int32 num_rows = out->NumRows(); + // later we may make this a bit more efficient. + CuVector temp(num_rows, kUndefined); + const_cast&>(random_generator_).RandUniform(&temp); + temp.Add(-dropout_proportion); + out->CopyColFromVec(temp, 0); + temp.Add(-1.0 + (2.0 * dropout_proportion)); + // Now, 'temp' contains the original uniformly-distributed data plus + // -(1 - dropout_proportion). + temp.Scale(-1.0); + out->CopyColFromVec(temp, 1); + out->ApplyHeaviside(); + } + + +void DropoutMaskComponent::Read(std::istream &is, bool binary) { + ExpectOneOrTwoTokens(is, binary, "", ""); + ReadBasicType(is, binary, &output_dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &dropout_proportion_); + ExpectToken(is, binary, ""); +} + + +void DropoutMaskComponent::Write(std::ostream &os, bool binary) const { + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, output_dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, dropout_proportion_); + WriteToken(os, binary, ""); +} + +Component* DropoutMaskComponent::Copy() const { + return new DropoutMaskComponent(*this); +} + +void DropoutMaskComponent::InitFromConfig(ConfigLine *cfl) { + output_dim_ = 0; + bool ok = cfl->GetValue("output-dim", &output_dim_); + KALDI_ASSERT(ok && output_dim_ > 0); + dropout_proportion_ = 0.5; + cfl->GetValue("dropout-proportion", &dropout_proportion_); +} + + + + } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h index b945edf4475..d5d7a140177 100644 --- a/src/nnet3/nnet-general-component.h +++ b/src/nnet3/nnet-general-component.h @@ -669,6 +669,88 @@ class ConstantComponent: public UpdatableComponent { +// DropoutMaskComponent outputs a random zero-or-one value for all dimensions of +// all requested indexes, and it has no dependencies on any input. It's like a +// ConstantComponent, but with random output that has value zero +// a proportion (dropout_proportion) of the time, and otherwise one. +// This is not the normal way to implement dropout; you'd normally use a +// DropoutComponent (see nnet-simple-component.h). This component is used while +// implementing per-frame dropout with the LstmNonlinearityComponent; we +// generate a two-dimensional output representing dropout +// +class DropoutMaskComponent: public RandomComponent { + public: + // actually this component requires no inputs; this value + // is really a don't-care. + virtual int32 InputDim() const { return output_dim_; } + + virtual int32 OutputDim() const { return output_dim_; } + + virtual std::string Info() const; + + // possible parameter values with their defaults: + // dropout-proportion=0.5 output-dim=-1 + virtual void InitFromConfig(ConfigLine *cfl); + + DropoutMaskComponent(); + + DropoutMaskComponent(const DropoutMaskComponent &other); + + virtual std::string Type() const { return "DropoutMaskComponent"; } + virtual int32 Properties() const { return kRandomComponent; } + // note: the matrix 'in' will be empty. + virtual void Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const; + // backprop does nothing, there is nothing to backprop to and nothing + // to update. + virtual void Backprop(const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &, // in_value + const CuMatrixBase &, // out_value + const CuMatrixBase &out_deriv, + Component *to_update, + CuMatrixBase *in_deriv) const { } + + virtual void Read(std::istream &is, bool binary); + virtual void Write(std::ostream &os, bool binary) const; + + virtual Component* Copy() const; + + // Some functions that are only to be reimplemented for GeneralComponents. + virtual void GetInputIndexes(const MiscComputationInfo &misc_info, + const Index &output_index, + std::vector *desired_indexes) const { + desired_indexes->clear(); // requires no inputs. + } + + // This function returns true if at least one of the input indexes used to + // compute this output index is computable. + // it's simple because this component requires no inputs. + virtual bool IsComputable(const MiscComputationInfo &misc_info, + const Index &output_index, + const IndexSet &input_index_set, + std::vector *used_inputs) const { + if (used_inputs) used_inputs->clear(); + return true; + } + + void SetDropoutProportion(BaseFloat p) { dropout_proportion_ = p; } + + private: + + // The output dimension + int32 output_dim_; + + BaseFloat dropout_proportion_; + + const DropoutMaskComponent &operator + = (const DropoutMaskComponent &other); // Disallow. +}; + + + + } // namespace nnet3 diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index 8bbe76840da..91f8f5139b2 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -4939,13 +4939,20 @@ void CompositeComponent::InitFromConfig(ConfigLine *cfl) { if(this_component->Type() == "CompositeComponent") { DeletePointers(&components); delete this_component; + // This is not allowed. If memory is too much with just one + // CompositeComponent, try decreasing max-rows-process instead. KALDI_ERR << "Found CompositeComponent nested within CompositeComponent." - << "Try decreasing max-rows-process instead." << "Nested line: '" << nested_line.WholeLine() << "'\n" << "Toplevel CompositeComponent line '" << cfl->WholeLine() << "'"; } this_component->InitFromConfig(&nested_line); + int32 props = this_component->Properties(); + if ((props & kRandomComponent) != 0 || + (props & kSimpleComponent) == 0) { + KALDI_ERR << "CompositeComponent contains disallowed component type: " + << nested_line.WholeLine(); + } components.push_back(this_component); } if (cfl->HasUnusedValues()) @@ -4965,10 +4972,9 @@ void CompositeComponent::SetComponent(int32 i, Component *component) { components_[i] = component; } - int32 LstmNonlinearityComponent::InputDim() const { int32 cell_dim = value_sum_.NumCols(); - return cell_dim * 5; + return cell_dim * 5 + (use_dropout_ ? 3 : 0); } int32 LstmNonlinearityComponent::OutputDim() const { @@ -4990,7 +4996,15 @@ void LstmNonlinearityComponent::Read(std::istream &is, bool binary) { ExpectToken(is, binary, ""); self_repair_total_.Read(is, binary); - ExpectToken(is, binary, ""); + std::string tok; + ReadToken(is, binary, &tok); + if (tok == "") { + ReadBasicType(is, binary, &use_dropout_); + ReadToken(is, binary, &tok); + } else { + use_dropout_ = false; + } + KALDI_ASSERT(tok == ""); ReadBasicType(is, binary, &count_); // For the on-disk format, we normalze value_sum_, deriv_sum_ and @@ -5037,6 +5051,12 @@ void LstmNonlinearityComponent::Write(std::ostream &os, bool binary) const { self_repair_prob.Scale(1.0 / (count_ * cell_dim)); self_repair_prob.Write(os, binary); } + if (use_dropout_) { + // only write this if true; we have back-compat code in reading anyway. + // this makes the models without dropout easier to read with older code. + WriteToken(os, binary, ""); + WriteBasicType(os, binary, use_dropout_); + } WriteToken(os, binary, ""); WriteBasicType(os, binary, count_); WriteToken(os, binary, ""); @@ -5047,7 +5067,8 @@ void LstmNonlinearityComponent::Write(std::ostream &os, bool binary) const { std::string LstmNonlinearityComponent::Info() const { std::ostringstream stream; int32 cell_dim = params_.NumCols(); - stream << UpdatableComponent::Info() << ", cell-dim=" << cell_dim; + stream << UpdatableComponent::Info() << ", cell-dim=" << cell_dim + << ", use-dropout=" << (use_dropout_ ? "true" : "false"); PrintParameterStats(stream, "w_ic", params_.Row(0)); PrintParameterStats(stream, "w_fc", params_.Row(1)); PrintParameterStats(stream, "w_oc", params_.Row(2)); @@ -5213,6 +5234,7 @@ LstmNonlinearityComponent::LstmNonlinearityComponent( const LstmNonlinearityComponent &other): UpdatableComponent(other), params_(other.params_), + use_dropout_(other.use_dropout_), value_sum_(other.value_sum_), deriv_sum_(other.deriv_sum_), self_repair_config_(other.self_repair_config_), @@ -5221,7 +5243,8 @@ LstmNonlinearityComponent::LstmNonlinearityComponent( preconditioner_(other.preconditioner_) { } void LstmNonlinearityComponent::Init( - int32 cell_dim, BaseFloat param_stddev, + int32 cell_dim, bool use_dropout, + BaseFloat param_stddev, BaseFloat tanh_self_repair_threshold, BaseFloat sigmoid_self_repair_threshold, BaseFloat self_repair_scale) { @@ -5231,6 +5254,7 @@ void LstmNonlinearityComponent::Init( sigmoid_self_repair_threshold >= 0.0 && sigmoid_self_repair_threshold <= 0.25 && self_repair_scale >= 0.0 && self_repair_scale <= 0.1); + use_dropout_ = use_dropout; params_.Resize(3, cell_dim); params_.SetRandn(); params_.Scale(param_stddev); @@ -5265,6 +5289,7 @@ void LstmNonlinearityComponent::InitNaturalGradient() { void LstmNonlinearityComponent::InitFromConfig(ConfigLine *cfl) { InitLearningRatesFromConfig(cfl); bool ok = true; + bool use_dropout = false; int32 cell_dim; // these self-repair thresholds are the normal defaults for tanh and sigmoid // respectively. If, later on, we decide that we want to support different @@ -5284,6 +5309,7 @@ void LstmNonlinearityComponent::InitFromConfig(ConfigLine *cfl) { cfl->GetValue("sigmoid-self-repair-threshold", &sigmoid_self_repair_threshold); cfl->GetValue("self-repair-scale", &self_repair_scale); + cfl->GetValue("use-dropout", &use_dropout); // We may later on want to make it possible to initialize the different // parameters w_ic, w_fc and w_oc with different biases. We'll implement @@ -5293,7 +5319,7 @@ void LstmNonlinearityComponent::InitFromConfig(ConfigLine *cfl) { KALDI_ERR << "Could not process these elements in initializer: " << cfl->UnusedValues(); if (ok) { - Init(cell_dim, param_stddev, tanh_self_repair_threshold, + Init(cell_dim, use_dropout, param_stddev, tanh_self_repair_threshold, sigmoid_self_repair_threshold, self_repair_scale); } else { KALDI_ERR << "Invalid initializer for layer of type " diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h index 62b4c9006d8..60fd1634598 100644 --- a/src/nnet3/nnet-simple-component.h +++ b/src/nnet3/nnet-simple-component.h @@ -99,7 +99,8 @@ class DropoutComponent : public RandomComponent { dropout_per_frame_(false) { } virtual int32 Properties() const { - return kLinearInInput|kBackpropInPlace|kSimpleComponent|kBackpropNeedsInput|kBackpropNeedsOutput; + return kLinearInInput|kBackpropInPlace|kSimpleComponent|kBackpropNeedsInput| + kBackpropNeedsOutput|kRandomComponent; } virtual std::string Type() const { return "DropoutComponent"; } @@ -1677,8 +1678,9 @@ class ConvolutionComponent: public UpdatableComponent { // o_part = W_{cx} x_t + W_{om} m_{t-1} + b_o // // The part of the computation that takes place in this component is as follows. -// Its input is of dimension 5C, consisting of 5 blocks: (i_part, f_part, c_part, o_part, and -// c_{t-1}). Its output is of dimension 2C, consisting of 2 blocks: c_t and m_t. +// Its input is of dimension 5C [however, search for 'dropout' below], +// consisting of 5 blocks: (i_part, f_part, c_part, o_part, and c_{t-1}). Its +// output is of dimension 2C, consisting of 2 blocks: c_t and m_t. // // To recap: the input is (i_part, f_part, c_part, o_part, c_{t-1}); the output is (c_t, m_t). // @@ -1696,6 +1698,12 @@ class ConvolutionComponent: public UpdatableComponent { // m_t = o_t * Tanh(c_t) (5) // # note: the outputs are just c_t and m_t. // +// [Note regarding dropout: optionally the input-dimension may be 5C + 3 instead +// of 5C in this case, the last two input dimensions will be interpreted as +// per-frame dropout masks on i_t, f_t and o_t respectively, so that in (3), i_t is +// replaced by i_t * i_t_scale, and likewise for f_t and o_t. +// +// // The backprop is as you would think, but for the "self-repair" we need to pass // in additional vectors (of the same dim as the parameters of the layer) that // dictate whether or not we add an additional term to the backpropagated @@ -1715,7 +1723,7 @@ class LstmNonlinearityComponent: public UpdatableComponent { virtual int32 OutputDim() const; virtual std::string Info() const; virtual void InitFromConfig(ConfigLine *cfl); - LstmNonlinearityComponent() { } // use Init to really initialize. + LstmNonlinearityComponent(): use_dropout_(false) { } virtual std::string Type() const { return "LstmNonlinearityComponent"; } virtual int32 Properties() const { return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput; @@ -1751,15 +1759,12 @@ class LstmNonlinearityComponent: public UpdatableComponent { explicit LstmNonlinearityComponent( const LstmNonlinearityComponent &other); - void Init(int32 cell_dim, BaseFloat param_stddev, + void Init(int32 cell_dim, bool use_dropout, + BaseFloat param_stddev, BaseFloat tanh_self_repair_threshold, BaseFloat sigmoid_self_repair_threshold, BaseFloat self_repair_scale); - void Init(std::string vector_filename, - int32 rank, int32 update_period, BaseFloat num_samples_history, - BaseFloat alpha, BaseFloat max_change_per_minibatch); - private: // Initializes the natural-gradient object with the configuration we @@ -1773,6 +1778,10 @@ class LstmNonlinearityComponent: public UpdatableComponent { // it contains the 3 diagonal parameter matrices w_i, w_f and w_o. CuMatrix params_; + // If true, we expect an extra 2 dimensions on the input, for dropout masks + // for i_t and f_t. + bool use_dropout_; + // Of dimension 5 * C, with a row for each of the Sigmoid/Tanh functions in // equations (1) through (5), this is the sum of the values of the nonliearities // (used for diagnostics only). It is comparable to value_sum_ vector diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc index a7f732a9864..27415fe8775 100644 --- a/src/nnet3/nnet-utils.cc +++ b/src/nnet3/nnet-utils.cc @@ -21,6 +21,7 @@ #include "nnet3/nnet-utils.h" #include "nnet3/nnet-graph.h" #include "nnet3/nnet-simple-component.h" +#include "nnet3/nnet-general-component.h" #include "nnet3/nnet-parse.h" namespace kaldi { @@ -461,6 +462,10 @@ void SetDropoutProportion(BaseFloat dropout_proportion, DropoutComponent *dc = dynamic_cast(comp); if (dc != NULL) dc->SetDropoutProportion(dropout_proportion); + DropoutMaskComponent *mc = + dynamic_cast(nnet->GetComponent(c)); + if (mc != NULL) + mc->SetDropoutProportion(dropout_proportion); } } @@ -629,16 +634,20 @@ void ReadEditConfig(std::istream &edit_config_is, Nnet *nnet) { KALDI_ERR << "In edits-config, expected proportion to be set in line: " << config_line.WholeLine(); } - DropoutComponent *dropout_component = NULL; int32 num_dropout_proportions_set = 0; for (int32 c = 0; c < nnet->NumComponents(); c++) { if (NameMatchesPattern(nnet->GetComponentName(c).c_str(), - name_pattern.c_str()) && - (dropout_component = - dynamic_cast(nnet->GetComponent(c)))) { + name_pattern.c_str())) { + DropoutComponent *dropout_component = + dynamic_cast(nnet->GetComponent(c)); + DropoutMaskComponent *mask_component = + dynamic_cast(nnet->GetComponent(c)); if (dropout_component != NULL) { dropout_component->SetDropoutProportion(proportion); num_dropout_proportions_set++; + } else if (mask_component != NULL){ + mask_component->SetDropoutProportion(proportion); + num_dropout_proportions_set++; } } } diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h index 921f1f1901d..041a916fb69 100644 --- a/src/nnet3/nnet-utils.h +++ b/src/nnet3/nnet-utils.h @@ -161,7 +161,7 @@ void ConvertRepeatedToBlockAffine(Nnet *nnet); std::string NnetInfo(const Nnet &nnet); /// This function sets the dropout proportion in all dropout components to -/// the value 'dropout_proportion' +/// dropout_proportion value. void SetDropoutProportion(BaseFloat dropout_proportion, Nnet *nnet); /// This function finds a list of components that are never used, and outputs