From 13e8bed04d7c4b043badc2e8d17513c3e6b144a3 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 30 Jan 2017 20:41:35 -0500 Subject: [PATCH 01/21] [src,scripts,egs] nnet3,fast-lstm: changes to support separate per-frame dropout masks on i and f gates. Old dropout method not supported in this branch. --- .../local/chain/tuning/run_tdnn_lstm_1p.sh | 344 +++++++++++++++++ .../local/chain/tuning/run_tdnn_lstm_1q.sh | 348 ++++++++++++++++++ egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py | 50 +-- src/cudamatrix/cu-kernels-ansi.h | 8 +- src/cudamatrix/cu-kernels.cu | 66 ++-- src/cudamatrix/cu-kernels.h | 16 +- src/cudamatrix/cu-math-test.cc | 34 +- src/cudamatrix/cu-math.cc | 76 ++-- src/cudamatrix/cu-math.h | 53 +-- src/nnet3/nnet-component-itf.cc | 2 + src/nnet3/nnet-component-itf.h | 5 +- src/nnet3/nnet-general-component.cc | 100 +++++ src/nnet3/nnet-general-component.h | 89 +++++ src/nnet3/nnet-simple-component.cc | 36 +- src/nnet3/nnet-simple-component.h | 27 +- src/nnet3/nnet-utils.cc | 17 +- src/nnet3/nnet-utils.h | 2 +- 17 files changed, 1138 insertions(+), 135 deletions(-) create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1p.sh create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1q.sh diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1p.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1p.sh new file mode 100755 index 00000000000..246601d8535 --- /dev/null +++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1p.sh @@ -0,0 +1,344 @@ +#!/bin/bash + +# 1p is as 1k, but [via script changes] doing the dropout as Gaofeng +# did it in the non-fast LSTMs, with separate per-frame masks on +# the i and f component. Using dropout schedule that maxes out at +# 0.3, which he found worked best for that type of dropout. +# +# +# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,k,p,q}_sp_bi +# System tdnn_lstm1e_sp_bi tdnn_lstm1k_sp_bi tdnn_lstm1p_sp_bi tdnn_lstm1q_sp_bi +# WER on dev(orig) 9.0 8.7 8.9 9.1 +# [looped:] 9.0 8.6 8.8 9.0 +# WER on dev(rescored) 8.4 7.9 8.4 8.3 +# [looped:] 8.4 7.8 8.3 8.2 +# WER on test(orig) 8.8 8.8 8.7 8.9 +# [looped:] 8.8 8.7 8.6 8.9 +# WER on test(rescored) 8.4 8.3 8.1 8.3 +# [looped:] 8.3 8.3 8.1 8.3 +# Final train prob -0.0648 -0.0693 -0.0712 -0.0698 +# Final valid prob -0.0827 -0.0854 -0.0848 -0.0875 +# Final train prob (xent) -0.8372 -0.8848 -0.8903 -0.8721 +# Final valid prob (xent) -0.9497 -0.9895 -0.9719 -0.9828 +# +# 1k is as 1e, but introducing a dropout schedule. + +# 1e is as 1b, but reducing decay-time from 40 to 20. + +# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers. note: it +# uses egs from 1b, remember to remove that before I commit. + +# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi +# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091) + +# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below +# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had +# better results. Note: these results are not with the updated LM (the LM data-prep +# for this setup was changed in Nov 2016 but this was with an older directory). +# +# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi +# System lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi +# WER on dev(orig) 10.3 10.7 9.7 +# WER on dev(rescored) 9.8 10.1 9.3 +# WER on test(orig) 9.7 9.8 9.1 +# WER on test(rescored) 9.2 9.4 8.7 +# Final train prob -0.0812 -0.0862 -0.0625 +# Final valid prob -0.1049 -0.1047 -0.0910 +# Final train prob (xent) -1.1334 -1.1763 -0.8518 +# Final valid prob (xent) -1.2263 -1.2427 -0.9972 + +## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here; +## otherwise call it directly in its location). +# by default, with cleanup: +# local/chain/run_tdnn_lstm.sh + +# without cleanup: +# local/chain/run_tdnn_lstm.sh --train-set train --gmm tri3 --nnet3-affix "" & + +# note, if you have already run one of the non-chain nnet3 systems +# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14. + +# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly +# standard, LSTM, except that some TDNN layers were added in between the +# LSTM layers. I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but +# this isn't exactly copied from there. + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +decode_nj=30 +min_seg_len=1.55 +label_delay=5 +xent_regularize=0.1 +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +# training options +chunk_left_context=40 +chunk_right_context=0 +chunk_left_context_initial=0 +chunk_right_context_final=0 +# decode options +extra_left_context=50 +extra_right_context=0 +extra_left_context_initial=0 +extra_right_context_final=0 +frames_per_chunk=140,100,160 +frames_per_chunk_primary=140 + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnn_lstm_affix=1p #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs # you can set this to use previously dumped egs. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 15 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 16 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + + +if [ $stage -le 17 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + # note: the value of the dropout-proportion is not important, as it's + # controlled by the dropout schedule; what's important is that we set it. + lstmp_opts="decay-time=20 dropout-proportion=0.0" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=512 + relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1) + fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts + relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3) + relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3) + fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts + relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3) + relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3) + fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 18 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --trainer.dropout-schedule='0,0@0.20,0.3@0.5,0@0.75,0' \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width "$frames_per_chunk" \ + --egs.chunk-left-context "$chunk_left_context" \ + --egs.chunk-right-context "$chunk_right_context" \ + --egs.chunk-left-context-initial "$chunk_left_context_initial" \ + --egs.chunk-right-context-final "$chunk_right_context_final" \ + --trainer.num-chunk-per-minibatch 128,64 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.deriv-truncate-margin 10 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir \ + --cleanup=false + # --cleanup=false is temporary while debugging. +fi + + + +if [ $stage -le 19 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph +fi + +if [ $stage -le 20 ]; then + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial $extra_left_context_initial \ + --extra-right-context-final $extra_right_context_final \ + --frames-per-chunk "$frames_per_chunk_primary" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +if [ $stage -le 21 ]; then + # 'looped' decoding. we didn't write a -parallel version of this program yet, + # so it will take a bit longer as the --num-threads option is not supported. + # we just hardcode the --frames-per-chunk option as it doesn't have to + # match any value used in training, and it won't affect the results (unlike + # regular decoding). + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context-initial $extra_left_context_initial \ + --frames-per-chunk 30 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +exit 0 diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1q.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1q.sh new file mode 100755 index 00000000000..f6a640fe17f --- /dev/null +++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1q.sh @@ -0,0 +1,348 @@ +#!/bin/bash + +# 1q is as 1p, but add the "dropout-exclusive" option which means that +# never drops out *both* the i and f gates. +# not helpful. see run_tdnn_lstm_1p.sh for results. + +# 1p is as 1k, but [via script changes] doing the dropout as Gaofeng +# did it in the non-fast LSTMs, with separate per-frame masks on +# the i and f component. Using dropout schedule that maxes out at +# 0.3, which he found worked best for that type of dropout. +# +# 1k is as 1e, but introducing a dropout schedule. + +# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,k,l,m}_sp_bi +# System tdnn_lstm1e_sp_bi tdnn_lstm1k_sp_bi tdnn_lstm1l_sp_bi tdnn_lstm1m_sp_bi +# WER on dev(orig) 9.0 8.7 8.9 9.0 +# [looped:] 9.0 8.6 8.9 8.9 +# WER on dev(rescored) 8.4 7.9 8.2 8.2 +# [looped:] 8.4 7.8 8.2 8.3 +# WER on test(orig) 8.8 8.8 8.9 8.9 +# [looped:] 8.8 8.7 8.8 8.8 +# WER on test(rescored) 8.4 8.3 8.2 8.5 +# [looped:] 8.3 8.3 8.3 8.4 +# Final train prob -0.0648 -0.0693 -0.0768 -0.0807 +# Final valid prob -0.0827 -0.0854 -0.0943 -0.0931 +# Final train prob (xent) -0.8372 -0.8848 -0.9371 -0.9807 +# Final valid prob (xent) -0.9497 -0.9895 -1.0546 -1.0629 + + +# 1e is as 1b, but reducing decay-time from 40 to 20. + +# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers. note: it +# uses egs from 1b, remember to remove that before I commit. + +# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi +# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091) + +# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below +# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had +# better results. Note: these results are not with the updated LM (the LM data-prep +# for this setup was changed in Nov 2016 but this was with an older directory). +# +# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi +# System lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi +# WER on dev(orig) 10.3 10.7 9.7 +# WER on dev(rescored) 9.8 10.1 9.3 +# WER on test(orig) 9.7 9.8 9.1 +# WER on test(rescored) 9.2 9.4 8.7 +# Final train prob -0.0812 -0.0862 -0.0625 +# Final valid prob -0.1049 -0.1047 -0.0910 +# Final train prob (xent) -1.1334 -1.1763 -0.8518 +# Final valid prob (xent) -1.2263 -1.2427 -0.9972 + +## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here; +## otherwise call it directly in its location). +# by default, with cleanup: +# local/chain/run_tdnn_lstm.sh + +# without cleanup: +# local/chain/run_tdnn_lstm.sh --train-set train --gmm tri3 --nnet3-affix "" & + +# note, if you have already run one of the non-chain nnet3 systems +# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14. + +# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly +# standard, LSTM, except that some TDNN layers were added in between the +# LSTM layers. I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but +# this isn't exactly copied from there. + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +decode_nj=30 +min_seg_len=1.55 +label_delay=5 +xent_regularize=0.1 +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +# training options +chunk_left_context=40 +chunk_right_context=0 +chunk_left_context_initial=0 +chunk_right_context_final=0 +# decode options +extra_left_context=50 +extra_right_context=0 +extra_left_context_initial=0 +extra_right_context_final=0 +frames_per_chunk=140,100,160 +frames_per_chunk_primary=140 + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnn_lstm_affix=1q #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs # you can set this to use previously dumped egs. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 15 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 16 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + + +if [ $stage -le 17 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + # note: the value of the dropout-proportion is not important, as it's + # controlled by the dropout schedule; what's important is that we set it. + lstmp_opts="decay-time=20 dropout-proportion=0.0 dropout-exclusive=true" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=512 + relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1) + fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts + relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3) + relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3) + fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts + relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3) + relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3) + fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 18 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --trainer.dropout-schedule='0,0@0.20,0.3@0.5,0@0.75,0' \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width "$frames_per_chunk" \ + --egs.chunk-left-context "$chunk_left_context" \ + --egs.chunk-right-context "$chunk_right_context" \ + --egs.chunk-left-context-initial "$chunk_left_context_initial" \ + --egs.chunk-right-context-final "$chunk_right_context_final" \ + --trainer.num-chunk-per-minibatch 128,64 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.deriv-truncate-margin 10 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir \ + --cleanup=false + # --cleanup=false is temporary while debugging. +fi + + + +if [ $stage -le 19 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph +fi + +if [ $stage -le 20 ]; then + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial $extra_left_context_initial \ + --extra-right-context-final $extra_right_context_final \ + --frames-per-chunk "$frames_per_chunk_primary" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +if [ $stage -le 21 ]; then + # 'looped' decoding. we didn't write a -parallel version of this program yet, + # so it will take a bit longer as the --num-threads option is not supported. + # we just hardcode the --frames-per-chunk option as it doesn't have to + # match any value used in training, and it won't affect the results (unlike + # regular decoding). + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context-initial $extra_left_context_initial \ + --frames-per-chunk 30 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +exit 0 diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py index 9d7f649c4b4..f6d93808538 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py @@ -682,9 +682,12 @@ def set_default_configs(self): 'decay-time': -1.0, 'zeroing-interval' : 20, 'zeroing-threshold' : 15.0, - 'dropout-proportion' : -1.0, # If -1.0, no dropout components will be added - 'dropout-per-frame' : False # If false, regular dropout, not per frame. - } + 'dropout-proportion' : -1.0, # If -1.0, no dropout will + # be used (note: this is + # per-frame dropout on the + # output of the i_t and f_t gates) + 'dropout-exclusive' : False # option affecting dropout masks. + } def set_derived_configs(self): if self.config['cell-dim'] <= 0: @@ -717,7 +720,6 @@ def check_configs(self): raise RuntimeError("dropout-proportion has invalid value {0}.".format(self.config['dropout-proportion'])) - def auxiliary_outputs(self): return ['c_t'] @@ -785,7 +787,7 @@ def generate_lstm_config(self): lstm_str = self.config['lstm-nonlinearity-options'] dropout_proportion = self.config['dropout-proportion'] - dropout_per_frame = 'true' if self.config['dropout-per-frame'] else 'false' + dropout_exclusive = 'true' if self.config['dropout-exclusive'] else 'false' configs = [] @@ -800,14 +802,16 @@ def generate_lstm_config(self): configs.append("# The core LSTM nonlinearity, implemented as a single component.") configs.append("# Input = (i_part, f_part, c_part, o_part, c_{t-1}), output = (c_t, m_t)") configs.append("# See cu-math.h:ComputeLstmNonlinearity() for details.") - configs.append("component name={0}.lstm_nonlin type=LstmNonlinearityComponent cell-dim={1} {2}".format(name, cell_dim, lstm_str)) + configs.append("component name={0}.lstm_nonlin type=LstmNonlinearityComponent cell-dim={1} " + "use-dropout={2} {3}" + .format(name, cell_dim, "true" if dropout_proportion != -1.0 else "false", lstm_str)) configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.") configs.append("component name={0}.cr_trunc type=BackpropTruncationComponent " "dim={1} {2}".format(name, cell_dim + rec_proj_dim, bptrunc_str)) if dropout_proportion != -1.0: - configs.append("component name={0}.cr_trunc.dropout type=DropoutComponent dim={1} " - "dropout-proportion={2} dropout-per-frame={3}" - .format(name, cell_dim + rec_proj_dim, dropout_proportion, dropout_per_frame)) + configs.append("component name={0}.dropout_mask type=DropoutMaskComponent output-dim=2 " + "dropout-proportion={1} exclusive={2}" + .format(name, dropout_proportion, dropout_exclusive)) configs.append("# Component specific to 'projected' LSTM (LSTMP), contains both recurrent"); configs.append("# and non-recurrent projections") configs.append("component name={0}.W_rp type=NaturalGradientAffineComponent input-dim={1} " @@ -816,8 +820,17 @@ def generate_lstm_config(self): configs.append("### Nodes for the components above.") configs.append("component-node name={0}.four_parts component={0}.W_all input=Append({1}, " "IfDefined(Offset({0}.r_trunc, {2})))".format(name, input_descriptor, delay)) - configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin " - "input=Append({0}.four_parts, IfDefined(Offset({0}.c_trunc, {1})))".format(name, delay)) + if dropout_proportion != -1.0: + # note: the 'input' is a don't-care as the component never uses it; it's required + # in component-node lines. + configs.append("component-node name={0}.dropout_mask component={0}.dropout_mask " + "input={0}.dropout_mask".format(name)) + configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin " + "input=Append({0}.four_parts, IfDefined(Offset({0}.c_trunc, {1})), {0}.dropout_mask)" + .format(name, delay)) + else: + configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin " + "input=Append({0}.four_parts, IfDefined(Offset({0}.c_trunc, {1})))".format(name, delay)) configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin " "dim-offset=0 dim={1}".format(name, cell_dim)) configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin " @@ -831,17 +844,10 @@ def generate_lstm_config(self): configs.append("# makes the deriv truncation more accurate .") configs.append("component-node name={0}.cr_trunc component={0}.cr_trunc " "input=Append({0}.c, {0}.r)".format(name)) - if dropout_proportion != -1.0: - configs.append("component-node name={0}.cr_trunc.dropout component={0}.cr_trunc.dropout input={0}.cr_trunc".format(name)) - configs.append("dim-range-node name={0}.c_trunc input-node={0}.cr_trunc.dropout " - "dim-offset=0 dim={1}".format(name, cell_dim)) - configs.append("dim-range-node name={0}.r_trunc input-node={0}.cr_trunc.dropout " - "dim-offset={1} dim={2}".format(name, cell_dim, rec_proj_dim)) - else: - configs.append("dim-range-node name={0}.c_trunc input-node={0}.cr_trunc " - "dim-offset=0 dim={1}".format(name, cell_dim)) - configs.append("dim-range-node name={0}.r_trunc input-node={0}.cr_trunc " - "dim-offset={1} dim={2}".format(name, cell_dim, rec_proj_dim)) + configs.append("dim-range-node name={0}.c_trunc input-node={0}.cr_trunc " + "dim-offset=0 dim={1}".format(name, cell_dim)) + configs.append("dim-range-node name={0}.r_trunc input-node={0}.cr_trunc " + "dim-offset={1} dim={2}".format(name, cell_dim, rec_proj_dim)) configs.append("### End LSTM Layer '{0}'".format(name)) return configs diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h index 116428ea82c..291d6a72cf3 100644 --- a/src/cudamatrix/cu-kernels-ansi.h +++ b/src/cudamatrix/cu-kernels-ansi.h @@ -650,14 +650,17 @@ void cudaD_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1, void cudaD_lstm_nonlinearity(dim3 Gr, dim3 Bl, const double* in, const int in_stride, const double* params, const int params_stride, const int out_stride, - const int cell_dim, const int num_rows, + const int cell_dim, const int have_dropout_mask, + const int num_rows, double* out); void cudaF_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in, const int in_stride, const float* params, const int params_stride, const int out_stride, - const int cell_dim, const int num_rows, + const int cell_dim, const int have_dropout_mask, + const int num_rows, float* out); void cudaD_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, + const int have_dropout_mask, const int num_rows, const double* input, const int in_stride, const double* params, const int params_stride, @@ -677,6 +680,7 @@ void cudaD_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, double* self_repair_sum_out, const int self_repair_sum_out_stride); void cudaF_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, + const int have_dropout_mask, const int num_rows, const float* input, const int in_stride, const float* params, const int params_stride, diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index abb4efd47ef..f50e5853fdd 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -2722,6 +2722,9 @@ static void _diff_log_softmax(const MatrixDim in_deriv_dim, consecutive blocks, each of dimension cell_dim, which we name: (i_part, f_part, c_part, o_part, c_{t-1}). + If 'have_dropout_mask' is nonzero, each row of + 'in' will have two extra elements, interpreted + as dropout masks/scales for i_t and f_t. @param [in] params A matrix, of dimension 3 by cell_dim, with rows containing the 3 diagonal parameter matrices used in LSTMs, namely @@ -2746,7 +2749,8 @@ __global__ static void _lstm_nonlinearity(const Real* in, const int in_stride, const Real* params, const int params_stride, const int out_stride, const int cell_dim, - const int num_rows, Real* out) { + const int have_dropout_mask, const int num_rows, + Real* out) { const int tid = threadIdx.x; const int i = blockIdx.x; const Real* i_part = in + i * in_stride; @@ -2759,12 +2763,14 @@ static void _lstm_nonlinearity(const Real* in, const int in_stride, const Real* w_oc = params + params_stride * 2; Real* c_t = out + i * out_stride; Real* m_t = out + i * out_stride + cell_dim; + Real i_scale = (have_dropout_mask ? in[i * in_stride + cell_dim * 5] : 1), + f_scale = (have_dropout_mask ? in[i * in_stride + cell_dim * 5 + 1] : 1); for (int j = tid; j < cell_dim; j += CU1DBLOCK) { Real c_tm1_j = c_tm1[j]; Real i_t_j = Real(1) / (Real(1) + exp(-i_part[j] - w_ic[j] * c_tm1_j)); Real f_t_j = Real(1) / (Real(1) + exp(-f_part[j] - w_fc[j] * c_tm1_j)); - Real c_t_j = f_t_j * c_tm1_j + i_t_j * tanh(c_part[j]); + Real c_t_j = f_t_j * f_scale * c_tm1_j + i_t_j * i_scale * tanh(c_part[j]); Real o_t_j = Real(1) / (Real(1) + exp(-o_part[j] - w_oc[j] * c_t_j)); c_t[j] = c_t_j; m_t[j] = o_t_j * tanh(c_t_j); @@ -2792,6 +2798,9 @@ static void _lstm_nonlinearity(const Real* in, const int in_stride, a multiple of 5). The column-space is interpreted as 5 consecutive blocks, each of dimension C, which we name: (i_part, f_part, c_part, o_part, c_{t-1}). + If 'have_dropout_mask' is nonzero, each row of + 'in' will have two extra elements, interpreted + as dropout masks/scales for i_t and f_t. @param [in] params The same as in ComputeLstmNonlinearity(). A matrix, of dimension 3 by C, with rows containing the three diagonal parameter matrices used in LSTMs, namely @@ -2864,7 +2873,8 @@ static void _lstm_nonlinearity(const Real* in, const int in_stride, */ template __global__ -static void _diff_lstm_nonlinearity(const int cell_dim, const int num_rows, +static void _diff_lstm_nonlinearity(const int cell_dim, const int have_dropout_mask, + const int num_rows, const Real* input, const int input_stride, const Real* params, const int params_stride, const Real* output_deriv, @@ -2918,6 +2928,7 @@ static void _diff_lstm_nonlinearity(const int cell_dim, const int num_rows, const Real o_t_self_repair = (update_sr[3] ? sr_config[8] : 0); const Real c_t_self_repair = (update_sr[4] ? sr_config[9] : 0); + for (int i = i0; i < num_rows; i += grid_stride) { const Real i_part = input[i * input_stride + j]; const Real f_part = input[i * input_stride + j + cell_dim]; @@ -2925,10 +2936,15 @@ static void _diff_lstm_nonlinearity(const int cell_dim, const int num_rows, const Real o_part = input[i * input_stride + j + 3 * cell_dim]; const Real c_prev = input[i * input_stride + j + 4 * cell_dim]; - const Real i_t = 1 / (1 + exp(-i_part - w_ic * c_prev)); - const Real f_t = 1 / (1 + exp(-f_part - w_fc * c_prev)); + + const Real i_scale = (have_dropout_mask ? + input[i * input_stride + cell_dim * 5] : 1), + f_scale = (have_dropout_mask ? + input[i * input_stride + cell_dim * 5 + 1] :1); + const Real i_t = Real(1) / (1 + exp(-i_part - w_ic * c_prev)); + const Real f_t = Real(1) / (1 + exp(-f_part - w_fc * c_prev)); const Real tanh_c_part = tanh(c_part); - const Real c_t = f_t * c_prev + i_t * tanh_c_part; + const Real c_t = f_t * f_scale * c_prev + i_t * i_scale * tanh_c_part; const Real o_t = 1 / (1 + exp(-o_part - w_oc * c_t)); const Real tanh_c_t = tanh(c_t); @@ -2962,13 +2978,13 @@ static void _diff_lstm_nonlinearity(const int cell_dim, const int num_rows, const Real dc_t = (c_t_deriv * dtanh_c_t + dc_t_out + do_t_input * w_oc) - tanh_c_t * c_t_self_repair; - const Real dtanh_c_part = i_t * dc_t; - const Real df_t = dc_t * c_prev; + const Real dtanh_c_part = i_t * i_scale * dc_t; + const Real df_t = dc_t * f_scale * c_prev; const Real df_t_input = (df_t * f_t_deriv - - (2 * f_t - 1) * f_t_self_repair); - const Real di_t = dc_t * tanh_c_part; + - (2 * f_t - 1) * f_t_self_repair); + const Real di_t = dc_t * i_scale * tanh_c_part; const Real di_t_input = (di_t * i_t_deriv - - (2 * i_t - 1) * i_t_self_repair); + - (2 * i_t - 1) * i_t_self_repair); if (params_deriv) { w_ic_deriv_sum += c_prev * di_t_input; @@ -2976,7 +2992,7 @@ static void _diff_lstm_nonlinearity(const int cell_dim, const int num_rows, w_oc_deriv_sum += c_t * do_t_input; } - const Real dc_prev = w_ic * di_t_input + w_fc * df_t_input + f_t * dc_t; + const Real dc_prev = w_ic * di_t_input + w_fc * df_t_input + f_t * f_scale * dc_t; const Real do_part = do_t_input; const Real dc_part = (c_part_deriv * dtanh_c_part - tanh_c_part * c_part_self_repair); @@ -4591,20 +4607,23 @@ void cudaD_trace_mat_smat_trans(dim3 Gr, dim3 Bl, const double* mat_in, void cudaD_lstm_nonlinearity(dim3 Gr, dim3 Bl, const double* in, const int in_stride, const double* params, const int params_stride, const int out_stride, - const int cell_dim, const int num_rows, - double* out) { - _lstm_nonlinearity<<>>(in, in_stride, params, params_stride, - out_stride, cell_dim, num_rows, out); + const int cell_dim, const int have_dropout_mask, + const int num_rows, double* out) { + _lstm_nonlinearity<<>>( + in, in_stride, params, params_stride, + out_stride, cell_dim, have_dropout_mask, num_rows, out); } void cudaF_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in, const int in_stride, const float* params, const int params_stride, const int out_stride, - const int cell_dim, const int num_rows, - float* out) { - _lstm_nonlinearity<<>>(in, in_stride, params, params_stride, - out_stride, cell_dim, num_rows, out); + const int cell_dim, const int have_dropout_mask, + const int num_rows, float* out) { + _lstm_nonlinearity<<>>( + in, in_stride, params, params_stride, + out_stride, cell_dim, have_dropout_mask, num_rows, out); } void cudaD_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, + const int have_dropout_mask, const int num_rows, const double* input, const int input_stride, const double* params, const int params_stride, @@ -4623,7 +4642,8 @@ void cudaD_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, const int deriv_sum_out_stride, double* self_repair_sum_out, const int self_repair_sum_out_stride) { - _diff_lstm_nonlinearity<<>>(cell_dim, num_rows, input, + _diff_lstm_nonlinearity<<>>( + cell_dim, have_dropout_mask, num_rows, input, input_stride, params, params_stride, output_deriv, output_deriv_stride, deriv_sum_in, deriv_sum_in_stride, self_repair_config, count, input_deriv, input_deriv_stride, params_deriv, params_deriv_stride, value_sum_out, @@ -4631,6 +4651,7 @@ void cudaD_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, self_repair_sum_out, self_repair_sum_out_stride); } void cudaF_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, + const int have_dropout_mask, const int num_rows, const float* input, const int input_stride, const float* params, const int params_stride, @@ -4649,7 +4670,8 @@ void cudaF_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, const int deriv_sum_out_stride, float* self_repair_sum_out, const int self_repair_sum_out_stride) { - _diff_lstm_nonlinearity<<>>(cell_dim, num_rows, input, + _diff_lstm_nonlinearity<<>>( + cell_dim, have_dropout_mask, num_rows, input, input_stride, params, params_stride, output_deriv, output_deriv_stride, deriv_sum_in, deriv_sum_in_stride, self_repair_config, count, input_deriv, input_deriv_stride, params_deriv, params_deriv_stride, value_sum_out, diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index 649a25ab67e..0e578ee7b49 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -1258,19 +1258,24 @@ inline void cuda_lstm_nonlinearity(dim3 Gr, dim3 Bl, const double* in, const int in_stride, const double* params, const int params_stride, const int out_stride, const int cell_dim, + const int have_dropout_mask, const int num_rows, double* out) { cudaD_lstm_nonlinearity(Gr, Bl, in, in_stride, params, params_stride, - out_stride, cell_dim, num_rows, out); + out_stride, cell_dim, have_dropout_mask, + num_rows, out); } inline void cuda_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in, const int in_stride, const float* params, const int params_stride, const int out_stride, const int cell_dim, + const int have_dropout_mask, const int num_rows, float* out) { cudaF_lstm_nonlinearity(Gr, Bl, in, in_stride, params, params_stride, - out_stride, cell_dim, num_rows, out); + out_stride, cell_dim, have_dropout_mask, + num_rows, out); } inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, + const int have_dropout_mask, const int num_rows, const double* input, const int input_stride, const double* params, @@ -1290,7 +1295,8 @@ inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, const int deriv_sum_out_stride, double* self_repair_sum_out, const int self_repair_sum_out_stride) { - cudaD_diff_lstm_nonlinearity(Gr, Bl, cell_dim, num_rows, input, input_stride, + cudaD_diff_lstm_nonlinearity(Gr, Bl, cell_dim, have_dropout_mask, num_rows, + input, input_stride, params, params_stride, output_deriv, output_deriv_stride, deriv_sum_in, deriv_sum_in_stride, self_repair_config, count, @@ -1301,6 +1307,7 @@ inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, self_repair_sum_out_stride); } inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, + const int have_dropout_mask, const int num_rows, const float* input, const int input_stride, const float* params, @@ -1320,7 +1327,8 @@ inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, const int deriv_sum_out_stride, float* self_repair_sum_out, const int self_repair_sum_out_stride) { - cudaF_diff_lstm_nonlinearity(Gr, Bl, cell_dim, num_rows, input, input_stride, + cudaF_diff_lstm_nonlinearity(Gr, Bl, cell_dim, have_dropout_mask, + num_rows, input, input_stride, params, params_stride, output_deriv, output_deriv_stride, deriv_sum_in, deriv_sum_in_stride, self_repair_config, count, diff --git a/src/cudamatrix/cu-math-test.cc b/src/cudamatrix/cu-math-test.cc index abd93fb1a0a..9abb6c7e8d1 100644 --- a/src/cudamatrix/cu-math-test.cc +++ b/src/cudamatrix/cu-math-test.cc @@ -144,7 +144,8 @@ static void UnitTestCuMathComputeLstmNonlinearity() { for (int i = 0; i < 3; i++) { int32 num_rows = 1 + Rand() % 100; int32 cell_dim = 1 + Rand() % 2000; - Matrix Hinput(num_rows, 5 * cell_dim); + int32 dropout_dim = (RandInt(0, 1) == 0 ? 0 : 2); + Matrix Hinput(num_rows, 5 * cell_dim + dropout_dim); Matrix Hparams(3, cell_dim); Matrix Houtput(num_rows, 2 * cell_dim); Hinput.SetRandn(); @@ -165,7 +166,8 @@ static void UnitTestCuMathComputeLstmNonlinearity() { BaseFloat time_in_secs = 0.025; int32 num_rows = i; int32 cell_dim = i; - CuMatrix input(num_rows, 5 * cell_dim); + int32 dropout_dim = (RandInt(0, 1) == 0 ? 0 : 2); + CuMatrix input(num_rows, 5 * cell_dim + dropout_dim); CuMatrix params(3, cell_dim); CuMatrix output(num_rows, 2 * cell_dim); input.SetRandn(); @@ -190,7 +192,8 @@ void UnitTestLstmNonlinearity() { // problem dimensions. int32 num_rows = RandInt(5, 20), - cell_dim = RandInt(2, 200); + cell_dim = RandInt(2, 200), + dropout_dim = (RandInt(0, 1) == 0 ? 0 : 2); // Pick the (input or params block), and output block, for which we'll // spot-check the derivative values. This will give us test failures @@ -207,7 +210,7 @@ void UnitTestLstmNonlinearity() { test_params = -1; - CuMatrix input(num_rows, cell_dim * 5), + CuMatrix input(num_rows, cell_dim * 5 + dropout_dim), params(3, cell_dim), output_deriv(num_rows, cell_dim * 2); input.SetRandn(); @@ -230,7 +233,7 @@ void UnitTestLstmNonlinearity() { CuVector self_repair_config(10.0); // leave at zero... we don't really test this here. CuMatrix self_repair_sum(5, cell_dim), - input_deriv(num_rows, 5 * cell_dim), + input_deriv(num_rows, 5 * cell_dim + dropout_dim), params_deriv(3, cell_dim); double count_in = 0.0; @@ -249,7 +252,7 @@ void UnitTestLstmNonlinearity() { measured_objf_change(test_dim); for (int32 i = 0; i < test_dim; i++) { - CuMatrix delta_input(num_rows, 5 * cell_dim), + CuMatrix delta_input(num_rows, 5 * cell_dim + dropout_dim), delta_params(3, cell_dim); if (test_input >= 0) { delta_input.ColRange(test_input * cell_dim, cell_dim).SetRandn(); @@ -260,12 +263,9 @@ void UnitTestLstmNonlinearity() { delta_params.Scale(delta); } - - predicted_objf_change(i) = TraceMatMat(delta_input, input_deriv, kTrans) + TraceMatMat(delta_params, params_deriv, kTrans); - CuMatrix perturbed_input(input); perturbed_input.AddMat(1.0, delta_input); @@ -280,7 +280,9 @@ void UnitTestLstmNonlinearity() { measured_objf_change(i) = objf_change; } KALDI_LOG << "LSTM nonlinearity test: num_rows=" << num_rows - << ", cell_dim=" << cell_dim << ", test_input=" << test_input + << ", cell_dim=" << cell_dim + << ", dropout_dim=" << dropout_dim + << ", test_input=" << test_input << ", test_params=" << test_params << ", test_output=" << test_output << ", predicted_objf_change=" << predicted_objf_change @@ -296,16 +298,17 @@ template static void UnitTestBackpropLstmNonlinearity() { for (int i = 0; i < 3; i++) { int32 num_rows = 1 + Rand() % 200; - int32 cell_dim = 1 + Rand() % 2000; + int32 cell_dim = 1 + Rand() % 2000, + dropout_dim = (RandInt(0, 1) == 0 ? 0 : 2); // KALDI_LOG << num_rows << ", " << cell_dim; - Matrix hinput(num_rows, 5 * cell_dim); + Matrix hinput(num_rows, 5 * cell_dim + dropout_dim); Matrix hparams(3, cell_dim); Matrix houtput_deriv(num_rows, 2 * cell_dim); Matrix hderiv_sum_in(5, cell_dim); Vector hself_repair_config(10); double count_in; - Matrix hinput_deriv(num_rows, 5 * cell_dim); + Matrix hinput_deriv(num_rows, 5 * cell_dim + dropout_dim); Matrix hparams_deriv(3, cell_dim); Matrix hvalue_sum_out(5, cell_dim); Matrix hderiv_sum_out(5, cell_dim); @@ -409,15 +412,16 @@ static void UnitTestBackpropLstmNonlinearity() { BaseFloat time_in_secs = 0.025; int32 num_rows = i; int32 cell_dim = i; + int32 dropout_dim = (RandInt(0, 1) == 0 ? 0 : 2); - CuMatrix input(num_rows, 5 * cell_dim); + CuMatrix input(num_rows, 5 * cell_dim + dropout_dim); CuMatrix params(3, cell_dim); CuMatrix output_deriv(num_rows, 2 * cell_dim); CuMatrix deriv_sum_in(5, cell_dim); CuVector self_repair_config(10); double count_in; - CuMatrix input_deriv(num_rows, 5 * cell_dim); + CuMatrix input_deriv(num_rows, 5 * cell_dim + dropout_dim); CuMatrix params_deriv(3, cell_dim); CuMatrix value_sum_out(5, cell_dim); CuMatrix deriv_sum_out(5, cell_dim); diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc index bb55302313a..b76721fcce3 100644 --- a/src/cudamatrix/cu-math.cc +++ b/src/cudamatrix/cu-math.cc @@ -317,10 +317,11 @@ template void CpuComputeLstmNonlinearity(const MatrixBase &input_mat, const MatrixBase ¶ms_mat, MatrixBase *output) { - int32 num_rows = input_mat.NumRows(); - int32 cell_dim = input_mat.NumCols() / 5; + int32 num_rows = input_mat.NumRows(), + input_cols = input_mat.NumCols(), + cell_dim = input_cols / 5; + KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 2); KALDI_ASSERT(output->NumRows() == num_rows); - KALDI_ASSERT(input_mat.NumCols() % 5 == 0); KALDI_ASSERT(params_mat.NumRows() == 3); KALDI_ASSERT(params_mat.NumCols() == cell_dim); KALDI_ASSERT(output->NumCols() == 2 * cell_dim); @@ -330,6 +331,10 @@ void CpuComputeLstmNonlinearity(const MatrixBase &input_mat, int32 params_stride = params_mat.Stride(); for (int32 r = 0; r < num_rows; r++) { const Real *input_row = input_mat.RowData(r); + // i_scale and f_scale relate to dropout, they will normally be 1.0. + Real i_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5]), + f_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5 + 1]); + Real *output_row = output_mat.RowData(r); for (int32 c = 0; c < cell_dim; c++) { Real i_part = input_row[c]; @@ -342,7 +347,7 @@ void CpuComputeLstmNonlinearity(const MatrixBase &input_mat, Real w_oc = params_data[c + params_stride * 2]; Real i_t = ScalarSigmoid(i_part + w_ic * c_prev); Real f_t = ScalarSigmoid(f_part + w_fc * c_prev); - Real c_t = f_t * c_prev + i_t * ScalarTanh(c_part); + Real c_t = f_t * f_scale * c_prev + i_t * i_scale * ScalarTanh(c_part); Real o_t = ScalarSigmoid(o_part + w_oc * c_t); Real m_t = o_t * ScalarTanh(c_t); output_row[c] = c_t; @@ -355,10 +360,11 @@ template void ComputeLstmNonlinearity(const CuMatrixBase &input, const CuMatrixBase ¶ms, CuMatrixBase *output) { - int32 num_rows = input.NumRows(); - int32 cell_dim = input.NumCols() / 5; + int32 num_rows = input.NumRows(), + input_cols = input.NumCols(), + cell_dim = input_cols / 5; + KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 2); KALDI_ASSERT(output->NumRows() == num_rows); - KALDI_ASSERT(input.NumCols() % 5 == 0); KALDI_ASSERT(params.NumRows() == 3); KALDI_ASSERT(params.NumCols() == cell_dim); KALDI_ASSERT(output->NumCols() == 2 * cell_dim); @@ -367,6 +373,8 @@ void ComputeLstmNonlinearity(const CuMatrixBase &input, if (CuDevice::Instantiate().Enabled()) { Timer tim; + int have_dropout_mask = (input_cols == (cell_dim * 5) + 2); + // Each thread block is working on 1 row of the data. // It's best that cell dim is a multiple fo CU1DBLOCK dim3 dimBlock(CU1DBLOCK); @@ -374,7 +382,7 @@ void ComputeLstmNonlinearity(const CuMatrixBase &input, cuda_lstm_nonlinearity(dimGrid, dimBlock, input.Data(), input.Stride(), params.Data(), params.Stride(), output->Stride(), - cell_dim, num_rows, output->Data()); + cell_dim, have_dropout_mask, num_rows, output->Data()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); @@ -414,10 +422,12 @@ void CpuBackpropLstmNonlinearity(const MatrixBase &input, MatrixBase *value_sum_out, MatrixBase *deriv_sum_out, MatrixBase *self_repair_sum_out) { - int32 num_rows = input.NumRows(); - int32 cell_dim = input.NumCols() / 5; + int32 num_rows = input.NumRows(), + input_cols = input + .NumCols(), + cell_dim = input.NumCols() / 5; // Check dimensions. - KALDI_ASSERT(input.NumCols() % 5 == 0); + KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 2); KALDI_ASSERT(params.NumRows() == 3); KALDI_ASSERT(params.NumCols() == cell_dim); KALDI_ASSERT(output_deriv.NumRows() == num_rows); @@ -512,6 +522,12 @@ void CpuBackpropLstmNonlinearity(const MatrixBase &input, c_part = input_mat(r, c + 2 * cell_dim), o_part = input_mat(r, c + 3 * cell_dim), c_prev = input_mat(r, c + 4 * cell_dim); + + Real i_scale = (input_cols == cell_dim * 5 ? 1.0 : + input_mat(r, cell_dim * 5)), + f_scale = (input_cols == cell_dim * 5 ? 1.0 : + input_mat(r, cell_dim * 5 + 1)); + // For greater clarity, we give some of the quantities in the // forward equations their own names. Real i_t_input = i_part + w_ic * c_prev, @@ -519,7 +535,7 @@ void CpuBackpropLstmNonlinearity(const MatrixBase &input, f_t_input = f_part + w_fc * c_prev, f_t = ScalarSigmoid(f_t_input), tanh_c_part = ScalarTanh(c_part), - c_t = f_t * c_prev + i_t * tanh_c_part, + c_t = f_t * f_scale * c_prev + i_t * i_scale * tanh_c_part, o_t_input = o_part + w_oc * c_t, o_t = ScalarSigmoid(o_t_input), tanh_c_t = ScalarTanh(c_t); @@ -557,19 +573,19 @@ void CpuBackpropLstmNonlinearity(const MatrixBase &input, - (2.0F * o_t - 1.0F) * o_t_self_repair); Real dc_t = ((1.0F - tanh_c_t * tanh_c_t) * dtanh_c_t + dc_t_out + do_t_input * w_oc) - tanh_c_t * c_t_self_repair; - Real dtanh_c_part = i_t * dc_t; - Real df_t = dc_t * c_prev; - Real df_t_input = (df_t * f_t * (1.0F - f_t) - - (2.0F * f_t - 1.0F) * f_t_self_repair); - Real di_t = dc_t * tanh_c_part; - Real di_t_input = (di_t * i_t * (1.0F - i_t) - - (2.0F * i_t - 1.0F) * i_t_self_repair); + Real dtanh_c_part = i_t * i_scale * dc_t; + Real df_t = dc_t * f_scale * c_prev; + Real df_t_input = ((df_t * f_t * (1.0F - f_t) + - (2.0F * f_t - 1.0F) * f_t_self_repair)); + Real di_t = dc_t * i_scale * tanh_c_part; + Real di_t_input = ((di_t * i_t * (1.0F - i_t) + - (2.0F * i_t - 1.0F) * i_t_self_repair)); w_ic_deriv_sum += c_prev * di_t_input; w_fc_deriv_sum += c_prev * df_t_input; w_oc_deriv_sum += c_t * do_t_input; - Real dc_prev = w_ic * di_t_input + w_fc * df_t_input + f_t * dc_t; + Real dc_prev = w_ic * di_t_input + w_fc * df_t_input + f_t * f_scale * dc_t; Real do_part = do_t_input; Real dc_part = ((1.0F - tanh_c_part * tanh_c_part) * dtanh_c_part - tanh_c_part * c_part_self_repair); @@ -630,10 +646,11 @@ void BackpropLstmNonlinearity(const CuMatrixBase &input, CuMatrixBase *value_sum_out, CuMatrixBase *deriv_sum_out, CuMatrixBase *self_repair_sum_out) { - int32 num_rows = input.NumRows(); - int32 cell_dim = input.NumCols() / 5; + int32 num_rows = input.NumRows(), + cell_dim = input.NumCols() / 5, + input_cols = input.NumCols(); // Check dimensions. - KALDI_ASSERT(input.NumCols() % 5 == 0); + KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim*5) + 2); KALDI_ASSERT(params.NumRows() == 3); KALDI_ASSERT(params.NumCols() == cell_dim); KALDI_ASSERT(output_deriv.NumRows() == num_rows); @@ -668,6 +685,7 @@ void BackpropLstmNonlinearity(const CuMatrixBase &input, // Each thread block is working on 1 row of the data. // It's best that cell dim is a multiple fo CU1DBLOCK + int have_dropout_mask = (input_cols == (cell_dim * 5) + 2); // Use 2D block (8x32 threads) as we need to compute column sum. // Use 1D grid to cover the data matrix width `cell_dim`. @@ -681,7 +699,8 @@ void BackpropLstmNonlinearity(const CuMatrixBase &input, dim3 dimGrid(n_blocks(cell_dim, dimBlock.x)); if (input_deriv == NULL) { if (params_deriv == NULL) { - cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim, num_rows, + cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim, + have_dropout_mask, num_rows, input.Data(), input.Stride(), params.Data(), params.Stride(), output_deriv.Data(), output_deriv.Stride(), deriv_sum_in.Data(), @@ -699,7 +718,8 @@ void BackpropLstmNonlinearity(const CuMatrixBase &input, 0); } else { - cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim, num_rows, + cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim, + have_dropout_mask, num_rows, input.Data(), input.Stride(), params.Data(), params.Stride(), output_deriv.Data(), output_deriv.Stride(), deriv_sum_in.Data(), @@ -717,7 +737,8 @@ void BackpropLstmNonlinearity(const CuMatrixBase &input, } } else { if (params_deriv == NULL) { - cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim, num_rows, + cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim, + have_dropout_mask, num_rows, input.Data(), input.Stride(), params.Data(), params.Stride(), output_deriv.Data(), output_deriv.Stride(), deriv_sum_in.Data(), @@ -727,7 +748,8 @@ void BackpropLstmNonlinearity(const CuMatrixBase &input, NULL, 0, NULL, 0, NULL, 0, NULL, 0); } else { - cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim, num_rows, + cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim, + have_dropout_mask, num_rows, input.Data(), input.Stride(), params.Data(), params.Stride(), output_deriv.Data(), output_deriv.Stride(), deriv_sum_in.Data(), diff --git a/src/cudamatrix/cu-math.h b/src/cudamatrix/cu-math.h index 9952ca5b9d2..3313baaa9d1 100644 --- a/src/cudamatrix/cu-math.h +++ b/src/cudamatrix/cu-math.h @@ -88,6 +88,9 @@ void Group2norm(const CuMatrixBase &src, a multiple of 5). The column-space is interpreted as 5 consecutive blocks, each of dimension C, which we name: (i_part, f_part, c_part, o_part, c_{t-1}). + This function will also accept input of dimension N by 5C+2, + and the two final elements will be used as scaling factors + on i_t and f_t (useful as per-frame dropout masks). @param [in] params A matrix, of dimension 3 by C, with rows containing the three diagonal parameter matrices used in LSTMs, namely w_{ic}, w_{fc} and w_{oc}. @@ -101,7 +104,6 @@ void Group2norm(const CuMatrixBase &src, o_t = Sigmoid(o_part + w_{oc}*c_t) m_t = o_t * Tanh(c_t) - */ template void ComputeLstmNonlinearity(const CuMatrixBase &input, @@ -134,6 +136,9 @@ void CpuComputeLstmNonlinearity(const MatrixBase &input, a multiple of 5). The column-space is interpreted as 5 consecutive blocks, each of dimension C, which we name: (i_part, f_part, c_part, o_part, c_{t-1}). + This function will also accept input of dimension N by 5C+2, + and the two final elements will be interpreted as scaling factors + on i_t and f_t (useful as per-frame dropout masks). @param [in] params The same as in ComputeLstmNonlinearity(). A matrix, of dimension 3 by C, with rows containing the three diagonal parameter matrices used in LSTMs, namely @@ -165,9 +170,13 @@ void CpuComputeLstmNonlinearity(const MatrixBase &input, May be NULL; if not, this function writes, to this location, the backpropagated derivative of the objective function w.r.t. the 'input' matrix. This matrix should - have the same dimension as 'input' i.e. N by 5C. In - addition to the regular backpropagated derivative, the - output will include small values relating to 'self-repair'. + have the same dimension as 'input'. In addition to the + regular backpropagated derivative, the output will include + small values relating to 'self-repair'. If the input + is of column-dimension 5C + 2 (i.e. we are using dropout + masks), the derivatives w.r.t. the dropout masks will not + be set; they will retain their value prior to this + function call. @param [out] params_deriv May be NULL; if not, this is where this function *writes* [not adds] the backpropagated derivative of the objective @@ -196,23 +205,6 @@ void CpuComputeLstmNonlinearity(const MatrixBase &input, processed outside this function into self-repair stats for diagnostics. */ -/// Normalize nonlinearity modifies the vector of activations -/// by scaling it so that the root-mean-square equals 1.0. -/// -/// The output y_i = scale * x_i, -/// and we want to RMS value of the y_i to equal target_rms, -/// so y^t y = D * target_rms^2 (if y is one row of the input). -/// we need to have scale = 1.0 / sqrt(x^t x / (D * target_rms^2)). -/// there is also flooring involved, to avoid division-by-zero -/// problems. It's important for the backprop, that the floor's -/// square root is exactly representable as float. -/// If add_log_stddev_ is true, log(max(epsi, sqrt(x^t x / D))) -/// is an extra dimension of the output. -template -void NormalizePerRow(const CuMatrixBase& in, const Real target_rms, - const bool add_log_stddev, CuMatrixBase* out); - - template void BackpropLstmNonlinearity(const CuMatrixBase &input, @@ -241,6 +233,25 @@ void CpuBackpropLstmNonlinearity(const MatrixBase &input, MatrixBase *deriv_sum_out, MatrixBase *self_repair_sum_out); + +/// Normalize nonlinearity modifies the vector of activations +/// by scaling it so that the root-mean-square equals 1.0. +/// +/// The output y_i = scale * x_i, +/// and we want to RMS value of the y_i to equal target_rms, +/// so y^t y = D * target_rms^2 (if y is one row of the input). +/// we need to have scale = 1.0 / sqrt(x^t x / (D * target_rms^2)). +/// there is also flooring involved, to avoid division-by-zero +/// problems. It's important for the backprop, that the floor's +/// square root is exactly representable as float. +/// If add_log_stddev_ is true, log(max(epsi, sqrt(x^t x / D))) +/// is an extra dimension of the output. +template +void NormalizePerRow(const CuMatrixBase& in, const Real target_rms, + const bool add_log_stddev, CuMatrixBase* out); + + + } // namespace cu } // namespace kaldi diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc index 23a8662a0d5..4a2a8d1c09a 100644 --- a/src/nnet3/nnet-component-itf.cc +++ b/src/nnet3/nnet-component-itf.cc @@ -147,6 +147,8 @@ Component* Component::NewComponentOfType(const std::string &component_type) { ans = new ConstantComponent(); } else if (component_type == "DropoutComponent") { ans = new DropoutComponent(); + } else if (component_type == "DropoutMaskComponent") { + ans = new DropoutMaskComponent(); } else if (component_type == "BackpropTruncationComponent") { ans = new BackpropTruncationComponent(); } else if (component_type == "LstmNonlinearityComponent") { diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h index c1732fc9b25..7cf438a025e 100644 --- a/src/nnet3/nnet-component-itf.h +++ b/src/nnet3/nnet-component-itf.h @@ -82,8 +82,11 @@ enum ComponentProperties { // Tanh, Sigmoid, ReLU and Softmax). kInputContiguous = 0x1000, // true if the component requires its input data (and // input derivatives) to have Stride()== NumCols(). - kOutputContiguous = 0x2000 // true if the component requires its input data (and + kOutputContiguous = 0x2000, // true if the component requires its input data (and // output derivatives) to have Stride()== NumCols(). + kRandomComponent = 0x4000 // true if the component has some kind of + // randomness, like DropoutComponent (these should + // inherit from class RandomComponent. }; diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc index 926ebd9b07d..6ff68525d55 100644 --- a/src/nnet3/nnet-general-component.cc +++ b/src/nnet3/nnet-general-component.cc @@ -1374,5 +1374,105 @@ void ConstantComponent::UnVectorize(const VectorBase ¶ms) { +std::string DropoutMaskComponent::Info() const { + std::ostringstream stream; + stream << Type() + << ", output-dim=" << output_dim_ + << ", dropout-proportion=" << dropout_proportion_ + << ", exclusive=" << (exclusive_ ? "true" : "false"); + return stream.str(); +} + +DropoutMaskComponent::DropoutMaskComponent(): + output_dim_(-1), dropout_proportion_(0.5), + exclusive_(false) { } + +DropoutMaskComponent::DropoutMaskComponent( + const DropoutMaskComponent &other): + output_dim_(other.output_dim_), + dropout_proportion_(other.dropout_proportion_), + exclusive_(other.exclusive_) { } + +void DropoutMaskComponent::Propagate( + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const { + KALDI_ASSERT(in.NumRows() == 0 && out->NumCols() == output_dim_); + BaseFloat dropout_proportion = dropout_proportion_; + KALDI_ASSERT(dropout_proportion >= 0.0 && dropout_proportion <= 1.0); + + if (dropout_proportion_ == 0) { + out->Set(1.0); + return; + } + if (!exclusive_) { + const_cast&>(random_generator_).RandUniform(out); + out->Add(-dropout_proportion); + out->ApplyHeaviside(); + } else { + if (!(output_dim_ == 2 && dropout_proportion <= 0.5)) { + KALDI_ERR << "If exclusive=true is set, output-dim must equal 2 (got: " + << output_dim_ << " and dropout-proportion must <= 0.5 (got: " + << dropout_proportion; + } + // To generate data where it's never the case that both of the dimensions + // for a row are zero, we generate uniformly distributed data (call this u_i), + // and for row i, set (*out)(i, 0) = (0 if u_i < dropout_proportion else 1) + // and (*out)(i, 1) = (0 if u_i > 1-dropout_proportion else 1) + int32 num_rows = out->NumRows(); + // later we may make this a bit more efficient. + CuVector temp(num_rows, kUndefined); + const_cast&>(random_generator_).RandUniform(&temp); + temp.Add(-dropout_proportion); + out->CopyColFromVec(temp, 0); + temp.Add(-1.0 + (2.0 * dropout_proportion)); + // Now, 'temp' contains the original uniformly-distributed data plus + // -(1 - dropout_proportion). + temp.Scale(-1.0); + out->CopyColFromVec(temp, 1); + out->ApplyHeaviside(); + } +} + + +void DropoutMaskComponent::Read(std::istream &is, bool binary) { + ExpectOneOrTwoTokens(is, binary, "", ""); + ReadBasicType(is, binary, &output_dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &dropout_proportion_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &exclusive_); + ExpectToken(is, binary, ""); +} + + +void DropoutMaskComponent::Write(std::ostream &os, bool binary) const { + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, output_dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, dropout_proportion_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, exclusive_); + WriteToken(os, binary, ""); +} + +Component* DropoutMaskComponent::Copy() const { + return new DropoutMaskComponent(*this); +} + +void DropoutMaskComponent::InitFromConfig(ConfigLine *cfl) { + output_dim_ = 0; + bool ok = cfl->GetValue("output-dim", &output_dim_); + KALDI_ASSERT(ok && output_dim_ > 0); + dropout_proportion_ = 0.5; + cfl->GetValue("dropout-proportion", &dropout_proportion_); + exclusive_ = false; + cfl->GetValue("exclusive", &exclusive_); +} + + + + } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h index b945edf4475..d3de9f40548 100644 --- a/src/nnet3/nnet-general-component.h +++ b/src/nnet3/nnet-general-component.h @@ -669,6 +669,95 @@ class ConstantComponent: public UpdatableComponent { +// DropoutMaskComponent outputs a random zero-or-one value for all dimensions of +// all requested indexes, and it has no dependencies on any input. It's like a +// ConstantComponent, but with random output that has value zero +// a proportion (dropout_proportion) of the time, and otherwise one. +// This is not the normal way to implement dropout; you'd normally use a +// DropoutComponent (see nnet-simple-component.h). This component is used while +// implementing per-frame dropout with the LstmNonlinearityComponent; we +// generate a two-dimensional output representing dropout +// +class DropoutMaskComponent: public RandomComponent { + public: + // actually this component requires no inputs; this value + // is really a don't-care. + virtual int32 InputDim() const { return output_dim_; } + + virtual int32 OutputDim() const { return output_dim_; } + + virtual std::string Info() const; + + // possible parameter values with their defaults: + // dropout-proportion=0.5 output-dim=-1 exclusive=false + // [for the meaning of 'exclusive', see its declaration]. + virtual void InitFromConfig(ConfigLine *cfl); + + DropoutMaskComponent(); + + DropoutMaskComponent(const DropoutMaskComponent &other); + + virtual std::string Type() const { return "DropoutMaskComponent"; } + virtual int32 Properties() const { return kRandomComponent; } + // note: the matrix 'in' will be empty. + virtual void Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const; + // backprop does nothing, there is nothing to backprop to and nothing + // to update. + virtual void Backprop(const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &, // in_value + const CuMatrixBase &, // out_value + const CuMatrixBase &out_deriv, + Component *to_update, + CuMatrixBase *in_deriv) const { } + + virtual void Read(std::istream &is, bool binary); + virtual void Write(std::ostream &os, bool binary) const; + + virtual Component* Copy() const; + + // Some functions that are only to be reimplemented for GeneralComponents. + virtual void GetInputIndexes(const MiscComputationInfo &misc_info, + const Index &output_index, + std::vector *desired_indexes) const { + desired_indexes->clear(); // requires no inputs. + } + + // This function returns true if at least one of the input indexes used to + // compute this output index is computable. + // it's simple because this component requires no inputs. + virtual bool IsComputable(const MiscComputationInfo &misc_info, + const Index &output_index, + const IndexSet &input_index_set, + std::vector *used_inputs) const { + if (used_inputs) used_inputs->clear(); + return true; + } + + void SetDropoutProportion(BaseFloat p) { dropout_proportion_ = p; } + + private: + + // The output dimension + int32 output_dim_; + + BaseFloat dropout_proportion_; + + // If true, and only in the special case where output_dim_ == 2, this + // component will make sure that it's never the case that both columns of a + // row of the output are zero. Note: if this is true, you cannot set + // dropout_proportion_ > 0.5. + bool exclusive_; + + const DropoutMaskComponent &operator + = (const DropoutMaskComponent &other); // Disallow. +}; + + + + } // namespace nnet3 diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index 527b0d54c01..1cde6b3b0fa 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -4969,13 +4969,20 @@ void CompositeComponent::InitFromConfig(ConfigLine *cfl) { if(this_component->Type() == "CompositeComponent") { DeletePointers(&components); delete this_component; + // This is not allowed. If memory is too much with just one + // CompositeComponent, try decreasing max-rows-process instead. KALDI_ERR << "Found CompositeComponent nested within CompositeComponent." - << "Try decreasing max-rows-process instead." << "Nested line: '" << nested_line.WholeLine() << "'\n" << "Toplevel CompositeComponent line '" << cfl->WholeLine() << "'"; } this_component->InitFromConfig(&nested_line); + int32 props = this_component->Properties(); + if ((props & kRandomComponent) != 0 || + (props & kSimpleComponent) == 0) { + KALDI_ERR << "CompositeComponent contains disallowed component type: " + << nested_line.WholeLine(); + } components.push_back(this_component); } if (cfl->HasUnusedValues()) @@ -4995,10 +5002,9 @@ void CompositeComponent::SetComponent(int32 i, Component *component) { components_[i] = component; } - int32 LstmNonlinearityComponent::InputDim() const { int32 cell_dim = value_sum_.NumCols(); - return cell_dim * 5; + return cell_dim * 5 + (use_dropout_ ? 2 : 0); } int32 LstmNonlinearityComponent::OutputDim() const { @@ -5020,7 +5026,15 @@ void LstmNonlinearityComponent::Read(std::istream &is, bool binary) { ExpectToken(is, binary, ""); self_repair_total_.Read(is, binary); - ExpectToken(is, binary, ""); + std::string tok; + ReadToken(is, binary, &tok); + if (tok == "") { + ReadBasicType(is, binary, &use_dropout_); + ReadToken(is, binary, &tok); + } else { + use_dropout_ = false; + } + KALDI_ASSERT(tok == ""); ReadBasicType(is, binary, &count_); // For the on-disk format, we normalze value_sum_, deriv_sum_ and @@ -5067,6 +5081,8 @@ void LstmNonlinearityComponent::Write(std::ostream &os, bool binary) const { self_repair_prob.Scale(1.0 / (count_ * cell_dim)); self_repair_prob.Write(os, binary); } + WriteToken(os, binary, ""); + WriteBasicType(os, binary, use_dropout_); WriteToken(os, binary, ""); WriteBasicType(os, binary, count_); WriteToken(os, binary, ""); @@ -5077,7 +5093,8 @@ void LstmNonlinearityComponent::Write(std::ostream &os, bool binary) const { std::string LstmNonlinearityComponent::Info() const { std::ostringstream stream; int32 cell_dim = params_.NumCols(); - stream << UpdatableComponent::Info() << ", cell-dim=" << cell_dim; + stream << UpdatableComponent::Info() << ", cell-dim=" << cell_dim + << ", use-dropout=" << (use_dropout_ ? "true" : "false"); PrintParameterStats(stream, "w_ic", params_.Row(0)); PrintParameterStats(stream, "w_fc", params_.Row(1)); PrintParameterStats(stream, "w_oc", params_.Row(2)); @@ -5243,6 +5260,7 @@ LstmNonlinearityComponent::LstmNonlinearityComponent( const LstmNonlinearityComponent &other): UpdatableComponent(other), params_(other.params_), + use_dropout_(other.use_dropout_), value_sum_(other.value_sum_), deriv_sum_(other.deriv_sum_), self_repair_config_(other.self_repair_config_), @@ -5251,7 +5269,8 @@ LstmNonlinearityComponent::LstmNonlinearityComponent( preconditioner_(other.preconditioner_) { } void LstmNonlinearityComponent::Init( - int32 cell_dim, BaseFloat param_stddev, + int32 cell_dim, bool use_dropout, + BaseFloat param_stddev, BaseFloat tanh_self_repair_threshold, BaseFloat sigmoid_self_repair_threshold, BaseFloat self_repair_scale) { @@ -5261,6 +5280,7 @@ void LstmNonlinearityComponent::Init( sigmoid_self_repair_threshold >= 0.0 && sigmoid_self_repair_threshold <= 0.25 && self_repair_scale >= 0.0 && self_repair_scale <= 0.1); + use_dropout_ = use_dropout; params_.Resize(3, cell_dim); params_.SetRandn(); params_.Scale(param_stddev); @@ -5295,6 +5315,7 @@ void LstmNonlinearityComponent::InitNaturalGradient() { void LstmNonlinearityComponent::InitFromConfig(ConfigLine *cfl) { InitLearningRatesFromConfig(cfl); bool ok = true; + bool use_dropout = false; int32 cell_dim; // these self-repair thresholds are the normal defaults for tanh and sigmoid // respectively. If, later on, we decide that we want to support different @@ -5314,6 +5335,7 @@ void LstmNonlinearityComponent::InitFromConfig(ConfigLine *cfl) { cfl->GetValue("sigmoid-self-repair-threshold", &sigmoid_self_repair_threshold); cfl->GetValue("self-repair-scale", &self_repair_scale); + cfl->GetValue("use-dropout", &use_dropout); // We may later on want to make it possible to initialize the different // parameters w_ic, w_fc and w_oc with different biases. We'll implement @@ -5323,7 +5345,7 @@ void LstmNonlinearityComponent::InitFromConfig(ConfigLine *cfl) { KALDI_ERR << "Could not process these elements in initializer: " << cfl->UnusedValues(); if (ok) { - Init(cell_dim, param_stddev, tanh_self_repair_threshold, + Init(cell_dim, use_dropout, param_stddev, tanh_self_repair_threshold, sigmoid_self_repair_threshold, self_repair_scale); } else { KALDI_ERR << "Invalid initializer for layer of type " diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h index 62b4c9006d8..ea5df928b37 100644 --- a/src/nnet3/nnet-simple-component.h +++ b/src/nnet3/nnet-simple-component.h @@ -99,7 +99,8 @@ class DropoutComponent : public RandomComponent { dropout_per_frame_(false) { } virtual int32 Properties() const { - return kLinearInInput|kBackpropInPlace|kSimpleComponent|kBackpropNeedsInput|kBackpropNeedsOutput; + return kLinearInInput|kBackpropInPlace|kSimpleComponent|kBackpropNeedsInput| + kBackpropNeedsOutput|kRandomComponent; } virtual std::string Type() const { return "DropoutComponent"; } @@ -1677,8 +1678,9 @@ class ConvolutionComponent: public UpdatableComponent { // o_part = W_{cx} x_t + W_{om} m_{t-1} + b_o // // The part of the computation that takes place in this component is as follows. -// Its input is of dimension 5C, consisting of 5 blocks: (i_part, f_part, c_part, o_part, and -// c_{t-1}). Its output is of dimension 2C, consisting of 2 blocks: c_t and m_t. +// Its input is of dimension 5C [however, search for 'dropout' below], +// consisting of 5 blocks: (i_part, f_part, c_part, o_part, and c_{t-1}). Its +// output is of dimension 2C, consisting of 2 blocks: c_t and m_t. // // To recap: the input is (i_part, f_part, c_part, o_part, c_{t-1}); the output is (c_t, m_t). // @@ -1696,6 +1698,12 @@ class ConvolutionComponent: public UpdatableComponent { // m_t = o_t * Tanh(c_t) (5) // # note: the outputs are just c_t and m_t. // +// [Note regarding dropout: optionally the input-dimension may be 5C + 2 instead +// of 5C in this case, the last two input dimensions will be interpreted as +// per-frame dropout masks on i_t and f_t respectively, so that in (3), i_t is +// replaced by i_t * i_t_scale, and likewise for f_t. +// +// // The backprop is as you would think, but for the "self-repair" we need to pass // in additional vectors (of the same dim as the parameters of the layer) that // dictate whether or not we add an additional term to the backpropagated @@ -1715,7 +1723,7 @@ class LstmNonlinearityComponent: public UpdatableComponent { virtual int32 OutputDim() const; virtual std::string Info() const; virtual void InitFromConfig(ConfigLine *cfl); - LstmNonlinearityComponent() { } // use Init to really initialize. + LstmNonlinearityComponent(): use_dropout_(false) { } virtual std::string Type() const { return "LstmNonlinearityComponent"; } virtual int32 Properties() const { return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput; @@ -1751,15 +1759,12 @@ class LstmNonlinearityComponent: public UpdatableComponent { explicit LstmNonlinearityComponent( const LstmNonlinearityComponent &other); - void Init(int32 cell_dim, BaseFloat param_stddev, + void Init(int32 cell_dim, bool use_dropout, + BaseFloat param_stddev, BaseFloat tanh_self_repair_threshold, BaseFloat sigmoid_self_repair_threshold, BaseFloat self_repair_scale); - void Init(std::string vector_filename, - int32 rank, int32 update_period, BaseFloat num_samples_history, - BaseFloat alpha, BaseFloat max_change_per_minibatch); - private: // Initializes the natural-gradient object with the configuration we @@ -1773,6 +1778,10 @@ class LstmNonlinearityComponent: public UpdatableComponent { // it contains the 3 diagonal parameter matrices w_i, w_f and w_o. CuMatrix params_; + // If true, we expect an extra 2 dimensions on the input, for dropout masks + // for i_t and f_t. + bool use_dropout_; + // Of dimension 5 * C, with a row for each of the Sigmoid/Tanh functions in // equations (1) through (5), this is the sum of the values of the nonliearities // (used for diagnostics only). It is comparable to value_sum_ vector diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc index a7f732a9864..27415fe8775 100644 --- a/src/nnet3/nnet-utils.cc +++ b/src/nnet3/nnet-utils.cc @@ -21,6 +21,7 @@ #include "nnet3/nnet-utils.h" #include "nnet3/nnet-graph.h" #include "nnet3/nnet-simple-component.h" +#include "nnet3/nnet-general-component.h" #include "nnet3/nnet-parse.h" namespace kaldi { @@ -461,6 +462,10 @@ void SetDropoutProportion(BaseFloat dropout_proportion, DropoutComponent *dc = dynamic_cast(comp); if (dc != NULL) dc->SetDropoutProportion(dropout_proportion); + DropoutMaskComponent *mc = + dynamic_cast(nnet->GetComponent(c)); + if (mc != NULL) + mc->SetDropoutProportion(dropout_proportion); } } @@ -629,16 +634,20 @@ void ReadEditConfig(std::istream &edit_config_is, Nnet *nnet) { KALDI_ERR << "In edits-config, expected proportion to be set in line: " << config_line.WholeLine(); } - DropoutComponent *dropout_component = NULL; int32 num_dropout_proportions_set = 0; for (int32 c = 0; c < nnet->NumComponents(); c++) { if (NameMatchesPattern(nnet->GetComponentName(c).c_str(), - name_pattern.c_str()) && - (dropout_component = - dynamic_cast(nnet->GetComponent(c)))) { + name_pattern.c_str())) { + DropoutComponent *dropout_component = + dynamic_cast(nnet->GetComponent(c)); + DropoutMaskComponent *mask_component = + dynamic_cast(nnet->GetComponent(c)); if (dropout_component != NULL) { dropout_component->SetDropoutProportion(proportion); num_dropout_proportions_set++; + } else if (mask_component != NULL){ + mask_component->SetDropoutProportion(proportion); + num_dropout_proportions_set++; } } } diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h index 3bda01271d2..0ed5aa0d5c5 100644 --- a/src/nnet3/nnet-utils.h +++ b/src/nnet3/nnet-utils.h @@ -160,7 +160,7 @@ void ConvertRepeatedToBlockAffine(Nnet *nnet); /// Info() function (we need this in the CTC code). std::string NnetInfo(const Nnet &nnet); -/// This function sets the dropout proportion in all dropout component to +/// This function sets the dropout proportion in all dropout components to /// dropout_proportion value. void SetDropoutProportion(BaseFloat dropout_proportion, Nnet *nnet); From 863534b0ec80ef33088183a2509965782e44e5e6 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 30 Jan 2017 23:11:16 -0500 Subject: [PATCH 02/21] [egs] Small fixes/additions in Swbd/s5c chain scripts --- .../local/chain/tuning/run_tdnn_lstm_1e.sh | 4 +- .../local/chain/tuning/run_tdnn_lstm_1g.sh | 261 ++++++++++++++++++ 2 files changed, 263 insertions(+), 2 deletions(-) create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh index bf93b156974..14dbb1cdd2e 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh @@ -242,11 +242,11 @@ if [ $stage -le 16 ]; then --nj 50 --cmd "$decode_cmd" --iter $decode_iter \ --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ $graph_dir data/${decode_set}_hires \ - $dir/decode_${decode_set}_sw1_tg || exit 1; + $dir/decode_${decode_set}_sw1_tg_looped || exit 1; if $has_fisher; then steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ - $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_looped || exit 1; fi ) & done diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh new file mode 100755 index 00000000000..6cacdf2dadb --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh @@ -0,0 +1,261 @@ +#!/bin/bash + +# 1g is like 1e, but reducing decay-time from 20 to 15, to see if +# it reduces the difference between regular and looped decoding. +# +# run_tdnn_lstm_1e.sh is like run_tdnn_lstm_1d.sh but +# trying the change of xent_regularize from 0.025 (which was an +# unusual value) to the more usual 0.01. + + + +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_lstm_1g # Note: _sp will get added to this if $speed_perturb == true. +decode_iter=final + +# training options +xent_regularize=0.01 +self_repair_scale=0.00001 +label_delay=5 + +chunk_left_context=40 +chunk_right_context=0 +# we'll put chunk-left-context-initial=0 and chunk-right-context-final=0 +# directly without variables. +frames_per_chunk=140,100,160 + +# (non-looped) decoding options +frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1) +extra_left_context=50 +extra_right_context=0 +# we'll put extra-left-context-initial=0 and extra-right-context-final=0 +# directly without variables. + + +remove_egs=false +common_egs_dir=exp/chain/tdnn_lstm_1d_sp/egs + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; } + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=15" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=1024 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 + fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 + fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_chunk \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + + +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --num-threads 4 \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 25 --cmd "$decode_cmd" --iter $decode_iter \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk "$frames_per_chunk_primary" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; + +if [ $stage -le 16 ]; then + # looped decoding. Note: this does not make sense for BLSTMs or other + # backward-recurrent setups, and for TDNNs and other non-recurrent there is no + # point doing it because it would give identical results to regular decoding. + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode_looped.sh \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" --iter $decode_iter \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_tg_looped || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_looped || exit 1; + fi + ) & + done +fi +wait; + + + +exit 0; From eb0f45819a33c79d56b99302d94d276c5921258d Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 31 Jan 2017 14:13:59 -0500 Subject: [PATCH 03/21] [src,egs,scripts] Modifying dropout in LSTM to be on (i,f,o) gates not just (i,f); test on tedlium. --- .../local/chain/tuning/run_tdnn_lstm_1p.sh | 21 +++++++++++++++- egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py | 6 ++--- src/cudamatrix/cu-kernels.cu | 23 ++++++++++------- src/cudamatrix/cu-math-test.cc | 10 ++++---- src/cudamatrix/cu-math.cc | 25 +++++++++++-------- src/cudamatrix/cu-math.h | 14 +++++------ src/nnet3/nnet-simple-component.cc | 10 +++++--- src/nnet3/nnet-simple-component.h | 6 ++--- 8 files changed, 73 insertions(+), 42 deletions(-) diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1p.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1p.sh index 246601d8535..f06f4a7f6ec 100755 --- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1p.sh +++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1p.sh @@ -4,6 +4,25 @@ # did it in the non-fast LSTMs, with separate per-frame masks on # the i and f component. Using dropout schedule that maxes out at # 0.3, which he found worked best for that type of dropout. + +# [See about 20 lines below for the original comparison with the baseline, +# done when "p" was dropping out 2 gates [the i and f gates]. +# The comparison directly below is between the version that dropped out +# 2 gates (p) with the one that dropped out 3 gates (p2). No consistent +# difference there.] +# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1{p,p2}_sp_bi +#_sp_bi +# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1p_sp_bi exp/chain_cleaned/tdnn_lstm1p2_sp_bi +# System tdnn_lstm1p_sp_bi tdnn_lstm1p2_sp_bi +# WER on dev(orig) 8.9 8.7 +# WER on dev(rescored) 8.4 8.2 +# WER on test(orig) 8.7 8.8 +# WER on test(rescored) 8.1 8.3 +# Final train prob -0.0712 -0.0717 +# Final valid prob -0.0848 -0.0834 +# Final train prob (xent) -0.8903 -0.9147 +# Final valid prob (xent) -0.9719 -0.9977 + # # # local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,k,p,q}_sp_bi @@ -95,7 +114,7 @@ frames_per_chunk_primary=140 # are just hardcoded at this level, in the commands below. train_stage=-10 tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. -tdnn_lstm_affix=1p #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. +tdnn_lstm_affix=1p2 #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs # you can set this to use previously dumped egs. # End configuration section. diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py index f6d93808538..ac2deb7ecd6 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py @@ -809,9 +809,9 @@ def generate_lstm_config(self): configs.append("component name={0}.cr_trunc type=BackpropTruncationComponent " "dim={1} {2}".format(name, cell_dim + rec_proj_dim, bptrunc_str)) if dropout_proportion != -1.0: - configs.append("component name={0}.dropout_mask type=DropoutMaskComponent output-dim=2 " - "dropout-proportion={1} exclusive={2}" - .format(name, dropout_proportion, dropout_exclusive)) + configs.append("component name={0}.dropout_mask type=DropoutMaskComponent output-dim=3 " + "dropout-proportion={1} " + .format(name, dropout_proportion)) configs.append("# Component specific to 'projected' LSTM (LSTMP), contains both recurrent"); configs.append("# and non-recurrent projections") configs.append("component name={0}.W_rp type=NaturalGradientAffineComponent input-dim={1} " diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index f50e5853fdd..d9d463d1aca 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -2723,8 +2723,8 @@ static void _diff_log_softmax(const MatrixDim in_deriv_dim, which we name: (i_part, f_part, c_part, o_part, c_{t-1}). If 'have_dropout_mask' is nonzero, each row of - 'in' will have two extra elements, interpreted - as dropout masks/scales for i_t and f_t. + 'in' will have 3 extra elements, interpreted + as dropout masks/scales for i_t, f_t and o_t. @param [in] params A matrix, of dimension 3 by cell_dim, with rows containing the 3 diagonal parameter matrices used in LSTMs, namely @@ -2764,7 +2764,8 @@ static void _lstm_nonlinearity(const Real* in, const int in_stride, Real* c_t = out + i * out_stride; Real* m_t = out + i * out_stride + cell_dim; Real i_scale = (have_dropout_mask ? in[i * in_stride + cell_dim * 5] : 1), - f_scale = (have_dropout_mask ? in[i * in_stride + cell_dim * 5 + 1] : 1); + f_scale = (have_dropout_mask ? in[i * in_stride + cell_dim * 5 + 1] : 1), + o_scale = (have_dropout_mask ? in[i * in_stride + cell_dim * 5 + 2] : 1); for (int j = tid; j < cell_dim; j += CU1DBLOCK) { Real c_tm1_j = c_tm1[j]; @@ -2773,7 +2774,7 @@ static void _lstm_nonlinearity(const Real* in, const int in_stride, Real c_t_j = f_t_j * f_scale * c_tm1_j + i_t_j * i_scale * tanh(c_part[j]); Real o_t_j = Real(1) / (Real(1) + exp(-o_part[j] - w_oc[j] * c_t_j)); c_t[j] = c_t_j; - m_t[j] = o_t_j * tanh(c_t_j); + m_t[j] = o_t_j * o_scale * tanh(c_t_j); } } @@ -2799,8 +2800,8 @@ static void _lstm_nonlinearity(const Real* in, const int in_stride, consecutive blocks, each of dimension C, which we name: (i_part, f_part, c_part, o_part, c_{t-1}). If 'have_dropout_mask' is nonzero, each row of - 'in' will have two extra elements, interpreted - as dropout masks/scales for i_t and f_t. + 'in' will have 3 extra elements, interpreted + as dropout masks/scales for i_t, f_t and o_t. @param [in] params The same as in ComputeLstmNonlinearity(). A matrix, of dimension 3 by C, with rows containing the three diagonal parameter matrices used in LSTMs, namely @@ -2940,7 +2941,11 @@ static void _diff_lstm_nonlinearity(const int cell_dim, const int have_dropout_m const Real i_scale = (have_dropout_mask ? input[i * input_stride + cell_dim * 5] : 1), f_scale = (have_dropout_mask ? - input[i * input_stride + cell_dim * 5 + 1] :1); + input[i * input_stride + cell_dim * 5 + 1] :1), + o_scale = (have_dropout_mask ? + input[i * input_stride + cell_dim * 5 + 2] :1); + + const Real i_t = Real(1) / (1 + exp(-i_part - w_ic * c_prev)); const Real f_t = Real(1) / (1 + exp(-f_part - w_fc * c_prev)); const Real tanh_c_part = tanh(c_part); @@ -2971,8 +2976,8 @@ static void _diff_lstm_nonlinearity(const int cell_dim, const int have_dropout_m const Real dc_t_out = output_deriv[i * output_deriv_stride + j]; const Real dm_t = output_deriv[i * output_deriv_stride + j + cell_dim]; - const Real dtanh_c_t = o_t * dm_t; - const Real do_t = tanh_c_t * dm_t; + const Real dtanh_c_t = o_t * o_scale * dm_t; + const Real do_t = o_scale * tanh_c_t * dm_t; const Real do_t_input = (o_t_deriv * do_t - (2 * o_t - 1) * o_t_self_repair); diff --git a/src/cudamatrix/cu-math-test.cc b/src/cudamatrix/cu-math-test.cc index 9abb6c7e8d1..9854692f356 100644 --- a/src/cudamatrix/cu-math-test.cc +++ b/src/cudamatrix/cu-math-test.cc @@ -144,7 +144,7 @@ static void UnitTestCuMathComputeLstmNonlinearity() { for (int i = 0; i < 3; i++) { int32 num_rows = 1 + Rand() % 100; int32 cell_dim = 1 + Rand() % 2000; - int32 dropout_dim = (RandInt(0, 1) == 0 ? 0 : 2); + int32 dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3); Matrix Hinput(num_rows, 5 * cell_dim + dropout_dim); Matrix Hparams(3, cell_dim); Matrix Houtput(num_rows, 2 * cell_dim); @@ -166,7 +166,7 @@ static void UnitTestCuMathComputeLstmNonlinearity() { BaseFloat time_in_secs = 0.025; int32 num_rows = i; int32 cell_dim = i; - int32 dropout_dim = (RandInt(0, 1) == 0 ? 0 : 2); + int32 dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3); CuMatrix input(num_rows, 5 * cell_dim + dropout_dim); CuMatrix params(3, cell_dim); CuMatrix output(num_rows, 2 * cell_dim); @@ -193,7 +193,7 @@ void UnitTestLstmNonlinearity() { // problem dimensions. int32 num_rows = RandInt(5, 20), cell_dim = RandInt(2, 200), - dropout_dim = (RandInt(0, 1) == 0 ? 0 : 2); + dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3); // Pick the (input or params block), and output block, for which we'll // spot-check the derivative values. This will give us test failures @@ -299,7 +299,7 @@ static void UnitTestBackpropLstmNonlinearity() { for (int i = 0; i < 3; i++) { int32 num_rows = 1 + Rand() % 200; int32 cell_dim = 1 + Rand() % 2000, - dropout_dim = (RandInt(0, 1) == 0 ? 0 : 2); + dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3); // KALDI_LOG << num_rows << ", " << cell_dim; Matrix hinput(num_rows, 5 * cell_dim + dropout_dim); @@ -412,7 +412,7 @@ static void UnitTestBackpropLstmNonlinearity() { BaseFloat time_in_secs = 0.025; int32 num_rows = i; int32 cell_dim = i; - int32 dropout_dim = (RandInt(0, 1) == 0 ? 0 : 2); + int32 dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3); CuMatrix input(num_rows, 5 * cell_dim + dropout_dim); CuMatrix params(3, cell_dim); diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc index b76721fcce3..13b2f450bbb 100644 --- a/src/cudamatrix/cu-math.cc +++ b/src/cudamatrix/cu-math.cc @@ -320,7 +320,7 @@ void CpuComputeLstmNonlinearity(const MatrixBase &input_mat, int32 num_rows = input_mat.NumRows(), input_cols = input_mat.NumCols(), cell_dim = input_cols / 5; - KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 2); + KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3); KALDI_ASSERT(output->NumRows() == num_rows); KALDI_ASSERT(params_mat.NumRows() == 3); KALDI_ASSERT(params_mat.NumCols() == cell_dim); @@ -333,7 +333,8 @@ void CpuComputeLstmNonlinearity(const MatrixBase &input_mat, const Real *input_row = input_mat.RowData(r); // i_scale and f_scale relate to dropout, they will normally be 1.0. Real i_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5]), - f_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5 + 1]); + f_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5 + 1]), + o_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5 + 2]); Real *output_row = output_mat.RowData(r); for (int32 c = 0; c < cell_dim; c++) { @@ -349,7 +350,7 @@ void CpuComputeLstmNonlinearity(const MatrixBase &input_mat, Real f_t = ScalarSigmoid(f_part + w_fc * c_prev); Real c_t = f_t * f_scale * c_prev + i_t * i_scale * ScalarTanh(c_part); Real o_t = ScalarSigmoid(o_part + w_oc * c_t); - Real m_t = o_t * ScalarTanh(c_t); + Real m_t = o_t * o_scale * ScalarTanh(c_t); output_row[c] = c_t; output_row[c + cell_dim] = m_t; } @@ -363,7 +364,7 @@ void ComputeLstmNonlinearity(const CuMatrixBase &input, int32 num_rows = input.NumRows(), input_cols = input.NumCols(), cell_dim = input_cols / 5; - KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 2); + KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3); KALDI_ASSERT(output->NumRows() == num_rows); KALDI_ASSERT(params.NumRows() == 3); KALDI_ASSERT(params.NumCols() == cell_dim); @@ -373,7 +374,7 @@ void ComputeLstmNonlinearity(const CuMatrixBase &input, if (CuDevice::Instantiate().Enabled()) { Timer tim; - int have_dropout_mask = (input_cols == (cell_dim * 5) + 2); + int have_dropout_mask = (input_cols == (cell_dim * 5) + 3); // Each thread block is working on 1 row of the data. // It's best that cell dim is a multiple fo CU1DBLOCK @@ -427,7 +428,7 @@ void CpuBackpropLstmNonlinearity(const MatrixBase &input, .NumCols(), cell_dim = input.NumCols() / 5; // Check dimensions. - KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 2); + KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3); KALDI_ASSERT(params.NumRows() == 3); KALDI_ASSERT(params.NumCols() == cell_dim); KALDI_ASSERT(output_deriv.NumRows() == num_rows); @@ -526,7 +527,9 @@ void CpuBackpropLstmNonlinearity(const MatrixBase &input, Real i_scale = (input_cols == cell_dim * 5 ? 1.0 : input_mat(r, cell_dim * 5)), f_scale = (input_cols == cell_dim * 5 ? 1.0 : - input_mat(r, cell_dim * 5 + 1)); + input_mat(r, cell_dim * 5 + 1)), + o_scale = (input_cols == cell_dim * 5 ? 1.0 : + input_mat(r, cell_dim * 5 + 2)); // For greater clarity, we give some of the quantities in the // forward equations their own names. @@ -567,8 +570,8 @@ void CpuBackpropLstmNonlinearity(const MatrixBase &input, // comes directly from the output of this function. Real dc_t_out = output_deriv_mat(r, c); Real dm_t = output_deriv_mat(r, c + cell_dim); - Real dtanh_c_t = o_t * dm_t; - Real do_t = tanh_c_t * dm_t; + Real dtanh_c_t = o_t * o_scale * dm_t; + Real do_t = o_scale * tanh_c_t * dm_t; Real do_t_input = (o_t * (1.0F - o_t) * do_t - (2.0F * o_t - 1.0F) * o_t_self_repair); Real dc_t = ((1.0F - tanh_c_t * tanh_c_t) * dtanh_c_t + dc_t_out @@ -650,7 +653,7 @@ void BackpropLstmNonlinearity(const CuMatrixBase &input, cell_dim = input.NumCols() / 5, input_cols = input.NumCols(); // Check dimensions. - KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim*5) + 2); + KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim*5) + 3); KALDI_ASSERT(params.NumRows() == 3); KALDI_ASSERT(params.NumCols() == cell_dim); KALDI_ASSERT(output_deriv.NumRows() == num_rows); @@ -685,7 +688,7 @@ void BackpropLstmNonlinearity(const CuMatrixBase &input, // Each thread block is working on 1 row of the data. // It's best that cell dim is a multiple fo CU1DBLOCK - int have_dropout_mask = (input_cols == (cell_dim * 5) + 2); + int have_dropout_mask = (input_cols == (cell_dim * 5) + 3); // Use 2D block (8x32 threads) as we need to compute column sum. // Use 1D grid to cover the data matrix width `cell_dim`. diff --git a/src/cudamatrix/cu-math.h b/src/cudamatrix/cu-math.h index 3313baaa9d1..3cc61da1744 100644 --- a/src/cudamatrix/cu-math.h +++ b/src/cudamatrix/cu-math.h @@ -88,9 +88,9 @@ void Group2norm(const CuMatrixBase &src, a multiple of 5). The column-space is interpreted as 5 consecutive blocks, each of dimension C, which we name: (i_part, f_part, c_part, o_part, c_{t-1}). - This function will also accept input of dimension N by 5C+2, - and the two final elements will be used as scaling factors - on i_t and f_t (useful as per-frame dropout masks). + This function will also accept input of dimension N by 5C + 3, + and the three final elements will be used as scaling factors + on i_t, f_t and o_t (useful as per-frame dropout masks). @param [in] params A matrix, of dimension 3 by C, with rows containing the three diagonal parameter matrices used in LSTMs, namely w_{ic}, w_{fc} and w_{oc}. @@ -136,9 +136,9 @@ void CpuComputeLstmNonlinearity(const MatrixBase &input, a multiple of 5). The column-space is interpreted as 5 consecutive blocks, each of dimension C, which we name: (i_part, f_part, c_part, o_part, c_{t-1}). - This function will also accept input of dimension N by 5C+2, - and the two final elements will be interpreted as scaling factors - on i_t and f_t (useful as per-frame dropout masks). + This function will also accept input of dimension N by 5C + 3, + and the three final elements will be interpreted as scaling factors + on i_t, f_t and o_t (useful as per-frame dropout masks). @param [in] params The same as in ComputeLstmNonlinearity(). A matrix, of dimension 3 by C, with rows containing the three diagonal parameter matrices used in LSTMs, namely @@ -173,7 +173,7 @@ void CpuComputeLstmNonlinearity(const MatrixBase &input, have the same dimension as 'input'. In addition to the regular backpropagated derivative, the output will include small values relating to 'self-repair'. If the input - is of column-dimension 5C + 2 (i.e. we are using dropout + is of column-dimension 5C + 3 (i.e. we are using dropout masks), the derivatives w.r.t. the dropout masks will not be set; they will retain their value prior to this function call. diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index 1cde6b3b0fa..dbb3729ec0d 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -5004,7 +5004,7 @@ void CompositeComponent::SetComponent(int32 i, Component *component) { int32 LstmNonlinearityComponent::InputDim() const { int32 cell_dim = value_sum_.NumCols(); - return cell_dim * 5 + (use_dropout_ ? 2 : 0); + return cell_dim * 5 + (use_dropout_ ? 3 : 0); } int32 LstmNonlinearityComponent::OutputDim() const { @@ -5081,8 +5081,12 @@ void LstmNonlinearityComponent::Write(std::ostream &os, bool binary) const { self_repair_prob.Scale(1.0 / (count_ * cell_dim)); self_repair_prob.Write(os, binary); } - WriteToken(os, binary, ""); - WriteBasicType(os, binary, use_dropout_); + if (use_dropout_) { + // only write this if true; we have back-compat code in reading anyway. + // this makes the models without dropout easier to read with older code. + WriteToken(os, binary, ""); + WriteBasicType(os, binary, use_dropout_); + } WriteToken(os, binary, ""); WriteBasicType(os, binary, count_); WriteToken(os, binary, ""); diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h index ea5df928b37..60fd1634598 100644 --- a/src/nnet3/nnet-simple-component.h +++ b/src/nnet3/nnet-simple-component.h @@ -1698,10 +1698,10 @@ class ConvolutionComponent: public UpdatableComponent { // m_t = o_t * Tanh(c_t) (5) // # note: the outputs are just c_t and m_t. // -// [Note regarding dropout: optionally the input-dimension may be 5C + 2 instead +// [Note regarding dropout: optionally the input-dimension may be 5C + 3 instead // of 5C in this case, the last two input dimensions will be interpreted as -// per-frame dropout masks on i_t and f_t respectively, so that in (3), i_t is -// replaced by i_t * i_t_scale, and likewise for f_t. +// per-frame dropout masks on i_t, f_t and o_t respectively, so that in (3), i_t is +// replaced by i_t * i_t_scale, and likewise for f_t and o_t. // // // The backprop is as you would think, but for the "self-repair" we need to pass From 96d92d77a6ba8e21b2fc5e012f3bdd019ffbd1b0 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 31 Jan 2017 14:40:11 -0500 Subject: [PATCH 04/21] Merge remote-tracking branch 'upstream/shortcut' into shortcut-dropout --- egs/wsj/s5/steps/shift_feats.sh | 5 ++ .../s5/utils/data/shift_and_combine_feats.sh | 55 ++++++++++++ egs/wsj/s5/utils/data/shift_feats.sh | 55 ++++++++++++ src/featbin/shift-feats.cc | 89 +++++++++++++------ 4 files changed, 176 insertions(+), 28 deletions(-) create mode 100755 egs/wsj/s5/utils/data/shift_and_combine_feats.sh create mode 100755 egs/wsj/s5/utils/data/shift_feats.sh diff --git a/egs/wsj/s5/steps/shift_feats.sh b/egs/wsj/s5/steps/shift_feats.sh index 22b17f2cb09..ada5716f187 100755 --- a/egs/wsj/s5/steps/shift_feats.sh +++ b/egs/wsj/s5/steps/shift_feats.sh @@ -3,6 +3,9 @@ # Copyright 2016 Vimal Manohar # Apache 2.0 +# This script is deprecated. The newer script utils/data/shift_feats.sh +# should be used instead. + # This script shifts the feats in the input data directory and creates a # new directory _fs with shifted feats. # If the shift is negative, the initial frames get truncated and the @@ -25,6 +28,8 @@ if [ -f path.sh ]; then . ./path.sh; fi . parse_options.sh || exit 1; if [ $# -ne 4 ]; then + echo "This script is deprecated. The newer script utils/data/shift_feats.sh" + echo "should be used instead." echo "usage: $0 [options] "; echo "e.g.: $0 -1 data/train exp/shift-1_train mfcc" echo "options: " diff --git a/egs/wsj/s5/utils/data/shift_and_combine_feats.sh b/egs/wsj/s5/utils/data/shift_and_combine_feats.sh new file mode 100755 index 00000000000..1a15b324ee8 --- /dev/null +++ b/egs/wsj/s5/utils/data/shift_and_combine_feats.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +# Copyright 2017 Hossein Hadian + +# Apache 2.0 + +echo "$0 $@" # Print the command line for logging +if [ -f path.sh ]; then . ./path.sh; fi +. utils/parse_options.sh + +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 3 data/train data/train_fs3" + echo "For use in perturbing data for discriminative training and alignment of" + echo "frame-subsampled systems, this script uses utils/data/shift_feats.sh" + echo "and utils/data/combine_data.sh to shift the features" + echo " different ways and combine them." + echo "E.g. if is 3, this script will combine" + echo "the data frame-shifted by -1, 0 and 1 (c.f. shift-feats)." + exit 1 +fi + +frame_subsampling_factor=$1 +srcdir=$2 +destdir=$3 + +if [ ! -f $srcdir/feats.scp ]; then + echo "$0: expected $srcdir/feats.scp to exist" + exit 1 +fi + +if [ -f $destdir/feats.scp ]; then + echo "$0: $destdir/feats.scp already exists: refusing to run this (please delete $destdir/feats.scp if you want this to run)" + exit 1 +fi + +tmp_shift_destdirs=() +for frame_shift in `seq $[-(frame_subsampling_factor/2)] $[-(frame_subsampling_factor/2) + frame_subsampling_factor - 1]`; do + if [ "$frame_shift" == 0 ]; then continue; fi + utils/data/shift_feats.sh $frame_shift $srcdir ${destdir}_fs$frame_shift || exit 1 + tmp_shift_destdirs+=("${destdir}_fs$frame_shift") +done +utils/data/combine_data.sh $destdir $srcdir ${tmp_shift_destdirs[@]} || exit 1 +rm -r ${tmp_shift_destdirs[@]} + +utils/validate_data_dir.sh $destdir + +src_nf=`cat $srcdir/feats.scp | wc -l` +dest_nf=`cat $destdir/feats.scp | wc -l` +if [ $[src_nf*frame_subsampling_factor] -ne $dest_nf ]; then + echo "There was a problem. Expected number of feature lines in destination dir to be $[src_nf*frame_subsampling_factor];" + exit 1; +fi + +echo "$0: Successfully generated $frame_subsampling_factor-way shifted version of data in $srcdir, in $destdir" diff --git a/egs/wsj/s5/utils/data/shift_feats.sh b/egs/wsj/s5/utils/data/shift_feats.sh new file mode 100755 index 00000000000..2ae7b2435d3 --- /dev/null +++ b/egs/wsj/s5/utils/data/shift_feats.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +# Copyright 2016 Vimal Manohar +# 2017 Hossein Hadian +# Apache 2.0 + +echo "$0 $@" # Print the command line for logging +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo " Usage: $0 " + echo "e.g.: $0 -1 data/train data/train_fs-1" + echo "The script creates a new data directory with the features modified" + echo "using the program shift-feats with the specified frame-shift." + echo "This program automatically adds the prefix 'fs-' to the" + echo "utterance and speaker names. See also utils/data/shift_and_combine_feats.sh" + exit 1 +fi + +frame_shift=$1 +srcdir=$2 +destdir=$3 + + +if [ "$destdir" == "$srcdir" ]; then + echo "$0: this script requires and to be different." + exit 1 +fi + +if [ ! -f $srcdir/feats.scp ]; then + echo "$0: no such file $srcdir/feats.scp" + exit 1; +fi + +utt_prefix="fs$frame_shift-" +spk_prefix="fs$frame_shift-" + +mkdir -p $destdir +utils/copy_data_dir.sh --utt-prefix $utt_prefix --spk-prefix $spk_prefix \ + $srcdir $destdir + +if grep --quiet "'" $srcdir/feats.scp; then + echo "$0: the input features already use single quotes. Can't proceed." + exit 1; +fi + +awk -v shift=$frame_shift 'NF == 2 {uttid=$1; feat=$2; qt="";} \ +NF > 2 {idx=index($0, " "); uttid=$1; feat=substr($0, idx + 1); qt="\x27";} \ +NF {print uttid " shift-feats --print-args=false --shift=" shift, qt feat qt " - |";}' \ + $destdir/feats.scp >$destdir/feats_shifted.scp +mv -f $destdir/feats_shifted.scp $destdir/feats.scp + +echo "$0: Done" + diff --git a/src/featbin/shift-feats.cc b/src/featbin/shift-feats.cc index 7b970e92248..5d392c9d15a 100644 --- a/src/featbin/shift-feats.cc +++ b/src/featbin/shift-feats.cc @@ -22,20 +22,41 @@ #include "util/common-utils.h" #include "matrix/kaldi-matrix.h" +namespace kaldi { + void ShiftFeatureMatrix(const Matrix &src, int32 shift, + Matrix* rearranged) { + for (int32 r = 0; r < src.NumRows(); r++) { + int32 src_r = r - shift; + if (src_r < 0) src_r = 0; + if (src_r >= src.NumRows()) src_r = src.NumRows() - 1; + rearranged->Row(r).CopyFromVec(src.Row(src_r)); + } + } +} int main(int argc, char *argv[]) { try { using namespace kaldi; const char *usage = - "Copy features and possibly shift them in time while maintaining the length, e.g.\n" - "shift-feats --shift=1 will shift all frames to the\n" - "right by one (the first frame would be duplicated).\n" - "See also: copy-feats, copy-matrix\n"; + "Copy features, and possibly shift them while maintaining the " + "num-frames.\n" + "Usage: shift-feats [options] " + "\n" + "or: shift-feats [options] \n" + "e.g.: shift-feats --shift=-1 foo.scp bar.ark\n" + "or: shift-feats --shift=1 foo.mat bar.mat\n" + "See also: copy-feats, copy-matrix, select-feats, extract-rows,\n" + "subset-feats, subsample-feats, splice-feats, paste-feats, " + "concat-feats\n"; ParseOptions po(usage); + bool binary = true; int32 shift = 0; - po.Register("shift", &shift, "Number of frames by which to shift the features."); + po.Register("shift", &shift, "Number of frames by which to shift the " + "features."); + po.Register("binary", &binary, "Binary-mode output (not relevant if " + "writing to archive)"); po.Read(argc, argv); @@ -46,32 +67,40 @@ int main(int argc, char *argv[]) { int32 num_done = 0, num_err = 0; - SequentialBaseFloatMatrixReader feat_reader(po.GetArg(1)); - BaseFloatMatrixWriter feat_writer(po.GetArg(2)); - - - for (; !feat_reader.Done(); feat_reader.Next()) { - const std::string &key = feat_reader.Key(); - const Matrix &src = feat_reader.Value(); - if (src.NumRows() == 0) { - KALDI_WARN << "Empty matrix for key " << key; - num_err++; - continue; + if (ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier) { + SequentialBaseFloatMatrixReader feat_reader(po.GetArg(1)); + BaseFloatMatrixWriter feat_writer(po.GetArg(2)); + + + for (; !feat_reader.Done(); feat_reader.Next()) { + const std::string &key = feat_reader.Key(); + const Matrix &src = feat_reader.Value(); + if (src.NumRows() == 0) { + KALDI_WARN << "Empty matrix for key " << key; + num_err++; + continue; + } + Matrix rearranged(src.NumRows(), src.NumCols()); + ShiftFeatureMatrix(src, shift, &rearranged); + feat_writer.Write(key, rearranged); + num_done++; } + + KALDI_LOG << "Shifted " << num_done << " features by " + << shift << " frames; " << num_err << " with errors."; + return (num_done > 0 ? 0 : 1); + } else { + std::string feat_rxfilename = po.GetArg(1), + feat_wxfilename = po.GetArg(2); + Matrix src; + ReadKaldiObject(feat_rxfilename, &src); + if (src.NumRows() == 0) + KALDI_ERR << "Empty input matrix"; Matrix rearranged(src.NumRows(), src.NumCols()); - for (int32 r = 0; r < src.NumRows(); r++) { - int32 src_r = r - shift; - if (src_r < 0) src_r = 0; - if (src_r >= src.NumRows()) src_r = src.NumRows() - 1; - rearranged.Row(r).CopyFromVec(src.Row(src_r)); - } - feat_writer.Write(key, rearranged); - num_done++; + ShiftFeatureMatrix(src, shift, &rearranged); + WriteKaldiObject(rearranged, feat_wxfilename, binary); + // we do not print any log messages here } - - KALDI_LOG << "Shifted " << num_done << " features by " - << shift << " frames; " << num_err << " with errors."; - return (num_done > 0 ? 0 : 1); } catch(const std::exception &e) { std::cerr << e.what(); return -1; @@ -87,4 +116,8 @@ int main(int argc, char *argv[]) { 1 1 1 1 2 2 ] + + + echo "[ 1 1; 2 2; 3 3 ]" | ./shift-feats --print-args=false --binary=false \ + --shift=1 - - */ From 6582acf773debc16dba2a67cebf3f6ddaae74100 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Thu, 2 Feb 2017 20:32:29 -0500 Subject: [PATCH 05/21] [scripts] Update example scripts for dropout on Tedlium s5_r2 --- .../local/chain/tuning/run_tdnn_lstm_1p.sh | 4 + .../local/chain/tuning/run_tdnn_lstm_1s.sh | 383 +++++++++++++++++ .../local/chain/tuning/run_tdnn_lstm_1t.sh | 382 +++++++++++++++++ .../local/chain/tuning/run_tdnn_lstm_1u.sh | 385 ++++++++++++++++++ 4 files changed, 1154 insertions(+) create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1p.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1p.sh index f06f4a7f6ec..eecc6bc2544 100755 --- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1p.sh +++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1p.sh @@ -1,5 +1,9 @@ #!/bin/bash +# [note: this was later run as 1p2, with code and script changes that +# meant it was using dropout on 3 gates, as Gaofeng was really doing, +# not 2 as I thought he was doing.] + # 1p is as 1k, but [via script changes] doing the dropout as Gaofeng # did it in the non-fast LSTMs, with separate per-frame masks on # the i and f component. Using dropout schedule that maxes out at diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh new file mode 100755 index 00000000000..a9fa14ae132 --- /dev/null +++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh @@ -0,0 +1,383 @@ +#!/bin/bash + +# 1s is as 1p, but reverting to the non-fast LSTM code (still with dropout); +# will do 1t as the baseline without dropout. [note: mistakenly, this was run +# with not-per-frame dropout]. +# Results are not that encouraging. It's just slightly better than 1t. + +# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1p2_sp_bi exp/chain_cleaned/tdnn_lstm1s_sp_bi exp/chain_cleaned/tdnn_lstm1t_sp_bi +# System tdnn_lstm1e_sp_bi tdnn_lstm1p2_sp_bi tdnn_lstm1s_sp_bi tdnn_lstm1t_sp_bi +# WER on dev(orig) 9.0 8.7 9.1 9.2 +# [looped:] 9.0 8.7 9.1 9.2 +# WER on dev(rescored) 8.4 8.2 8.3 8.6 +# [looped:] 8.4 8.2 8.3 8.6 +# WER on test(orig) 8.8 8.8 9.0 9.1 +# [looped:] 8.8 8.8 9.0 9.0 +# WER on test(rescored) 8.4 8.3 8.4 8.6 +# [looped:] 8.3 8.3 8.4 8.7 +# Final train prob -0.0648 -0.0717 -0.0693 -0.0618 +# Final valid prob -0.0827 -0.0833 -0.0859 -0.0794 +# Final train prob (xent) -0.8372 -0.8979 -0.8802 -0.8120 +# Final valid prob (xent) -0.9497 -0.9844 -0.9934 -0.9396 + +# 1p is as 1k, but [via script changes] doing the dropout as Gaofeng +# did it in the non-fast LSTMs, with separate per-frame masks on +# the i and f component. Using dropout schedule that maxes out at +# 0.3, which he found worked best for that type of dropout. + +# [See about 20 lines below for the original comparison with the baseline, +# done when "p" was dropping out 2 gates [the i and f gates]. +# The comparison directly below is between the version that dropped out +# 2 gates (p) with the one that dropped out 3 gates (p2). No consistent +# difference there.] +# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1{p,p2}_sp_bi +#_sp_bi +# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1p_sp_bi exp/chain_cleaned/tdnn_lstm1p2_sp_bi +# System tdnn_lstm1p_sp_bi tdnn_lstm1p2_sp_bi +# WER on dev(orig) 8.9 8.7 +# WER on dev(rescored) 8.4 8.2 +# WER on test(orig) 8.7 8.8 +# WER on test(rescored) 8.1 8.3 +# Final train prob -0.0712 -0.0717 +# Final valid prob -0.0848 -0.0834 +# Final train prob (xent) -0.8903 -0.9147 +# Final valid prob (xent) -0.9719 -0.9977 + +# +# +# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,k,p,q}_sp_bi +# System tdnn_lstm1e_sp_bi tdnn_lstm1k_sp_bi tdnn_lstm1p_sp_bi tdnn_lstm1q_sp_bi +# WER on dev(orig) 9.0 8.7 8.9 9.1 +# [looped:] 9.0 8.6 8.8 9.0 +# WER on dev(rescored) 8.4 7.9 8.4 8.3 +# [looped:] 8.4 7.8 8.3 8.2 +# WER on test(orig) 8.8 8.8 8.7 8.9 +# [looped:] 8.8 8.7 8.6 8.9 +# WER on test(rescored) 8.4 8.3 8.1 8.3 +# [looped:] 8.3 8.3 8.1 8.3 +# Final train prob -0.0648 -0.0693 -0.0712 -0.0698 +# Final valid prob -0.0827 -0.0854 -0.0848 -0.0875 +# Final train prob (xent) -0.8372 -0.8848 -0.8903 -0.8721 +# Final valid prob (xent) -0.9497 -0.9895 -0.9719 -0.9828 +# +# 1k is as 1e, but introducing a dropout schedule. + +# 1e is as 1b, but reducing decay-time from 40 to 20. + +# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers. note: it +# uses egs from 1b, remember to remove that before I commit. + +# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi +# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091) + +# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below +# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had +# better results. Note: these results are not with the updated LM (the LM data-prep +# for this setup was changed in Nov 2016 but this was with an older directory). +# +# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi +# System lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi +# WER on dev(orig) 10.3 10.7 9.7 +# WER on dev(rescored) 9.8 10.1 9.3 +# WER on test(orig) 9.7 9.8 9.1 +# WER on test(rescored) 9.2 9.4 8.7 +# Final train prob -0.0812 -0.0862 -0.0625 +# Final valid prob -0.1049 -0.1047 -0.0910 +# Final train prob (xent) -1.1334 -1.1763 -0.8518 +# Final valid prob (xent) -1.2263 -1.2427 -0.9972 + +## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here; +## otherwise call it directly in its location). +# by default, with cleanup: +# local/chain/run_tdnn_lstm.sh + +# without cleanup: +# local/chain/run_tdnn_lstm.sh --train-set train --gmm tri3 --nnet3-affix "" & + +# note, if you have already run one of the non-chain nnet3 systems +# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14. + +# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly +# standard, LSTM, except that some TDNN layers were added in between the +# LSTM layers. I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but +# this isn't exactly copied from there. + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +decode_nj=30 +min_seg_len=1.55 +label_delay=5 +xent_regularize=0.1 +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +# training options +chunk_left_context=40 +chunk_right_context=0 +chunk_left_context_initial=0 +chunk_right_context_final=0 +# decode options +extra_left_context=50 +extra_right_context=0 +extra_left_context_initial=0 +extra_right_context_final=0 +frames_per_chunk=140,100,160 +frames_per_chunk_primary=140 + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnn_lstm_affix=1s #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs # you can set this to use previously dumped egs. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 15 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 16 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + + +if [ $stage -le 17 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + # note: the value of the dropout-proportion is not important, as it's + # controlled by the dropout schedule; what's important is that we set it. + lstmp_opts="decay-time=20 dropout-proportion=0.0" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=512 + relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1) + lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts + relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3) + relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3) + lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts + relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3) + relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3) + lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 18 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --trainer.dropout-schedule='0,0@0.20,0.3@0.5,0@0.75,0' \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width "$frames_per_chunk" \ + --egs.chunk-left-context "$chunk_left_context" \ + --egs.chunk-right-context "$chunk_right_context" \ + --egs.chunk-left-context-initial "$chunk_left_context_initial" \ + --egs.chunk-right-context-final "$chunk_right_context_final" \ + --trainer.num-chunk-per-minibatch 128,64 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.deriv-truncate-margin 10 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir \ + --cleanup=false + # --cleanup=false is temporary while debugging. +fi + + + +if [ $stage -le 19 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph +fi + +if [ $stage -le 20 ]; then + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial $extra_left_context_initial \ + --extra-right-context-final $extra_right_context_final \ + --frames-per-chunk "$frames_per_chunk_primary" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +if [ $stage -le 21 ]; then + # 'looped' decoding. we didn't write a -parallel version of this program yet, + # so it will take a bit longer as the --num-threads option is not supported. + # we just hardcode the --frames-per-chunk option as it doesn't have to + # match any value used in training, and it won't affect the results (unlike + # regular decoding). + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context-initial $extra_left_context_initial \ + --frames-per-chunk 30 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +exit 0 diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh new file mode 100755 index 00000000000..724081a4c61 --- /dev/null +++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh @@ -0,0 +1,382 @@ +#!/bin/bash + +# 1t is as 1s but without dropout; it could be compared to 1e. +# 1s is as 1p, but reverting to the non-fast LSTM code (still with dropout); +# will do 1t as the baseline without dropout. Seems a bit worse than +# the fast-LSTM code. + +# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1t_sp_bi +# System tdnn_lstm1e_sp_bi tdnn_lstm1t_sp_bi +# WER on dev(orig) 9.0 9.2 +# [looped:] 9.0 9.2 +# WER on dev(rescored) 8.4 8.6 +# [looped:] 8.4 8.6 +# WER on test(orig) 8.8 9.1 +# [looped:] 8.8 9.0 +# WER on test(rescored) 8.4 8.6 +# [looped:] 8.3 8.7 +# Final train prob -0.0648 -0.0618 +# Final valid prob -0.0827 -0.0794 +# Final train prob (xent) -0.8372 -0.8120 +# Final valid prob (xent) -0.9497 -0.9396 + +# 1p is as 1k, but [via script changes] doing the dropout as Gaofeng +# did it in the non-fast LSTMs, with separate per-frame masks on +# the i and f component. Using dropout schedule that maxes out at +# 0.3, which he found worked best for that type of dropout. + +# [See about 20 lines below for the original comparison with the baseline, +# done when "p" was dropping out 2 gates [the i and f gates]. +# The comparison directly below is between the version that dropped out +# 2 gates (p) with the one that dropped out 3 gates (p2). No consistent +# difference there.] +# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1{p,p2}_sp_bi +#_sp_bi +# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1p_sp_bi exp/chain_cleaned/tdnn_lstm1p2_sp_bi +# System tdnn_lstm1p_sp_bi tdnn_lstm1p2_sp_bi +# WER on dev(orig) 8.9 8.7 +# WER on dev(rescored) 8.4 8.2 +# WER on test(orig) 8.7 8.8 +# WER on test(rescored) 8.1 8.3 +# Final train prob -0.0712 -0.0717 +# Final valid prob -0.0848 -0.0834 +# Final train prob (xent) -0.8903 -0.9147 +# Final valid prob (xent) -0.9719 -0.9977 + +# +# +# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,k,p,q}_sp_bi +# System tdnn_lstm1e_sp_bi tdnn_lstm1k_sp_bi tdnn_lstm1p_sp_bi tdnn_lstm1q_sp_bi +# WER on dev(orig) 9.0 8.7 8.9 9.1 +# [looped:] 9.0 8.6 8.8 9.0 +# WER on dev(rescored) 8.4 7.9 8.4 8.3 +# [looped:] 8.4 7.8 8.3 8.2 +# WER on test(orig) 8.8 8.8 8.7 8.9 +# [looped:] 8.8 8.7 8.6 8.9 +# WER on test(rescored) 8.4 8.3 8.1 8.3 +# [looped:] 8.3 8.3 8.1 8.3 +# Final train prob -0.0648 -0.0693 -0.0712 -0.0698 +# Final valid prob -0.0827 -0.0854 -0.0848 -0.0875 +# Final train prob (xent) -0.8372 -0.8848 -0.8903 -0.8721 +# Final valid prob (xent) -0.9497 -0.9895 -0.9719 -0.9828 +# +# 1k is as 1e, but introducing a dropout schedule. + +# 1e is as 1b, but reducing decay-time from 40 to 20. + +# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers. note: it +# uses egs from 1b, remember to remove that before I commit. + +# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi +# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091) + +# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below +# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had +# better results. Note: these results are not with the updated LM (the LM data-prep +# for this setup was changed in Nov 2016 but this was with an older directory). +# +# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi +# System lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi +# WER on dev(orig) 10.3 10.7 9.7 +# WER on dev(rescored) 9.8 10.1 9.3 +# WER on test(orig) 9.7 9.8 9.1 +# WER on test(rescored) 9.2 9.4 8.7 +# Final train prob -0.0812 -0.0862 -0.0625 +# Final valid prob -0.1049 -0.1047 -0.0910 +# Final train prob (xent) -1.1334 -1.1763 -0.8518 +# Final valid prob (xent) -1.2263 -1.2427 -0.9972 + +## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here; +## otherwise call it directly in its location). +# by default, with cleanup: +# local/chain/run_tdnn_lstm.sh + +# without cleanup: +# local/chain/run_tdnn_lstm.sh --train-set train --gmm tri3 --nnet3-affix "" & + +# note, if you have already run one of the non-chain nnet3 systems +# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14. + +# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly +# standard, LSTM, except that some TDNN layers were added in between the +# LSTM layers. I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but +# this isn't exactly copied from there. + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +decode_nj=30 +min_seg_len=1.55 +label_delay=5 +xent_regularize=0.1 +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +# training options +chunk_left_context=40 +chunk_right_context=0 +chunk_left_context_initial=0 +chunk_right_context_final=0 +# decode options +extra_left_context=50 +extra_right_context=0 +extra_left_context_initial=0 +extra_right_context_final=0 +frames_per_chunk=140,100,160 +frames_per_chunk_primary=140 + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnn_lstm_affix=1t #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs # you can set this to use previously dumped egs. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 15 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 16 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + + +if [ $stage -le 17 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + # note: the value of the dropout-proportion is not important, as it's + # controlled by the dropout schedule; what's important is that we set it. + lstmp_opts="decay-time=20" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=512 + relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1) + lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts + relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3) + relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3) + lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts + relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3) + relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3) + lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 18 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width "$frames_per_chunk" \ + --egs.chunk-left-context "$chunk_left_context" \ + --egs.chunk-right-context "$chunk_right_context" \ + --egs.chunk-left-context-initial "$chunk_left_context_initial" \ + --egs.chunk-right-context-final "$chunk_right_context_final" \ + --trainer.num-chunk-per-minibatch 128,64 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.deriv-truncate-margin 10 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir \ + --cleanup=false + # --cleanup=false is temporary while debugging. +fi + + + +if [ $stage -le 19 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph +fi + +if [ $stage -le 20 ]; then + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial $extra_left_context_initial \ + --extra-right-context-final $extra_right_context_final \ + --frames-per-chunk "$frames_per_chunk_primary" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +if [ $stage -le 21 ]; then + # 'looped' decoding. we didn't write a -parallel version of this program yet, + # so it will take a bit longer as the --num-threads option is not supported. + # we just hardcode the --frames-per-chunk option as it doesn't have to + # match any value used in training, and it won't affect the results (unlike + # regular decoding). + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context-initial $extra_left_context_initial \ + --frames-per-chunk 30 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +exit 0 diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh new file mode 100755 index 00000000000..eda096b487b --- /dev/null +++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh @@ -0,0 +1,385 @@ +#!/bin/bash + +# 1u is as 1s, but adding dropout-per-frame=true. +# Slightly better than 1s, but the improvement versus the baseline 1t is +# rather disappointing (only about 0.4 at most). + +# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1p2_sp_bi exp/chain_cleaned/tdnn_lstm1s_sp_bi exp/chain_cleaned/tdnn_lstm1t_sp_bi exp/chain_cleaned/tdnn_lstm1u_sp_bi +# System tdnn_lstm1e_sp_bi tdnn_lstm1p2_sp_bi tdnn_lstm1s_sp_bi tdnn_lstm1t_sp_bi tdnn_lstm1u_sp_bi +# WER on dev(orig) 9.0 8.7 9.1 9.2 9.0 +# [looped:] 9.0 8.7 9.1 9.2 8.9 +# WER on dev(rescored) 8.4 8.2 8.3 8.6 8.1 +# [looped:] 8.4 8.2 8.3 8.6 8.1 +# WER on test(orig) 8.8 8.8 9.0 9.1 8.7 +# [looped:] 8.8 8.8 9.0 9.0 8.7 +# WER on test(rescored) 8.4 8.3 8.4 8.6 8.3 +# [looped:] 8.3 8.3 8.4 8.7 8.3 +# Final train prob -0.0648 -0.0717 -0.0693 -0.0618 -0.0723 +# Final valid prob -0.0827 -0.0833 -0.0859 -0.0794 -0.0828 +# Final train prob (xent) -0.8372 -0.8979 -0.8802 -0.8120 -0.9042 +# Final valid prob (xent) -0.9497 -0.9844 -0.9934 -0.9396 -0.9879 + +# 1s is as 1p, but reverting to the non-fast LSTM code (still with dropout); +# will do 1t as the baseline without dropout. + +# 1p is as 1k, but [via script changes] doing the dropout as Gaofeng +# did it in the non-fast LSTMs, with separate per-frame masks on +# the i and f component. Using dropout schedule that maxes out at +# 0.3, which he found worked best for that type of dropout. + +# [See about 20 lines below for the original comparison with the baseline, +# done when "p" was dropping out 2 gates [the i and f gates]. +# The comparison directly below is between the version that dropped out +# 2 gates (p) with the one that dropped out 3 gates (p2). No consistent +# difference there.] +# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1{p,p2}_sp_bi +#_sp_bi +# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1p_sp_bi exp/chain_cleaned/tdnn_lstm1p2_sp_bi +# System tdnn_lstm1p_sp_bi tdnn_lstm1p2_sp_bi +# WER on dev(orig) 8.9 8.7 +# WER on dev(rescored) 8.4 8.2 +# WER on test(orig) 8.7 8.8 +# WER on test(rescored) 8.1 8.3 +# Final train prob -0.0712 -0.0717 +# Final valid prob -0.0848 -0.0834 +# Final train prob (xent) -0.8903 -0.9147 +# Final valid prob (xent) -0.9719 -0.9977 + +# +# +# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,k,p,q}_sp_bi +# System tdnn_lstm1e_sp_bi tdnn_lstm1k_sp_bi tdnn_lstm1p_sp_bi tdnn_lstm1q_sp_bi +# WER on dev(orig) 9.0 8.7 8.9 9.1 +# [looped:] 9.0 8.6 8.8 9.0 +# WER on dev(rescored) 8.4 7.9 8.4 8.3 +# [looped:] 8.4 7.8 8.3 8.2 +# WER on test(orig) 8.8 8.8 8.7 8.9 +# [looped:] 8.8 8.7 8.6 8.9 +# WER on test(rescored) 8.4 8.3 8.1 8.3 +# [looped:] 8.3 8.3 8.1 8.3 +# Final train prob -0.0648 -0.0693 -0.0712 -0.0698 +# Final valid prob -0.0827 -0.0854 -0.0848 -0.0875 +# Final train prob (xent) -0.8372 -0.8848 -0.8903 -0.8721 +# Final valid prob (xent) -0.9497 -0.9895 -0.9719 -0.9828 +# +# 1k is as 1e, but introducing a dropout schedule. + +# 1e is as 1b, but reducing decay-time from 40 to 20. + +# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers. note: it +# uses egs from 1b, remember to remove that before I commit. + +# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi +# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091) + +# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below +# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had +# better results. Note: these results are not with the updated LM (the LM data-prep +# for this setup was changed in Nov 2016 but this was with an older directory). +# +# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi +# System lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi +# WER on dev(orig) 10.3 10.7 9.7 +# WER on dev(rescored) 9.8 10.1 9.3 +# WER on test(orig) 9.7 9.8 9.1 +# WER on test(rescored) 9.2 9.4 8.7 +# Final train prob -0.0812 -0.0862 -0.0625 +# Final valid prob -0.1049 -0.1047 -0.0910 +# Final train prob (xent) -1.1334 -1.1763 -0.8518 +# Final valid prob (xent) -1.2263 -1.2427 -0.9972 + +## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here; +## otherwise call it directly in its location). +# by default, with cleanup: +# local/chain/run_tdnn_lstm.sh + +# without cleanup: +# local/chain/run_tdnn_lstm.sh --train-set train --gmm tri3 --nnet3-affix "" & + +# note, if you have already run one of the non-chain nnet3 systems +# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14. + +# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly +# standard, LSTM, except that some TDNN layers were added in between the +# LSTM layers. I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but +# this isn't exactly copied from there. + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +decode_nj=30 +min_seg_len=1.55 +label_delay=5 +xent_regularize=0.1 +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +# training options +chunk_left_context=40 +chunk_right_context=0 +chunk_left_context_initial=0 +chunk_right_context_final=0 +# decode options +extra_left_context=50 +extra_right_context=0 +extra_left_context_initial=0 +extra_right_context_final=0 +frames_per_chunk=140,100,160 +frames_per_chunk_primary=140 + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnn_lstm_affix=1u #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs # you can set this to use previously dumped egs. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 15 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 16 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + + +if [ $stage -le 17 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + # note: the value of the dropout-proportion is not important, as it's + # controlled by the dropout schedule; what's important is that we set it. + lstmp_opts="decay-time=20 dropout-proportion=0.0 dropout-per-frame=True" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=512 + relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1) + lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts + relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3) + relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3) + lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts + relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3) + relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3) + lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 18 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --trainer.dropout-schedule='0,0@0.20,0.3@0.5,0@0.75,0' \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width "$frames_per_chunk" \ + --egs.chunk-left-context "$chunk_left_context" \ + --egs.chunk-right-context "$chunk_right_context" \ + --egs.chunk-left-context-initial "$chunk_left_context_initial" \ + --egs.chunk-right-context-final "$chunk_right_context_final" \ + --trainer.num-chunk-per-minibatch 128,64 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.deriv-truncate-margin 10 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir \ + --cleanup=false + # --cleanup=false is temporary while debugging. +fi + + + +if [ $stage -le 19 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph +fi + +if [ $stage -le 20 ]; then + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial $extra_left_context_initial \ + --extra-right-context-final $extra_right_context_final \ + --frames-per-chunk "$frames_per_chunk_primary" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +if [ $stage -le 21 ]; then + # 'looped' decoding. we didn't write a -parallel version of this program yet, + # so it will take a bit longer as the --num-threads option is not supported. + # we just hardcode the --frames-per-chunk option as it doesn't have to + # match any value used in training, and it won't affect the results (unlike + # regular decoding). + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context-initial $extra_left_context_initial \ + --frames-per-chunk 30 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +exit 0 From eb94ffde0f2c8eb132c264bfcde4f50122e7de0c Mon Sep 17 00:00:00 2001 From: Gaofeng Cheng <770579626@qq.com> Date: Tue, 21 Mar 2017 12:32:40 +0800 Subject: [PATCH 06/21] for ref --- .vscode/settings.json | 3 + .../chain/tuning/run_tdnn_lstm_1u_1024.sh | 387 ++++++++++++++++++ .../tdnn_lstm_1u_newschedule_5epoch_1024.sh | 1 + 3 files changed, 391 insertions(+) create mode 100644 .vscode/settings.json create mode 100644 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u_1024.sh create mode 100644 egs/tedlium/s5_r2/tdnn_lstm_1u_newschedule_5epoch_1024.sh diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000000..fe7159848bd --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.linting.pylintEnabled": false +} \ No newline at end of file diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u_1024.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u_1024.sh new file mode 100644 index 00000000000..e6a44bd0bc8 --- /dev/null +++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u_1024.sh @@ -0,0 +1,387 @@ +#!/bin/bash + +# 1u is as 1s, but adding dropout-per-frame=true. +# Slightly better than 1s, but the improvement versus the baseline 1t is +# rather disappointing (only about 0.4 at most). + +# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1p2_sp_bi exp/chain_cleaned/tdnn_lstm1s_sp_bi exp/chain_cleaned/tdnn_lstm1t_sp_bi exp/chain_cleaned/tdnn_lstm1u_sp_bi +# System tdnn_lstm1e_sp_bi tdnn_lstm1p2_sp_bi tdnn_lstm1s_sp_bi tdnn_lstm1t_sp_bi tdnn_lstm1u_sp_bi +# WER on dev(orig) 9.0 8.7 9.1 9.2 9.0 +# [looped:] 9.0 8.7 9.1 9.2 8.9 +# WER on dev(rescored) 8.4 8.2 8.3 8.6 8.1 +# [looped:] 8.4 8.2 8.3 8.6 8.1 +# WER on test(orig) 8.8 8.8 9.0 9.1 8.7 +# [looped:] 8.8 8.8 9.0 9.0 8.7 +# WER on test(rescored) 8.4 8.3 8.4 8.6 8.3 +# [looped:] 8.3 8.3 8.4 8.7 8.3 +# Final train prob -0.0648 -0.0717 -0.0693 -0.0618 -0.0723 +# Final valid prob -0.0827 -0.0833 -0.0859 -0.0794 -0.0828 +# Final train prob (xent) -0.8372 -0.8979 -0.8802 -0.8120 -0.9042 +# Final valid prob (xent) -0.9497 -0.9844 -0.9934 -0.9396 -0.9879 + +# 1s is as 1p, but reverting to the non-fast LSTM code (still with dropout); +# will do 1t as the baseline without dropout. + +# 1p is as 1k, but [via script changes] doing the dropout as Gaofeng +# did it in the non-fast LSTMs, with separate per-frame masks on +# the i and f component. Using dropout schedule that maxes out at +# 0.3, which he found worked best for that type of dropout. + +# [See about 20 lines below for the original comparison with the baseline, +# done when "p" was dropping out 2 gates [the i and f gates]. +# The comparison directly below is between the version that dropped out +# 2 gates (p) with the one that dropped out 3 gates (p2). No consistent +# difference there.] +# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1{p,p2}_sp_bi +#_sp_bi +# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1p_sp_bi exp/chain_cleaned/tdnn_lstm1p2_sp_bi +# System tdnn_lstm1p_sp_bi tdnn_lstm1p2_sp_bi +# WER on dev(orig) 8.9 8.7 +# WER on dev(rescored) 8.4 8.2 +# WER on test(orig) 8.7 8.8 +# WER on test(rescored) 8.1 8.3 +# Final train prob -0.0712 -0.0717 +# Final valid prob -0.0848 -0.0834 +# Final train prob (xent) -0.8903 -0.9147 +# Final valid prob (xent) -0.9719 -0.9977 + +# +# +# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,k,p,q}_sp_bi +# System tdnn_lstm1e_sp_bi tdnn_lstm1k_sp_bi tdnn_lstm1p_sp_bi tdnn_lstm1q_sp_bi +# WER on dev(orig) 9.0 8.7 8.9 9.1 +# [looped:] 9.0 8.6 8.8 9.0 +# WER on dev(rescored) 8.4 7.9 8.4 8.3 +# [looped:] 8.4 7.8 8.3 8.2 +# WER on test(orig) 8.8 8.8 8.7 8.9 +# [looped:] 8.8 8.7 8.6 8.9 +# WER on test(rescored) 8.4 8.3 8.1 8.3 +# [looped:] 8.3 8.3 8.1 8.3 +# Final train prob -0.0648 -0.0693 -0.0712 -0.0698 +# Final valid prob -0.0827 -0.0854 -0.0848 -0.0875 +# Final train prob (xent) -0.8372 -0.8848 -0.8903 -0.8721 +# Final valid prob (xent) -0.9497 -0.9895 -0.9719 -0.9828 +# +# 1k is as 1e, but introducing a dropout schedule. + +# 1e is as 1b, but reducing decay-time from 40 to 20. + +# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers. note: it +# uses egs from 1b, remember to remove that before I commit. + +# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi +# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091) + +# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below +# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had +# better results. Note: these results are not with the updated LM (the LM data-prep +# for this setup was changed in Nov 2016 but this was with an older directory). +# +# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi +# System lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi +# WER on dev(orig) 10.3 10.7 9.7 +# WER on dev(rescored) 9.8 10.1 9.3 +# WER on test(orig) 9.7 9.8 9.1 +# WER on test(rescored) 9.2 9.4 8.7 +# Final train prob -0.0812 -0.0862 -0.0625 +# Final valid prob -0.1049 -0.1047 -0.0910 +# Final train prob (xent) -1.1334 -1.1763 -0.8518 +# Final valid prob (xent) -1.2263 -1.2427 -0.9972 + +## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here; +## otherwise call it directly in its location). +# by default, with cleanup: +# local/chain/run_tdnn_lstm.sh + +# without cleanup: +# local/chain/run_tdnn_lstm.sh --train-set train --gmm tri3 --nnet3-affix "" & + +# note, if you have already run one of the non-chain nnet3 systems +# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14. + +# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly +# standard, LSTM, except that some TDNN layers were added in between the +# LSTM layers. I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but +# this isn't exactly copied from there. + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=17 +nj=30 +decode_nj=30 +min_seg_len=1.55 +label_delay=5 +xent_regularize=0.1 +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +dropout_schedule= +num_epoch= +# training options +chunk_left_context=40 +chunk_right_context=0 +chunk_left_context_initial=0 +chunk_right_context_final=0 +# decode options +extra_left_context=50 +extra_right_context=0 +extra_left_context_initial=0 +extra_right_context_final=0 +frames_per_chunk=140,100,160 +frames_per_chunk_primary=140 + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnn_lstm_affix=1u #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs # you can set this to use previously dumped egs. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 15 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 16 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + + +if [ $stage -le 17 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + # note: the value of the dropout-proportion is not important, as it's + # controlled by the dropout schedule; what's important is that we set it. + lstmp_opts="decay-time=20 dropout-proportion=0.0 dropout-per-frame=True" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=1024 + relu-renorm-layer name=tdnn2 dim=1024 input=Append(-1,0,1) + lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstmp_opts + relu-renorm-layer name=tdnn3 dim=1024 input=Append(-3,0,3) + relu-renorm-layer name=tdnn4 dim=1024 input=Append(-3,0,3) + lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstmp_opts + relu-renorm-layer name=tdnn5 dim=1024 input=Append(-3,0,3) + relu-renorm-layer name=tdnn6 dim=1024 input=Append(-3,0,3) + lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstmp_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 18 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --trainer.dropout-schedule="$dropout_schedule" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width "$frames_per_chunk" \ + --egs.chunk-left-context "$chunk_left_context" \ + --egs.chunk-right-context "$chunk_right_context" \ + --egs.chunk-left-context-initial "$chunk_left_context_initial" \ + --egs.chunk-right-context-final "$chunk_right_context_final" \ + --trainer.num-chunk-per-minibatch 128,64 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epoch \ + --trainer.deriv-truncate-margin 10 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir \ + --cleanup=false + # --cleanup=false is temporary while debugging. +fi + + + +if [ $stage -le 19 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph +fi + +if [ $stage -le 20 ]; then + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial $extra_left_context_initial \ + --extra-right-context-final $extra_right_context_final \ + --frames-per-chunk "$frames_per_chunk_primary" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +if [ $stage -le 21 ]; then + # 'looped' decoding. we didn't write a -parallel version of this program yet, + # so it will take a bit longer as the --num-threads option is not supported. + # we just hardcode the --frames-per-chunk option as it doesn't have to + # match any value used in training, and it won't affect the results (unlike + # regular decoding). + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context-initial $extra_left_context_initial \ + --frames-per-chunk 30 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +exit 0 diff --git a/egs/tedlium/s5_r2/tdnn_lstm_1u_newschedule_5epoch_1024.sh b/egs/tedlium/s5_r2/tdnn_lstm_1u_newschedule_5epoch_1024.sh new file mode 100644 index 00000000000..d41fb4f82c2 --- /dev/null +++ b/egs/tedlium/s5_r2/tdnn_lstm_1u_newschedule_5epoch_1024.sh @@ -0,0 +1 @@ +local/chain/tuning/run_tdnn_lstm_1u_1024.sh --train-stage 68 --dropout-schedule "0,0@0.20,0.3@0.5,0" --num-epoch 5 --tdnn-lstm-affix 1u_newschedule_5epoch_1024 From 9afaf399ce353540839f9b7ebbde172eb8c29367 Mon Sep 17 00:00:00 2001 From: Gaofeng Cheng <770579626@qq.com> Date: Sun, 9 Apr 2017 10:36:28 +0800 Subject: [PATCH 07/21] delete temporary tuning sdripts in tedlium --- .../local/chain/tuning/run_tdnn_lstm_1p.sh | 367 ----------------- .../local/chain/tuning/run_tdnn_lstm_1q.sh | 348 ---------------- .../local/chain/tuning/run_tdnn_lstm_1s.sh | 383 ----------------- .../local/chain/tuning/run_tdnn_lstm_1t.sh | 382 ----------------- .../local/chain/tuning/run_tdnn_lstm_1u.sh | 385 ----------------- .../chain/tuning/run_tdnn_lstm_1u_1024.sh | 387 ------------------ .../tdnn_lstm_1u_newschedule_5epoch_1024.sh | 1 - 7 files changed, 2253 deletions(-) delete mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1p.sh delete mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1q.sh delete mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh delete mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh delete mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh delete mode 100644 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u_1024.sh delete mode 100644 egs/tedlium/s5_r2/tdnn_lstm_1u_newschedule_5epoch_1024.sh diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1p.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1p.sh deleted file mode 100755 index eecc6bc2544..00000000000 --- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1p.sh +++ /dev/null @@ -1,367 +0,0 @@ -#!/bin/bash - -# [note: this was later run as 1p2, with code and script changes that -# meant it was using dropout on 3 gates, as Gaofeng was really doing, -# not 2 as I thought he was doing.] - -# 1p is as 1k, but [via script changes] doing the dropout as Gaofeng -# did it in the non-fast LSTMs, with separate per-frame masks on -# the i and f component. Using dropout schedule that maxes out at -# 0.3, which he found worked best for that type of dropout. - -# [See about 20 lines below for the original comparison with the baseline, -# done when "p" was dropping out 2 gates [the i and f gates]. -# The comparison directly below is between the version that dropped out -# 2 gates (p) with the one that dropped out 3 gates (p2). No consistent -# difference there.] -# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1{p,p2}_sp_bi -#_sp_bi -# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1p_sp_bi exp/chain_cleaned/tdnn_lstm1p2_sp_bi -# System tdnn_lstm1p_sp_bi tdnn_lstm1p2_sp_bi -# WER on dev(orig) 8.9 8.7 -# WER on dev(rescored) 8.4 8.2 -# WER on test(orig) 8.7 8.8 -# WER on test(rescored) 8.1 8.3 -# Final train prob -0.0712 -0.0717 -# Final valid prob -0.0848 -0.0834 -# Final train prob (xent) -0.8903 -0.9147 -# Final valid prob (xent) -0.9719 -0.9977 - -# -# -# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,k,p,q}_sp_bi -# System tdnn_lstm1e_sp_bi tdnn_lstm1k_sp_bi tdnn_lstm1p_sp_bi tdnn_lstm1q_sp_bi -# WER on dev(orig) 9.0 8.7 8.9 9.1 -# [looped:] 9.0 8.6 8.8 9.0 -# WER on dev(rescored) 8.4 7.9 8.4 8.3 -# [looped:] 8.4 7.8 8.3 8.2 -# WER on test(orig) 8.8 8.8 8.7 8.9 -# [looped:] 8.8 8.7 8.6 8.9 -# WER on test(rescored) 8.4 8.3 8.1 8.3 -# [looped:] 8.3 8.3 8.1 8.3 -# Final train prob -0.0648 -0.0693 -0.0712 -0.0698 -# Final valid prob -0.0827 -0.0854 -0.0848 -0.0875 -# Final train prob (xent) -0.8372 -0.8848 -0.8903 -0.8721 -# Final valid prob (xent) -0.9497 -0.9895 -0.9719 -0.9828 -# -# 1k is as 1e, but introducing a dropout schedule. - -# 1e is as 1b, but reducing decay-time from 40 to 20. - -# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers. note: it -# uses egs from 1b, remember to remove that before I commit. - -# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi -# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091) - -# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below -# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had -# better results. Note: these results are not with the updated LM (the LM data-prep -# for this setup was changed in Nov 2016 but this was with an older directory). -# -# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi -# System lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi -# WER on dev(orig) 10.3 10.7 9.7 -# WER on dev(rescored) 9.8 10.1 9.3 -# WER on test(orig) 9.7 9.8 9.1 -# WER on test(rescored) 9.2 9.4 8.7 -# Final train prob -0.0812 -0.0862 -0.0625 -# Final valid prob -0.1049 -0.1047 -0.0910 -# Final train prob (xent) -1.1334 -1.1763 -0.8518 -# Final valid prob (xent) -1.2263 -1.2427 -0.9972 - -## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here; -## otherwise call it directly in its location). -# by default, with cleanup: -# local/chain/run_tdnn_lstm.sh - -# without cleanup: -# local/chain/run_tdnn_lstm.sh --train-set train --gmm tri3 --nnet3-affix "" & - -# note, if you have already run one of the non-chain nnet3 systems -# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14. - -# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly -# standard, LSTM, except that some TDNN layers were added in between the -# LSTM layers. I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but -# this isn't exactly copied from there. - - -set -e -o pipefail - -# First the options that are passed through to run_ivector_common.sh -# (some of which are also used in this script directly). -stage=0 -nj=30 -decode_nj=30 -min_seg_len=1.55 -label_delay=5 -xent_regularize=0.1 -train_set=train_cleaned -gmm=tri3_cleaned # the gmm for the target data -num_threads_ubm=32 -nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned -# training options -chunk_left_context=40 -chunk_right_context=0 -chunk_left_context_initial=0 -chunk_right_context_final=0 -# decode options -extra_left_context=50 -extra_right_context=0 -extra_left_context_initial=0 -extra_right_context_final=0 -frames_per_chunk=140,100,160 -frames_per_chunk_primary=140 - -# The rest are configs specific to this script. Most of the parameters -# are just hardcoded at this level, in the commands below. -train_stage=-10 -tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. -tdnn_lstm_affix=1p2 #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. -common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs # you can set this to use previously dumped egs. - -# End configuration section. -echo "$0 $@" # Print the command line for logging - -. cmd.sh -. ./path.sh -. ./utils/parse_options.sh - - -if ! cuda-compiled; then - cat <data/lang_chain/topo - fi -fi - -if [ $stage -le 15 ]; then - # Get the alignments as lattices (gives the chain training more freedom). - # use the same num-jobs as the alignments - steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ - data/lang $gmm_dir $lat_dir - rm $lat_dir/fsts.*.gz # save space -fi - -if [ $stage -le 16 ]; then - # Build a tree using our new topology. We know we have alignments for the - # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use - # those. - if [ -f $tree_dir/final.mdl ]; then - echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." - exit 1; - fi - steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ - --context-opts "--context-width=2 --central-position=1" \ - --leftmost-questions-truncate -1 \ - --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir -fi - - -if [ $stage -le 17 ]; then - mkdir -p $dir - echo "$0: creating neural net configs using the xconfig parser"; - - num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - - # note: the value of the dropout-proportion is not important, as it's - # controlled by the dropout schedule; what's important is that we set it. - lstmp_opts="decay-time=20 dropout-proportion=0.0" - - mkdir -p $dir/configs - cat < $dir/configs/network.xconfig - input dim=100 name=ivector - input dim=40 name=input - - # please note that it is important to have input layer with the name=input - # as the layer immediately preceding the fixed-affine-layer to enable - # the use of short notation for the descriptor - fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat - - # the first splicing is moved before the lda layer, so no splicing here - relu-renorm-layer name=tdnn1 dim=512 - relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1) - fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts - relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3) - relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3) - fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts - relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3) - relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3) - fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts - - ## adding the layers for chain branch - output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 - - # adding the layers for xent branch - # This block prints the configs for a separate output that will be - # trained with a cross-entropy objective in the 'chain' models... this - # has the effect of regularizing the hidden parts of the model. we use - # 0.5 / args.xent_regularize as the learning rate factor- the factor of - # 0.5 / args.xent_regularize is suitable as it means the xent - # final-layer learns at a rate independent of the regularization - # constant; and the 0.5 was tuned so as to make the relative progress - # similar in the xent and regular final layers. - output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 - -EOF - steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ -fi - - -if [ $stage -le 18 ]; then - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then - utils/create_split_dir.pl \ - /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage - fi - - steps/nnet3/chain/train.py --stage $train_stage \ - --cmd "$decode_cmd" \ - --feat.online-ivector-dir $train_ivector_dir \ - --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ - --trainer.dropout-schedule='0,0@0.20,0.3@0.5,0@0.75,0' \ - --chain.xent-regularize 0.1 \ - --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize 0.00005 \ - --chain.apply-deriv-weights false \ - --chain.lm-opts="--num-extra-lm-states=2000" \ - --egs.dir "$common_egs_dir" \ - --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width "$frames_per_chunk" \ - --egs.chunk-left-context "$chunk_left_context" \ - --egs.chunk-right-context "$chunk_right_context" \ - --egs.chunk-left-context-initial "$chunk_left_context_initial" \ - --egs.chunk-right-context-final "$chunk_right_context_final" \ - --trainer.num-chunk-per-minibatch 128,64 \ - --trainer.frames-per-iter 1500000 \ - --trainer.max-param-change 2.0 \ - --trainer.num-epochs 4 \ - --trainer.deriv-truncate-margin 10 \ - --trainer.optimization.shrink-value 0.99 \ - --trainer.optimization.num-jobs-initial 2 \ - --trainer.optimization.num-jobs-final 12 \ - --trainer.optimization.initial-effective-lrate 0.001 \ - --trainer.optimization.final-effective-lrate 0.0001 \ - --trainer.optimization.momentum 0.0 \ - --cleanup.remove-egs true \ - --feat-dir $train_data_dir \ - --tree-dir $tree_dir \ - --lat-dir $lat_dir \ - --dir $dir \ - --cleanup=false - # --cleanup=false is temporary while debugging. -fi - - - -if [ $stage -le 19 ]; then - # Note: it might appear that this data/lang_chain directory is mismatched, and it is as - # far as the 'topo' is concerned, but this script doesn't read the 'topo' from - # the lang directory. - utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph -fi - -if [ $stage -le 20 ]; then - rm $dir/.error 2>/dev/null || true - for dset in dev test; do - ( - steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ - --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $extra_left_context \ - --extra-right-context $extra_right_context \ - --extra-left-context-initial $extra_left_context_initial \ - --extra-right-context-final $extra_right_context_final \ - --frames-per-chunk "$frames_per_chunk_primary" \ - --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ - --scoring-opts "--min-lmwt 5 " \ - $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ - data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 - ) || touch $dir/.error & - done - wait - if [ -f $dir/.error ]; then - echo "$0: something went wrong in decoding" - exit 1 - fi -fi - - -if [ $stage -le 21 ]; then - # 'looped' decoding. we didn't write a -parallel version of this program yet, - # so it will take a bit longer as the --num-threads option is not supported. - # we just hardcode the --frames-per-chunk option as it doesn't have to - # match any value used in training, and it won't affect the results (unlike - # regular decoding). - rm $dir/.error 2>/dev/null || true - for dset in dev test; do - ( - steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \ - --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context-initial $extra_left_context_initial \ - --frames-per-chunk 30 \ - --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ - --scoring-opts "--min-lmwt 5 " \ - $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1; - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ - data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1 - ) || touch $dir/.error & - done - wait - if [ -f $dir/.error ]; then - echo "$0: something went wrong in decoding" - exit 1 - fi -fi - - -exit 0 diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1q.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1q.sh deleted file mode 100755 index f6a640fe17f..00000000000 --- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1q.sh +++ /dev/null @@ -1,348 +0,0 @@ -#!/bin/bash - -# 1q is as 1p, but add the "dropout-exclusive" option which means that -# never drops out *both* the i and f gates. -# not helpful. see run_tdnn_lstm_1p.sh for results. - -# 1p is as 1k, but [via script changes] doing the dropout as Gaofeng -# did it in the non-fast LSTMs, with separate per-frame masks on -# the i and f component. Using dropout schedule that maxes out at -# 0.3, which he found worked best for that type of dropout. -# -# 1k is as 1e, but introducing a dropout schedule. - -# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,k,l,m}_sp_bi -# System tdnn_lstm1e_sp_bi tdnn_lstm1k_sp_bi tdnn_lstm1l_sp_bi tdnn_lstm1m_sp_bi -# WER on dev(orig) 9.0 8.7 8.9 9.0 -# [looped:] 9.0 8.6 8.9 8.9 -# WER on dev(rescored) 8.4 7.9 8.2 8.2 -# [looped:] 8.4 7.8 8.2 8.3 -# WER on test(orig) 8.8 8.8 8.9 8.9 -# [looped:] 8.8 8.7 8.8 8.8 -# WER on test(rescored) 8.4 8.3 8.2 8.5 -# [looped:] 8.3 8.3 8.3 8.4 -# Final train prob -0.0648 -0.0693 -0.0768 -0.0807 -# Final valid prob -0.0827 -0.0854 -0.0943 -0.0931 -# Final train prob (xent) -0.8372 -0.8848 -0.9371 -0.9807 -# Final valid prob (xent) -0.9497 -0.9895 -1.0546 -1.0629 - - -# 1e is as 1b, but reducing decay-time from 40 to 20. - -# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers. note: it -# uses egs from 1b, remember to remove that before I commit. - -# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi -# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091) - -# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below -# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had -# better results. Note: these results are not with the updated LM (the LM data-prep -# for this setup was changed in Nov 2016 but this was with an older directory). -# -# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi -# System lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi -# WER on dev(orig) 10.3 10.7 9.7 -# WER on dev(rescored) 9.8 10.1 9.3 -# WER on test(orig) 9.7 9.8 9.1 -# WER on test(rescored) 9.2 9.4 8.7 -# Final train prob -0.0812 -0.0862 -0.0625 -# Final valid prob -0.1049 -0.1047 -0.0910 -# Final train prob (xent) -1.1334 -1.1763 -0.8518 -# Final valid prob (xent) -1.2263 -1.2427 -0.9972 - -## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here; -## otherwise call it directly in its location). -# by default, with cleanup: -# local/chain/run_tdnn_lstm.sh - -# without cleanup: -# local/chain/run_tdnn_lstm.sh --train-set train --gmm tri3 --nnet3-affix "" & - -# note, if you have already run one of the non-chain nnet3 systems -# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14. - -# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly -# standard, LSTM, except that some TDNN layers were added in between the -# LSTM layers. I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but -# this isn't exactly copied from there. - - -set -e -o pipefail - -# First the options that are passed through to run_ivector_common.sh -# (some of which are also used in this script directly). -stage=0 -nj=30 -decode_nj=30 -min_seg_len=1.55 -label_delay=5 -xent_regularize=0.1 -train_set=train_cleaned -gmm=tri3_cleaned # the gmm for the target data -num_threads_ubm=32 -nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned -# training options -chunk_left_context=40 -chunk_right_context=0 -chunk_left_context_initial=0 -chunk_right_context_final=0 -# decode options -extra_left_context=50 -extra_right_context=0 -extra_left_context_initial=0 -extra_right_context_final=0 -frames_per_chunk=140,100,160 -frames_per_chunk_primary=140 - -# The rest are configs specific to this script. Most of the parameters -# are just hardcoded at this level, in the commands below. -train_stage=-10 -tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. -tdnn_lstm_affix=1q #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. -common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs # you can set this to use previously dumped egs. - -# End configuration section. -echo "$0 $@" # Print the command line for logging - -. cmd.sh -. ./path.sh -. ./utils/parse_options.sh - - -if ! cuda-compiled; then - cat <data/lang_chain/topo - fi -fi - -if [ $stage -le 15 ]; then - # Get the alignments as lattices (gives the chain training more freedom). - # use the same num-jobs as the alignments - steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ - data/lang $gmm_dir $lat_dir - rm $lat_dir/fsts.*.gz # save space -fi - -if [ $stage -le 16 ]; then - # Build a tree using our new topology. We know we have alignments for the - # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use - # those. - if [ -f $tree_dir/final.mdl ]; then - echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." - exit 1; - fi - steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ - --context-opts "--context-width=2 --central-position=1" \ - --leftmost-questions-truncate -1 \ - --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir -fi - - -if [ $stage -le 17 ]; then - mkdir -p $dir - echo "$0: creating neural net configs using the xconfig parser"; - - num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - - # note: the value of the dropout-proportion is not important, as it's - # controlled by the dropout schedule; what's important is that we set it. - lstmp_opts="decay-time=20 dropout-proportion=0.0 dropout-exclusive=true" - - mkdir -p $dir/configs - cat < $dir/configs/network.xconfig - input dim=100 name=ivector - input dim=40 name=input - - # please note that it is important to have input layer with the name=input - # as the layer immediately preceding the fixed-affine-layer to enable - # the use of short notation for the descriptor - fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat - - # the first splicing is moved before the lda layer, so no splicing here - relu-renorm-layer name=tdnn1 dim=512 - relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1) - fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts - relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3) - relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3) - fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts - relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3) - relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3) - fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts - - ## adding the layers for chain branch - output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 - - # adding the layers for xent branch - # This block prints the configs for a separate output that will be - # trained with a cross-entropy objective in the 'chain' models... this - # has the effect of regularizing the hidden parts of the model. we use - # 0.5 / args.xent_regularize as the learning rate factor- the factor of - # 0.5 / args.xent_regularize is suitable as it means the xent - # final-layer learns at a rate independent of the regularization - # constant; and the 0.5 was tuned so as to make the relative progress - # similar in the xent and regular final layers. - output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 - -EOF - steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ -fi - - -if [ $stage -le 18 ]; then - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then - utils/create_split_dir.pl \ - /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage - fi - - steps/nnet3/chain/train.py --stage $train_stage \ - --cmd "$decode_cmd" \ - --feat.online-ivector-dir $train_ivector_dir \ - --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ - --trainer.dropout-schedule='0,0@0.20,0.3@0.5,0@0.75,0' \ - --chain.xent-regularize 0.1 \ - --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize 0.00005 \ - --chain.apply-deriv-weights false \ - --chain.lm-opts="--num-extra-lm-states=2000" \ - --egs.dir "$common_egs_dir" \ - --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width "$frames_per_chunk" \ - --egs.chunk-left-context "$chunk_left_context" \ - --egs.chunk-right-context "$chunk_right_context" \ - --egs.chunk-left-context-initial "$chunk_left_context_initial" \ - --egs.chunk-right-context-final "$chunk_right_context_final" \ - --trainer.num-chunk-per-minibatch 128,64 \ - --trainer.frames-per-iter 1500000 \ - --trainer.max-param-change 2.0 \ - --trainer.num-epochs 4 \ - --trainer.deriv-truncate-margin 10 \ - --trainer.optimization.shrink-value 0.99 \ - --trainer.optimization.num-jobs-initial 2 \ - --trainer.optimization.num-jobs-final 12 \ - --trainer.optimization.initial-effective-lrate 0.001 \ - --trainer.optimization.final-effective-lrate 0.0001 \ - --trainer.optimization.momentum 0.0 \ - --cleanup.remove-egs true \ - --feat-dir $train_data_dir \ - --tree-dir $tree_dir \ - --lat-dir $lat_dir \ - --dir $dir \ - --cleanup=false - # --cleanup=false is temporary while debugging. -fi - - - -if [ $stage -le 19 ]; then - # Note: it might appear that this data/lang_chain directory is mismatched, and it is as - # far as the 'topo' is concerned, but this script doesn't read the 'topo' from - # the lang directory. - utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph -fi - -if [ $stage -le 20 ]; then - rm $dir/.error 2>/dev/null || true - for dset in dev test; do - ( - steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ - --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $extra_left_context \ - --extra-right-context $extra_right_context \ - --extra-left-context-initial $extra_left_context_initial \ - --extra-right-context-final $extra_right_context_final \ - --frames-per-chunk "$frames_per_chunk_primary" \ - --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ - --scoring-opts "--min-lmwt 5 " \ - $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ - data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 - ) || touch $dir/.error & - done - wait - if [ -f $dir/.error ]; then - echo "$0: something went wrong in decoding" - exit 1 - fi -fi - - -if [ $stage -le 21 ]; then - # 'looped' decoding. we didn't write a -parallel version of this program yet, - # so it will take a bit longer as the --num-threads option is not supported. - # we just hardcode the --frames-per-chunk option as it doesn't have to - # match any value used in training, and it won't affect the results (unlike - # regular decoding). - rm $dir/.error 2>/dev/null || true - for dset in dev test; do - ( - steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \ - --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context-initial $extra_left_context_initial \ - --frames-per-chunk 30 \ - --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ - --scoring-opts "--min-lmwt 5 " \ - $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1; - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ - data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1 - ) || touch $dir/.error & - done - wait - if [ -f $dir/.error ]; then - echo "$0: something went wrong in decoding" - exit 1 - fi -fi - - -exit 0 diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh deleted file mode 100755 index a9fa14ae132..00000000000 --- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh +++ /dev/null @@ -1,383 +0,0 @@ -#!/bin/bash - -# 1s is as 1p, but reverting to the non-fast LSTM code (still with dropout); -# will do 1t as the baseline without dropout. [note: mistakenly, this was run -# with not-per-frame dropout]. -# Results are not that encouraging. It's just slightly better than 1t. - -# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1p2_sp_bi exp/chain_cleaned/tdnn_lstm1s_sp_bi exp/chain_cleaned/tdnn_lstm1t_sp_bi -# System tdnn_lstm1e_sp_bi tdnn_lstm1p2_sp_bi tdnn_lstm1s_sp_bi tdnn_lstm1t_sp_bi -# WER on dev(orig) 9.0 8.7 9.1 9.2 -# [looped:] 9.0 8.7 9.1 9.2 -# WER on dev(rescored) 8.4 8.2 8.3 8.6 -# [looped:] 8.4 8.2 8.3 8.6 -# WER on test(orig) 8.8 8.8 9.0 9.1 -# [looped:] 8.8 8.8 9.0 9.0 -# WER on test(rescored) 8.4 8.3 8.4 8.6 -# [looped:] 8.3 8.3 8.4 8.7 -# Final train prob -0.0648 -0.0717 -0.0693 -0.0618 -# Final valid prob -0.0827 -0.0833 -0.0859 -0.0794 -# Final train prob (xent) -0.8372 -0.8979 -0.8802 -0.8120 -# Final valid prob (xent) -0.9497 -0.9844 -0.9934 -0.9396 - -# 1p is as 1k, but [via script changes] doing the dropout as Gaofeng -# did it in the non-fast LSTMs, with separate per-frame masks on -# the i and f component. Using dropout schedule that maxes out at -# 0.3, which he found worked best for that type of dropout. - -# [See about 20 lines below for the original comparison with the baseline, -# done when "p" was dropping out 2 gates [the i and f gates]. -# The comparison directly below is between the version that dropped out -# 2 gates (p) with the one that dropped out 3 gates (p2). No consistent -# difference there.] -# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1{p,p2}_sp_bi -#_sp_bi -# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1p_sp_bi exp/chain_cleaned/tdnn_lstm1p2_sp_bi -# System tdnn_lstm1p_sp_bi tdnn_lstm1p2_sp_bi -# WER on dev(orig) 8.9 8.7 -# WER on dev(rescored) 8.4 8.2 -# WER on test(orig) 8.7 8.8 -# WER on test(rescored) 8.1 8.3 -# Final train prob -0.0712 -0.0717 -# Final valid prob -0.0848 -0.0834 -# Final train prob (xent) -0.8903 -0.9147 -# Final valid prob (xent) -0.9719 -0.9977 - -# -# -# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,k,p,q}_sp_bi -# System tdnn_lstm1e_sp_bi tdnn_lstm1k_sp_bi tdnn_lstm1p_sp_bi tdnn_lstm1q_sp_bi -# WER on dev(orig) 9.0 8.7 8.9 9.1 -# [looped:] 9.0 8.6 8.8 9.0 -# WER on dev(rescored) 8.4 7.9 8.4 8.3 -# [looped:] 8.4 7.8 8.3 8.2 -# WER on test(orig) 8.8 8.8 8.7 8.9 -# [looped:] 8.8 8.7 8.6 8.9 -# WER on test(rescored) 8.4 8.3 8.1 8.3 -# [looped:] 8.3 8.3 8.1 8.3 -# Final train prob -0.0648 -0.0693 -0.0712 -0.0698 -# Final valid prob -0.0827 -0.0854 -0.0848 -0.0875 -# Final train prob (xent) -0.8372 -0.8848 -0.8903 -0.8721 -# Final valid prob (xent) -0.9497 -0.9895 -0.9719 -0.9828 -# -# 1k is as 1e, but introducing a dropout schedule. - -# 1e is as 1b, but reducing decay-time from 40 to 20. - -# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers. note: it -# uses egs from 1b, remember to remove that before I commit. - -# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi -# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091) - -# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below -# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had -# better results. Note: these results are not with the updated LM (the LM data-prep -# for this setup was changed in Nov 2016 but this was with an older directory). -# -# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi -# System lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi -# WER on dev(orig) 10.3 10.7 9.7 -# WER on dev(rescored) 9.8 10.1 9.3 -# WER on test(orig) 9.7 9.8 9.1 -# WER on test(rescored) 9.2 9.4 8.7 -# Final train prob -0.0812 -0.0862 -0.0625 -# Final valid prob -0.1049 -0.1047 -0.0910 -# Final train prob (xent) -1.1334 -1.1763 -0.8518 -# Final valid prob (xent) -1.2263 -1.2427 -0.9972 - -## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here; -## otherwise call it directly in its location). -# by default, with cleanup: -# local/chain/run_tdnn_lstm.sh - -# without cleanup: -# local/chain/run_tdnn_lstm.sh --train-set train --gmm tri3 --nnet3-affix "" & - -# note, if you have already run one of the non-chain nnet3 systems -# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14. - -# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly -# standard, LSTM, except that some TDNN layers were added in between the -# LSTM layers. I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but -# this isn't exactly copied from there. - - -set -e -o pipefail - -# First the options that are passed through to run_ivector_common.sh -# (some of which are also used in this script directly). -stage=0 -nj=30 -decode_nj=30 -min_seg_len=1.55 -label_delay=5 -xent_regularize=0.1 -train_set=train_cleaned -gmm=tri3_cleaned # the gmm for the target data -num_threads_ubm=32 -nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned -# training options -chunk_left_context=40 -chunk_right_context=0 -chunk_left_context_initial=0 -chunk_right_context_final=0 -# decode options -extra_left_context=50 -extra_right_context=0 -extra_left_context_initial=0 -extra_right_context_final=0 -frames_per_chunk=140,100,160 -frames_per_chunk_primary=140 - -# The rest are configs specific to this script. Most of the parameters -# are just hardcoded at this level, in the commands below. -train_stage=-10 -tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. -tdnn_lstm_affix=1s #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. -common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs # you can set this to use previously dumped egs. - -# End configuration section. -echo "$0 $@" # Print the command line for logging - -. cmd.sh -. ./path.sh -. ./utils/parse_options.sh - - -if ! cuda-compiled; then - cat <data/lang_chain/topo - fi -fi - -if [ $stage -le 15 ]; then - # Get the alignments as lattices (gives the chain training more freedom). - # use the same num-jobs as the alignments - steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ - data/lang $gmm_dir $lat_dir - rm $lat_dir/fsts.*.gz # save space -fi - -if [ $stage -le 16 ]; then - # Build a tree using our new topology. We know we have alignments for the - # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use - # those. - if [ -f $tree_dir/final.mdl ]; then - echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." - exit 1; - fi - steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ - --context-opts "--context-width=2 --central-position=1" \ - --leftmost-questions-truncate -1 \ - --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir -fi - - -if [ $stage -le 17 ]; then - mkdir -p $dir - echo "$0: creating neural net configs using the xconfig parser"; - - num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - - # note: the value of the dropout-proportion is not important, as it's - # controlled by the dropout schedule; what's important is that we set it. - lstmp_opts="decay-time=20 dropout-proportion=0.0" - - mkdir -p $dir/configs - cat < $dir/configs/network.xconfig - input dim=100 name=ivector - input dim=40 name=input - - # please note that it is important to have input layer with the name=input - # as the layer immediately preceding the fixed-affine-layer to enable - # the use of short notation for the descriptor - fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat - - # the first splicing is moved before the lda layer, so no splicing here - relu-renorm-layer name=tdnn1 dim=512 - relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1) - lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts - relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3) - relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3) - lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts - relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3) - relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3) - lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts - - ## adding the layers for chain branch - output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 - - # adding the layers for xent branch - # This block prints the configs for a separate output that will be - # trained with a cross-entropy objective in the 'chain' models... this - # has the effect of regularizing the hidden parts of the model. we use - # 0.5 / args.xent_regularize as the learning rate factor- the factor of - # 0.5 / args.xent_regularize is suitable as it means the xent - # final-layer learns at a rate independent of the regularization - # constant; and the 0.5 was tuned so as to make the relative progress - # similar in the xent and regular final layers. - output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 - -EOF - steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ -fi - - -if [ $stage -le 18 ]; then - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then - utils/create_split_dir.pl \ - /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage - fi - - steps/nnet3/chain/train.py --stage $train_stage \ - --cmd "$decode_cmd" \ - --feat.online-ivector-dir $train_ivector_dir \ - --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ - --trainer.dropout-schedule='0,0@0.20,0.3@0.5,0@0.75,0' \ - --chain.xent-regularize 0.1 \ - --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize 0.00005 \ - --chain.apply-deriv-weights false \ - --chain.lm-opts="--num-extra-lm-states=2000" \ - --egs.dir "$common_egs_dir" \ - --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width "$frames_per_chunk" \ - --egs.chunk-left-context "$chunk_left_context" \ - --egs.chunk-right-context "$chunk_right_context" \ - --egs.chunk-left-context-initial "$chunk_left_context_initial" \ - --egs.chunk-right-context-final "$chunk_right_context_final" \ - --trainer.num-chunk-per-minibatch 128,64 \ - --trainer.frames-per-iter 1500000 \ - --trainer.max-param-change 2.0 \ - --trainer.num-epochs 4 \ - --trainer.deriv-truncate-margin 10 \ - --trainer.optimization.shrink-value 0.99 \ - --trainer.optimization.num-jobs-initial 2 \ - --trainer.optimization.num-jobs-final 12 \ - --trainer.optimization.initial-effective-lrate 0.001 \ - --trainer.optimization.final-effective-lrate 0.0001 \ - --trainer.optimization.momentum 0.0 \ - --cleanup.remove-egs true \ - --feat-dir $train_data_dir \ - --tree-dir $tree_dir \ - --lat-dir $lat_dir \ - --dir $dir \ - --cleanup=false - # --cleanup=false is temporary while debugging. -fi - - - -if [ $stage -le 19 ]; then - # Note: it might appear that this data/lang_chain directory is mismatched, and it is as - # far as the 'topo' is concerned, but this script doesn't read the 'topo' from - # the lang directory. - utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph -fi - -if [ $stage -le 20 ]; then - rm $dir/.error 2>/dev/null || true - for dset in dev test; do - ( - steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ - --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $extra_left_context \ - --extra-right-context $extra_right_context \ - --extra-left-context-initial $extra_left_context_initial \ - --extra-right-context-final $extra_right_context_final \ - --frames-per-chunk "$frames_per_chunk_primary" \ - --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ - --scoring-opts "--min-lmwt 5 " \ - $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ - data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 - ) || touch $dir/.error & - done - wait - if [ -f $dir/.error ]; then - echo "$0: something went wrong in decoding" - exit 1 - fi -fi - - -if [ $stage -le 21 ]; then - # 'looped' decoding. we didn't write a -parallel version of this program yet, - # so it will take a bit longer as the --num-threads option is not supported. - # we just hardcode the --frames-per-chunk option as it doesn't have to - # match any value used in training, and it won't affect the results (unlike - # regular decoding). - rm $dir/.error 2>/dev/null || true - for dset in dev test; do - ( - steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \ - --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context-initial $extra_left_context_initial \ - --frames-per-chunk 30 \ - --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ - --scoring-opts "--min-lmwt 5 " \ - $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1; - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ - data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1 - ) || touch $dir/.error & - done - wait - if [ -f $dir/.error ]; then - echo "$0: something went wrong in decoding" - exit 1 - fi -fi - - -exit 0 diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh deleted file mode 100755 index 724081a4c61..00000000000 --- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh +++ /dev/null @@ -1,382 +0,0 @@ -#!/bin/bash - -# 1t is as 1s but without dropout; it could be compared to 1e. -# 1s is as 1p, but reverting to the non-fast LSTM code (still with dropout); -# will do 1t as the baseline without dropout. Seems a bit worse than -# the fast-LSTM code. - -# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1t_sp_bi -# System tdnn_lstm1e_sp_bi tdnn_lstm1t_sp_bi -# WER on dev(orig) 9.0 9.2 -# [looped:] 9.0 9.2 -# WER on dev(rescored) 8.4 8.6 -# [looped:] 8.4 8.6 -# WER on test(orig) 8.8 9.1 -# [looped:] 8.8 9.0 -# WER on test(rescored) 8.4 8.6 -# [looped:] 8.3 8.7 -# Final train prob -0.0648 -0.0618 -# Final valid prob -0.0827 -0.0794 -# Final train prob (xent) -0.8372 -0.8120 -# Final valid prob (xent) -0.9497 -0.9396 - -# 1p is as 1k, but [via script changes] doing the dropout as Gaofeng -# did it in the non-fast LSTMs, with separate per-frame masks on -# the i and f component. Using dropout schedule that maxes out at -# 0.3, which he found worked best for that type of dropout. - -# [See about 20 lines below for the original comparison with the baseline, -# done when "p" was dropping out 2 gates [the i and f gates]. -# The comparison directly below is between the version that dropped out -# 2 gates (p) with the one that dropped out 3 gates (p2). No consistent -# difference there.] -# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1{p,p2}_sp_bi -#_sp_bi -# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1p_sp_bi exp/chain_cleaned/tdnn_lstm1p2_sp_bi -# System tdnn_lstm1p_sp_bi tdnn_lstm1p2_sp_bi -# WER on dev(orig) 8.9 8.7 -# WER on dev(rescored) 8.4 8.2 -# WER on test(orig) 8.7 8.8 -# WER on test(rescored) 8.1 8.3 -# Final train prob -0.0712 -0.0717 -# Final valid prob -0.0848 -0.0834 -# Final train prob (xent) -0.8903 -0.9147 -# Final valid prob (xent) -0.9719 -0.9977 - -# -# -# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,k,p,q}_sp_bi -# System tdnn_lstm1e_sp_bi tdnn_lstm1k_sp_bi tdnn_lstm1p_sp_bi tdnn_lstm1q_sp_bi -# WER on dev(orig) 9.0 8.7 8.9 9.1 -# [looped:] 9.0 8.6 8.8 9.0 -# WER on dev(rescored) 8.4 7.9 8.4 8.3 -# [looped:] 8.4 7.8 8.3 8.2 -# WER on test(orig) 8.8 8.8 8.7 8.9 -# [looped:] 8.8 8.7 8.6 8.9 -# WER on test(rescored) 8.4 8.3 8.1 8.3 -# [looped:] 8.3 8.3 8.1 8.3 -# Final train prob -0.0648 -0.0693 -0.0712 -0.0698 -# Final valid prob -0.0827 -0.0854 -0.0848 -0.0875 -# Final train prob (xent) -0.8372 -0.8848 -0.8903 -0.8721 -# Final valid prob (xent) -0.9497 -0.9895 -0.9719 -0.9828 -# -# 1k is as 1e, but introducing a dropout schedule. - -# 1e is as 1b, but reducing decay-time from 40 to 20. - -# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers. note: it -# uses egs from 1b, remember to remove that before I commit. - -# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi -# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091) - -# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below -# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had -# better results. Note: these results are not with the updated LM (the LM data-prep -# for this setup was changed in Nov 2016 but this was with an older directory). -# -# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi -# System lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi -# WER on dev(orig) 10.3 10.7 9.7 -# WER on dev(rescored) 9.8 10.1 9.3 -# WER on test(orig) 9.7 9.8 9.1 -# WER on test(rescored) 9.2 9.4 8.7 -# Final train prob -0.0812 -0.0862 -0.0625 -# Final valid prob -0.1049 -0.1047 -0.0910 -# Final train prob (xent) -1.1334 -1.1763 -0.8518 -# Final valid prob (xent) -1.2263 -1.2427 -0.9972 - -## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here; -## otherwise call it directly in its location). -# by default, with cleanup: -# local/chain/run_tdnn_lstm.sh - -# without cleanup: -# local/chain/run_tdnn_lstm.sh --train-set train --gmm tri3 --nnet3-affix "" & - -# note, if you have already run one of the non-chain nnet3 systems -# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14. - -# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly -# standard, LSTM, except that some TDNN layers were added in between the -# LSTM layers. I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but -# this isn't exactly copied from there. - - -set -e -o pipefail - -# First the options that are passed through to run_ivector_common.sh -# (some of which are also used in this script directly). -stage=0 -nj=30 -decode_nj=30 -min_seg_len=1.55 -label_delay=5 -xent_regularize=0.1 -train_set=train_cleaned -gmm=tri3_cleaned # the gmm for the target data -num_threads_ubm=32 -nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned -# training options -chunk_left_context=40 -chunk_right_context=0 -chunk_left_context_initial=0 -chunk_right_context_final=0 -# decode options -extra_left_context=50 -extra_right_context=0 -extra_left_context_initial=0 -extra_right_context_final=0 -frames_per_chunk=140,100,160 -frames_per_chunk_primary=140 - -# The rest are configs specific to this script. Most of the parameters -# are just hardcoded at this level, in the commands below. -train_stage=-10 -tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. -tdnn_lstm_affix=1t #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. -common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs # you can set this to use previously dumped egs. - -# End configuration section. -echo "$0 $@" # Print the command line for logging - -. cmd.sh -. ./path.sh -. ./utils/parse_options.sh - - -if ! cuda-compiled; then - cat <data/lang_chain/topo - fi -fi - -if [ $stage -le 15 ]; then - # Get the alignments as lattices (gives the chain training more freedom). - # use the same num-jobs as the alignments - steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ - data/lang $gmm_dir $lat_dir - rm $lat_dir/fsts.*.gz # save space -fi - -if [ $stage -le 16 ]; then - # Build a tree using our new topology. We know we have alignments for the - # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use - # those. - if [ -f $tree_dir/final.mdl ]; then - echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." - exit 1; - fi - steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ - --context-opts "--context-width=2 --central-position=1" \ - --leftmost-questions-truncate -1 \ - --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir -fi - - -if [ $stage -le 17 ]; then - mkdir -p $dir - echo "$0: creating neural net configs using the xconfig parser"; - - num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - - # note: the value of the dropout-proportion is not important, as it's - # controlled by the dropout schedule; what's important is that we set it. - lstmp_opts="decay-time=20" - - mkdir -p $dir/configs - cat < $dir/configs/network.xconfig - input dim=100 name=ivector - input dim=40 name=input - - # please note that it is important to have input layer with the name=input - # as the layer immediately preceding the fixed-affine-layer to enable - # the use of short notation for the descriptor - fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat - - # the first splicing is moved before the lda layer, so no splicing here - relu-renorm-layer name=tdnn1 dim=512 - relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1) - lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts - relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3) - relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3) - lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts - relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3) - relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3) - lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts - - ## adding the layers for chain branch - output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 - - # adding the layers for xent branch - # This block prints the configs for a separate output that will be - # trained with a cross-entropy objective in the 'chain' models... this - # has the effect of regularizing the hidden parts of the model. we use - # 0.5 / args.xent_regularize as the learning rate factor- the factor of - # 0.5 / args.xent_regularize is suitable as it means the xent - # final-layer learns at a rate independent of the regularization - # constant; and the 0.5 was tuned so as to make the relative progress - # similar in the xent and regular final layers. - output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 - -EOF - steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ -fi - - -if [ $stage -le 18 ]; then - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then - utils/create_split_dir.pl \ - /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage - fi - - steps/nnet3/chain/train.py --stage $train_stage \ - --cmd "$decode_cmd" \ - --feat.online-ivector-dir $train_ivector_dir \ - --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ - --chain.xent-regularize 0.1 \ - --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize 0.00005 \ - --chain.apply-deriv-weights false \ - --chain.lm-opts="--num-extra-lm-states=2000" \ - --egs.dir "$common_egs_dir" \ - --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width "$frames_per_chunk" \ - --egs.chunk-left-context "$chunk_left_context" \ - --egs.chunk-right-context "$chunk_right_context" \ - --egs.chunk-left-context-initial "$chunk_left_context_initial" \ - --egs.chunk-right-context-final "$chunk_right_context_final" \ - --trainer.num-chunk-per-minibatch 128,64 \ - --trainer.frames-per-iter 1500000 \ - --trainer.max-param-change 2.0 \ - --trainer.num-epochs 4 \ - --trainer.deriv-truncate-margin 10 \ - --trainer.optimization.shrink-value 0.99 \ - --trainer.optimization.num-jobs-initial 2 \ - --trainer.optimization.num-jobs-final 12 \ - --trainer.optimization.initial-effective-lrate 0.001 \ - --trainer.optimization.final-effective-lrate 0.0001 \ - --trainer.optimization.momentum 0.0 \ - --cleanup.remove-egs true \ - --feat-dir $train_data_dir \ - --tree-dir $tree_dir \ - --lat-dir $lat_dir \ - --dir $dir \ - --cleanup=false - # --cleanup=false is temporary while debugging. -fi - - - -if [ $stage -le 19 ]; then - # Note: it might appear that this data/lang_chain directory is mismatched, and it is as - # far as the 'topo' is concerned, but this script doesn't read the 'topo' from - # the lang directory. - utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph -fi - -if [ $stage -le 20 ]; then - rm $dir/.error 2>/dev/null || true - for dset in dev test; do - ( - steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ - --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $extra_left_context \ - --extra-right-context $extra_right_context \ - --extra-left-context-initial $extra_left_context_initial \ - --extra-right-context-final $extra_right_context_final \ - --frames-per-chunk "$frames_per_chunk_primary" \ - --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ - --scoring-opts "--min-lmwt 5 " \ - $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ - data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 - ) || touch $dir/.error & - done - wait - if [ -f $dir/.error ]; then - echo "$0: something went wrong in decoding" - exit 1 - fi -fi - - -if [ $stage -le 21 ]; then - # 'looped' decoding. we didn't write a -parallel version of this program yet, - # so it will take a bit longer as the --num-threads option is not supported. - # we just hardcode the --frames-per-chunk option as it doesn't have to - # match any value used in training, and it won't affect the results (unlike - # regular decoding). - rm $dir/.error 2>/dev/null || true - for dset in dev test; do - ( - steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \ - --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context-initial $extra_left_context_initial \ - --frames-per-chunk 30 \ - --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ - --scoring-opts "--min-lmwt 5 " \ - $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1; - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ - data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1 - ) || touch $dir/.error & - done - wait - if [ -f $dir/.error ]; then - echo "$0: something went wrong in decoding" - exit 1 - fi -fi - - -exit 0 diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh deleted file mode 100755 index eda096b487b..00000000000 --- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh +++ /dev/null @@ -1,385 +0,0 @@ -#!/bin/bash - -# 1u is as 1s, but adding dropout-per-frame=true. -# Slightly better than 1s, but the improvement versus the baseline 1t is -# rather disappointing (only about 0.4 at most). - -# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1p2_sp_bi exp/chain_cleaned/tdnn_lstm1s_sp_bi exp/chain_cleaned/tdnn_lstm1t_sp_bi exp/chain_cleaned/tdnn_lstm1u_sp_bi -# System tdnn_lstm1e_sp_bi tdnn_lstm1p2_sp_bi tdnn_lstm1s_sp_bi tdnn_lstm1t_sp_bi tdnn_lstm1u_sp_bi -# WER on dev(orig) 9.0 8.7 9.1 9.2 9.0 -# [looped:] 9.0 8.7 9.1 9.2 8.9 -# WER on dev(rescored) 8.4 8.2 8.3 8.6 8.1 -# [looped:] 8.4 8.2 8.3 8.6 8.1 -# WER on test(orig) 8.8 8.8 9.0 9.1 8.7 -# [looped:] 8.8 8.8 9.0 9.0 8.7 -# WER on test(rescored) 8.4 8.3 8.4 8.6 8.3 -# [looped:] 8.3 8.3 8.4 8.7 8.3 -# Final train prob -0.0648 -0.0717 -0.0693 -0.0618 -0.0723 -# Final valid prob -0.0827 -0.0833 -0.0859 -0.0794 -0.0828 -# Final train prob (xent) -0.8372 -0.8979 -0.8802 -0.8120 -0.9042 -# Final valid prob (xent) -0.9497 -0.9844 -0.9934 -0.9396 -0.9879 - -# 1s is as 1p, but reverting to the non-fast LSTM code (still with dropout); -# will do 1t as the baseline without dropout. - -# 1p is as 1k, but [via script changes] doing the dropout as Gaofeng -# did it in the non-fast LSTMs, with separate per-frame masks on -# the i and f component. Using dropout schedule that maxes out at -# 0.3, which he found worked best for that type of dropout. - -# [See about 20 lines below for the original comparison with the baseline, -# done when "p" was dropping out 2 gates [the i and f gates]. -# The comparison directly below is between the version that dropped out -# 2 gates (p) with the one that dropped out 3 gates (p2). No consistent -# difference there.] -# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1{p,p2}_sp_bi -#_sp_bi -# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1p_sp_bi exp/chain_cleaned/tdnn_lstm1p2_sp_bi -# System tdnn_lstm1p_sp_bi tdnn_lstm1p2_sp_bi -# WER on dev(orig) 8.9 8.7 -# WER on dev(rescored) 8.4 8.2 -# WER on test(orig) 8.7 8.8 -# WER on test(rescored) 8.1 8.3 -# Final train prob -0.0712 -0.0717 -# Final valid prob -0.0848 -0.0834 -# Final train prob (xent) -0.8903 -0.9147 -# Final valid prob (xent) -0.9719 -0.9977 - -# -# -# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,k,p,q}_sp_bi -# System tdnn_lstm1e_sp_bi tdnn_lstm1k_sp_bi tdnn_lstm1p_sp_bi tdnn_lstm1q_sp_bi -# WER on dev(orig) 9.0 8.7 8.9 9.1 -# [looped:] 9.0 8.6 8.8 9.0 -# WER on dev(rescored) 8.4 7.9 8.4 8.3 -# [looped:] 8.4 7.8 8.3 8.2 -# WER on test(orig) 8.8 8.8 8.7 8.9 -# [looped:] 8.8 8.7 8.6 8.9 -# WER on test(rescored) 8.4 8.3 8.1 8.3 -# [looped:] 8.3 8.3 8.1 8.3 -# Final train prob -0.0648 -0.0693 -0.0712 -0.0698 -# Final valid prob -0.0827 -0.0854 -0.0848 -0.0875 -# Final train prob (xent) -0.8372 -0.8848 -0.8903 -0.8721 -# Final valid prob (xent) -0.9497 -0.9895 -0.9719 -0.9828 -# -# 1k is as 1e, but introducing a dropout schedule. - -# 1e is as 1b, but reducing decay-time from 40 to 20. - -# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers. note: it -# uses egs from 1b, remember to remove that before I commit. - -# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi -# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091) - -# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below -# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had -# better results. Note: these results are not with the updated LM (the LM data-prep -# for this setup was changed in Nov 2016 but this was with an older directory). -# -# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi -# System lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi -# WER on dev(orig) 10.3 10.7 9.7 -# WER on dev(rescored) 9.8 10.1 9.3 -# WER on test(orig) 9.7 9.8 9.1 -# WER on test(rescored) 9.2 9.4 8.7 -# Final train prob -0.0812 -0.0862 -0.0625 -# Final valid prob -0.1049 -0.1047 -0.0910 -# Final train prob (xent) -1.1334 -1.1763 -0.8518 -# Final valid prob (xent) -1.2263 -1.2427 -0.9972 - -## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here; -## otherwise call it directly in its location). -# by default, with cleanup: -# local/chain/run_tdnn_lstm.sh - -# without cleanup: -# local/chain/run_tdnn_lstm.sh --train-set train --gmm tri3 --nnet3-affix "" & - -# note, if you have already run one of the non-chain nnet3 systems -# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14. - -# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly -# standard, LSTM, except that some TDNN layers were added in between the -# LSTM layers. I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but -# this isn't exactly copied from there. - - -set -e -o pipefail - -# First the options that are passed through to run_ivector_common.sh -# (some of which are also used in this script directly). -stage=0 -nj=30 -decode_nj=30 -min_seg_len=1.55 -label_delay=5 -xent_regularize=0.1 -train_set=train_cleaned -gmm=tri3_cleaned # the gmm for the target data -num_threads_ubm=32 -nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned -# training options -chunk_left_context=40 -chunk_right_context=0 -chunk_left_context_initial=0 -chunk_right_context_final=0 -# decode options -extra_left_context=50 -extra_right_context=0 -extra_left_context_initial=0 -extra_right_context_final=0 -frames_per_chunk=140,100,160 -frames_per_chunk_primary=140 - -# The rest are configs specific to this script. Most of the parameters -# are just hardcoded at this level, in the commands below. -train_stage=-10 -tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. -tdnn_lstm_affix=1u #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. -common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs # you can set this to use previously dumped egs. - -# End configuration section. -echo "$0 $@" # Print the command line for logging - -. cmd.sh -. ./path.sh -. ./utils/parse_options.sh - - -if ! cuda-compiled; then - cat <data/lang_chain/topo - fi -fi - -if [ $stage -le 15 ]; then - # Get the alignments as lattices (gives the chain training more freedom). - # use the same num-jobs as the alignments - steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ - data/lang $gmm_dir $lat_dir - rm $lat_dir/fsts.*.gz # save space -fi - -if [ $stage -le 16 ]; then - # Build a tree using our new topology. We know we have alignments for the - # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use - # those. - if [ -f $tree_dir/final.mdl ]; then - echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." - exit 1; - fi - steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ - --context-opts "--context-width=2 --central-position=1" \ - --leftmost-questions-truncate -1 \ - --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir -fi - - -if [ $stage -le 17 ]; then - mkdir -p $dir - echo "$0: creating neural net configs using the xconfig parser"; - - num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - - # note: the value of the dropout-proportion is not important, as it's - # controlled by the dropout schedule; what's important is that we set it. - lstmp_opts="decay-time=20 dropout-proportion=0.0 dropout-per-frame=True" - - mkdir -p $dir/configs - cat < $dir/configs/network.xconfig - input dim=100 name=ivector - input dim=40 name=input - - # please note that it is important to have input layer with the name=input - # as the layer immediately preceding the fixed-affine-layer to enable - # the use of short notation for the descriptor - fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat - - # the first splicing is moved before the lda layer, so no splicing here - relu-renorm-layer name=tdnn1 dim=512 - relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1) - lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts - relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3) - relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3) - lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts - relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3) - relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3) - lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts - - ## adding the layers for chain branch - output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 - - # adding the layers for xent branch - # This block prints the configs for a separate output that will be - # trained with a cross-entropy objective in the 'chain' models... this - # has the effect of regularizing the hidden parts of the model. we use - # 0.5 / args.xent_regularize as the learning rate factor- the factor of - # 0.5 / args.xent_regularize is suitable as it means the xent - # final-layer learns at a rate independent of the regularization - # constant; and the 0.5 was tuned so as to make the relative progress - # similar in the xent and regular final layers. - output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 - -EOF - steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ -fi - - -if [ $stage -le 18 ]; then - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then - utils/create_split_dir.pl \ - /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage - fi - - steps/nnet3/chain/train.py --stage $train_stage \ - --cmd "$decode_cmd" \ - --feat.online-ivector-dir $train_ivector_dir \ - --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ - --trainer.dropout-schedule='0,0@0.20,0.3@0.5,0@0.75,0' \ - --chain.xent-regularize 0.1 \ - --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize 0.00005 \ - --chain.apply-deriv-weights false \ - --chain.lm-opts="--num-extra-lm-states=2000" \ - --egs.dir "$common_egs_dir" \ - --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width "$frames_per_chunk" \ - --egs.chunk-left-context "$chunk_left_context" \ - --egs.chunk-right-context "$chunk_right_context" \ - --egs.chunk-left-context-initial "$chunk_left_context_initial" \ - --egs.chunk-right-context-final "$chunk_right_context_final" \ - --trainer.num-chunk-per-minibatch 128,64 \ - --trainer.frames-per-iter 1500000 \ - --trainer.max-param-change 2.0 \ - --trainer.num-epochs 4 \ - --trainer.deriv-truncate-margin 10 \ - --trainer.optimization.shrink-value 0.99 \ - --trainer.optimization.num-jobs-initial 2 \ - --trainer.optimization.num-jobs-final 12 \ - --trainer.optimization.initial-effective-lrate 0.001 \ - --trainer.optimization.final-effective-lrate 0.0001 \ - --trainer.optimization.momentum 0.0 \ - --cleanup.remove-egs true \ - --feat-dir $train_data_dir \ - --tree-dir $tree_dir \ - --lat-dir $lat_dir \ - --dir $dir \ - --cleanup=false - # --cleanup=false is temporary while debugging. -fi - - - -if [ $stage -le 19 ]; then - # Note: it might appear that this data/lang_chain directory is mismatched, and it is as - # far as the 'topo' is concerned, but this script doesn't read the 'topo' from - # the lang directory. - utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph -fi - -if [ $stage -le 20 ]; then - rm $dir/.error 2>/dev/null || true - for dset in dev test; do - ( - steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ - --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $extra_left_context \ - --extra-right-context $extra_right_context \ - --extra-left-context-initial $extra_left_context_initial \ - --extra-right-context-final $extra_right_context_final \ - --frames-per-chunk "$frames_per_chunk_primary" \ - --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ - --scoring-opts "--min-lmwt 5 " \ - $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ - data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 - ) || touch $dir/.error & - done - wait - if [ -f $dir/.error ]; then - echo "$0: something went wrong in decoding" - exit 1 - fi -fi - - -if [ $stage -le 21 ]; then - # 'looped' decoding. we didn't write a -parallel version of this program yet, - # so it will take a bit longer as the --num-threads option is not supported. - # we just hardcode the --frames-per-chunk option as it doesn't have to - # match any value used in training, and it won't affect the results (unlike - # regular decoding). - rm $dir/.error 2>/dev/null || true - for dset in dev test; do - ( - steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \ - --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context-initial $extra_left_context_initial \ - --frames-per-chunk 30 \ - --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ - --scoring-opts "--min-lmwt 5 " \ - $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1; - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ - data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1 - ) || touch $dir/.error & - done - wait - if [ -f $dir/.error ]; then - echo "$0: something went wrong in decoding" - exit 1 - fi -fi - - -exit 0 diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u_1024.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u_1024.sh deleted file mode 100644 index e6a44bd0bc8..00000000000 --- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u_1024.sh +++ /dev/null @@ -1,387 +0,0 @@ -#!/bin/bash - -# 1u is as 1s, but adding dropout-per-frame=true. -# Slightly better than 1s, but the improvement versus the baseline 1t is -# rather disappointing (only about 0.4 at most). - -# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1p2_sp_bi exp/chain_cleaned/tdnn_lstm1s_sp_bi exp/chain_cleaned/tdnn_lstm1t_sp_bi exp/chain_cleaned/tdnn_lstm1u_sp_bi -# System tdnn_lstm1e_sp_bi tdnn_lstm1p2_sp_bi tdnn_lstm1s_sp_bi tdnn_lstm1t_sp_bi tdnn_lstm1u_sp_bi -# WER on dev(orig) 9.0 8.7 9.1 9.2 9.0 -# [looped:] 9.0 8.7 9.1 9.2 8.9 -# WER on dev(rescored) 8.4 8.2 8.3 8.6 8.1 -# [looped:] 8.4 8.2 8.3 8.6 8.1 -# WER on test(orig) 8.8 8.8 9.0 9.1 8.7 -# [looped:] 8.8 8.8 9.0 9.0 8.7 -# WER on test(rescored) 8.4 8.3 8.4 8.6 8.3 -# [looped:] 8.3 8.3 8.4 8.7 8.3 -# Final train prob -0.0648 -0.0717 -0.0693 -0.0618 -0.0723 -# Final valid prob -0.0827 -0.0833 -0.0859 -0.0794 -0.0828 -# Final train prob (xent) -0.8372 -0.8979 -0.8802 -0.8120 -0.9042 -# Final valid prob (xent) -0.9497 -0.9844 -0.9934 -0.9396 -0.9879 - -# 1s is as 1p, but reverting to the non-fast LSTM code (still with dropout); -# will do 1t as the baseline without dropout. - -# 1p is as 1k, but [via script changes] doing the dropout as Gaofeng -# did it in the non-fast LSTMs, with separate per-frame masks on -# the i and f component. Using dropout schedule that maxes out at -# 0.3, which he found worked best for that type of dropout. - -# [See about 20 lines below for the original comparison with the baseline, -# done when "p" was dropping out 2 gates [the i and f gates]. -# The comparison directly below is between the version that dropped out -# 2 gates (p) with the one that dropped out 3 gates (p2). No consistent -# difference there.] -# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1{p,p2}_sp_bi -#_sp_bi -# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1p_sp_bi exp/chain_cleaned/tdnn_lstm1p2_sp_bi -# System tdnn_lstm1p_sp_bi tdnn_lstm1p2_sp_bi -# WER on dev(orig) 8.9 8.7 -# WER on dev(rescored) 8.4 8.2 -# WER on test(orig) 8.7 8.8 -# WER on test(rescored) 8.1 8.3 -# Final train prob -0.0712 -0.0717 -# Final valid prob -0.0848 -0.0834 -# Final train prob (xent) -0.8903 -0.9147 -# Final valid prob (xent) -0.9719 -0.9977 - -# -# -# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,k,p,q}_sp_bi -# System tdnn_lstm1e_sp_bi tdnn_lstm1k_sp_bi tdnn_lstm1p_sp_bi tdnn_lstm1q_sp_bi -# WER on dev(orig) 9.0 8.7 8.9 9.1 -# [looped:] 9.0 8.6 8.8 9.0 -# WER on dev(rescored) 8.4 7.9 8.4 8.3 -# [looped:] 8.4 7.8 8.3 8.2 -# WER on test(orig) 8.8 8.8 8.7 8.9 -# [looped:] 8.8 8.7 8.6 8.9 -# WER on test(rescored) 8.4 8.3 8.1 8.3 -# [looped:] 8.3 8.3 8.1 8.3 -# Final train prob -0.0648 -0.0693 -0.0712 -0.0698 -# Final valid prob -0.0827 -0.0854 -0.0848 -0.0875 -# Final train prob (xent) -0.8372 -0.8848 -0.8903 -0.8721 -# Final valid prob (xent) -0.9497 -0.9895 -0.9719 -0.9828 -# -# 1k is as 1e, but introducing a dropout schedule. - -# 1e is as 1b, but reducing decay-time from 40 to 20. - -# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers. note: it -# uses egs from 1b, remember to remove that before I commit. - -# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi -# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091) - -# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below -# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had -# better results. Note: these results are not with the updated LM (the LM data-prep -# for this setup was changed in Nov 2016 but this was with an older directory). -# -# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi -# System lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi -# WER on dev(orig) 10.3 10.7 9.7 -# WER on dev(rescored) 9.8 10.1 9.3 -# WER on test(orig) 9.7 9.8 9.1 -# WER on test(rescored) 9.2 9.4 8.7 -# Final train prob -0.0812 -0.0862 -0.0625 -# Final valid prob -0.1049 -0.1047 -0.0910 -# Final train prob (xent) -1.1334 -1.1763 -0.8518 -# Final valid prob (xent) -1.2263 -1.2427 -0.9972 - -## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here; -## otherwise call it directly in its location). -# by default, with cleanup: -# local/chain/run_tdnn_lstm.sh - -# without cleanup: -# local/chain/run_tdnn_lstm.sh --train-set train --gmm tri3 --nnet3-affix "" & - -# note, if you have already run one of the non-chain nnet3 systems -# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14. - -# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly -# standard, LSTM, except that some TDNN layers were added in between the -# LSTM layers. I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but -# this isn't exactly copied from there. - - -set -e -o pipefail - -# First the options that are passed through to run_ivector_common.sh -# (some of which are also used in this script directly). -stage=17 -nj=30 -decode_nj=30 -min_seg_len=1.55 -label_delay=5 -xent_regularize=0.1 -train_set=train_cleaned -gmm=tri3_cleaned # the gmm for the target data -num_threads_ubm=32 -nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned -dropout_schedule= -num_epoch= -# training options -chunk_left_context=40 -chunk_right_context=0 -chunk_left_context_initial=0 -chunk_right_context_final=0 -# decode options -extra_left_context=50 -extra_right_context=0 -extra_left_context_initial=0 -extra_right_context_final=0 -frames_per_chunk=140,100,160 -frames_per_chunk_primary=140 - -# The rest are configs specific to this script. Most of the parameters -# are just hardcoded at this level, in the commands below. -train_stage=-10 -tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. -tdnn_lstm_affix=1u #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. -common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs # you can set this to use previously dumped egs. - -# End configuration section. -echo "$0 $@" # Print the command line for logging - -. cmd.sh -. ./path.sh -. ./utils/parse_options.sh - - -if ! cuda-compiled; then - cat <data/lang_chain/topo - fi -fi - -if [ $stage -le 15 ]; then - # Get the alignments as lattices (gives the chain training more freedom). - # use the same num-jobs as the alignments - steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ - data/lang $gmm_dir $lat_dir - rm $lat_dir/fsts.*.gz # save space -fi - -if [ $stage -le 16 ]; then - # Build a tree using our new topology. We know we have alignments for the - # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use - # those. - if [ -f $tree_dir/final.mdl ]; then - echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." - exit 1; - fi - steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ - --context-opts "--context-width=2 --central-position=1" \ - --leftmost-questions-truncate -1 \ - --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir -fi - - -if [ $stage -le 17 ]; then - mkdir -p $dir - echo "$0: creating neural net configs using the xconfig parser"; - - num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - - # note: the value of the dropout-proportion is not important, as it's - # controlled by the dropout schedule; what's important is that we set it. - lstmp_opts="decay-time=20 dropout-proportion=0.0 dropout-per-frame=True" - - mkdir -p $dir/configs - cat < $dir/configs/network.xconfig - input dim=100 name=ivector - input dim=40 name=input - - # please note that it is important to have input layer with the name=input - # as the layer immediately preceding the fixed-affine-layer to enable - # the use of short notation for the descriptor - fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat - - # the first splicing is moved before the lda layer, so no splicing here - relu-renorm-layer name=tdnn1 dim=1024 - relu-renorm-layer name=tdnn2 dim=1024 input=Append(-1,0,1) - lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstmp_opts - relu-renorm-layer name=tdnn3 dim=1024 input=Append(-3,0,3) - relu-renorm-layer name=tdnn4 dim=1024 input=Append(-3,0,3) - lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstmp_opts - relu-renorm-layer name=tdnn5 dim=1024 input=Append(-3,0,3) - relu-renorm-layer name=tdnn6 dim=1024 input=Append(-3,0,3) - lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstmp_opts - - ## adding the layers for chain branch - output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 - - # adding the layers for xent branch - # This block prints the configs for a separate output that will be - # trained with a cross-entropy objective in the 'chain' models... this - # has the effect of regularizing the hidden parts of the model. we use - # 0.5 / args.xent_regularize as the learning rate factor- the factor of - # 0.5 / args.xent_regularize is suitable as it means the xent - # final-layer learns at a rate independent of the regularization - # constant; and the 0.5 was tuned so as to make the relative progress - # similar in the xent and regular final layers. - output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 - -EOF - steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ -fi - - -if [ $stage -le 18 ]; then - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then - utils/create_split_dir.pl \ - /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage - fi - - steps/nnet3/chain/train.py --stage $train_stage \ - --cmd "$decode_cmd" \ - --feat.online-ivector-dir $train_ivector_dir \ - --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ - --trainer.dropout-schedule="$dropout_schedule" \ - --chain.xent-regularize 0.1 \ - --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize 0.00005 \ - --chain.apply-deriv-weights false \ - --chain.lm-opts="--num-extra-lm-states=2000" \ - --egs.dir "$common_egs_dir" \ - --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width "$frames_per_chunk" \ - --egs.chunk-left-context "$chunk_left_context" \ - --egs.chunk-right-context "$chunk_right_context" \ - --egs.chunk-left-context-initial "$chunk_left_context_initial" \ - --egs.chunk-right-context-final "$chunk_right_context_final" \ - --trainer.num-chunk-per-minibatch 128,64 \ - --trainer.frames-per-iter 1500000 \ - --trainer.max-param-change 2.0 \ - --trainer.num-epochs $num_epoch \ - --trainer.deriv-truncate-margin 10 \ - --trainer.optimization.shrink-value 0.99 \ - --trainer.optimization.num-jobs-initial 2 \ - --trainer.optimization.num-jobs-final 12 \ - --trainer.optimization.initial-effective-lrate 0.001 \ - --trainer.optimization.final-effective-lrate 0.0001 \ - --trainer.optimization.momentum 0.0 \ - --cleanup.remove-egs true \ - --feat-dir $train_data_dir \ - --tree-dir $tree_dir \ - --lat-dir $lat_dir \ - --dir $dir \ - --cleanup=false - # --cleanup=false is temporary while debugging. -fi - - - -if [ $stage -le 19 ]; then - # Note: it might appear that this data/lang_chain directory is mismatched, and it is as - # far as the 'topo' is concerned, but this script doesn't read the 'topo' from - # the lang directory. - utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph -fi - -if [ $stage -le 20 ]; then - rm $dir/.error 2>/dev/null || true - for dset in dev test; do - ( - steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ - --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $extra_left_context \ - --extra-right-context $extra_right_context \ - --extra-left-context-initial $extra_left_context_initial \ - --extra-right-context-final $extra_right_context_final \ - --frames-per-chunk "$frames_per_chunk_primary" \ - --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ - --scoring-opts "--min-lmwt 5 " \ - $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ - data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 - ) || touch $dir/.error & - done - wait - if [ -f $dir/.error ]; then - echo "$0: something went wrong in decoding" - exit 1 - fi -fi - - -if [ $stage -le 21 ]; then - # 'looped' decoding. we didn't write a -parallel version of this program yet, - # so it will take a bit longer as the --num-threads option is not supported. - # we just hardcode the --frames-per-chunk option as it doesn't have to - # match any value used in training, and it won't affect the results (unlike - # regular decoding). - rm $dir/.error 2>/dev/null || true - for dset in dev test; do - ( - steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \ - --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context-initial $extra_left_context_initial \ - --frames-per-chunk 30 \ - --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ - --scoring-opts "--min-lmwt 5 " \ - $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1; - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ - data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1 - ) || touch $dir/.error & - done - wait - if [ -f $dir/.error ]; then - echo "$0: something went wrong in decoding" - exit 1 - fi -fi - - -exit 0 diff --git a/egs/tedlium/s5_r2/tdnn_lstm_1u_newschedule_5epoch_1024.sh b/egs/tedlium/s5_r2/tdnn_lstm_1u_newschedule_5epoch_1024.sh deleted file mode 100644 index d41fb4f82c2..00000000000 --- a/egs/tedlium/s5_r2/tdnn_lstm_1u_newschedule_5epoch_1024.sh +++ /dev/null @@ -1 +0,0 @@ -local/chain/tuning/run_tdnn_lstm_1u_1024.sh --train-stage 68 --dropout-schedule "0,0@0.20,0.3@0.5,0" --num-epoch 5 --tdnn-lstm-affix 1u_newschedule_5epoch_1024 From e9ac4e2343805f4f38e824f0dfd65cd9cca7dc1b Mon Sep 17 00:00:00 2001 From: Gaofeng Cheng <770579626@qq.com> Date: Sun, 9 Apr 2017 10:39:25 +0800 Subject: [PATCH 08/21] delete irrelevant file --- .vscode/settings.json | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index fe7159848bd..00000000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "python.linting.pylintEnabled": false -} \ No newline at end of file From 638f0834c85efaf476f14d91f33d97412a039fea Mon Sep 17 00:00:00 2001 From: Gaofeng Cheng <770579626@qq.com> Date: Sun, 9 Apr 2017 11:13:04 +0800 Subject: [PATCH 09/21] delete exclusive option in fast lstm code --- egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py | 2 -- src/nnet3/nnet-general-component.cc | 22 +++------------------ src/nnet3/nnet-general-component.h | 9 +-------- 3 files changed, 4 insertions(+), 29 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py index 55ac4704c0a..9d95e41ab12 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py @@ -720,7 +720,6 @@ def set_default_configs(self): # be used (note: this is # per-frame dropout on the # output of the i_t and f_t gates) - 'dropout-exclusive' : False # option affecting dropout masks. } def set_derived_configs(self): @@ -820,7 +819,6 @@ def generate_lstm_config(self): lstm_str = self.config['lstm-nonlinearity-options'] dropout_proportion = self.config['dropout-proportion'] - dropout_exclusive = 'true' if self.config['dropout-exclusive'] else 'false' configs = [] diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc index 3f47e1e01d2..761ffbd6815 100644 --- a/src/nnet3/nnet-general-component.cc +++ b/src/nnet3/nnet-general-component.cc @@ -1380,20 +1380,17 @@ std::string DropoutMaskComponent::Info() const { std::ostringstream stream; stream << Type() << ", output-dim=" << output_dim_ - << ", dropout-proportion=" << dropout_proportion_ - << ", exclusive=" << (exclusive_ ? "true" : "false"); + << ", dropout-proportion=" << dropout_proportion_; return stream.str(); } DropoutMaskComponent::DropoutMaskComponent(): - output_dim_(-1), dropout_proportion_(0.5), - exclusive_(false) { } + output_dim_(-1), dropout_proportion_(0.5) { } DropoutMaskComponent::DropoutMaskComponent( const DropoutMaskComponent &other): output_dim_(other.output_dim_), - dropout_proportion_(other.dropout_proportion_), - exclusive_(other.exclusive_) { } + dropout_proportion_(other.dropout_proportion_) { } void DropoutMaskComponent::Propagate( const ComponentPrecomputedIndexes *indexes, @@ -1407,16 +1404,9 @@ void DropoutMaskComponent::Propagate( out->Set(1.0); return; } - if (!exclusive_) { const_cast&>(random_generator_).RandUniform(out); out->Add(-dropout_proportion); out->ApplyHeaviside(); - } else { - if (!(output_dim_ == 2 && dropout_proportion <= 0.5)) { - KALDI_ERR << "If exclusive=true is set, output-dim must equal 2 (got: " - << output_dim_ << " and dropout-proportion must <= 0.5 (got: " - << dropout_proportion; - } // To generate data where it's never the case that both of the dimensions // for a row are zero, we generate uniformly distributed data (call this u_i), // and for row i, set (*out)(i, 0) = (0 if u_i < dropout_proportion else 1) @@ -1442,8 +1432,6 @@ void DropoutMaskComponent::Read(std::istream &is, bool binary) { ReadBasicType(is, binary, &output_dim_); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &dropout_proportion_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &exclusive_); ExpectToken(is, binary, ""); } @@ -1454,8 +1442,6 @@ void DropoutMaskComponent::Write(std::ostream &os, bool binary) const { WriteBasicType(os, binary, output_dim_); WriteToken(os, binary, ""); WriteBasicType(os, binary, dropout_proportion_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, exclusive_); WriteToken(os, binary, ""); } @@ -1469,8 +1455,6 @@ void DropoutMaskComponent::InitFromConfig(ConfigLine *cfl) { KALDI_ASSERT(ok && output_dim_ > 0); dropout_proportion_ = 0.5; cfl->GetValue("dropout-proportion", &dropout_proportion_); - exclusive_ = false; - cfl->GetValue("exclusive", &exclusive_); } diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h index d3de9f40548..d5d7a140177 100644 --- a/src/nnet3/nnet-general-component.h +++ b/src/nnet3/nnet-general-component.h @@ -689,8 +689,7 @@ class DropoutMaskComponent: public RandomComponent { virtual std::string Info() const; // possible parameter values with their defaults: - // dropout-proportion=0.5 output-dim=-1 exclusive=false - // [for the meaning of 'exclusive', see its declaration]. + // dropout-proportion=0.5 output-dim=-1 virtual void InitFromConfig(ConfigLine *cfl); DropoutMaskComponent(); @@ -745,12 +744,6 @@ class DropoutMaskComponent: public RandomComponent { BaseFloat dropout_proportion_; - // If true, and only in the special case where output_dim_ == 2, this - // component will make sure that it's never the case that both columns of a - // row of the output are zero. Note: if this is true, you cannot set - // dropout_proportion_ > 0.5. - bool exclusive_; - const DropoutMaskComponent &operator = (const DropoutMaskComponent &other); // Disallow. }; From 49c4558c27b10a89c0337e4d9d7779dca1424070 Mon Sep 17 00:00:00 2001 From: Gaofeng Cheng <770579626@qq.com> Date: Sun, 9 Apr 2017 13:10:12 +0800 Subject: [PATCH 10/21] solve some cuda-kernel line mismatch problem --- src/cudamatrix/cu-kernels-ansi.h | 130 ++++++++++--------------------- src/cudamatrix/cu-kernels.h | 107 ++++++------------------- 2 files changed, 63 insertions(+), 174 deletions(-) diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h index cbdecfb7386..5b72a62e716 100644 --- a/src/cudamatrix/cu-kernels-ansi.h +++ b/src/cudamatrix/cu-kernels-ansi.h @@ -330,6 +330,7 @@ void cudaF_diff_log_softmax(dim3 Gr, dim3 Bl, const MatrixDim in_deriv_dim, const float* out_deriv, const int out_deriv_stride, float* in_deriv); void cudaD_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, + const int have_dropout_mask, const int num_rows, const double* input, const int in_stride, const double* params, const int params_stride, @@ -349,6 +350,7 @@ void cudaD_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, double* self_repair_sum_out, const int self_repair_sum_out_stride); void cudaF_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, + const int have_dropout_mask, const int num_rows, const float* input, const int in_stride, const float* params, const int params_stride, @@ -455,12 +457,14 @@ void cudaF_log_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x, void cudaD_lstm_nonlinearity(dim3 Gr, dim3 Bl, const double* in, const int in_stride, const double* params, const int params_stride, const int out_stride, - const int cell_dim, const int num_rows, + const int cell_dim, const int have_dropout_mask, + const int num_rows, double* out); void cudaF_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in, const int in_stride, const float* params, const int params_stride, const int out_stride, - const int cell_dim, const int num_rows, + const int cell_dim, const int have_dropout_mask, + const int num_rows, float* out); void cudaD_matrix_add_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim dim, double alpha, MatrixElement* x, @@ -636,93 +640,41 @@ void cudaD_trace_mat_smat_trans(dim3 Gr, dim3 Bl, const double* mat_in, const MatrixElement* smat_in, MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in, double* trace_vec_out); - -void cudaD_matrix_add_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim dim, - double alpha, MatrixElement* x, - int num_elements); -void cudaD_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim, - double alpha, const Int32Pair* indices, - const double* x, int s, double* data); -void cudaD_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement* x, int s, - const double* z, MatrixDim d, double* z2, - MatrixDim d2, double* t); - -void cudaD_sy_add_tr2(dim3 Gr, dim3 Bl, double alpha, double beta, - const double* T, MatrixDim tdim, double *S, - MatrixDim sdim); -void cudaD_sum_column_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim, - const double *src_data, MatrixDim src_dim, - const Int32Pair *indices); -void cudaD_add_row_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim, - const double *src_data, MatrixDim src_dim, - const Int32Pair *indexes); -void cudaD_matrix_lookup(dim3 Gr, dim3 Bl, const double *data, MatrixDim dim, - const Int32Pair *indices, int indices_size, - double *output); - -void cudaD_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1, - const double *mat2, double *mask, - MatrixDim mat1_dim, int mat2_stride, - int mask_stride); - -void cudaD_lstm_nonlinearity(dim3 Gr, dim3 Bl, const double* in, - const int in_stride, const double* params, - const int params_stride, const int out_stride, - const int cell_dim, const int have_dropout_mask, - const int num_rows, - double* out); -void cudaF_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in, - const int in_stride, const float* params, - const int params_stride, const int out_stride, - const int cell_dim, const int have_dropout_mask, - const int num_rows, - float* out); -void cudaD_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, - const int have_dropout_mask, - const int num_rows, const double* input, - const int in_stride, const double* params, - const int params_stride, - const double* output_deriv, - const int output_deriv_stride, - const double* deriv_sum_in, - const int deriv_sum_in_stride, - const double* self_repair_config, - double count, double* input_deriv, - const int input_deriv_stride, - double* params_deriv, - const int params_deriv_stride, - double* value_sum_out, - const int value_sum_out_stride, - double* deriv_sum_out, - const int deriv_sum_out_stride, - double* self_repair_sum_out, - const int self_repair_sum_out_stride); -void cudaF_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, - const int have_dropout_mask, - const int num_rows, const float* input, - const int in_stride, const float* params, - const int params_stride, - const float* output_deriv, - const int output_deriv_stride, - const double* deriv_sum_in, - const int deriv_sum_in_stride, - const float* self_repair_config, double count, - float* input_deriv, - const int input_deriv_stride, - float* params_deriv, - const int params_deriv_stride, - double* value_sum_out, - const int value_sum_out_stride, - double* deriv_sum_out, - const int deriv_sum_out_stride, - float* self_repair_sum_out, - const int self_repair_sum_out_stride); - - -void cudaD_copy_cols_from_vec(dim3 Gr, dim3 Bl, double *mat_out, - MatrixDim d_out, const double *v_in); -void cudaF_copy_cols_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out, - const float *v_in); +void cudaF_trace_mat_smat_trans(dim3 Gr, dim3 Bl, const float* mat_in, + const MatrixElement* smat_in, + MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in, + float* trace_vec_out); +void cudaD_vec_apply_ceiling(int Gr, int Bl, double* v, double ceiling_val, + float* num, int dim); +void cudaF_vec_apply_ceiling(int Gr, int Bl, float* v, float ceiling_val, + float* num, int dim); +void cudaD_vec_apply_exp(int Gr, int Bl, double* v, int dim); +void cudaF_vec_apply_exp(int Gr, int Bl, float* v, int dim); +void cudaD_vec_apply_floor(int Gr, int Bl, double* v, double floor_val, + float* num, int dim); +void cudaF_vec_apply_floor(int Gr, int Bl, float* v, float floor_val, + float* num, int dim); +void cudaD_vec_apply_log(int Gr, int Bl, double* v, double* flag, int dim); +void cudaF_vec_apply_log(int Gr, int Bl, float* v, float* flag, int dim); +void cudaD_vec_copy_diag_from_packed(int Gr, int Bl, double *dst, + const double *src, int dim); +void cudaF_vec_copy_diag_from_packed(int Gr, int Bl, float *dst, + const float *src, int dim); +void cudaD_vec_max(int Gr, int Bl, const double* v, double* value, int dim, + int inc); +void cudaF_vec_max(int Gr, int Bl, const float* v, float* value, int dim, + int inc); +void cudaD_vec_min(int Gr, int Bl, const double* v, double* value, int dim, + int inc); +void cudaF_vec_min(int Gr, int Bl, const float* v, float* value, int dim, + int inc); +void cudaD_vec_mul_elements(int Gr, int Bl, double* v, const double* a, + int dim); +void cudaF_vec_mul_elements(int Gr, int Bl, float* v, const float* a, int dim); +void cudaD_vec_soft_max(int Gr, int Bl, double* v, int dim); +void cudaF_vec_soft_max(int Gr, int Bl, float* v, int dim); +void cudaD_vec_sum(int Gr, int Bl, double* v, double* value, int dim, int inc); +void cudaF_vec_sum(int Gr, int Bl, float* v, float* value, int dim, int inc); } // extern "C" diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index a8d305c5bf4..d2a79f471c8 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -626,6 +626,7 @@ inline void cuda_diff_log_softmax(dim3 Gr, dim3 Bl, out_deriv, out_deriv_stride, in_deriv); } inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, + const int have_dropout_mask, const int num_rows, const double* input, const int input_stride, const double* params, @@ -645,7 +646,8 @@ inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, const int deriv_sum_out_stride, double* self_repair_sum_out, const int self_repair_sum_out_stride) { - cudaD_diff_lstm_nonlinearity(Gr, Bl, cell_dim, num_rows, input, input_stride, + cudaD_diff_lstm_nonlinearity(Gr, Bl, cell_dim, have_dropout_mask, num_rows, + input, input_stride, params, params_stride, output_deriv, output_deriv_stride, deriv_sum_in, deriv_sum_in_stride, self_repair_config, count, @@ -656,6 +658,7 @@ inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, self_repair_sum_out_stride); } inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, + const int have_dropout_mask, const int num_rows, const float* input, const int input_stride, const float* params, @@ -675,7 +678,8 @@ inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, const int deriv_sum_out_stride, float* self_repair_sum_out, const int self_repair_sum_out_stride) { - cudaF_diff_lstm_nonlinearity(Gr, Bl, cell_dim, num_rows, input, input_stride, + cudaF_diff_lstm_nonlinearity(Gr, Bl, cell_dim, have_dropout_mask, + num_rows, input, input_stride, params, params_stride, output_deriv, output_deriv_stride, deriv_sum_in, deriv_sum_in_stride, self_repair_config, count, @@ -849,17 +853,21 @@ inline void cuda_lstm_nonlinearity(dim3 Gr, dim3 Bl, const double* in, const int in_stride, const double* params, const int params_stride, const int out_stride, const int cell_dim, + const int have_dropout_mask, const int num_rows, double* out) { cudaD_lstm_nonlinearity(Gr, Bl, in, in_stride, params, params_stride, - out_stride, cell_dim, num_rows, out); + out_stride, cell_dim, have_dropout_mask, + num_rows, out); } inline void cuda_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in, const int in_stride, const float* params, const int params_stride, const int out_stride, const int cell_dim, + const int have_dropout_mask, const int num_rows, float* out) { cudaF_lstm_nonlinearity(Gr, Bl, in, in_stride, params, params_stride, - out_stride, cell_dim, num_rows, out); + out_stride, cell_dim, have_dropout_mask, + num_rows, out); } inline void cuda_matrix_add_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim dim, double alpha, @@ -1300,90 +1308,19 @@ inline void cuda_vec_min(int Gr, int Bl, const float* v, float* value, int dim, int inc) { cudaF_vec_min(Gr, Bl, v, value, dim, inc); } - -inline void cuda_lstm_nonlinearity(dim3 Gr, dim3 Bl, const double* in, - const int in_stride, const double* params, - const int params_stride, - const int out_stride, const int cell_dim, - const int have_dropout_mask, - const int num_rows, double* out) { - cudaD_lstm_nonlinearity(Gr, Bl, in, in_stride, params, params_stride, - out_stride, cell_dim, have_dropout_mask, - num_rows, out); +inline void cuda_vec_mul_elements(int Gr, int Bl, double* v, const double* a, + int dim) { + cudaD_vec_mul_elements(Gr, Bl, v, a, dim); } -inline void cuda_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in, - const int in_stride, const float* params, - const int params_stride, - const int out_stride, const int cell_dim, - const int have_dropout_mask, - const int num_rows, float* out) { - cudaF_lstm_nonlinearity(Gr, Bl, in, in_stride, params, params_stride, - out_stride, cell_dim, have_dropout_mask, - num_rows, out); +inline void cuda_vec_mul_elements(int Gr, int Bl, float* v, const float* a, + int dim) { + cudaF_vec_mul_elements(Gr, Bl, v, a, dim); } -inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, - const int have_dropout_mask, - const int num_rows, const double* input, - const int input_stride, - const double* params, - const int params_stride, - const double* output_deriv, - const int output_deriv_stride, - const double* deriv_sum_in, - const int deriv_sum_in_stride, - const double* self_repair_config, - double count, double* input_deriv, - const int input_deriv_stride, - double* params_deriv, - const int params_deriv_stride, - double* value_sum_out, - const int value_sum_out_stride, - double* deriv_sum_out, - const int deriv_sum_out_stride, - double* self_repair_sum_out, - const int self_repair_sum_out_stride) { - cudaD_diff_lstm_nonlinearity(Gr, Bl, cell_dim, have_dropout_mask, num_rows, - input, input_stride, - params, params_stride, output_deriv, - output_deriv_stride, deriv_sum_in, - deriv_sum_in_stride, self_repair_config, count, - input_deriv, input_deriv_stride, params_deriv, - params_deriv_stride, value_sum_out, - value_sum_out_stride, deriv_sum_out, - deriv_sum_out_stride, self_repair_sum_out, - self_repair_sum_out_stride); +inline void cuda_vec_soft_max(int Gr, int Bl, double* v, int dim) { + cudaD_vec_soft_max(Gr, Bl, v, dim); } -inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim, - const int have_dropout_mask, - const int num_rows, const float* input, - const int input_stride, - const float* params, - const int params_stride, - const float* output_deriv, - const int output_deriv_stride, - const double* deriv_sum_in, - const int deriv_sum_in_stride, - const float* self_repair_config, - double count, float* input_deriv, - const int input_deriv_stride, - float* params_deriv, - const int params_deriv_stride, - double* value_sum_out, - const int value_sum_out_stride, - double* deriv_sum_out, - const int deriv_sum_out_stride, - float* self_repair_sum_out, - const int self_repair_sum_out_stride) { - cudaF_diff_lstm_nonlinearity(Gr, Bl, cell_dim, have_dropout_mask, - num_rows, input, input_stride, - params, params_stride, output_deriv, - output_deriv_stride, deriv_sum_in, - deriv_sum_in_stride, self_repair_config, count, - input_deriv, input_deriv_stride, params_deriv, - params_deriv_stride, value_sum_out, - value_sum_out_stride, deriv_sum_out, - deriv_sum_out_stride, self_repair_sum_out, - self_repair_sum_out_stride); +inline void cuda_vec_soft_max(int Gr, int Bl, float* v, int dim) { + cudaF_vec_soft_max(Gr, Bl, v, dim); } inline void cuda_vec_sum(int Gr, int Bl, double* v, double* value, int dim, int inc) { From 05fc6d250408048e7b6ecf63ed033c9e9b1de9a1 Mon Sep 17 00:00:00 2001 From: Gaofeng Cheng <770579626@qq.com> Date: Sun, 9 Apr 2017 14:44:53 +0800 Subject: [PATCH 11/21] small bug fix --- src/nnet3/nnet-general-component.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc index 761ffbd6815..85743490518 100644 --- a/src/nnet3/nnet-general-component.cc +++ b/src/nnet3/nnet-general-component.cc @@ -1424,7 +1424,6 @@ void DropoutMaskComponent::Propagate( out->CopyColFromVec(temp, 1); out->ApplyHeaviside(); } -} void DropoutMaskComponent::Read(std::istream &is, bool binary) { From 90df5d7c81bb3af14edbadce1e5a7920f992681b Mon Sep 17 00:00:00 2001 From: Gaofeng Cheng <770579626@qq.com> Date: Sun, 9 Apr 2017 15:02:45 +0800 Subject: [PATCH 12/21] small fix --- egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py index 9d95e41ab12..c92afb1c2dc 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py @@ -717,9 +717,7 @@ def set_default_configs(self): 'zeroing-interval' : 20, 'zeroing-threshold' : 15.0, 'dropout-proportion' : -1.0, # If -1.0, no dropout will - # be used (note: this is - # per-frame dropout on the - # output of the i_t and f_t gates) + # be used) } def set_derived_configs(self): From 1a5823672105c9c2790ea39fb9b50c84e3b61ed9 Mon Sep 17 00:00:00 2001 From: Gaofeng Cheng <770579626@qq.com> Date: Tue, 11 Apr 2017 09:28:15 +0800 Subject: [PATCH 13/21] update scripts for tdnn-(fast)lstm of AMI-IHM --- egs/ami/s5b/RESULTS_ihm | 14 + .../local/chain/tuning/run_tdnn_lstm_1l.sh | 293 +++++++++++++++++ .../local/chain/tuning/run_tdnn_lstm_1m.sh | 299 ++++++++++++++++++ 3 files changed, 606 insertions(+) create mode 100644 egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh create mode 100644 egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh diff --git a/egs/ami/s5b/RESULTS_ihm b/egs/ami/s5b/RESULTS_ihm index 25a60d24cfb..660fac9c200 100644 --- a/egs/ami/s5b/RESULTS_ihm +++ b/egs/ami/s5b/RESULTS_ihm @@ -84,6 +84,20 @@ %WER 20.8 | 13098 94489 | 82.0 10.0 8.0 2.8 20.8 53.2 | -0.096 | exp/ihm/chain_cleaned/tdnn_lstm1i_sp_bi_ld5/decode_dev/ascore_11/dev_hires.ctm.filt.sys %WER 20.7 | 12643 89980 | 81.7 11.5 6.8 2.5 20.7 51.8 | 0.015 | exp/ihm/chain_cleaned/tdnn_lstm1i_sp_bi_ld5/decode_eval/ascore_11/eval_hires.ctm.filt.sys +# local/chain/tuning/run_tdnn_lstm_1l.sh --mic ihm --train-set train_cleaned --gmm tri3_cleaned +# cleanup + chain TDNN+LSTM model + dropout +%WER 19.8 | 13098 94475 | 83.1 9.6 7.4 2.8 19.8 51.8 | -0.041 | exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/decode_dev/ascore_10/dev_hires.ctm.filt.sys +%WER 19.2 | 12643 89964 | 83.2 10.7 6.1 2.5 19.2 49.7 | 0.079 | exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/decode_eval/ascore_10/eval_hires.ctm.filt.sys + +# local/chain/tuning/run_tdnn_lstm_1j.sh --mic ihm --train-set train_cleaned --gmm tri3_cleaned +# cleanup + chain TDNN+fast-LSTM model +%WER 20.8 | 13098 94485 | 82.1 10.3 7.6 3.0 20.8 53.0 | -0.140 | exp/ihm/chain_cleaned/tdnn_lstm1j_sp_bi_ld5/decode_dev/ascore_11/dev_hires.ctm.filt.sys +%WER 20.3 | 12643 89982 | 82.3 11.4 6.3 2.6 20.3 51.1 | -0.035 | exp/ihm/chain_cleaned/tdnn_lstm1j_sp_bi_ld5/decode_eval/ascore_11/eval_hires.ctm.filt.sys + +# local/chain/tuning/run_tdnn_lstm_1m.sh --mic ihm --train-set train_cleaned --gmm tri3_cleaned +# cleanup + chain TDNN+fast-LSTM model + dropout +%WER 19.9 | 13098 94476 | 83.0 9.7 7.3 2.9 19.9 51.7 | -0.059 | exp/ihm/chain_cleaned/tdnn_lstm1m_sp_bi_ld5/decode_dev/ascore_10/dev_hires.ctm.filt.sys +%WER 19.3 | 12643 89969 | 83.1 10.8 6.1 2.4 19.3 49.9 | 0.045 | exp/ihm/chain_cleaned/tdnn_lstm1m_sp_bi_ld5/decode_eval/ascore_10/eval_hires.ctm.filt.sys # local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh --mic ihm # cleanup + chain TDNN+LSTM model + IHM reverberated data diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh new file mode 100644 index 00000000000..02680b92f30 --- /dev/null +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh @@ -0,0 +1,293 @@ +#!/bin/bash + +# same as 1i but with per-frame dropout on LSTM layer +#IHM +#System tdnn_lstm1i_sp_bi_ld5 tdnn_lstm1l_sp_bi_ld5 +#WER on dev 20.6 19.8 +#WER on eval 20.1 19.2 +#Final train prob -0.045 -0.067 +#Final valid prob -0.098 -0.098 +#Final train prob (xent) -0.723 -0.916 +#Final valid prob (xent) -1.04 -1.10 + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +mic=ihm +nj=30 +min_seg_len=1.55 +use_ihm_ali=false +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +ihm_gmm=tri3 # the gmm for the IHM system (if --use-ihm-ali true). +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +dropout_schedule='0,0@0.20,0.3@0.50,0' + +chunk_width=150 +chunk_left_context=40 +chunk_right_context=0 +label_delay=5 +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tlstm_affix=1l #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + + +# decode options +extra_left_context=50 +frames_per_chunk= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +xent_regularize=0.1 + +if [ $stage -le 15 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=1024 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 dropout-per-frame=true + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 + lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 dropout-per-frame=true + relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn8 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn9 input=Append(-3,0,3) dim=1024 + lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 dropout-per-frame=true + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + +graph_dir=$dir/graph_${LM} +if [ $stage -le 17 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + rm $dir/.error 2>/dev/null || true + + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + + for decode_set in dev eval; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$decode_cmd" \ + --extra-left-context $extra_left_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1; + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh new file mode 100644 index 00000000000..395a6dff483 --- /dev/null +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh @@ -0,0 +1,299 @@ +#!/bin/bash + +# 1m is same as 1j but with the by-frame dropout fast-lstmp + +#IHM +#System tdnn_lstm1j_sp_bi_ld5 tdnn_lstm1m_sp_bi_ld5 +#WER on dev 20.8 19.9 +#WER on eval 20.3 19.3 +#Final train prob -0.044 -0.065 +#Final valid prob -0.107 -0.100 +#Final train prob (xent) -0.684 -0.885 +#Final valid prob (xent) -1.05 -1.09 + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +mic=ihm +nj=30 +min_seg_len=1.55 +use_ihm_ali=false +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +ihm_gmm=tri3 # the gmm for the IHM system (if --use-ihm-ali true). +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +dropout_schedule='0,0@0.20,0.3@0.50,0' + +chunk_width=150 +chunk_left_context=40 +chunk_right_context=0 +label_delay=5 +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tlstm_affix=1m #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + + +# decode options +extra_left_context=50 +frames_per_chunk= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +xent_regularize=0.1 + +if [ $stage -le 15 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=20 dropout-proportion=0.0" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=1024 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + fast-lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 + fast-lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn8 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn9 input=Append(-3,0,3) dim=1024 + fast-lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + +graph_dir=$dir/graph_${LM} +if [ $stage -le 17 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + rm $dir/.error 2>/dev/null || true + + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + + for decode_set in dev eval; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$decode_cmd" \ + --extra-left-context $extra_left_context \ + --frames-per-chunk "$frames_per_chunk" \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1; + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 From 69a36e46f9b2312f46a6e1cb7cb998e2fceca5a4 Mon Sep 17 00:00:00 2001 From: Gaofeng Cheng <770579626@qq.com> Date: Tue, 11 Apr 2017 09:53:56 +0800 Subject: [PATCH 14/21] change scripts comment style and RESULTS --- egs/ami/s5b/RESULTS_ihm | 10 ---------- egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh | 13 ++++++++----- egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh | 13 +++++++++---- 3 files changed, 17 insertions(+), 19 deletions(-) diff --git a/egs/ami/s5b/RESULTS_ihm b/egs/ami/s5b/RESULTS_ihm index 660fac9c200..6438f64a6c9 100644 --- a/egs/ami/s5b/RESULTS_ihm +++ b/egs/ami/s5b/RESULTS_ihm @@ -84,16 +84,6 @@ %WER 20.8 | 13098 94489 | 82.0 10.0 8.0 2.8 20.8 53.2 | -0.096 | exp/ihm/chain_cleaned/tdnn_lstm1i_sp_bi_ld5/decode_dev/ascore_11/dev_hires.ctm.filt.sys %WER 20.7 | 12643 89980 | 81.7 11.5 6.8 2.5 20.7 51.8 | 0.015 | exp/ihm/chain_cleaned/tdnn_lstm1i_sp_bi_ld5/decode_eval/ascore_11/eval_hires.ctm.filt.sys -# local/chain/tuning/run_tdnn_lstm_1l.sh --mic ihm --train-set train_cleaned --gmm tri3_cleaned -# cleanup + chain TDNN+LSTM model + dropout -%WER 19.8 | 13098 94475 | 83.1 9.6 7.4 2.8 19.8 51.8 | -0.041 | exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/decode_dev/ascore_10/dev_hires.ctm.filt.sys -%WER 19.2 | 12643 89964 | 83.2 10.7 6.1 2.5 19.2 49.7 | 0.079 | exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/decode_eval/ascore_10/eval_hires.ctm.filt.sys - -# local/chain/tuning/run_tdnn_lstm_1j.sh --mic ihm --train-set train_cleaned --gmm tri3_cleaned -# cleanup + chain TDNN+fast-LSTM model -%WER 20.8 | 13098 94485 | 82.1 10.3 7.6 3.0 20.8 53.0 | -0.140 | exp/ihm/chain_cleaned/tdnn_lstm1j_sp_bi_ld5/decode_dev/ascore_11/dev_hires.ctm.filt.sys -%WER 20.3 | 12643 89982 | 82.3 11.4 6.3 2.6 20.3 51.1 | -0.035 | exp/ihm/chain_cleaned/tdnn_lstm1j_sp_bi_ld5/decode_eval/ascore_11/eval_hires.ctm.filt.sys - # local/chain/tuning/run_tdnn_lstm_1m.sh --mic ihm --train-set train_cleaned --gmm tri3_cleaned # cleanup + chain TDNN+fast-LSTM model + dropout %WER 19.9 | 13098 94476 | 83.0 9.7 7.3 2.9 19.9 51.7 | -0.059 | exp/ihm/chain_cleaned/tdnn_lstm1m_sp_bi_ld5/decode_dev/ascore_10/dev_hires.ctm.filt.sys diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh index 02680b92f30..50d8d5ad0b9 100644 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh @@ -5,11 +5,14 @@ #System tdnn_lstm1i_sp_bi_ld5 tdnn_lstm1l_sp_bi_ld5 #WER on dev 20.6 19.8 #WER on eval 20.1 19.2 -#Final train prob -0.045 -0.067 -#Final valid prob -0.098 -0.098 -#Final train prob (xent) -0.723 -0.916 -#Final valid prob (xent) -1.04 -1.10 - +#Final train prob -0.044763 -0.0666221 +#Final valid prob -0.0981107 -0.097616 +#Final train prob (xent) -0.722765 -0.915559 +#Final valid prob (xent) -1.03985 -1.09907 + +# steps/info/chain_dir_info.pl exp/ihm/chain_cleaned/tdnn_lstm1i_sp_bi_ld5/ exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/ +# exp/ihm/chain_cleaned/tdnn_lstm1i_sp_bi_ld5/: num-iters=89 nj=2..12 num-params=43.4M dim=40+100->3765 combine=-0.064->-0.059 xent:train/valid[58,88,final]=(-0.940,-0.739,-0.723/-1.14,-1.04,-1.04) logprob:train/valid[58,88,final]=(-0.067,-0.046,-0.045/-0.103,-0.099,-0.098) +# exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/: num-iters=89 nj=2..12 num-params=43.4M dim=40+100->3765 combine=-0.094->-0.082 xent:train/valid[58,88,final]=(-3.10,-1.11,-0.916/-3.17,-1.29,-1.10) logprob:train/valid[58,88,final]=(-0.164,-0.073,-0.067/-0.182,-0.104,-0.098) set -e -o pipefail diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh index 395a6dff483..f2244fdc1c8 100644 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh @@ -6,10 +6,15 @@ #System tdnn_lstm1j_sp_bi_ld5 tdnn_lstm1m_sp_bi_ld5 #WER on dev 20.8 19.9 #WER on eval 20.3 19.3 -#Final train prob -0.044 -0.065 -#Final valid prob -0.107 -0.100 -#Final train prob (xent) -0.684 -0.885 -#Final valid prob (xent) -1.05 -1.09 +#Final train prob -0.0439145 -0.0653269 +#Final valid prob -0.10673 -0.0998743 +#Final train prob (xent) -0.683776 -0.884698 +#Final valid prob (xent) -1.05254 -1.09002 + +# steps/info/chain_dir_info.pl exp/ihm/chain_cleaned/tdnn_lstm1j_sp_bi_ld5/ exp/ihm/chain_cleaned/tdnn_lstm1m_sp_bi_ld5/ +# exp/ihm/chain_cleaned/tdnn_lstm1j_sp_bi_ld5: num-iters=89 nj=2..12 num-params=43.4M dim=40+100->3765 combine=-0.063->-0.058 xent:train/valid[58,88,final]=(-0.888,-0.695,-0.684/-1.12,-1.06,-1.05) logprob:train/valid[58,88,final]=(-0.065,-0.045,-0.044/-0.105,-0.107,-0.107) +# exp/ihm/chain_cleaned/tdnn_lstm1m_sp_bi_ld5: num-iters=89 nj=2..12 num-params=43.4M dim=40+100->3765 combine=-0.092->-0.080 xent:train/valid[58,88,final]=(-3.12,-1.09,-0.885/-3.20,-1.27,-1.09) logprob:train/valid[58,88,final]=(-0.164,-0.072,-0.065/-0.181,-0.103,-0.100) + set -e -o pipefail From d03be0ff2091f6d614d0cf2c5c08130f52c9de8a Mon Sep 17 00:00:00 2001 From: Gaofeng Cheng <770579626@qq.com> Date: Wed, 12 Apr 2017 09:46:18 +0800 Subject: [PATCH 15/21] adding SDM results --- egs/ami/s5b/RESULTS_ihm | 8 +-- egs/ami/s5b/RESULTS_sdm | 4 ++ .../local/chain/tuning/run_tdnn_lstm_1i.sh | 3 +- .../local/chain/tuning/run_tdnn_lstm_1j.sh | 3 +- .../local/chain/tuning/run_tdnn_lstm_1l.sh | 54 +++++++++++++++++- .../local/chain/tuning/run_tdnn_lstm_1m.sh | 56 +++++++++++++++++-- 6 files changed, 115 insertions(+), 13 deletions(-) diff --git a/egs/ami/s5b/RESULTS_ihm b/egs/ami/s5b/RESULTS_ihm index 6438f64a6c9..bdd5a18b235 100644 --- a/egs/ami/s5b/RESULTS_ihm +++ b/egs/ami/s5b/RESULTS_ihm @@ -84,10 +84,10 @@ %WER 20.8 | 13098 94489 | 82.0 10.0 8.0 2.8 20.8 53.2 | -0.096 | exp/ihm/chain_cleaned/tdnn_lstm1i_sp_bi_ld5/decode_dev/ascore_11/dev_hires.ctm.filt.sys %WER 20.7 | 12643 89980 | 81.7 11.5 6.8 2.5 20.7 51.8 | 0.015 | exp/ihm/chain_cleaned/tdnn_lstm1i_sp_bi_ld5/decode_eval/ascore_11/eval_hires.ctm.filt.sys -# local/chain/tuning/run_tdnn_lstm_1m.sh --mic ihm --train-set train_cleaned --gmm tri3_cleaned -# cleanup + chain TDNN+fast-LSTM model + dropout -%WER 19.9 | 13098 94476 | 83.0 9.7 7.3 2.9 19.9 51.7 | -0.059 | exp/ihm/chain_cleaned/tdnn_lstm1m_sp_bi_ld5/decode_dev/ascore_10/dev_hires.ctm.filt.sys -%WER 19.3 | 12643 89969 | 83.1 10.8 6.1 2.4 19.3 49.9 | 0.045 | exp/ihm/chain_cleaned/tdnn_lstm1m_sp_bi_ld5/decode_eval/ascore_10/eval_hires.ctm.filt.sys +# local/chain/tuning/run_tdnn_lstm_1l.sh --mic ihm --train-set train_cleaned --gmm tri3_cleaned +# cleanup + chain TDNN+LSTM model + per-frame dropout +%WER 19.8 | 13098 94475 | 83.1 9.6 7.4 2.8 19.8 51.8 | -0.041 | exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/decode_dev/ascore_10/dev_hires.ctm.filt.sys +%WER 19.2 | 12643 89964 | 83.2 10.7 6.1 2.5 19.2 49.7 | 0.079 | exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/decode_eval/ascore_10/eval_hires.ctm.filt.sys # local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh --mic ihm # cleanup + chain TDNN+LSTM model + IHM reverberated data diff --git a/egs/ami/s5b/RESULTS_sdm b/egs/ami/s5b/RESULTS_sdm index 05b68e5e780..9ed296f51b1 100644 --- a/egs/ami/s5b/RESULTS_sdm +++ b/egs/ami/s5b/RESULTS_sdm @@ -91,6 +91,10 @@ %WER 37.6 | 15122 94495 | 66.1 18.7 15.1 3.7 37.6 63.2 | 0.646 | exp/sdm1/chain_cleaned/tdnn_lstm1i_sp_bi_ihmali_ld5/decode_dev/ascore_10/dev_hires_o4.ctm.filt.sys %WER 40.9 | 13807 89961 | 62.4 20.0 17.6 3.3 40.9 65.7 | 0.612 | exp/sdm1/chain_cleaned/tdnn_lstm1i_sp_bi_ihmali_ld5/decode_eval/ascore_10/eval_hires_o4.ctm.filt.sys +# local/chain/tuning/run_tdnn_lstm_1l.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned +# cleanup + chain TDNN+LSTM model, SDM audio + alignments from ihm data + per-frame dropout. +%WER 35.9 | 14900 94497 | 67.8 18.2 14.1 3.7 35.9 62.5 | 0.647 | exp/sdm1/chain_cleaned/tdnn_lstm1l_sp_bi_ihmali_ld5/decode_dev/ascore_9/dev_hires_o4.ctm.filt.sys +%WER 39.4 | 13223 89946 | 64.1 19.7 16.2 3.5 39.4 67.0 | 0.611 | exp/sdm1/chain_cleaned/tdnn_lstm1l_sp_bi_ihmali_ld5/decode_eval/ascore_9/eval_hires_o4.ctm.filt.sys # local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned # cleanup + chain TDNN+LSTM model, SDM original + IHM reverberated data, alignments from ihm data. diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh index 3e3976ac7a8..92636b4c17e 100755 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh @@ -26,6 +26,7 @@ gmm=tri3_cleaned # the gmm for the target data ihm_gmm=tri3 # the gmm for the IHM system (if --use-ihm-ali true). num_threads_ubm=32 nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +num_epochs=4 chunk_width=150 chunk_left_context=40 @@ -242,7 +243,7 @@ if [ $stage -le 16 ]; then --egs.chunk-right-context $chunk_right_context \ --trainer.num-chunk-per-minibatch 64 \ --trainer.frames-per-iter 1500000 \ - --trainer.num-epochs 4 \ + --trainer.num-epochs $num_epochs \ --trainer.optimization.shrink-value 0.99 \ --trainer.optimization.num-jobs-initial 2 \ --trainer.optimization.num-jobs-final 12 \ diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh index 008060df070..a96230075b6 100755 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh @@ -34,6 +34,7 @@ gmm=tri3_cleaned # the gmm for the target data ihm_gmm=tri3 # the gmm for the IHM system (if --use-ihm-ali true). num_threads_ubm=32 nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +num_epochs=4 chunk_width=150 chunk_left_context=40 @@ -254,7 +255,7 @@ if [ $stage -le 16 ]; then --egs.chunk-right-context-final 0 \ --trainer.num-chunk-per-minibatch 64,32 \ --trainer.frames-per-iter 1500000 \ - --trainer.num-epochs 4 \ + --trainer.num-epochs $num_epochs \ --trainer.optimization.shrink-value 0.99 \ --trainer.optimization.num-jobs-initial 2 \ --trainer.optimization.num-jobs-final 12 \ diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh index 50d8d5ad0b9..eac59626a0f 100644 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh @@ -1,7 +1,12 @@ #!/bin/bash -# same as 1i but with per-frame dropout on LSTM layer -#IHM +# This (1l.sh) is the same as 1j but with per-frame dropout on LSTM layer +# It is a regular (non-fast) LSTM with per-frame dropout on [i, f, o] gates of the LSTM, +# the dropout-adding place is "place4" in paper : http://www.danielpovey.com/files/2017_interspeech_dropout.pdf. +# We have tried both 4-epoch and 5-epoch training. + +### IHM +# Results with flags : --mic ihm --train-set train_cleaned --gmm tri3_cleaned\ #System tdnn_lstm1i_sp_bi_ld5 tdnn_lstm1l_sp_bi_ld5 #WER on dev 20.6 19.8 #WER on eval 20.1 19.2 @@ -10,10 +15,53 @@ #Final train prob (xent) -0.722765 -0.915559 #Final valid prob (xent) -1.03985 -1.09907 -# steps/info/chain_dir_info.pl exp/ihm/chain_cleaned/tdnn_lstm1i_sp_bi_ld5/ exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/ +# ./steps/info/chain_dir_info.pl exp/ihm/chain_cleaned/tdnn_lstm1i_sp_bi_ld5/ exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/ # exp/ihm/chain_cleaned/tdnn_lstm1i_sp_bi_ld5/: num-iters=89 nj=2..12 num-params=43.4M dim=40+100->3765 combine=-0.064->-0.059 xent:train/valid[58,88,final]=(-0.940,-0.739,-0.723/-1.14,-1.04,-1.04) logprob:train/valid[58,88,final]=(-0.067,-0.046,-0.045/-0.103,-0.099,-0.098) # exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/: num-iters=89 nj=2..12 num-params=43.4M dim=40+100->3765 combine=-0.094->-0.082 xent:train/valid[58,88,final]=(-3.10,-1.11,-0.916/-3.17,-1.29,-1.10) logprob:train/valid[58,88,final]=(-0.164,-0.073,-0.067/-0.182,-0.104,-0.098) +# Results with flags for (1l.sh) : --num-epochs 5 --tlstm-affix 1i_5epoch --mic ihm --train-set train_cleaned --gmm tri3_cleaned\ +# Results with flags for (1i.sh) : --num-epochs 5 --tlstm-affix 1l_5epoch --mic ihm --train-set train_cleaned --gmm tri3_cleaned\ +#System tdnn_lstm1i_5epoch_sp_bi_ld5 tdnn_lstm1l_5epoch_sp_bi_ld5 +#WER on dev 20.8 19.7 +#WER on eval 20.6 19.3 +#Final train prob -0.0347795-0.0600903 +#Final valid prob -0.102486-0.0964607 +#Final train prob (xent) -0.621007 -0.84667 +#Final valid prob (xent) -1.02634 -1.04725 + +# ./steps/info/chain_dir_info.pl exp/ihm/chain_cleaned/tdnn_lstm1i_5epoch_sp_bi_ld5/ exp/ihm/chain_cleaned/tdnn_lstm1l_5epoch_sp_bi_ld5/ +# exp/ihm/chain_cleaned/tdnn_lstm1i_5epoch_sp_bi_ld5/: num-iters=111 nj=2..12 num-params=43.4M dim=40+100->3765 combine=-0.053->-0.049 xent:train/valid[73,110,final]=(-0.832,-0.631,-0.621/-1.09,-1.03,-1.03) logprob:train/valid[73,110,final]=(-0.057,-0.037,-0.035/-0.102,-0.103,-0.102) +# exp/ihm/chain_cleaned/tdnn_lstm1l_5epoch_sp_bi_ld5/: num-iters=111 nj=2..12 num-params=43.4M dim=40+100->3765 combine=-0.085->-0.074 xent:train/valid[73,110,final]=(-3.14,-1.02,-0.847/-3.20,-1.21,-1.05) logprob:train/valid[73,110,final]=(-0.162,-0.065,-0.060/-0.177,-0.101,-0.096) + +### SDM +# Results with flags : --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned \ +#System tdnn_lstm1i_sp_bi_ihmali_ld5 tdnn_lstm1l_sp_bi_ihmali_ld5 +#WER on dev 37.0 35.9 +#WER on eval 40.0 39.4 +#Final train prob -0.106971 -0.15439 +#Final valid prob -0.252201 -0.244499 +#Final train prob (xent) -1.41142 -1.73795 +#Final valid prob (xent) -2.13741 -2.14519 + +# ./steps/info/chain_dir_info.pl exp/sdm1/chain_cleaned/tdnn_lstm1i_sp_bi_ihmali_ld5/ exp/sdm1/chain_cleaned/tdnn_lstm1l_sp_bi_ihmali_ld5/ +# exp/sdm1/chain_cleaned/tdnn_lstm1i_sp_bi_ihmali_ld5/: num-iters=87 nj=2..12 num-params=43.4M dim=40+100->3741 combine=-0.138->-0.128 xent:train/valid[57,86,final]=(-1.78,-1.42,-1.41/-2.23,-2.14,-2.14) logprob:train/valid[57,86,final]=(-0.155,-0.108,-0.107/-0.251,-0.254,-0.252) +# exp/sdm1/chain_cleaned/tdnn_lstm1l_sp_bi_ihmali_ld5/: num-iters=87 nj=2..12 num-params=43.4M dim=40+100->3741 combine=-0.192->-0.174 xent:train/valid[57,86,final]=(-3.74,-1.95,-1.74/-3.86,-2.31,-2.15) logprob:train/valid[57,86,final]=(-0.287,-0.165,-0.154/-0.335,-0.250,-0.244) + +# Results with flags for (1i.sh) : --num-epochs 5 --tlstm-affix 1i_5epoch --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned\ +# Results with flags for (1l.sh) : --num-epochs 5 --tlstm-affix 1l_5epoch --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned\ +#System tdnn_lstm1i_5epoch_sp_bi_ihmali_ld5 tdnn_lstm1l_5epoch_sp_bi_ihmali_ld5 +#WER on dev 36.9 35.8 +#WER on eval 40.2 39.5 +#Final train prob -0.0854552 -0.134189 +#Final valid prob -0.262789 -0.244183 +#inal train prob (xent) -1.2195 -1.58789 +#Final valid prob (xent) -2.13389 -2.08964 + +# ./steps/info/chain_dir_info.pl exp/sdm1/chain_cleaned/tdnn_lstm1i_5epoch_sp_bi_ihmali_ld5 exp/sdm1/chain_cleaned/tdnn_lstm1l_5epoch_sp_bi_ihmali_ld5/ +# exp/sdm1/chain_cleaned/tdnn_lstm1i_5epoch_sp_bi_ihmali_ld5: num-iters=109 nj=2..12 num-params=43.4M dim=40+100->3741 combine=-0.111->-0.104 xent:train/valid[71,108,final]=(-1.61,-1.25,-1.22/-2.16,-2.15,-2.13) logprob:train/valid[71,108,final]=(-0.133,-0.089,-0.085/-0.246,-0.264,-0.263) +# exp/sdm1/chain_cleaned/tdnn_lstm1l_5epoch_sp_bi_ihmali_ld5/: num-iters=109 nj=2..12 num-params=43.4M dim=40+100->3741 combine=-0.170->-0.153 xent:train/valid[71,108,final]=(-3.67,-1.76,-1.59/-3.81,-2.22,-2.09) logprob:train/valid[71,108,final]=(-0.274,-0.144,-0.134/-0.327,-0.248,-0.244) + + set -e -o pipefail # First the options that are passed through to run_ivector_common.sh diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh index f2244fdc1c8..b0e7af0618d 100644 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh @@ -1,8 +1,12 @@ #!/bin/bash -# 1m is same as 1j but with the by-frame dropout fast-lstmp +# This (1m.sh) is the same as 1j but with per-frame dropout on LSTM layer +# It is a fast LSTM with per-frame dropout on [i, f, o] gates of the LSTM, +# the dropout-adding place is "place4" in paper : http://www.danielpovey.com/files/2017_interspeech_dropout.pdf. +# We have tried both 4-epoch and 5-epoch training. -#IHM +### IHM +# Results with flags : --mic ihm --train-set train_cleaned --gmm tri3_cleaned \ #System tdnn_lstm1j_sp_bi_ld5 tdnn_lstm1m_sp_bi_ld5 #WER on dev 20.8 19.9 #WER on eval 20.3 19.3 @@ -15,6 +19,48 @@ # exp/ihm/chain_cleaned/tdnn_lstm1j_sp_bi_ld5: num-iters=89 nj=2..12 num-params=43.4M dim=40+100->3765 combine=-0.063->-0.058 xent:train/valid[58,88,final]=(-0.888,-0.695,-0.684/-1.12,-1.06,-1.05) logprob:train/valid[58,88,final]=(-0.065,-0.045,-0.044/-0.105,-0.107,-0.107) # exp/ihm/chain_cleaned/tdnn_lstm1m_sp_bi_ld5: num-iters=89 nj=2..12 num-params=43.4M dim=40+100->3765 combine=-0.092->-0.080 xent:train/valid[58,88,final]=(-3.12,-1.09,-0.885/-3.20,-1.27,-1.09) logprob:train/valid[58,88,final]=(-0.164,-0.072,-0.065/-0.181,-0.103,-0.100) +# Results with flags for (1m.sh) : --num-epochs 5 --tlstm-affix 1m_5epoch --mic ihm --train-set train_cleaned --gmm tri3_cleaned \ +# Results with flags for (1j.sh) : --num-epochs 5 --tlstm-affix 1j_5epoch --mic ihm --train-set train_cleaned --gmm tri3_cleaned \ +#System tdnn_lstm1j_5epoch_sp_bi_ld5 tdnn_lstm1m_5epoch_sp_bi_ld5 +#WER on dev 21.1 19.9 +#WER on eval 20.9 19.8 +#Final train prob -0.0365079 -0.057024 +#Final valid prob -0.112709-0.0992725 +#inal train prob (xent) -0.601602 -0.800653 +#Final valid prob (xent) -1.03241 -1.04748 + +# ./steps/info/chain_dir_info.pl exp/ihm/chain_cleaned/tdnn_lstm1j_5epoch_sp_bi_ld5/ exp/ihm/chain_cleaned/tdnn_lstm1m_5epoch_sp_bi_ld5/ +# exp/ihm/chain_cleaned/tdnn_lstm1j_5epoch_sp_bi_ld5/: num-iters=111 nj=2..12 num-params=43.4M dim=40+100->3765 combine=-0.053->-0.049 xent:train/valid[73,110,final]=(-0.813,-0.615,-0.602/-1.08,-1.04,-1.03) logprob:train/valid[73,110,final]=(-0.057,-0.038,-0.037/-0.106,-0.113,-0.113) +# exp/ihm/chain_cleaned/tdnn_lstm1m_5epoch_sp_bi_ld5/: num-iters=111 nj=2..12 num-params=43.4M dim=40+100->3765 combine=-0.080->-0.072 xent:train/valid[73,110,final]=(-3.15,-0.985,-0.801/-3.26,-1.21,-1.05) logprob:train/valid[73,110,final]=(-0.161,-0.062,-0.057/-0.183,-0.102,-0.099) + +#### SDM +# Results with flags : --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned \ +#System tdnn_lstm1j_sp_bi_ihmali_ld5 tdnn_lstm1m_sp_bi_ihmali_ld5 +#WER on dev 36.9 36.4 +#WER on eval 40.5 39.9 +#Final train prob -0.108141 -0.148861 +#Final valid prob -0.257468 -0.240962 +#Final train prob (xent) -1.38179 -1.70258 +#Final valid prob (xent) -2.13095 -2.12803 + +# ./steps/info/chain_dir_info.pl exp/sdm1/chain_cleaned/tdnn_lstm1j_sp_bi_ihmali_ld5/ exp/sdm1/chain_cleaned/tdnn_lstm1m_sp_bi_ihmali_ld5/ +# exp/sdm1/chain_cleaned/tdnn_lstm1j_sp_bi_ihmali_ld5/: num-iters=87 nj=2..12 num-params=43.4M dim=40+100->3741 combine=-0.138->-0.128 xent:train/valid[57,86,final]=(-1.71,-1.39,-1.38/-2.18,-2.14,-2.13) logprob:train/valid[57,86,final]=(-0.150,-0.110,-0.108/-0.251,-0.260,-0.257) +# exp/sdm1/chain_cleaned/tdnn_lstm1m_sp_bi_ihmali_ld5/: num-iters=87 nj=2..12 num-params=43.4M dim=40+100->3741 combine=-0.187->-0.170 xent:train/valid[57,86,final]=(-3.74,-1.90,-1.70/-3.88,-2.28,-2.13) logprob:train/valid[57,86,final]=(-0.286,-0.158,-0.149/-0.336,-0.245,-0.241) + +# Results with flags for (1m.sh) : --num-epochs 5 --tlstm-affix 1m_5epoch --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned\ +# Results with flags for (1j.sh) : --num-epochs 5 --tlstm-affix 1j_5epoch --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned\ +#System tdnn_lstm1j_5epoch_sp_bi_ihmali_ld5 tdnn_lstm1m_5epoch_sp_bi_ihmali_ld5 +#WER on dev 37.4 36.0 +#WER on eval 40.7 39.6 +#Final train prob -0.0879063 -0.133092 +#Final valid prob -0.270953 -0.243246 +#Final train prob (xent) -1.20822 -1.56293 +#Final valid prob (xent) -2.1425 -2.07265 + +# ./steps/info/chain_dir_info.pl exp/sdm1/chain_cleaned/tdnn_lstm1j_5epoch_sp_bi_ihmali_ld5/ exp/sdm1/chain_cleaned/tdnn_lstm1m_5epoch_sp_bi_ihmali_ld5/ +# exp/sdm1/chain_cleaned/tdnn_lstm1j_5epoch_sp_bi_ihmali_ld5/: num-iters=109 nj=2..12 num-params=43.4M dim=40+100->3741 combine=-0.115->-0.107 xent:train/valid[71,108,final]=(-1.56,-1.22,-1.21/-2.16,-2.16,-2.14) logprob:train/valid[71,108,final]=(-0.131,-0.090,-0.088/-0.256,-0.273,-0.271) +# exp/sdm1/chain_cleaned/tdnn_lstm1m_5epoch_sp_bi_ihmali_ld5/: num-iters=109 nj=2..12 num-params=43.4M dim=40+100->3741 combine=-0.167->-0.153 xent:train/valid[71,108,final]=(-3.69,-1.71,-1.56/-3.84,-2.20,-2.07) logprob:train/valid[71,108,final]=(-0.279,-0.140,-0.133/-0.329,-0.247,-0.243) + set -e -o pipefail @@ -30,7 +76,9 @@ gmm=tri3_cleaned # the gmm for the target data ihm_gmm=tri3 # the gmm for the IHM system (if --use-ihm-ali true). num_threads_ubm=32 nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned -dropout_schedule='0,0@0.20,0.3@0.50,0' +dropout_schedule='0,0@0.20,0.3@0.50,0' # dropout schedule controls the dropout + # proportion for each training iteration. +num_epochs=4 chunk_width=150 chunk_left_context=40 @@ -252,7 +300,7 @@ if [ $stage -le 16 ]; then --trainer.dropout-schedule $dropout_schedule \ --trainer.num-chunk-per-minibatch 64,32 \ --trainer.frames-per-iter 1500000 \ - --trainer.num-epochs 4 \ + --trainer.num-epochs $num_epochs \ --trainer.optimization.shrink-value 0.99 \ --trainer.optimization.num-jobs-initial 2 \ --trainer.optimization.num-jobs-final 12 \ From 936863ee456364bc90e5cd904a3a31adbc83cd56 Mon Sep 17 00:00:00 2001 From: Gaofeng Cheng <770579626@qq.com> Date: Mon, 17 Apr 2017 09:31:41 +0800 Subject: [PATCH 16/21] adding SWBD (parts of all) scripts with dropout --- .../s5c/local/chain/tuning/run_blstm_6l.sh | 247 ++++++++++++++ .../local/chain/tuning/run_tdnn_lstm_1k.sh | 321 ++++++++++++++++++ 2 files changed, 568 insertions(+) create mode 100644 egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh create mode 100644 egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1k.sh diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh new file mode 100644 index 00000000000..e577f96a58f --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh @@ -0,0 +1,247 @@ +#!/bin/bash + +# 6l is same as 6k, but with the per-frame dropout +# location4 as paper : http://www.danielpovey.com/files/2017_interspeech_dropout.pdf + +# local/chain/compare_wer_general.sh blstm_6k_sp blstm_6l_sp +# result (14.5 vs 14.1), this may due to noise + +# System blstm_6k_sp blstm_6l_sp +# WER on train_dev(tg) 13.30 13.06 +# WER on train_dev(fg) 12.34 12.16 +# WER on eval2000(tg) 15.5 15.2 +# WER on eval2000(fg) 14.1 13.8 +# Final train prob -0.052 -0.065 +# Final valid prob -0.090 -0.093 +# Final train prob (xent) -0.743 -0.831 +# Final valid prob (xent) -0.9579 -0.9821 + +# exp/chain/blstm_6k_sp/: num-iters=327 nj=3..16 num-params=41.2M dim=40+100->6074 combine=-0.069->-0.069 xent:train/valid[217,326,final]=(-0.849,-0.748,-0.743/-1.04,-0.959,-0.958) logprob:train/valid[217,326,final]=(-0.065,-0.053,-0.052/-0.096,-0.090,-0.090) +# exp/chain/blstm_6l_sp/: num-iters=327 nj=3..16 num-params=41.2M dim=40+100->6074 combine=-0.084->-0.082 xent:train/valid[217,326,final]=(-1.45,-0.840,-0.831/-1.58,-0.994,-0.982) logprob:train/valid[217,326,final]=(-0.110,-0.066,-0.065/-0.132,-0.094,-0.093) +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/blstm_6l # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= +decode_dir_affix= + +# training options +leftmost_questions_truncate=-1 +chunk_width=150 +chunk_left_context=40 +chunk_right_context=40 +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=0 +dropout_schedule='0,0@0.20,0.1@0.50,0' + +# decode options +extra_left_context=50 +extra_right_context=50 +frames_per_chunk= + +remove_egs=false +common_egs_dir= + +affix= +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; } + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=20 dropout-proportion=0.0" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + fast-lstmp-layer name=blstm1-forward input=lda cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + fast-lstmp-layer name=blstm1-backward input=lda cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 $lstm_opts + + fast-lstmp-layer name=blstm2-forward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + fast-lstmp-layer name=blstm2-backward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 $lstm_opts + + fast-lstmp-layer name=blstm3-forward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + fast-lstmp-layer name=blstm3-backward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 1200000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --trainer.dropout-schedule $dropout_schedule \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $extra_right_context ] && extra_right_context=$chunk_right_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1k.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1k.sh new file mode 100644 index 00000000000..21cb4fa9373 --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1k.sh @@ -0,0 +1,321 @@ +#!/bin/bash + +# run_tdnn_lstm_1k.sh is like run_tdnn_lstm_1e.sh but +# added the per-frame dropout location 4 as paper: +# http://www.danielpovey.com/files/2017_interspeech_dropout.pdf + +# ./local/chain/compare_wer_general.sh --looped tdnn_lstm_1e_sp tdnn_lstm_1k_sp +# System tdnn_lstm_1e_sp tdnn_lstm_1k_sp +# WER on train_dev(tg) 13.18 12.60 +# [looped:] 13.10 12.56 +# WER on train_dev(fg) 12.21 11.58 +# [looped:] 12.28 11.62 +# WER on eval2000(tg) 15.8 15.2 +# [looped:] 15.8 15.2 +# WER on eval2000(fg) 14.5 13.7 +# [looped:] 14.5 13.8 +# Final train prob -0.060 -0.076 +# Final valid prob -0.101 -0.106 +# Final train prob (xent) -0.868 -0.989 +# Final valid prob (xent) -1.0740 -1.1341 + +# exp/chain/tdnn_lstm_1e_sp/: num-iters=262 nj=3..16 num-params=39.6M dim=40+100->6074 combine=-0.072->-0.071 xent:train/valid[173,261,final]=(-1.01,-0.876,-0.868/-1.16,-1.08,-1.07) logprob:train/valid[173,261,final]=(-0.075,-0.061,-0.060/-0.106,-0.101,-0.101) +# exp/chain/tdnn_lstm_1k_sp/: num-iters=262 nj=3..16 num-params=39.6M dim=40+100->6074 combine=-0.093->-0.089 xent:train/valid[173,261,final]=(-2.87,-1.07,-0.989/-2.90,-1.20,-1.13) logprob:train/valid[173,261,final]=(-0.153,-0.079,-0.076/-0.179,-0.107,-0.106) + +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_lstm_1e # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= +decode_nj=50 + +# training options +xent_regularize=0.01 +self_repair_scale=0.00001 +label_delay=5 + +chunk_left_context=40 +chunk_right_context=0 +# we'll put chunk-left-context-initial=0 and chunk-right-context-final=0 +# directly without variables. +frames_per_chunk=140,100,160 + +# (non-looped) decoding options +frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1) +extra_left_context=50 +extra_right_context=0 +# we'll put extra-left-context-initial=0 and extra-right-context-final=0 +# directly without variables. + + +remove_egs=false +common_egs_dir= + +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; } + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=20" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=1024 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 + fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 + fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_chunk \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + + +graph_dir=$dir/graph_sw1_tg +iter_opts= +if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " +fi + +if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --num-threads 4 \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 25 --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk "$frames_per_chunk_primary" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + +if [ $stage -le 16 ]; then + # looped decoding. Note: this does not make sense for BLSTMs or other + # backward-recurrent setups, and for TDNNs and other non-recurrent there is no + # point doing it because it would give identical results to regular decoding. + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode_looped.sh \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $decode_nj --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg_looped || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg}_looped || exit 1; + fi + ) & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in looped decoding" + exit 1 + fi +fi + +if $test_online_decoding && [ $stage -le 17 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3/extractor $dir ${dir}_online + + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000; do + ( + # note: we just give it "$decode_set" as it only uses the wav.scp, the + # feature type does not matter. + + steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" $iter_opts \ + --acwt 1.0 --post-decode-acwt 10.0 \ + $graph_dir data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in online decoding" + exit 1 + fi +fi + +exit 0; From f51fb75dd76dca6e1ca6cdbf0a18c78967fcdc64 Mon Sep 17 00:00:00 2001 From: Gaofeng Cheng <770579626@qq.com> Date: Mon, 17 Apr 2017 09:31:56 +0800 Subject: [PATCH 17/21] small fix --- egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh index e577f96a58f..68daf81ab01 100644 --- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh @@ -4,6 +4,7 @@ # location4 as paper : http://www.danielpovey.com/files/2017_interspeech_dropout.pdf # local/chain/compare_wer_general.sh blstm_6k_sp blstm_6l_sp +# attention: the blatm_6k_sp result here is far better than the updated # result (14.5 vs 14.1), this may due to noise # System blstm_6k_sp blstm_6l_sp From 139f412fcb7b0df71065458e22a16558a0184529 Mon Sep 17 00:00:00 2001 From: Gaofeng Cheng <770579626@qq.com> Date: Tue, 18 Apr 2017 11:19:39 +0800 Subject: [PATCH 18/21] update tdnn-blstm with dropout in SWBD --- .../local/chain/tuning/run_tdnn_blstm_1b.sh | 248 ++++++++++++++++++ 1 file changed, 248 insertions(+) create mode 100644 egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1b.sh diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1b.sh new file mode 100644 index 00000000000..3929cdc432e --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1b.sh @@ -0,0 +1,248 @@ +#!/bin/bash + +# tdnn_blstm_1b is same as tdnn_blstm_1a, but with the per-frame dropout +# added with location 4, see paper: +# http://www.danielpovey.com/files/2017_interspeech_dropout.pdf + +# ./local/chain/compare_wer_general.sh tdnn_blstm_1a_sp tdnn_blstm_1b_sp +# System tdnn_blstm_1a_sp tdnn_blstm_1b_sp +# WER on train_dev(tg) 12.86 12.60 +# WER on train_dev(fg) 11.86 11.80 +# WER on eval2000(tg) 15.3 14.9 +# WER on eval2000(fg) 14.0 13.5 +# Final train prob -0.042 -0.054 +# Final valid prob -0.099 -0.091 +# Final train prob (xent) -0.637 -0.719 +# Final valid prob (xent) -0.9418 -0.9190 + +# exp/chain/tdnn_blstm_1a_sp/: num-iters=327 nj=3..16 num-params=53.7M dim=40+100->6074 combine=-0.058->-0.057 xent:train/valid[217,326,final]=(-0.753,-0.631,-0.637/-0.974,-0.941,-0.942) logprob:train/valid[217,326,final]=(-0.055,-0.041,-0.042/-0.094,-0.099,-0.099) +# exp/chain/tdnn_blstm_1b_sp/: num-iters=327 nj=3..16 num-params=53.7M dim=40+100->6074 combine=-0.070->-0.068 xent:train/valid[217,326,final]=(-1.27,-0.732,-0.719/-1.42,-0.931,-0.919) logprob:train/valid[217,326,final]=(-0.094,-0.055,-0.054/-0.117,-0.091,-0.091) +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_blstm_1b # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= +decode_dir_affix= + +# training options +leftmost_questions_truncate=-1 +chunk_width=150 +chunk_left_context=40 +chunk_right_context=40 +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=0 +dropout_schedule='0,0@0.20,0.1@0.50,0' +# decode options +extra_left_context=50 +extra_right_context=50 +frames_per_chunk= + +remove_egs=false +common_egs_dir= + +affix= +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; } + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=20 dropout-proportion=0.0" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=1024 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + fast-lstmp-layer name=blstm1-forward input=tdnn3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + fast-lstmp-layer name=blstm1-backward input=tdnn3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 $lstm_opts + + fast-lstmp-layer name=blstm2-forward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + fast-lstmp-layer name=blstm2-backward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 $lstm_opts + + fast-lstmp-layer name=blstm3-forward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + fast-lstmp-layer name=blstm3-backward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 1200000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --trainer.dropout-schedule $dropout_schedule \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $extra_right_context ] && extra_right_context=$chunk_right_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; From 9a8b81cb839b2ac8f99fe4bd6fbc17a6e8bc1eff Mon Sep 17 00:00:00 2001 From: Gaofeng Cheng <770579626@qq.com> Date: Tue, 18 Apr 2017 12:07:55 +0800 Subject: [PATCH 19/21] update tdnn+regular-LSTM(4epoch) in SWBD 5epoch is on the way --- .../local/chain/tuning/run_tdnn_lstm_1l.sh | 244 ++++++++++++++++++ 1 file changed, 244 insertions(+) create mode 100644 egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1l.sh diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1l.sh new file mode 100644 index 00000000000..e88e199839c --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1l.sh @@ -0,0 +1,244 @@ +#!/bin/bash + +# tdnn_lstm_1l is same as tdnn_lstm_1b, but with the per-frame dropout +# added with location 4 in LSTM layer, see paper: +# http://www.danielpovey.com/files/2017_interspeech_dropout.pdf + +# ./local/chain/compare_wer_general.sh tdnn_lstm_1b_ld5_sp tdnn_lstm_1l_ld5_sp +# System tdnn_lstm_1b_ld5_sp tdnn_lstm_1l_ld5_sp +# WER on train_dev(tg) 13.06 12.41 +# WER on train_dev(fg) 12.13 11.59 +# WER on eval2000(tg) 15.1 14.8 +# WER on eval2000(fg) 13.9 13.5 +# Final train prob -0.047 -0.069 +# Final valid prob -0.093 -0.095 +# Final train prob (xent) -0.735 -0.913 +# Final valid prob (xent) -1.0151 -1.0820 + +# exp/chain/tdnn_lstm_1b_ld5_sp: num-iters=327 nj=3..16 num-params=39.6M dim=40+100->6074 combine=-0.062->-0.061 xent:train/valid[217,326,final]=(-0.877,-0.741,-0.735/-1.08,-1.02,-1.02) logprob:train/valid[217,326,final]=(-0.063,-0.048,-0.047/-0.095,-0.093,-0.093) +# exp/chain/tdnn_lstm_1l_ld5_sp: num-iters=327 nj=3..16 num-params=39.6M dim=40+100->6074 combine=-0.088->-0.084 xent:train/valid[217,326,final]=(-3.32,-0.961,-0.913/-3.40,-1.13,-1.08) logprob:train/valid[217,326,final]=(-0.176,-0.072,-0.069/-0.198,-0.097,-0.095) +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_lstm_1l # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= +decode_dir_affix= + +# training options +leftmost_questions_truncate=-1 +chunk_width=150 +chunk_left_context=40 +chunk_right_context=0 +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 +dropout_schedule='0,0@0.20,0.3@0.50,0' +# decode options +extra_left_context=50 +extra_right_context=0 +frames_per_chunk= + +remove_egs=false +common_egs_dir= + +affix= +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=1024 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 dropout-per-frame=true + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 + lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 dropout-per-frame=true + relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 + lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 dropout-per-frame=true + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 1200000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --trainer.dropout-schedule $dropout_schedule \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $extra_right_context ] && extra_right_context=$chunk_right_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; From 48f41a7a399a16c7e778605f693ade507d7eae6c Mon Sep 17 00:00:00 2001 From: Gaofeng Cheng <770579626@qq.com> Date: Thu, 20 Apr 2017 09:32:07 +0800 Subject: [PATCH 20/21] adding tedlium scripts also SWBD RESULTS updated --- egs/swbd/s5c/RESULTS | 6 + .../local/chain/tuning/run_tdnn_lstm_1s.sh | 333 ++++++++++++++++++ .../local/chain/tuning/run_tdnn_lstm_1t.sh | 333 ++++++++++++++++++ .../local/chain/tuning/run_tdnn_lstm_1u.sh | 327 +++++++++++++++++ 4 files changed, 999 insertions(+) create mode 100644 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh create mode 100644 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh create mode 100644 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh diff --git a/egs/swbd/s5c/RESULTS b/egs/swbd/s5c/RESULTS index f103200f966..2cf34c600c1 100644 --- a/egs/swbd/s5c/RESULTS +++ b/egs/swbd/s5c/RESULTS @@ -203,6 +203,12 @@ exit 0 %WER 21.2 | 2628 21594 | 81.4 12.8 5.9 2.6 21.2 56.7 | exp/chain/lstm_d_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys %WER 13.88 [ 6829 / 49204, 935 ins, 1690 del, 4204 sub ] exp/chain/lstm_d_ld5_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# current best 'chain' models with TDNN + LSTM + dropout (see local/chain/run_tdnn_lstm_1l.sh) +%WER 13.5 | 4459 42989 | 88.2 8.0 3.8 1.7 13.5 48.2 | exp/chain/tdnn_lstm_1b_dropout_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 8.8 | 1831 21395 | 92.3 5.2 2.5 1.1 8.8 41.9 | exp/chain/tdnn_lstm_1b_dropout_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 18.1 | 2628 21594 | 84.0 10.8 5.2 2.2 18.1 52.6 | exp/chain/tdnn_lstm_1b_dropout_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.callhm.filt.sys +%WER 11.59 [ 5615 / 48460, 708 ins, 1450 del, 3457 sub ] exp/chain/tdnn_lstm_1b_dropout_ld5_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 + # these are results with nnet3 LSTMs with CTC training : local/ctc/run_lstm.sh %WER 17.4 | 1831 21395 | 85.3 10.1 4.6 2.7 17.4 57.8 | exp/ctc/lstm_sp/decode_eval2000_sw1_fsh_fg_0.15/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys %WER 19.4 | 1831 21395 | 83.5 11.2 5.2 3.0 19.4 60.7 | exp/ctc/lstm_sp/decode_eval2000_sw1_tg_0.15/score_12_0.5/eval2000_hires.ctm.swbd.filt.sys diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh new file mode 100644 index 00000000000..dc0f59fb64a --- /dev/null +++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh @@ -0,0 +1,333 @@ +#!/bin/bash + +# 1s is as 1e, but adding per-frame dropout to LSTM in location4 +# as paper : http://www.danielpovey.com/files/2017_interspeech_dropout.pdf + +# ./local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1s_sp_bi +# System tdnn_lstm1e_sp_bi tdnn_lstm1s_sp_bi +# WER on dev(orig) 9.0 8.9 +# [looped:] 9.0 8.9 +# WER on dev(rescored) 8.4 8.1 +# [looped:] 8.4 8.1 +# WER on test(orig) 8.9 8.8 +# [looped:] 8.9 8.8 +# WER on test(rescored) 8.4 8.4 +# [looped:] 8.4 8.3 +# Final train prob -0.0712 -0.0914 +# Final valid prob -0.0892 -0.0977 +# Final train prob (xent) -0.8566 -0.9931 +# Final valid prob (xent) -0.9927 -1.0633 + +# exp/chain_cleaned/tdnn_lstm1e_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3626 combine=-0.082->-0.081 xent:train/valid[167,252,final]=(-0.961,-0.859,-0.857/-1.06,-0.999,-0.993) logprob:train/valid[167,252,final]=(-0.086,-0.072,-0.071/-0.098,-0.091,-0.089) +# exp/chain_cleaned/tdnn_lstm1s_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3626 combine=-0.104->-0.101 xent:train/valid[167,252,final]=(-3.08,-1.07,-0.993/-3.13,-1.14,-1.06) logprob:train/valid[167,252,final]=(-0.181,-0.093,-0.091/-0.183,-0.100,-0.098) + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +decode_nj=30 +min_seg_len=1.55 +label_delay=5 +xent_regularize=0.1 +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +# training options +chunk_left_context=40 +chunk_right_context=0 +chunk_left_context_initial=0 +chunk_right_context_final=0 +frames_per_chunk=140,100,160 +dropout_schedule="0,0@0.2,0.3@0.5,0" +# decode options +frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1) +extra_left_context=50 +extra_right_context=0 +extra_left_context_initial=0 +extra_right_context_final=0 + + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnn_lstm_affix=1s #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. +remove_egs=true + +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 15 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 16 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + + +if [ $stage -le 17 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=512 + relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1) + fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 dropout-proportion=0.0 delay=-3 + relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3) + relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3) + fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 dropout-proportion=0.0 delay=-3 + relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3) + relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3) + fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 dropout-proportion=0.0 delay=-3 + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 18 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width "$frames_per_chunk" \ + --egs.chunk-left-context "$chunk_left_context" \ + --egs.chunk-right-context "$chunk_right_context" \ + --egs.chunk-left-context-initial "$chunk_left_context_initial" \ + --egs.chunk-right-context-final "$chunk_right_context_final" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 128,64 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.deriv-truncate-margin 10 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --cleanup.remove-egs "$remove_egs" \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + + +if [ $stage -le 19 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph +fi + +if [ $stage -le 20 ]; then + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial $extra_left_context_initial \ + --extra-right-context-final $extra_right_context_final \ + --frames-per-chunk "$frames_per_chunk_primary" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +if [ $stage -le 21 ]; then + # 'looped' decoding. we didn't write a -parallel version of this program yet, + # so it will take a bit longer as the --num-threads option is not supported. + # we just hardcode the --frames-per-chunk option as it doesn't have to + # match any value used in training, and it won't affect the results very much (unlike + # regular decoding)... [it will affect them slightly due to differences in the + # iVector extraction; probably smaller will be worse as it sees less of the future, + # but in a real scenario, long chunks will introduce excessive latency]. + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context-initial $extra_left_context_initial \ + --frames-per-chunk 30 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +if $test_online_decoding && [ $stage -le 22 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + data/lang_chain exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online + + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + # note: we just give it "$dset" as it only uses the wav.scp, the + # feature type does not matter. + + steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \ + --extra-left-context-initial $extra_left_context_initial \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset} ${dir}_online/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}_online/decode_${dset} ${dir}_online/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + + +exit 0 diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh new file mode 100644 index 00000000000..c286fcef353 --- /dev/null +++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh @@ -0,0 +1,333 @@ +#!/bin/bash + +# 1t is as 1e, but increasing the TDNN dim and LSTM cell-dim into +# 1024, the recurrent and non-recurrent projection of the LSTM from +# 128 into 256. + +# ./local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1t_sp_bi +# System tdnn_lstm1e_again_sp_bi tdnn_lstm1t_again_sp_bi +# WER on dev(orig) 9.0 8.9 +# [looped:] 9.0 8.9 +# WER on dev(rescored) 8.4 8.2 +# [looped:] 8.4 8.3 +# WER on test(orig) 8.9 8.9 +# [looped:] 8.9 9.0 +# WER on test(rescored) 8.4 8.4 +# [looped:] 8.4 8.5 +# Final train prob -0.0712 -0.0459 +# Final valid prob -0.0892 -0.0867 +# Final train prob (xent) -0.8566 -0.6434 +# Final valid prob (xent) -0.9927 -0.8733 + +# exp/chain_cleaned/tdnn_lstm1e_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3626 combine=-0.082->-0.081 xent:train/valid[167,252,final]=(-0.961,-0.859,-0.857/-1.06,-0.999,-0.993) logprob:train/valid[167,252,final]=(-0.086,-0.072,-0.071/-0.098,-0.091,-0.089) +# exp/chain_cleaned/tdnn_lstm1t_sp_bi: num-iters=253 nj=2..12 num-params=37.1M dim=40+100->3626 combine=-0.055->-0.055 xent:train/valid[167,252,final]=(-0.774,-0.655,-0.643/-0.928,-0.883,-0.873) logprob:train/valid[167,252,final]=(-0.063,-0.048,-0.046/-0.087,-0.089,-0.087) + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +decode_nj=30 +min_seg_len=1.55 +label_delay=5 +xent_regularize=0.1 +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +# training options +chunk_left_context=40 +chunk_right_context=0 +chunk_left_context_initial=0 +chunk_right_context_final=0 +frames_per_chunk=140,100,160 +# decode options +frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1) +extra_left_context=50 +extra_right_context=0 +extra_left_context_initial=0 +extra_right_context_final=0 + + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnn_lstm_affix=1t #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. +remove_egs=true + +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 15 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 16 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + + +if [ $stage -le 17 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=1024 + relu-renorm-layer name=tdnn2 dim=1024 input=Append(-1,0,1) + fast-lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 decay-time=20 delay=-3 + relu-renorm-layer name=tdnn3 dim=1024 input=Append(-3,0,3) + relu-renorm-layer name=tdnn4 dim=1024 input=Append(-3,0,3) + fast-lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 decay-time=20 delay=-3 + relu-renorm-layer name=tdnn5 dim=1024 input=Append(-3,0,3) + relu-renorm-layer name=tdnn6 dim=1024 input=Append(-3,0,3) + fast-lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 decay-time=20 delay=-3 + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 18 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width "$frames_per_chunk" \ + --egs.chunk-left-context "$chunk_left_context" \ + --egs.chunk-right-context "$chunk_right_context" \ + --egs.chunk-left-context-initial "$chunk_left_context_initial" \ + --egs.chunk-right-context-final "$chunk_right_context_final" \ + --trainer.num-chunk-per-minibatch 128,64 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.deriv-truncate-margin 10 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --cleanup.remove-egs "$remove_egs" \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + + +if [ $stage -le 19 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph +fi + +if [ $stage -le 20 ]; then + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial $extra_left_context_initial \ + --extra-right-context-final $extra_right_context_final \ + --frames-per-chunk "$frames_per_chunk_primary" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +if [ $stage -le 21 ]; then + # 'looped' decoding. we didn't write a -parallel version of this program yet, + # so it will take a bit longer as the --num-threads option is not supported. + # we just hardcode the --frames-per-chunk option as it doesn't have to + # match any value used in training, and it won't affect the results very much (unlike + # regular decoding)... [it will affect them slightly due to differences in the + # iVector extraction; probably smaller will be worse as it sees less of the future, + # but in a real scenario, long chunks will introduce excessive latency]. + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context-initial $extra_left_context_initial \ + --frames-per-chunk 30 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +if $test_online_decoding && [ $stage -le 22 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + data/lang_chain exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online + + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + # note: we just give it "$dset" as it only uses the wav.scp, the + # feature type does not matter. + + steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \ + --extra-left-context-initial $extra_left_context_initial \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset} ${dir}_online/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}_online/decode_${dset} ${dir}_online/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + + +exit 0 diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh new file mode 100644 index 00000000000..9e50060f5d6 --- /dev/null +++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh @@ -0,0 +1,327 @@ +#!/bin/bash + +# 1u is the same as 1t but adding per-frame dropout to LSTM +# in location4, see paper : http://www.danielpovey.com/files/2017_interspeech_dropout.pdf + +# ./local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1t_sp_bi exp/chain_cleaned/tdnn_lstm1u_sp_bi +# System tdnn_lstm1t_again_sp_bi tdnn_lstm1u_sp_bi +# WER on dev(orig) 8.9 8.6 +# WER on dev(rescored) 8.2 8.0 +# WER on test(orig) 8.9 8.3 +# WER on test(rescored) 8.4 7.9 +# Final train prob -0.0459 -0.0709 +# Final valid prob -0.0867 -0.0902 +# Final train prob (xent) -0.6434 -0.8112 +# Final valid prob (xent) -0.8733 -0.9384 + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +decode_nj=30 +min_seg_len=1.55 +label_delay=5 +xent_regularize=0.1 +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +# training options +chunk_left_context=40 +chunk_right_context=0 +chunk_left_context_initial=0 +chunk_right_context_final=0 +frames_per_chunk=140,100,160 +dropout_schedule="0,0@0.20,0.3@0.5,0" +# decode options +frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1) +extra_left_context=50 +extra_right_context=0 +extra_left_context_initial=0 +extra_right_context_final=0 + + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnn_lstm_affix=1u #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. +remove_egs=true + +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 15 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 16 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + + +if [ $stage -le 17 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=1024 + relu-renorm-layer name=tdnn2 dim=1024 input=Append(-1,0,1) + fast-lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 dropout-proportion=0.0 decay-time=20 delay=-3 + relu-renorm-layer name=tdnn3 dim=1024 input=Append(-3,0,3) + relu-renorm-layer name=tdnn4 dim=1024 input=Append(-3,0,3) + fast-lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 dropout-proportion=0.0 decay-time=20 delay=-3 + relu-renorm-layer name=tdnn5 dim=1024 input=Append(-3,0,3) + relu-renorm-layer name=tdnn6 dim=1024 input=Append(-3,0,3) + fast-lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 dropout-proportion=0.0 decay-time=20 delay=-3 + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 18 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width "$frames_per_chunk" \ + --egs.chunk-left-context "$chunk_left_context" \ + --egs.chunk-right-context "$chunk_right_context" \ + --egs.chunk-left-context-initial "$chunk_left_context_initial" \ + --egs.chunk-right-context-final "$chunk_right_context_final" \ + --trainer.dropout-schedule="$dropout_schedule" \ + --trainer.num-chunk-per-minibatch 128,64 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.deriv-truncate-margin 10 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --cleanup.remove-egs "$remove_egs" \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + + +if [ $stage -le 19 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph +fi + +if [ $stage -le 20 ]; then + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial $extra_left_context_initial \ + --extra-right-context-final $extra_right_context_final \ + --frames-per-chunk "$frames_per_chunk_primary" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +if [ $stage -le 21 ]; then + # 'looped' decoding. we didn't write a -parallel version of this program yet, + # so it will take a bit longer as the --num-threads option is not supported. + # we just hardcode the --frames-per-chunk option as it doesn't have to + # match any value used in training, and it won't affect the results very much (unlike + # regular decoding)... [it will affect them slightly due to differences in the + # iVector extraction; probably smaller will be worse as it sees less of the future, + # but in a real scenario, long chunks will introduce excessive latency]. + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context-initial $extra_left_context_initial \ + --frames-per-chunk 30 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +if $test_online_decoding && [ $stage -le 22 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + data/lang_chain exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online + + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + # note: we just give it "$dset" as it only uses the wav.scp, the + # feature type does not matter. + + steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \ + --extra-left-context-initial $extra_left_context_initial \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset} ${dir}_online/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}_online/decode_${dset} ${dir}_online/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + + +exit 0 From 62fee2b5e0a579f5afa8f786e16262993eb782e3 Mon Sep 17 00:00:00 2001 From: Gaofeng Cheng <770579626@qq.com> Date: Thu, 20 Apr 2017 09:56:14 +0800 Subject: [PATCH 21/21] small fix --- egs/ami/s5b/RESULTS_ihm | 1 + egs/ami/s5b/RESULTS_sdm | 1 + egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/egs/ami/s5b/RESULTS_ihm b/egs/ami/s5b/RESULTS_ihm index bdd5a18b235..a2b5d0c3a5c 100644 --- a/egs/ami/s5b/RESULTS_ihm +++ b/egs/ami/s5b/RESULTS_ihm @@ -85,6 +85,7 @@ %WER 20.7 | 12643 89980 | 81.7 11.5 6.8 2.5 20.7 51.8 | 0.015 | exp/ihm/chain_cleaned/tdnn_lstm1i_sp_bi_ld5/decode_eval/ascore_11/eval_hires.ctm.filt.sys # local/chain/tuning/run_tdnn_lstm_1l.sh --mic ihm --train-set train_cleaned --gmm tri3_cleaned +# same as local/chain/tuning/run_tdnn_lstm_1i.sh, except that dropout is adopted # cleanup + chain TDNN+LSTM model + per-frame dropout %WER 19.8 | 13098 94475 | 83.1 9.6 7.4 2.8 19.8 51.8 | -0.041 | exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/decode_dev/ascore_10/dev_hires.ctm.filt.sys %WER 19.2 | 12643 89964 | 83.2 10.7 6.1 2.5 19.2 49.7 | 0.079 | exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/decode_eval/ascore_10/eval_hires.ctm.filt.sys diff --git a/egs/ami/s5b/RESULTS_sdm b/egs/ami/s5b/RESULTS_sdm index 9ed296f51b1..bbe0ba3aa12 100644 --- a/egs/ami/s5b/RESULTS_sdm +++ b/egs/ami/s5b/RESULTS_sdm @@ -92,6 +92,7 @@ %WER 40.9 | 13807 89961 | 62.4 20.0 17.6 3.3 40.9 65.7 | 0.612 | exp/sdm1/chain_cleaned/tdnn_lstm1i_sp_bi_ihmali_ld5/decode_eval/ascore_10/eval_hires_o4.ctm.filt.sys # local/chain/tuning/run_tdnn_lstm_1l.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned +# same as local/chain/tuning/run_tdnn_lstm_1i.sh, except that dropout is adopted # cleanup + chain TDNN+LSTM model, SDM audio + alignments from ihm data + per-frame dropout. %WER 35.9 | 14900 94497 | 67.8 18.2 14.1 3.7 35.9 62.5 | 0.647 | exp/sdm1/chain_cleaned/tdnn_lstm1l_sp_bi_ihmali_ld5/decode_dev/ascore_9/dev_hires_o4.ctm.filt.sys %WER 39.4 | 13223 89946 | 64.1 19.7 16.2 3.5 39.4 67.0 | 0.611 | exp/sdm1/chain_cleaned/tdnn_lstm1l_sp_bi_ihmali_ld5/decode_eval/ascore_9/eval_hires_o4.ctm.filt.sys diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh index eac59626a0f..74c0f5a6ead 100644 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh @@ -1,6 +1,6 @@ #!/bin/bash -# This (1l.sh) is the same as 1j but with per-frame dropout on LSTM layer +# This (1l.sh) is the same as 1i but with per-frame dropout on LSTM layer # It is a regular (non-fast) LSTM with per-frame dropout on [i, f, o] gates of the LSTM, # the dropout-adding place is "place4" in paper : http://www.danielpovey.com/files/2017_interspeech_dropout.pdf. # We have tried both 4-epoch and 5-epoch training.