diff --git a/egs/librispeech/s5/local/chain/run_chain_common.sh b/egs/librispeech/s5/local/chain/run_chain_common.sh index 5598c8deb21..da37e148441 100755 --- a/egs/librispeech/s5/local/chain/run_chain_common.sh +++ b/egs/librispeech/s5/local/chain/run_chain_common.sh @@ -13,6 +13,8 @@ gmm_dir= ali_dir= lores_train_data_dir= +num_leaves=6000 + # output directory names. They are also compulsory. lang= lat_dir= @@ -74,7 +76,7 @@ if [ $stage -le 13 ]; then fi steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ --context-opts "--context-width=2 --central-position=1" \ - --cmd "$train_cmd" 6000 ${lores_train_data_dir} $lang $ali_dir $tree_dir + --cmd "$train_cmd" $num_leaves ${lores_train_data_dir} $lang $ali_dir $tree_dir fi exit 0; diff --git a/egs/librispeech/s5/local/chain/run_tdnn.sh b/egs/librispeech/s5/local/chain/run_tdnn.sh index 61f8f499182..d48449e28bd 120000 --- a/egs/librispeech/s5/local/chain/run_tdnn.sh +++ b/egs/librispeech/s5/local/chain/run_tdnn.sh @@ -1 +1 @@ -tuning/run_tdnn_1b.sh \ No newline at end of file +tuning/run_tdnn_1c.sh \ No newline at end of file diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1c.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1c.sh new file mode 100755 index 00000000000..ceadd890b5c --- /dev/null +++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1c.sh @@ -0,0 +1,265 @@ +#!/bin/bash +set -e + +## Adapted from swbd for librispeech by David van Leeuwen + +# 7n is a kind of factorized TDNN, with skip connections + +# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_1c_sp +# exp/chain_cleaned/tdnn_1c_sp: num-iters=1307 nj=3..16 num-params=20.1M dim=40+100->6024 combine=-0.051->-0.050 (over 23) xent:train/valid[869,1306,final]=(-0.808,-0.767,-0.771/-0.828,-0.780,-0.787) logprob:train/valid[869,1306,final]=(-0.051,-0.049,-0.047/-0.059,-0.056,-0.056) + +# local/chain/compare_wer.sh exp/chain_cleaned/tdnn_1b_sp exp/chain_cleaned/tdnn_1c_sp +# System tdnn_1b_sp tdnn_1c_sp +# WER on dev(fglarge) 3.77 3.35 +# WER on dev(tglarge) 3.90 3.49 +# WER on dev(tgmed) 4.89 4.30 +# WER on dev(tgsmall) 5.47 4.78 +# WER on dev_other(fglarge) 10.05 8.76 +# WER on dev_other(tglarge) 10.80 9.26 +# WER on dev_other(tgmed) 13.07 11.21 +# WER on dev_other(tgsmall) 14.46 12.47 +# WER on test(fglarge) 4.20 3.87 +# WER on test(tglarge) 4.28 4.08 +# WER on test(tgmed) 5.31 4.80 +# WER on test(tgsmall) 5.97 5.25 +# WER on test_other(fglarge) 10.44 8.95 +# WER on test_other(tglarge) 11.05 9.41 +# WER on test_other(tgmed) 13.36 11.52 +# WER on test_other(tgsmall) 14.90 12.66 +# Final train prob -0.0670 -0.0475 +# Final valid prob -0.0704 -0.0555 +# Final train prob (xent) -1.0502 -0.7708 +# Final valid prob (xent) -1.0441 -0.7874 + +# configs for 'chain' +stage=0 +decode_nj=50 +min_seg_len=1.55 +train_set=train_960_cleaned +gmm=tri6b_cleaned +nnet3_affix=_cleaned + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1c +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# TDNN options +frames_per_eg=150,110,100 +remove_egs=true +common_egs_dir= +xent_regularize=0.1 + +test_online_decoding=true # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $opts dim=1280 + linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn3l dim=256 $linear_opts + relu-batchnorm-layer name=tdnn3 $opts dim=1280 + linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn5l dim=256 $linear_opts + relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(tdnn5l, tdnn3l) + linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280 + linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280 + linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn8l,tdnn6l) dim=1280 + linear-component name=prefinal-l dim=256 $linear_opts + + relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 6 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; + +fi + +graph_dir=$dir/graph_tgsmall +if [ $stage -le 16 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir + # remove from the graph, and convert back to const-FST. + fstrmsymbols --apply-to-output=true --remove-arcs=true "echo 3|" $graph_dir/HCLG.fst - | \ + fstconvert --fst_type=const > $graph_dir/temp.fst + mv $graph_dir/temp.fst $graph_dir/HCLG.fst +fi + +iter_opts= +if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " +fi +if [ $stage -le 17 ]; then + rm $dir/.error 2>/dev/null || true + for decode_set in test_clean test_other dev_clean dev_other; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $decode_nj --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_tgsmall || exit 1 + steps/lmrescore.sh --cmd "$decode_cmd" --self-loop-scale 1.0 data/lang_test_{tgsmall,tgmed} \ + data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tgmed} || exit 1 + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \ + data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tglarge} || exit 1 + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \ + data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,fglarge} || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + +if $test_online_decoding && [ $stage -le 18 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3${nnet3_affix}/extractor $dir ${dir}_online + + rm $dir/.error 2>/dev/null || true + for data in test_clean test_other dev_clean dev_other; do + ( + nspk=$(wc -l 0.5) { - system("touch $qdir/.kick"); + system("touch $qdir/.kick 2>/dev/null"); } else { system("rm $qdir/.kick 2>/dev/null"); }