diff --git a/egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a10.sh b/egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a10.sh new file mode 100755 index 00000000000..ef4824bf7f2 --- /dev/null +++ b/egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a10.sh @@ -0,0 +1,341 @@ +#!/bin/bash + + +# run_cnn_tdnn_1a10.sh is modified from run_tdnn_1b.sh but taking +# the xconfig from mini-librispeech's run_cnn_tdnn_1a54.sh; only +# reducing the bottleneck-dim from 96 to 64, which is the value +# the run_tdnn1b.sh script here has. +# Better! +# local/chain/compare_wer.sh exp/chain/tdnn1a_sp exp/chain/tdnn1b_sp exp/chain/cnn_tdnn1a10_sp +# System tdnn1a_sp tdnn1b_sp cnn_tdnn1a10_sp +# %WER devtest 53.07 52.54 51.10 +# %WER test 59.25 53.70 52.07 +# %WER native 54.47 48.76 47.88 +# %WER nonnative 63.01 57.66 55.51 +# Final train prob -0.0253 -0.0547 -0.0502 +# Final valid prob -0.0687 -0.0694 -0.0661 +# Final train prob (xent) -0.7715 -0.9502 -0.8513 +# Final valid prob (xent) -1.0719 -1.0849 -0.9915 +# Num-params 6567648 3321312 3345088 + + + +# 1b is as 1a but a re-tuned model with quite a few changes, including moving to +# a resnet-style factored TDNN-F model. +# +# local/chain/compare_wer.sh exp/chain/tdnn1a_sp exp/chain/tdnn1b_sp +# System tdnn1a_sp tdnn1b_sp +# %WER devtest 53.07 52.54 +# %WER test 59.25 53.70 +# %WER native 54.47 48.76 +# %WER nonnative 63.01 57.66 +# Final train prob -0.0253 -0.0547 +# Final valid prob -0.0687 -0.0694 +# Final train prob (xent) -0.7715 -0.9502 +# Final valid prob (xent) -1.0719 -1.0849 +# Num-params 6567648 3321312 + + +# steps/info/chain_dir_info.pl exp/chain/tdnn1b_sp +# exp/chain/tdnn1b_sp: num-iters=34 nj=2..5 num-params=3.3M dim=40+100->1392 combine=-0.059->-0.059 (over 1) xent:train/valid[21,33,final]=(-1.28,-0.986,-0.950/-1.38,-1.10,-1.08) logprob:train/valid[21,33,final]=(-0.085,-0.063,-0.055/-0.090,-0.074,-0.069) + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train +test_sets="native nonnative devtest test" +gmm=tri3b +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1a10 # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +num_leaves=3500 + +# training options +# training chunk-options +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +dropout_schedule='0,0@0.20,0.3@0.50,0' +common_egs_dir= +xent_regularize=0.1 + +# training options +srand=0 +remove_egs=true +reporting_email= + +#decode options +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --cmd "$train_cmd" \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + $num_leaves \ + ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + cnn_opts="l2-regularize=0.03" + ivector_layer_opts="l2-regularize=0.03" + ivector_affine_opts="l2-regularize=0.03" + tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" + tdnnf_first_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.0" + tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.03" + output_opts="l2-regularize=0.015" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # this takes the MFCCs and generates filterbank coefficients. The MFCCs + # are more compressible so we prefer to dump the MFCCs to disk rather + # than filterbanks. + idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat + + linear-component name=ivector-linear $ivector_affine_opts dim=200 input=ReplaceIndex(ivector, t, 0) + batchnorm-component name=ivector-batchnorm target-rms=0.025 + + batchnorm-component name=idct-batchnorm input=idct + combine-feature-maps-layer name=combine_inputs input=Append(idct-batchnorm, ivector-batchnorm) num-filters1=1 num-filters2=5 height=40 + + conv-relu-batchnorm-layer name=cnn1 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48 learning-rate-factor=0.333 max-change=0.25 + conv-relu-batchnorm-layer name=cnn2 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48 + conv-relu-batchnorm-layer name=cnn3 $cnn_opts height-in=40 height-out=20 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn4 $cnn_opts height-in=20 height-out=20 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn5 $cnn_opts height-in=20 height-out=10 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn6 $cnn_opts height-in=10 height-out=5 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128 + + # the first TDNN-F layer has no bypass (since dims don't match), and a larger bottleneck so the + # information bottleneck doesn't become a problem. (we use time-stride=0 so no splicing, to + # limit the num-parameters). + tdnnf-layer name=tdnnf7 $tdnnf_first_opts dim=768 bottleneck-dim=192 time-stride=0 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + tdnnf-layer name=tdnnf14 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + tdnnf-layer name=tdnnf15 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + linear-component name=prefinal-l dim=192 $linear_opts + + ## adding the layers for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + # adding the layers for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py \ + --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=8 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 \ + data/lang_test \ + $tree_dir \ + $tree_dir/graph || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l 6024 combine=-0.051->-0.050 (over 23) xent:train/valid[869,1306,final]=(-0.808,-0.767,-0.771/-0.828,-0.780,-0.787) logprob:train/valid[869,1306,final]=(-0.051,-0.049,-0.047/-0.059,-0.056,-0.056) + +# local/chain/compare_wer.sh exp/chain_cleaned/tdnn_1b_sp exp/chain_cleaned/tdnn_1c_sp +# System tdnn_1b_sp tdnn_1c_sp +# WER on dev(fglarge) 3.77 3.35 +# WER on dev(tglarge) 3.90 3.49 +# WER on dev(tgmed) 4.89 4.30 +# WER on dev(tgsmall) 5.47 4.78 +# WER on dev_other(fglarge) 10.05 8.76 +# WER on dev_other(tglarge) 10.80 9.26 +# WER on dev_other(tgmed) 13.07 11.21 +# WER on dev_other(tgsmall) 14.46 12.47 +# WER on test(fglarge) 4.20 3.87 +# WER on test(tglarge) 4.28 4.08 +# WER on test(tgmed) 5.31 4.80 +# WER on test(tgsmall) 5.97 5.25 +# WER on test_other(fglarge) 10.44 8.95 +# WER on test_other(tglarge) 11.05 9.41 +# WER on test_other(tgmed) 13.36 11.52 +# WER on test_other(tgsmall) 14.90 12.66 +# Final train prob -0.0670 -0.0475 +# Final valid prob -0.0704 -0.0555 +# Final train prob (xent) -1.0502 -0.7708 +# Final valid prob (xent) -1.0441 -0.7874 + +# configs for 'chain' +stage=0 +decode_nj=50 +train_set=train_960_cleaned +gmm=tri6b_cleaned +nnet3_affix=_cleaned + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1d +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# TDNN options +frames_per_eg=150,110,100 +remove_egs=true +common_egs_dir= +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.5@0.50,0' + +test_online_decoding=true # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf16 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf17 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + linear-component name=prefinal-l dim=256 $linear_opts + + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 15 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b{09,10,11,12}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --constrained false" \ + --egs.chunk-width $frames_per_eg \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 2500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.00015 \ + --trainer.optimization.final-effective-lrate 0.000015 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; + +fi + +graph_dir=$dir/graph_tgsmall +if [ $stage -le 16 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir + # remove from the graph, and convert back to const-FST. + fstrmsymbols --apply-to-output=true --remove-arcs=true "echo 3|" $graph_dir/HCLG.fst - | \ + fstconvert --fst_type=const > $graph_dir/temp.fst + mv $graph_dir/temp.fst $graph_dir/HCLG.fst +fi + +iter_opts= +if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " +fi +if [ $stage -le 17 ]; then + rm $dir/.error 2>/dev/null || true + for decode_set in test_clean test_other dev_clean dev_other; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $decode_nj --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_tgsmall || exit 1 + steps/lmrescore.sh --cmd "$decode_cmd" --self-loop-scale 1.0 data/lang_test_{tgsmall,tgmed} \ + data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tgmed} || exit 1 + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \ + data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tglarge} || exit 1 + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \ + data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,fglarge} || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + +if $test_online_decoding && [ $stage -le 18 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3${nnet3_affix}/extractor $dir ${dir}_online + + rm $dir/.error 2>/dev/null || true + for data in test_clean test_other dev_clean dev_other; do + ( + nspk=$(wc -l 22. Better, on average. +# local/chain/compare_wer.sh --online exp/chain/tdnn1h_sp exp/chain/tdnn1h2_sp exp/chain/cnn_tdnn1a23_sp exp/chain/cnn_tdnn1a23b_sp exp/chain/cnn_tdnn1a24_sp exp/chain/cnn_tdnn1a24b_sp +# System tdnn1h_sp tdnn1h2_sp cnn_tdnn1a23_sp cnn_tdnn1a23b_sp cnn_tdnn1a24_sp cnn_tdnn1a24b_sp +#WER dev_clean_2 (tgsmall) 13.18 13.04 12.15 12.11 11.95 11.86 +# [online:] 13.03 12.97 12.18 12.07 11.99 11.96 +#WER dev_clean_2 (tglarge) 9.18 9.16 8.57 8.47 8.57 8.54 +# [online:] 9.29 9.24 8.64 8.50 8.63 8.57 +# Final train prob -0.0531 -0.0590 -0.0456 -0.0462 -0.0461 -0.0455 +# Final valid prob -0.0844 -0.0865 -0.0800 -0.0802 -0.0800 -0.0798 +# Final train prob (xent) -1.5244 -1.7771 -1.0691 -1.0683 -1.0776 -1.0781 +# Final valid prob (xent) -1.7447 -1.9611 -1.3190 -1.3108 -1.3131 -1.3190 +# Num-params 3512112 3512112 4474688 4474688 4474688 4474688 + +# 1a23 is as 1a14 but for the last cnn layer (cnn5), using twice the num-filters +# plus subsampling on the output. +# A bit better, on average! +# local/chain/compare_wer.sh --online exp/chain/tdnn1h_sp exp/chain/tdnn1h2_sp exp/chain/cnn_tdnn1a14_sp exp/chain/cnn_tdnn1a14b_sp exp/chain/cnn_tdnn1a23_sp exp/chain/cnn_tdnn1a23b_sp +# System tdnn1h_sp tdnn1h2_sp cnn_tdnn1a14_sp cnn_tdnn1a14b_sp cnn_tdnn1a23_sp cnn_tdnn1a23b_sp +#WER dev_clean_2 (tgsmall) 13.18 13.04 12.14 12.39 12.15 12.11 +# [online:] 13.03 12.97 12.10 12.38 12.18 12.07 +#WER dev_clean_2 (tglarge) 9.18 9.16 8.44 8.69 8.57 8.47 +# [online:] 9.29 9.24 8.58 8.81 8.64 8.50 +# Final train prob -0.0531 -0.0590 -0.0455 -0.0460 -0.0456 -0.0462 +# Final valid prob -0.0844 -0.0865 -0.0806 -0.0802 -0.0800 -0.0802 +# Final train prob (xent) -1.5244 -1.7771 -1.0792 -1.0763 -1.0691 -1.0683 +# Final valid prob (xent) -1.7447 -1.9611 -1.3221 -1.3173 -1.3190 -1.3108 +# Num-params 3512112 3512112 4456224 4456224 4474688 4474688 + +# 1a14 is as 1a13 but with an extra tdnn-f layer. Better! +# local/chain/compare_wer.sh --online exp/chain/tdnn1h_sp exp/chain/tdnn1h2_sp exp/chain/cnn_tdnn1a13_sp exp/chain/cnn_tdnn1a14_sp +# System tdnn1h_sp tdnn1h2_sp cnn_tdnn1a13_sp cnn_tdnn1a14_sp +#WER dev_clean_2 (tgsmall) 13.18 13.04 12.21 12.14 +# [online:] 13.03 12.97 12.26 12.10 +#WER dev_clean_2 (tglarge) 9.18 9.16 8.65 8.44 +# [online:] 9.29 9.24 8.67 8.58 +# Final train prob -0.0531 -0.0590 -0.0459 -0.0455 +# Final valid prob -0.0844 -0.0865 -0.0810 -0.0806 +# Final train prob (xent) -1.5244 -1.7771 -1.0901 -1.0792 +# Final valid prob (xent) -1.7447 -1.9611 -1.3328 -1.3221 +# Num-params 3512112 3512112 4160544 4456224 + +# 1a13 is as 1a12 but using the same l2 values for the first layers as for the +# later ones (more l2). +# 1a12 is as 1a11 but making the first TDNN-F layer non-splicing and restoring +# the 640's to 768's. +# 1a11 is as 1a10 but adding some l2 to the CNN layers and to the TDNN layers +# for the ivector training. +# run_cnn_tdnn_1a10.sh is as run_cnn_tdnn_1a.sh but reducing the 768's to 640 +# to make the num-params similar to the tdnn1h experiment (run_cnn_tdnn_1a.sh was overfitting +# a bit). +# +# run_cnn_tdnn_1a.sh is modified from run_tdnn_1h.sh, but adding CNN layers +# near the beginning. + +# 1h is as 1g but a re-tuned model based on resnet-style TDNN-F layers with +# bypass connections. Below, 1h2 is just a rerun of 1h with a different --affix +# option, to give some idea of the run-to-run variation. + +# local/chain/compare_wer.sh --online exp/chain/tdnn1g_sp exp/chain/tdnn1h_sp exp/chain/tdnn1h2_sp +# System tdnn1g_sp tdnn1h_sp tdnn1h2_sp +#WER dev_clean_2 (tgsmall) 13.50 13.18 13.04 +# [online:] 13.52 13.03 12.97 +#WER dev_clean_2 (tglarge) 9.79 9.18 9.16 +# [online:] 9.79 9.29 9.24 +# Final train prob -0.0460 -0.0531 -0.0590 +# Final valid prob -0.0892 -0.0844 -0.0865 +# Final train prob (xent) -1.1739 -1.5244 -1.7771 +# Final valid prob (xent) -1.4487 -1.7447 -1.9611 +# Num-params 6234672 3512112 3512112 + +# steps/info/chain_dir_info.pl exp/chain/tdnn1{g,h,h2}_sp +# exp/chain/tdnn1g_sp: num-iters=25 nj=2..5 num-params=6.2M dim=40+100->2328 combine=-0.056->-0.055 (over 3) xent:train/valid[15,24,final]=(-1.50,-1.23,-1.17/-1.73,-1.52,-1.45) logprob:train/valid[15,24,final]=(-0.063,-0.051,-0.046/-0.101,-0.094,-0.089) +# exp/chain/tdnn1h_sp: num-iters=34 nj=2..5 num-params=3.5M dim=40+100->2328 combine=-0.055->-0.050 (over 4) xent:train/valid[21,33,final]=(-1.97,-1.57,-1.52/-2.11,-1.78,-1.74) logprob:train/valid[21,33,final]=(-0.080,-0.061,-0.053/-0.106,-0.096,-0.084) +# exp/chain/tdnn1h2_sp: num-iters=34 nj=2..5 num-params=3.5M dim=40+100->2328 combine=-0.062->-0.056 (over 4) xent:train/valid[21,33,final]=(-2.21,-1.78,-1.78/-2.34,-1.96,-1.96) logprob:train/valid[21,33,final]=(-0.086,-0.066,-0.059/-0.110,-0.098,-0.087) + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train_clean_5 +test_sets=dev_clean_2 +gmm=tri3b +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1a54 # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# training options +# training chunk-options +chunk_width=140,100,160 +dropout_schedule='0,0@0.20,0.3@0.50,0' +common_egs_dir= +xent_regularize=0.1 + +# training options +srand=0 +remove_egs=true +reporting_email= + +#decode options +test_online_decoding=true # if true, it will run the last decoding stage. + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + cnn_opts="l2-regularize=0.03" + ivector_affine_opts="l2-regularize=0.03" + tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" + tdnnf_first_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.0" + tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.03" + output_opts="l2-regularize=0.015" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # this takes the MFCCs and generates filterbank coefficients. The MFCCs + # are more compressible so we prefer to dump the MFCCs to disk rather + # than filterbanks. + idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat + + linear-component name=ivector-linear $ivector_affine_opts dim=200 input=ReplaceIndex(ivector, t, 0) + batchnorm-component name=ivector-batchnorm target-rms=0.025 + + batchnorm-component name=idct-batchnorm input=idct + combine-feature-maps-layer name=combine_inputs input=Append(idct-batchnorm, ivector-batchnorm) num-filters1=1 num-filters2=5 height=40 + + conv-relu-batchnorm-layer name=cnn1 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48 learning-rate-factor=0.333 max-change=0.25 + conv-relu-batchnorm-layer name=cnn2 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48 + conv-relu-batchnorm-layer name=cnn3 $cnn_opts height-in=40 height-out=20 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn4 $cnn_opts height-in=20 height-out=20 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn5 $cnn_opts height-in=20 height-out=10 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn6 $cnn_opts height-in=10 height-out=5 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128 + + # the first TDNN-F layer has no bypass (since dims don't match), and a larger bottleneck so the + # information bottleneck doesn't become a problem. (we use time-stride=0 so no splicing, to + # limit the num-parameters). + tdnnf-layer name=tdnnf7 $tdnnf_first_opts dim=768 bottleneck-dim=192 time-stride=0 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf14 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf15 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + linear-component name=prefinal-l dim=192 $linear_opts + + ## adding the layers for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + # adding the layers for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/fs0{1,2}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=20 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.002 \ + --trainer.optimization.final-effective-lrate=0.0002 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l 2854 combine=-0.042->-0.042 (over 2) xent:train/valid[71,107,final]=(-0.975,-0.640,-0.646/-0.980,-0.678,-0.688) logprob:train/valid[71,107,final]=(-0.067,-0.043,-0.042/-0.069,-0.050,-0.049) +# exp/chain/cnn_tdnn1b17_sp: num-iters=144 nj=2..8 num-params=6.9M dim=40+100->2854 combine=-0.041->-0.041 (over 3) xent:train/valid[95,143,final]=(-0.866,-0.617,-0.620/-0.881,-0.657,-0.659) logprob:train/valid[95,143,final]=(-0.061,-0.042,-0.041/-0.062,-0.050,-0.049) + +# The following table compares chain (TDNN+LSTM, TDNN, CNN+TDNN). +# The CNN+TDNN doesn't seem to have any advantages versus the TDNN (and it's +# about 5 times slower per iteration). But it's not well tuned. +# And the num-params is fewer (5.5M vs 7.6M for TDNN). + +# local/chain/compare_wer.sh exp/chain/tdnn_lstm1a_sp exp/chain/tdnn1a_sp exp/chain/cnn_tdnn1a_sp +# System tdnn_lstm1a_sp tdnn1a_sp cnn_tdnn1a_sp +#WER dev93 (tgpr) 7.48 7.87 9.02 +#WER dev93 (tg) 7.41 7.61 8.60 +#WER dev93 (big-dict,tgpr) 5.64 5.71 6.97 +#WER dev93 (big-dict,fg) 5.40 5.10 6.12 +#WER eval92 (tgpr) 5.67 5.23 5.56 +#WER eval92 (tg) 5.46 4.87 5.05 +#WER eval92 (big-dict,tgpr) 3.69 3.24 3.40 +#WER eval92 (big-dict,fg) 3.28 2.71 2.73 +# Final train prob -0.0341 -0.0414 -0.0532 +# Final valid prob -0.0506 -0.0634 -0.0752 +# Final train prob (xent) -0.5643 -0.8216 -1.0857 +# Final valid prob (xent) -0.6648 -0.9208 -1.1505 + + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +train_set=train_si284 +test_sets="test_dev93 test_eval92" +gmm=tri4b # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +num_threads_ubm=32 +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. + +# Options which are not passed through to run_ivector_common.sh +affix=1b17 #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= +reporting_email= + +# LSTM/chain options +train_stage=-10 +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.5@0.50,0' + +# training chunk-options +chunk_width=140,100,160 + +# training options +srand=0 +remove_egs=true + +#decode options +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 15 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + cnn_opts="l2-regularize=0.01" + ivector_affine_opts="l2-regularize=0.01" + tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true" + tdnnf_first_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.0" + tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.01" + output_opts="l2-regularize=0.005" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # this takes the MFCCs and generates filterbank coefficients. The MFCCs + # are more compressible so we prefer to dump the MFCCs to disk rather + # than filterbanks. + idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat + + linear-component name=ivector-linear $ivector_affine_opts dim=200 input=ReplaceIndex(ivector, t, 0) + batchnorm-component name=ivector-batchnorm target-rms=0.025 + + batchnorm-component name=idct-batchnorm input=idct + combine-feature-maps-layer name=combine_inputs input=Append(idct-batchnorm, ivector-batchnorm) num-filters1=1 num-filters2=5 height=40 + + conv-relu-batchnorm-layer name=cnn1 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48 learning-rate-factor=0.333 max-change=0.25 + conv-relu-batchnorm-layer name=cnn2 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48 + conv-relu-batchnorm-layer name=cnn3 $cnn_opts height-in=40 height-out=20 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn4 $cnn_opts height-in=20 height-out=20 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn5 $cnn_opts height-in=20 height-out=10 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn6 $cnn_opts height-in=10 height-out=5 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128 + + # the first TDNN-F layer has no bypass (since dims don't match), and a larger bottleneck so the + # information bottleneck doesn't become a problem. + tdnnf-layer name=tdnnf7 $tdnnf_first_opts dim=1024 bottleneck-dim=256 time-stride=0 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + linear-component name=prefinal-l dim=192 $linear_opts + + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{4,5,6,7}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=8 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=8 \ + --trainer.optimization.initial-effective-lrate=0.0005 \ + --trainer.optimization.final-effective-lrate=0.00005 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 17 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/lang/check_phones_compatible.sh \ + data/lang_test_tgpr/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgpr \ + $tree_dir $tree_dir/graph_tgpr || exit 1; + + utils/lang/check_phones_compatible.sh \ + data/lang_test_bd_tgpr/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_bd_tgpr \ + $tree_dir $tree_dir/graph_bd_tgpr || exit 1; +fi + +if [ $stage -le 18 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l " - echo "e.g.: steps/align_fmllr.sh data/train data/lang exp/tri1 exp/tri1_ali" + echo "usage: steps/align_basis_fmllr.sh " + echo "e.g.: steps/align_basis_fmllr.sh data/train data/lang exp/tri4 exp/tri4_ali" + echo "Note: should ideally have been trained by steps/train_sat_basis.sh, or" + echo "if a non-SAT system (not recommended), the basis should have been computed" + echo "by steps/get_fmllr_basis.sh." echo "main options (for others, see top of script file)" echo " --config # config containing options" echo " --nj # number of parallel jobs" @@ -57,9 +60,19 @@ mkdir -p $dir/log echo $nj > $dir/num_jobs [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +for f in $srcdir/tree $srcdir/final.mdl $srcdir/fmllr.basis \ + $data/feats.scp $lang/phones.txt; do + if [ ! -f $f ]; then + echo "$0: expected file $f to exist" + exit 1 + fi +done + utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1; cp $lang/phones.txt $dir || exit 1; + cp $srcdir/{tree,final.mdl} $dir || exit 1; cp $srcdir/final.occs $dir; splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/composite_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/composite_layers.py index e1905d0aa48..bf2a90916ae 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/composite_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/composite_layers.py @@ -135,11 +135,9 @@ def get_full_config(self): def _generate_config(self): configs = [] name = self.name - input_dim = self.descriptors['input']['dim'] input_descriptor = self.descriptors['input']['final-string'] output_dim = self.config['dim'] - assert output_dim == input_dim bottleneck_dim = self.config['bottleneck-dim'] bypass_scale = self.config['bypass-scale'] dropout_proportion = self.config['dropout-proportion'] diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index 1d284146e35..01c1b1e533c 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -74,7 +74,9 @@ 'batchnorm-component': xlayers.XconfigBatchnormComponent, 'no-op-component': xlayers.XconfigNoOpComponent, 'linear-component': xlayers.XconfigLinearComponent, - 'scale-component': xlayers.XconfigPerElementScaleComponent + 'affine-component': xlayers.XconfigAffineComponent, + 'scale-component': xlayers.XconfigPerElementScaleComponent, + 'combine-feature-maps-layer': xlayers.XconfigCombineFeatureMapsLayer } # Turn a config line and a list of previous layers into diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py index 6b8e3c3a5c2..42cc20293a5 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py @@ -206,7 +206,9 @@ def set_default_configs(self): 'dim': -1, 'orthonormal-constraint': '', 'max-change': 0.75, - 'l2-regularize': '' } + 'l2-regularize': '', + 'param-stddev': '', + 'learning-rate-factor': '' } def check_configs(self): if self.config['dim'] <= 0: @@ -240,7 +242,8 @@ def _generate_config(self): output_dim = self.config['dim'] opts = '' - for opt_name in ['orthonormal-constraint', 'max-change', 'l2-regularize']: + for opt_name in ['orthonormal-constraint', 'max-change', 'l2-regularize', + 'param-stddev', 'learning-rate-factor' ]: value = self.config[opt_name] if value != '': opts += ' {0}={1}'.format(opt_name, value) @@ -255,6 +258,171 @@ def _generate_config(self): return configs +class XconfigCombineFeatureMapsLayer(XconfigLayerBase): + """This class is for parsing lines like + 'combine-feature-maps-layer name=combine_features1 height=40 num-filters1=1 num-filters2=4' + It produces a PermuteComponent. It expects its input to be two things + appended together, where the first is of dimension height * num-filters1 and + the second is of dimension height * num-filters2; it interpolates the filters + so the output can be interpreted as a single feature map with the same height + as the input and the sum of the num-filters. + + This is to be used in convolutional setups as part of how we combine the + filterbank inputs with ivectors. + """ + + def __init__(self, first_token, key_to_value, prev_names=None): + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + self.config = { 'input': '[-1]', + 'num-filters1': -1, + 'num-filters2': -1, + 'height': -1 } + + def check_configs(self): + input_dim = self.descriptors['input']['dim'] + if (self.config['num-filters1'] <= 0 or + self.config['num-filters2'] <= 0 or + self.config['height'] <= 0): + raise RuntimeError("invalid values of num-filters1, num-filters2 and/or height") + f1 = self.config['num-filters1'] + f2 = self.config['num-filters2'] + h = self.config['height'] + if input_dim != (f1 + f2) * h: + raise RuntimeError("Expected input-dim={0} based on num-filters1={1}, num-filters2={2} " + "and height={3}, but got input-dim={4}".format( + (f1 + f2) * h, f1, f2, h, input_dim)) + + def output_name(self, auxiliary_output=None): + assert auxiliary_output is None + return self.name + + def output_dim(self, auxiliary_output=None): + assert auxiliary_output is None + input_dim = self.descriptors['input']['dim'] + return input_dim + + def get_full_config(self): + ans = [] + config_lines = self._generate_config() + + for line in config_lines: + for config_name in ['ref', 'final']: + # we do not support user specified matrices in this layer + # so 'ref' and 'final' configs are the same. + ans.append((config_name, line)) + return ans + + def _generate_config(self): + # by 'descriptor_final_string' we mean a string that can appear in + # config-files, i.e. it contains the 'final' names of nodes. + input_desc = self.descriptors['input']['final-string'] + dim = self.descriptors['input']['dim'] + num_filters1 = self.config['num-filters1'] + num_filters2 = self.config['num-filters2'] + height = self.config['height'] + assert dim == (num_filters1 + num_filters2) * height + + column_map = [] + for h in range(height): + for f in range(num_filters1): + column_map.append(h * num_filters1 + f) + for f in range(num_filters2): + column_map.append((height * num_filters1) + h * num_filters2 + f) + + configs = [] + line = ('component name={0} type=PermuteComponent column-map={1} '.format( + self.name, ','.join([str(x) for x in column_map]))) + configs.append(line) + + line = ('component-node name={0} component={0} input={1}'.format( + self.name, input_desc)) + configs.append(line) + return configs + + + + +class XconfigAffineComponent(XconfigLayerBase): + """This class is for parsing lines like + 'affine-component name=linear1 dim=1024 input=Append(-3,0,3)' + which will produce just a single component, of type NaturalGradientAffineComponent, + with output-dim 1024 in this case, and input-dim determined by the dimension + of the input . + + Parameters of the class, and their defaults: + input='[-1]' [Descriptor giving the input of the layer.] + dim=-1 [Dimension of the output] + + The following (shown with their effective defaults) are just passed through + to the component's config line. + + orthonormal-constraint=0.0 + max-change=0.75 + l2-regularize=0.0 + + """ + def __init__(self, first_token, key_to_value, prev_names=None): + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + self.config = {'input': '[-1]', + 'dim': -1, + 'orthonormal-constraint': '', + 'max-change': 0.75, + 'param-stddev': '', + 'bias-stddev': '', + 'l2-regularize': '' } + + def check_configs(self): + if self.config['dim'] <= 0: + raise RuntimeError("'dim' must be specified and > 0.") + + def output_name(self, auxiliary_output=None): + assert auxiliary_output is None + return self.name + + def output_dim(self, auxiliary_output=None): + assert auxiliary_output is None + assert self.config['dim'] > 0 + return self.config['dim'] + + def get_full_config(self): + ans = [] + config_lines = self._generate_config() + + for line in config_lines: + for config_name in ['ref', 'final']: + # we do not support user specified matrices in this layer + # so 'ref' and 'final' configs are the same. + ans.append((config_name, line)) + return ans + + def _generate_config(self): + # by 'descriptor_final_string' we mean a string that can appear in + # config-files, i.e. it contains the 'final' names of nodes. + input_desc = self.descriptors['input']['final-string'] + input_dim = self.descriptors['input']['dim'] + output_dim = self.config['dim'] + + opts = '' + for opt_name in ['orthonormal-constraint', 'max-change', 'l2-regularize', + 'param-stddev', 'bias-stddev']: + value = self.config[opt_name] + if value != '': + opts += ' {0}={1}'.format(opt_name, value) + + configs = [] + line = ('component name={0} type=NaturalGradientAffineComponent input-dim={1} output-dim={2} ' + '{3}'.format(self.name, input_dim, output_dim, opts)) + configs.append(line) + line = ('component-node name={0} component={0} input={1}'.format( + self.name, input_desc)) + configs.append(line) + return configs + + class XconfigPerElementScaleComponent(XconfigLayerBase): """This class is for parsing lines like 'scale-component name=scale1 input=Append(-3,0,3)' diff --git a/egs/wsj/s5/utils/parallel/pbs.pl b/egs/wsj/s5/utils/parallel/pbs.pl index 6c8d4488882..cbde8eb86d5 100755 --- a/egs/wsj/s5/utils/parallel/pbs.pl +++ b/egs/wsj/s5/utils/parallel/pbs.pl @@ -11,7 +11,7 @@ use Cwd; use Getopt::Long; -# This is a version of the queue.pl modified so that it works under PBS +# This is a version of the queue.pl modified so that it works under PBS # The PBS is one of the several "almost compatible" queueing systems. The # command switches and environment variables are different, so we are adding # a this script. An optimal solution might probably be to make the variable diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc index 5ea3a236b0a..dcca5a76cde 100644 --- a/src/cudamatrix/cu-vector.cc +++ b/src/cudamatrix/cu-vector.cc @@ -83,11 +83,11 @@ Real VecMatVec(const CuVectorBase &v1, const CuMatrixBase &M, const CuVectorBase &v2) { KALDI_ASSERT(v1.Dim() == M.NumRows() && M.NumCols() == v2.Dim()); if (v1.Dim() > v2.Dim()) { // do v2*M first - CuVector v2M(v1.Dim(), kUndefined); + CuVector v2M(v1.Dim()); v2M.AddMatVec(1.0, M, kNoTrans, v2, 0.0); return VecVec(v2M, v1); } else { // do v1*M first - CuVector v1M(v2.Dim(), kUndefined); + CuVector v1M(v2.Dim()); v1M.AddMatVec(1.0, M, kTrans, v1, 0.0); return VecVec(v1M, v2); } diff --git a/src/gmmbin/gmm-basis-fmllr-training.cc b/src/gmmbin/gmm-basis-fmllr-training.cc index 40c86be670b..3d93c3ca877 100644 --- a/src/gmmbin/gmm-basis-fmllr-training.cc +++ b/src/gmmbin/gmm-basis-fmllr-training.cc @@ -36,7 +36,7 @@ int main(int argc, char *argv[]) { const char *usage = "Estimate fMLLR basis representation. Reads a set of gradient scatter\n" "accumulations. Outputs basis matrices.\n" - "Usage: gmm-basis-fmllr-training [options] " + "Usage: gmm-basis-fmllr-training [options] " " ...\n"; bool binary_write = true; @@ -86,4 +86,3 @@ int main(int argc, char *argv[]) { return -1; } } - diff --git a/src/nnet3/nnet-compile-looped.cc b/src/nnet3/nnet-compile-looped.cc index 1a5ceabab0e..5206caac9e2 100644 --- a/src/nnet3/nnet-compile-looped.cc +++ b/src/nnet3/nnet-compile-looped.cc @@ -37,25 +37,36 @@ void ModifyNnetIvectorPeriod(int32 ivector_period, bool b = config_line.ParseLine(config_lines[i]); KALDI_ASSERT(b && "Could not parse config line."); if (config_line.FirstToken() == "component-node") { + // What we're trying to do here is: find a line like: + // component-node name=foo component=foo input=Append(bar, ReplaceIndex(ivector, t, 0)) + // we want to replace it with something like: + // component-node name=foo component=foo input=Append(bar, ReplaceIndex(ivector, t, 0)) + // .. and we want this to also work if instead of 'ivector' it has something like + // Scale(0.5, ivector). We assume that ReplaceIndex() expressions only occur in this + // type of context. std::string whole_line = config_lines[i]; std::string to_search_for = "ReplaceIndex("; std::string::size_type to_search_for_size = to_search_for.size(); std::string::size_type pos = whole_line.find(to_search_for); if (pos != std::string::npos) { - std::string::size_type comma_pos = whole_line.find(',', pos); + std::string::size_type comma_pos = whole_line.find(", t, 0)", pos); if (comma_pos != std::string::npos) { // if the line contained ReplaceIndex(ivector, t, 0), // descriptor_name would now be 'ivector'. std::string descriptor_name = whole_line.substr(pos + to_search_for_size, comma_pos - (pos + to_search_for_size)); - std::string::size_type end_pos = whole_line.find(')', pos); - std::string::size_type expr_size = end_pos + 1 - pos; + // Note: 7, below, is the size of: ", t, 0)". + std::string::size_type end_pos = comma_pos + 7; + std::string::size_type expr_size = end_pos - pos; // e.g. expr_size would be strlen("ReplaceIndex(ivector, t, 0)"). std::ostringstream to_replace_with; to_replace_with << "Round(" << descriptor_name << ", " << ivector_period << ")"; whole_line.replace(pos, expr_size, to_replace_with.str()); config_to_read << whole_line << "\n"; + } else { + KALDI_ERR << "Could not process the ReplaceIndex expression in: " + << whole_line; } } }