diff --git a/egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a.sh new file mode 100755 index 00000000000..1112f0ec08b --- /dev/null +++ b/egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a.sh @@ -0,0 +1,318 @@ +#!/bin/bash + +# run_cnn_tdnn_1a.sh is modified from run_tdnn_1b.sh but taking +# the xconfig from mini-librispeech's run_cnn_tdnn_1a54.sh; only +# reducing the bottleneck-dim from 96 to 64, which is the value +# the run_tdnn1b.sh script here has. Results are better. +# local/chain/compare_wer.sh exp/chain/tdnn1a_sp exp/chain/tdnn1b_sp exp/chain/cnn_tdnn1a_sp +# System tdnn1a_sp tdnn1b_sp cnn_tdnn1a_sp +# %WER devtest 53.07 52.54 51.10 +# %WER test 59.25 53.70 52.07 +# %WER native 54.47 48.76 47.88 +# %WER nonnative 63.01 57.66 55.51 +# Final train prob -0.0253 -0.0547 -0.0502 +# Final valid prob -0.0687 -0.0694 -0.0661 +# Final train prob (xent) -0.7715 -0.9502 -0.8513 +# Final valid prob (xent) -1.0719 -1.0849 -0.9915 +# Num-params 6567648 3321312 3345088 + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train +test_sets="native nonnative devtest test" +gmm=tri3b +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1a # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +num_leaves=3500 + +# training options +# training chunk-options +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +dropout_schedule='0,0@0.20,0.3@0.50,0' +common_egs_dir= +xent_regularize=0.1 + +# training options +srand=0 +remove_egs=true +reporting_email= + +#decode options +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --cmd "$train_cmd" \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + $num_leaves \ + ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + cnn_opts="l2-regularize=0.03" + ivector_layer_opts="l2-regularize=0.03" + ivector_affine_opts="l2-regularize=0.03" + tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" + tdnnf_first_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.0" + tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.03" + output_opts="l2-regularize=0.015" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # this takes the MFCCs and generates filterbank coefficients. The MFCCs + # are more compressible so we prefer to dump the MFCCs to disk rather + # than filterbanks. + idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat + + linear-component name=ivector-linear $ivector_affine_opts dim=200 input=ReplaceIndex(ivector, t, 0) + batchnorm-component name=ivector-batchnorm target-rms=0.025 + + batchnorm-component name=idct-batchnorm input=idct + combine-feature-maps-layer name=combine_inputs input=Append(idct-batchnorm, ivector-batchnorm) num-filters1=1 num-filters2=5 height=40 + + conv-relu-batchnorm-layer name=cnn1 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48 learning-rate-factor=0.333 max-change=0.25 + conv-relu-batchnorm-layer name=cnn2 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48 + conv-relu-batchnorm-layer name=cnn3 $cnn_opts height-in=40 height-out=20 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn4 $cnn_opts height-in=20 height-out=20 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn5 $cnn_opts height-in=20 height-out=10 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn6 $cnn_opts height-in=10 height-out=5 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128 + + # the first TDNN-F layer has no bypass (since dims don't match), and a larger bottleneck so the + # information bottleneck doesn't become a problem. (we use time-stride=0 so no splicing, to + # limit the num-parameters). + tdnnf-layer name=tdnnf7 $tdnnf_first_opts dim=768 bottleneck-dim=192 time-stride=0 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + tdnnf-layer name=tdnnf14 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + tdnnf-layer name=tdnnf15 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + linear-component name=prefinal-l dim=192 $linear_opts + + ## adding the layers for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + # adding the layers for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py \ + --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=8 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 \ + data/lang_test \ + $tree_dir \ + $tree_dir/graph || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l $lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + cnn_opts="l2-regularize=0.03" + ivector_affine_opts="l2-regularize=0.03" + tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" + tdnnf_first_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.0" + tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.03" + output_opts="l2-regularize=0.015" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # this takes the MFCCs and generates filterbank coefficients. The MFCCs + # are more compressible so we prefer to dump the MFCCs to disk rather + # than filterbanks. + idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat + + linear-component name=ivector-linear $ivector_affine_opts dim=200 input=ReplaceIndex(ivector, t, 0) + batchnorm-component name=ivector-batchnorm target-rms=0.025 + + batchnorm-component name=idct-batchnorm input=idct + combine-feature-maps-layer name=combine_inputs input=Append(idct-batchnorm, ivector-batchnorm) num-filters1=1 num-filters2=5 height=40 + + conv-relu-batchnorm-layer name=cnn1 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48 learning-rate-factor=0.333 max-change=0.25 + conv-relu-batchnorm-layer name=cnn2 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48 + conv-relu-batchnorm-layer name=cnn3 $cnn_opts height-in=40 height-out=20 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn4 $cnn_opts height-in=20 height-out=20 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn5 $cnn_opts height-in=20 height-out=10 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn6 $cnn_opts height-in=10 height-out=5 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128 + + # the first TDNN-F layer has no bypass (since dims don't match), and a larger bottleneck so the + # information bottleneck doesn't become a problem. (we use time-stride=0 so no splicing, to + # limit the num-parameters). + tdnnf-layer name=tdnnf7 $tdnnf_first_opts dim=768 bottleneck-dim=192 time-stride=0 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf14 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf15 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + linear-component name=prefinal-l dim=192 $linear_opts + + ## adding the layers for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + # adding the layers for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/fs0{1,2}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=20 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.002 \ + --trainer.optimization.final-effective-lrate=0.0002 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l $lang/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 15 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + cnn_opts="l2-regularize=0.01" + ivector_affine_opts="l2-regularize=0.01" + tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true" + tdnnf_first_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.0" + tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.01" + output_opts="l2-regularize=0.005" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # this takes the MFCCs and generates filterbank coefficients. The MFCCs + # are more compressible so we prefer to dump the MFCCs to disk rather + # than filterbanks. + idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat + + linear-component name=ivector-linear $ivector_affine_opts dim=200 input=ReplaceIndex(ivector, t, 0) + batchnorm-component name=ivector-batchnorm target-rms=0.025 + + batchnorm-component name=idct-batchnorm input=idct + combine-feature-maps-layer name=combine_inputs input=Append(idct-batchnorm, ivector-batchnorm) num-filters1=1 num-filters2=5 height=40 + + conv-relu-batchnorm-layer name=cnn1 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48 learning-rate-factor=0.333 max-change=0.25 + conv-relu-batchnorm-layer name=cnn2 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48 + conv-relu-batchnorm-layer name=cnn3 $cnn_opts height-in=40 height-out=20 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn4 $cnn_opts height-in=20 height-out=20 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn5 $cnn_opts height-in=20 height-out=10 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn6 $cnn_opts height-in=10 height-out=5 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128 + + # the first TDNN-F layer has no bypass (since dims don't match), and a larger bottleneck so the + # information bottleneck doesn't become a problem. + tdnnf-layer name=tdnnf7 $tdnnf_first_opts dim=1024 bottleneck-dim=256 time-stride=0 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + linear-component name=prefinal-l dim=192 $linear_opts + + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{4,5,6,7}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=8 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=8 \ + --trainer.optimization.initial-effective-lrate=0.0005 \ + --trainer.optimization.final-effective-lrate=0.00005 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 17 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/lang/check_phones_compatible.sh \ + data/lang_test_tgpr/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgpr \ + $tree_dir $tree_dir/graph_tgpr || exit 1; + + utils/lang/check_phones_compatible.sh \ + data/lang_test_bd_tgpr/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_bd_tgpr \ + $tree_dir $tree_dir/graph_bd_tgpr || exit 1; +fi + +if [ $stage -le 18 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l " - echo "e.g.: steps/align_fmllr.sh data/train data/lang exp/tri1 exp/tri1_ali" + echo "usage: steps/align_basis_fmllr.sh " + echo "e.g.: steps/align_basis_fmllr.sh data/train data/lang exp/tri4 exp/tri4_ali" + echo "Note: should ideally have been trained by steps/train_sat_basis.sh, or" + echo "if a non-SAT system (not recommended), the basis should have been computed" + echo "by steps/get_fmllr_basis.sh." echo "main options (for others, see top of script file)" echo " --config # config containing options" echo " --nj # number of parallel jobs" @@ -58,9 +61,19 @@ mkdir -p $dir/log echo $nj > $dir/num_jobs [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +for f in $srcdir/tree $srcdir/final.mdl $srcdir/fmllr.basis \ + $data/feats.scp $lang/phones.txt; do + if [ ! -f $f ]; then + echo "$0: expected file $f to exist" + exit 1 + fi +done + utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1; cp $lang/phones.txt $dir || exit 1; + cp $srcdir/{tree,final.mdl} $dir || exit 1; cp $srcdir/final.occs $dir; splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/composite_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/composite_layers.py index e1905d0aa48..bf2a90916ae 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/composite_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/composite_layers.py @@ -135,11 +135,9 @@ def get_full_config(self): def _generate_config(self): configs = [] name = self.name - input_dim = self.descriptors['input']['dim'] input_descriptor = self.descriptors['input']['final-string'] output_dim = self.config['dim'] - assert output_dim == input_dim bottleneck_dim = self.config['bottleneck-dim'] bypass_scale = self.config['bypass-scale'] dropout_proportion = self.config['dropout-proportion'] diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index 1d284146e35..01c1b1e533c 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -74,7 +74,9 @@ 'batchnorm-component': xlayers.XconfigBatchnormComponent, 'no-op-component': xlayers.XconfigNoOpComponent, 'linear-component': xlayers.XconfigLinearComponent, - 'scale-component': xlayers.XconfigPerElementScaleComponent + 'affine-component': xlayers.XconfigAffineComponent, + 'scale-component': xlayers.XconfigPerElementScaleComponent, + 'combine-feature-maps-layer': xlayers.XconfigCombineFeatureMapsLayer } # Turn a config line and a list of previous layers into diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py index 6b8e3c3a5c2..42cc20293a5 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py @@ -206,7 +206,9 @@ def set_default_configs(self): 'dim': -1, 'orthonormal-constraint': '', 'max-change': 0.75, - 'l2-regularize': '' } + 'l2-regularize': '', + 'param-stddev': '', + 'learning-rate-factor': '' } def check_configs(self): if self.config['dim'] <= 0: @@ -240,7 +242,8 @@ def _generate_config(self): output_dim = self.config['dim'] opts = '' - for opt_name in ['orthonormal-constraint', 'max-change', 'l2-regularize']: + for opt_name in ['orthonormal-constraint', 'max-change', 'l2-regularize', + 'param-stddev', 'learning-rate-factor' ]: value = self.config[opt_name] if value != '': opts += ' {0}={1}'.format(opt_name, value) @@ -255,6 +258,171 @@ def _generate_config(self): return configs +class XconfigCombineFeatureMapsLayer(XconfigLayerBase): + """This class is for parsing lines like + 'combine-feature-maps-layer name=combine_features1 height=40 num-filters1=1 num-filters2=4' + It produces a PermuteComponent. It expects its input to be two things + appended together, where the first is of dimension height * num-filters1 and + the second is of dimension height * num-filters2; it interpolates the filters + so the output can be interpreted as a single feature map with the same height + as the input and the sum of the num-filters. + + This is to be used in convolutional setups as part of how we combine the + filterbank inputs with ivectors. + """ + + def __init__(self, first_token, key_to_value, prev_names=None): + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + self.config = { 'input': '[-1]', + 'num-filters1': -1, + 'num-filters2': -1, + 'height': -1 } + + def check_configs(self): + input_dim = self.descriptors['input']['dim'] + if (self.config['num-filters1'] <= 0 or + self.config['num-filters2'] <= 0 or + self.config['height'] <= 0): + raise RuntimeError("invalid values of num-filters1, num-filters2 and/or height") + f1 = self.config['num-filters1'] + f2 = self.config['num-filters2'] + h = self.config['height'] + if input_dim != (f1 + f2) * h: + raise RuntimeError("Expected input-dim={0} based on num-filters1={1}, num-filters2={2} " + "and height={3}, but got input-dim={4}".format( + (f1 + f2) * h, f1, f2, h, input_dim)) + + def output_name(self, auxiliary_output=None): + assert auxiliary_output is None + return self.name + + def output_dim(self, auxiliary_output=None): + assert auxiliary_output is None + input_dim = self.descriptors['input']['dim'] + return input_dim + + def get_full_config(self): + ans = [] + config_lines = self._generate_config() + + for line in config_lines: + for config_name in ['ref', 'final']: + # we do not support user specified matrices in this layer + # so 'ref' and 'final' configs are the same. + ans.append((config_name, line)) + return ans + + def _generate_config(self): + # by 'descriptor_final_string' we mean a string that can appear in + # config-files, i.e. it contains the 'final' names of nodes. + input_desc = self.descriptors['input']['final-string'] + dim = self.descriptors['input']['dim'] + num_filters1 = self.config['num-filters1'] + num_filters2 = self.config['num-filters2'] + height = self.config['height'] + assert dim == (num_filters1 + num_filters2) * height + + column_map = [] + for h in range(height): + for f in range(num_filters1): + column_map.append(h * num_filters1 + f) + for f in range(num_filters2): + column_map.append((height * num_filters1) + h * num_filters2 + f) + + configs = [] + line = ('component name={0} type=PermuteComponent column-map={1} '.format( + self.name, ','.join([str(x) for x in column_map]))) + configs.append(line) + + line = ('component-node name={0} component={0} input={1}'.format( + self.name, input_desc)) + configs.append(line) + return configs + + + + +class XconfigAffineComponent(XconfigLayerBase): + """This class is for parsing lines like + 'affine-component name=linear1 dim=1024 input=Append(-3,0,3)' + which will produce just a single component, of type NaturalGradientAffineComponent, + with output-dim 1024 in this case, and input-dim determined by the dimension + of the input . + + Parameters of the class, and their defaults: + input='[-1]' [Descriptor giving the input of the layer.] + dim=-1 [Dimension of the output] + + The following (shown with their effective defaults) are just passed through + to the component's config line. + + orthonormal-constraint=0.0 + max-change=0.75 + l2-regularize=0.0 + + """ + def __init__(self, first_token, key_to_value, prev_names=None): + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + self.config = {'input': '[-1]', + 'dim': -1, + 'orthonormal-constraint': '', + 'max-change': 0.75, + 'param-stddev': '', + 'bias-stddev': '', + 'l2-regularize': '' } + + def check_configs(self): + if self.config['dim'] <= 0: + raise RuntimeError("'dim' must be specified and > 0.") + + def output_name(self, auxiliary_output=None): + assert auxiliary_output is None + return self.name + + def output_dim(self, auxiliary_output=None): + assert auxiliary_output is None + assert self.config['dim'] > 0 + return self.config['dim'] + + def get_full_config(self): + ans = [] + config_lines = self._generate_config() + + for line in config_lines: + for config_name in ['ref', 'final']: + # we do not support user specified matrices in this layer + # so 'ref' and 'final' configs are the same. + ans.append((config_name, line)) + return ans + + def _generate_config(self): + # by 'descriptor_final_string' we mean a string that can appear in + # config-files, i.e. it contains the 'final' names of nodes. + input_desc = self.descriptors['input']['final-string'] + input_dim = self.descriptors['input']['dim'] + output_dim = self.config['dim'] + + opts = '' + for opt_name in ['orthonormal-constraint', 'max-change', 'l2-regularize', + 'param-stddev', 'bias-stddev']: + value = self.config[opt_name] + if value != '': + opts += ' {0}={1}'.format(opt_name, value) + + configs = [] + line = ('component name={0} type=NaturalGradientAffineComponent input-dim={1} output-dim={2} ' + '{3}'.format(self.name, input_dim, output_dim, opts)) + configs.append(line) + line = ('component-node name={0} component={0} input={1}'.format( + self.name, input_desc)) + configs.append(line) + return configs + + class XconfigPerElementScaleComponent(XconfigLayerBase): """This class is for parsing lines like 'scale-component name=scale1 input=Append(-3,0,3)' diff --git a/egs/wsj/s5/utils/parallel/pbs.pl b/egs/wsj/s5/utils/parallel/pbs.pl index 6c8d4488882..cbde8eb86d5 100755 --- a/egs/wsj/s5/utils/parallel/pbs.pl +++ b/egs/wsj/s5/utils/parallel/pbs.pl @@ -11,7 +11,7 @@ use Cwd; use Getopt::Long; -# This is a version of the queue.pl modified so that it works under PBS +# This is a version of the queue.pl modified so that it works under PBS # The PBS is one of the several "almost compatible" queueing systems. The # command switches and environment variables are different, so we are adding # a this script. An optimal solution might probably be to make the variable diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc index 5ea3a236b0a..dcca5a76cde 100644 --- a/src/cudamatrix/cu-vector.cc +++ b/src/cudamatrix/cu-vector.cc @@ -83,11 +83,11 @@ Real VecMatVec(const CuVectorBase &v1, const CuMatrixBase &M, const CuVectorBase &v2) { KALDI_ASSERT(v1.Dim() == M.NumRows() && M.NumCols() == v2.Dim()); if (v1.Dim() > v2.Dim()) { // do v2*M first - CuVector v2M(v1.Dim(), kUndefined); + CuVector v2M(v1.Dim()); v2M.AddMatVec(1.0, M, kNoTrans, v2, 0.0); return VecVec(v2M, v1); } else { // do v1*M first - CuVector v1M(v2.Dim(), kUndefined); + CuVector v1M(v2.Dim()); v1M.AddMatVec(1.0, M, kTrans, v1, 0.0); return VecVec(v1M, v2); } diff --git a/src/gmmbin/gmm-basis-fmllr-training.cc b/src/gmmbin/gmm-basis-fmllr-training.cc index 40c86be670b..3d93c3ca877 100644 --- a/src/gmmbin/gmm-basis-fmllr-training.cc +++ b/src/gmmbin/gmm-basis-fmllr-training.cc @@ -36,7 +36,7 @@ int main(int argc, char *argv[]) { const char *usage = "Estimate fMLLR basis representation. Reads a set of gradient scatter\n" "accumulations. Outputs basis matrices.\n" - "Usage: gmm-basis-fmllr-training [options] " + "Usage: gmm-basis-fmllr-training [options] " " ...\n"; bool binary_write = true; @@ -86,4 +86,3 @@ int main(int argc, char *argv[]) { return -1; } } - diff --git a/src/nnet3/nnet-compile-looped.cc b/src/nnet3/nnet-compile-looped.cc index 1a5ceabab0e..5206caac9e2 100644 --- a/src/nnet3/nnet-compile-looped.cc +++ b/src/nnet3/nnet-compile-looped.cc @@ -37,25 +37,36 @@ void ModifyNnetIvectorPeriod(int32 ivector_period, bool b = config_line.ParseLine(config_lines[i]); KALDI_ASSERT(b && "Could not parse config line."); if (config_line.FirstToken() == "component-node") { + // What we're trying to do here is: find a line like: + // component-node name=foo component=foo input=Append(bar, ReplaceIndex(ivector, t, 0)) + // we want to replace it with something like: + // component-node name=foo component=foo input=Append(bar, ReplaceIndex(ivector, t, 0)) + // .. and we want this to also work if instead of 'ivector' it has something like + // Scale(0.5, ivector). We assume that ReplaceIndex() expressions only occur in this + // type of context. std::string whole_line = config_lines[i]; std::string to_search_for = "ReplaceIndex("; std::string::size_type to_search_for_size = to_search_for.size(); std::string::size_type pos = whole_line.find(to_search_for); if (pos != std::string::npos) { - std::string::size_type comma_pos = whole_line.find(',', pos); + std::string::size_type comma_pos = whole_line.find(", t, 0)", pos); if (comma_pos != std::string::npos) { // if the line contained ReplaceIndex(ivector, t, 0), // descriptor_name would now be 'ivector'. std::string descriptor_name = whole_line.substr(pos + to_search_for_size, comma_pos - (pos + to_search_for_size)); - std::string::size_type end_pos = whole_line.find(')', pos); - std::string::size_type expr_size = end_pos + 1 - pos; + // Note: 7, below, is the size of: ", t, 0)". + std::string::size_type end_pos = comma_pos + 7; + std::string::size_type expr_size = end_pos - pos; // e.g. expr_size would be strlen("ReplaceIndex(ivector, t, 0)"). std::ostringstream to_replace_with; to_replace_with << "Round(" << descriptor_name << ", " << ivector_period << ")"; whole_line.replace(pos, expr_size, to_replace_with.str()); config_to_read << whole_line << "\n"; + } else { + KALDI_ERR << "Could not process the ReplaceIndex expression in: " + << whole_line; } } }