diff --git a/egs/ami/s5/local/chain/run_blstm_ami_5.sh b/egs/ami/s5/local/chain/run_blstm_ami_5.sh index d9437af7e0c..53221a2bd53 100755 --- a/egs/ami/s5/local/chain/run_blstm_ami_5.sh +++ b/egs/ami/s5/local/chain/run_blstm_ami_5.sh @@ -118,7 +118,6 @@ if [ $stage -le 17 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 128 \ --trainer.frames-per-iter 1200000 \ --trainer.max-param-change 2.0 \ @@ -129,6 +128,7 @@ if [ $stage -le 17 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/ami/s5b/RESULTS_ihm b/egs/ami/s5b/RESULTS_ihm index c0af57b4a5d..0776bc05923 100644 --- a/egs/ami/s5b/RESULTS_ihm +++ b/egs/ami/s5b/RESULTS_ihm @@ -48,6 +48,13 @@ %WER 24.2 | 13098 94477 | 79.3 12.2 8.6 3.5 24.2 57.1 | -0.178 | exp/ihm/nnet3/tdnn_sp/decode_dev/ascore_11/dev_hires.ctm.filt.sys %WER 25.4 | 12643 89970 | 77.6 13.7 8.7 3.0 25.4 56.3 | -0.067 | exp/ihm/nnet3/tdnn_sp/decode_eval/ascore_12/eval_hires.ctm.filt.sys +# local/nnet3/run_blstm.sh --mic ihm +# nnet3 xent BLSTM with data cleaning +# for d in exp/ihm/nnet3_cleaned/lstm_bidirectional_sp/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done +# Note: the results are with ClipGradientComponent, which may be different from with BackpropTruncationComponent +%WER 22.3 | 13098 94494 | 80.9 11.7 7.4 3.2 22.3 55.7 | -0.618 | exp/ihm/nnet3_cleaned/lstm_bidirectional_sp/decode_dev/ascore_10/dev_hires.ctm.filt.sys +%WER 22.5 | 12643 89962 | 80.2 12.7 7.1 2.7 22.5 53.4 | -0.476 | exp/ihm/nnet3_cleaned/lstm_bidirectional_sp/decode_eval/ascore_10/eval_hires.ctm.filt.sys + ############################################ # local/chain/run_tdnn.sh --mic ihm --stage 12 & @@ -62,3 +69,15 @@ for d in exp/ihm/chain/tdnn_sp_bi/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done %WER 22.4 | 13098 94476 | 80.4 10.4 9.2 2.8 22.4 54.6 | 0.069 | exp/ihm/chain/tdnn_sp_bi/decode_dev/ascore_10/dev_hires.ctm.filt.sys %WER 22.5 | 12643 89974 | 80.0 12.1 7.9 2.6 22.5 52.8 | 0.157 | exp/ihm/chain/tdnn_sp_bi/decode_eval/ascore_10/eval_hires.ctm.filt.sys + +# local/chain/multi_condition/run_tdnn.sh --mic ihm & +# cleanup + chain TDNN model + IHM reverberated data +# for d in exp/ihm/chain_cleaned_rvb/tdnn_sp_bi/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done +%WER 21.5 | 13098 94486 | 81.8 11.0 7.2 3.3 21.5 54.6 | 0.090 | exp/ihm/chain_cleaned_rvb/tdnn_sp_rvb_bi/decode_dev/ascore_10/dev_hires.ctm.filt.sys +%WER 21.9 | 12643 89985 | 80.8 12.3 6.9 2.7 21.9 52.5 | 0.183 | exp/ihm/chain_cleaned_rvb/tdnn_sp_rvb_bi/decode_eval/ascore_10/eval_hires.ctm.filt.sys + + +# local/chain/tuning/run_tdnn_lstm_1i.sh --mic ihm --train-set train_cleaned --gmm tri3_cleaned +# cleanup + chain TDNN+LSTM model +%WER 20.8 | 13098 94489 | 82.0 10.0 8.0 2.8 20.8 53.2 | -0.096 | exp/ihm/chain_cleaned/tdnn_lstm1i_sp_bi_ld5/decode_dev/ascore_11/dev_hires.ctm.filt.sys +%WER 20.7 | 12643 89980 | 81.7 11.5 6.8 2.5 20.7 51.8 | 0.015 | exp/ihm/chain_cleaned/tdnn_lstm1i_sp_bi_ld5/decode_eval/ascore_11/eval_hires.ctm.filt.sys diff --git a/egs/ami/s5b/RESULTS_mdm b/egs/ami/s5b/RESULTS_mdm index da56d650e73..80eb152fc5d 100644 --- a/egs/ami/s5b/RESULTS_mdm +++ b/egs/ami/s5b/RESULTS_mdm @@ -65,7 +65,6 @@ # cleanup + chain TDNN model, alignments from IHM data (IHM alignments help). # local/chain/run_tdnn.sh --mic mdm8 --use-ihm-ali true --stage 12 & -# *** best system *** # for d in exp/mdm8/chain_cleaned/tdnn_sp_bi_ihmali/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done %WER 37.4 | 15286 94509 | 66.6 18.0 15.5 3.9 37.4 62.8 | 0.624 | exp/mdm8/chain_cleaned/tdnn_sp_bi_ihmali/decode_dev/ascore_9/dev_hires_o4.ctm.filt.sys %WER 40.6 | 13381 89982 | 62.7 18.9 18.3 3.3 40.6 67.6 | 0.594 | exp/mdm8/chain_cleaned/tdnn_sp_bi_ihmali/decode_eval/ascore_9/eval_hires_o4.ctm.filt.sys @@ -78,4 +77,15 @@ %WER 37.9 | 15635 94514 | 66.5 19.1 14.4 4.4 37.9 61.2 | 0.646 | exp/mdm8/chain/tdnn_sp_bi_ihmali/decode_dev/ascore_8/dev_hires_o4.ctm.filt.sys %WER 41.5 | 13884 89975 | 62.3 20.3 17.4 3.8 41.5 66.0 | 0.621 | exp/mdm8/chain/tdnn_sp_bi_ihmali/decode_eval/ascore_8/eval_hires_o4.ctm.filt.sys +# local/chain/multi_condition/run_tdnn.sh --mic mdm8 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned +# cleanup + chain TDNN model, MDM original + IHM reverberated data, alignments from IHM data +# for d in exp/mdm8/chain_cleaned_rvb/tdnn_sp_rvb_bi_ihmali/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done +%WER 35.8 | 14512 94498 | 68.2 17.2 14.6 4.0 35.8 64.9 | 0.632 | exp/mdm8/chain_cleaned_rvb/tdnn_sp_rvb_bi_ihmali/decode_dev/ascore_9/dev_hires_o4.ctm.filt.sys +%WER 39.1 | 13651 89967 | 64.3 18.4 17.3 3.3 39.1 65.2 | 0.607 | exp/mdm8/chain_cleaned_rvb/tdnn_sp_rvb_bi_ihmali/decode_eval/ascore_9/eval_hires_o4.ctm.filt.sys + +# local/chain/tuning/run_tdnn_lstm_1i.sh --mic mdm8 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned +# cleanup + chain TDNN+LSTM model, MDM audio and alignments from IHM data +# *** best system *** +%WER 34.6 | 15116 94508 | 69.6 17.6 12.9 4.1 34.6 62.3 | 0.687 | exp/mdm8/chain_cleaned/tdnn_lstm1i_sp_bi_ihmali_ld5/decode_dev/ascore_9/dev_hires_o4.ctm.filt.sys +%WER 37.1 | 14343 90002 | 66.3 18.8 14.9 3.4 37.1 62.3 | 0.659 | exp/mdm8/chain_cleaned/tdnn_lstm1i_sp_bi_ihmali_ld5/decode_eval/ascore_9/eval_hires_o4.ctm.filt.sys diff --git a/egs/ami/s5b/RESULTS_sdm b/egs/ami/s5b/RESULTS_sdm index cec525d797d..f0177a45078 100644 --- a/egs/ami/s5b/RESULTS_sdm +++ b/egs/ami/s5b/RESULTS_sdm @@ -46,6 +46,12 @@ %WER 41.6 | 14493 94516 | 63.3 23.5 13.2 4.9 41.6 66.8 | 0.639 | exp/sdm1/nnet3/tdnn_sp_ihmali/decode_dev/ascore_13/dev_hires_o4.ctm.filt.sys %WER 46.0 | 13597 89967 | 57.5 24.9 17.6 3.6 46.0 68.1 | 0.601 | exp/sdm1/nnet3/tdnn_sp_ihmali/decode_eval/ascore_14/eval_hires_o4.ctm.filt.sys +# xent BLSTM system; cleaned data and IHM alignments. +# local/nnet3/run_blstm.sh --mic sdm1 --use-ihm-ali true +# for d in exp/sdm1/nnet3_cleaned/lstm_bidirectional_sp/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done +# Note: the results are with ClipGradientComponent, which may be different from with BackpropTruncationComponent +%WER 37.8 | 14633 94518 | 67.1 22.3 10.7 4.9 37.8 64.2 | 0.745 | exp/sdm1/nnet3_cleaned/lstm_bidirectional_sp_ihmali/decode_dev/ascore_11/dev_hires_o4.ctm.filt.sys +%WER 41.4 | 13809 89628 | 62.7 24.1 13.2 4.1 41.4 65.2 | 0.723 | exp/sdm1/nnet3_cleaned/lstm_bidirectional_sp_ihmali/decode_eval/ascore_11/eval_hires_o4.ctm.filt.sys # ========================= @@ -62,7 +68,6 @@ # cleanup + chain TDNN model, alignments from IHM data (IHM alignments help). # local/chain/run_tdnn.sh --mic sdm1 --use-ihm-ali true --stage 12 & # cleanup + chain TDNN model, cleaned data and alignments from ihm data. -# *** best system *** # for d in exp/sdm1/chain_cleaned/tdnn_sp_bi_ihmali/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done %WER 40.7 | 14321 94501 | 63.0 19.6 17.4 3.7 40.7 67.7 | 0.592 | exp/sdm1/chain_cleaned/tdnn_sp_bi_ihmali/decode_dev/ascore_9/dev_hires_o4.ctm.filt.sys %WER 44.8 | 14293 89976 | 58.6 21.3 20.1 3.3 44.8 64.2 | 0.559 | exp/sdm1/chain_cleaned/tdnn_sp_bi_ihmali/decode_eval/ascore_9/eval_hires_o4.ctm.filt.sys @@ -75,4 +80,16 @@ %WER 40.7 | 14549 94520 | 63.6 21.4 15.0 4.3 40.7 66.2 | 0.617 | exp/sdm1/chain/tdnn_sp_bi_ihmali/decode_dev/ascore_8/dev_hires_o4.ctm.filt.sys %WER 45.1 | 13296 89971 | 59.1 23.4 17.6 4.2 45.1 69.5 | 0.591 | exp/sdm1/chain/tdnn_sp_bi_ihmali/decode_eval/ascore_8/eval_hires_o4.ctm.filt.sys +# local/chain/multi_condition/run_tdnn.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned & +# cleanup + chain TDNN model, SDM original + IHM reverberated data, alignments from ihm data. +# *** best system *** +# for d in exp/sdm1/chain_cleaned_rvb/tdnn_sp_rvb_bi_ihmali/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done +%WER 38.6 | 14760 94502 | 65.3 19.3 15.4 3.9 38.6 64.9 | 0.599 | exp/sdm1/chain_cleaned_rvb/tdnn_sp_rvb_bi_ihmali/decode_dev/ascore_9/dev_hires_o4.ctm.filt.sys +%WER 42.7 | 14070 89982 | 60.9 21.0 18.0 3.6 42.7 64.5 | 0.571 | exp/sdm1/chain_cleaned_rvb/tdnn_sp_rvb_bi_ihmali/decode_eval/ascore_9/eval_hires_o4.ctm.filt.sys + +# local/chain/tuning/run_tdnn_lstm_1i.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned +# cleanup + chain TDNN model, SDM audio + alignments from ihm data. +# *** best system *** +%WER 37.6 | 15122 94495 | 66.1 18.7 15.1 3.7 37.6 63.2 | 0.646 | exp/sdm1/chain_cleaned/tdnn_lstm1i_sp_bi_ihmali_ld5/decode_dev/ascore_10/dev_hires_o4.ctm.filt.sys +%WER 40.9 | 13807 89961 | 62.4 20.0 17.6 3.3 40.9 65.7 | 0.612 | exp/sdm1/chain_cleaned/tdnn_lstm1i_sp_bi_ihmali_ld5/decode_eval/ascore_10/eval_hires_o4.ctm.filt.sys diff --git a/egs/ami/s5b/local/chain/compare_wer_general.sh b/egs/ami/s5b/local/chain/compare_wer_general.sh new file mode 100755 index 00000000000..225890daf5c --- /dev/null +++ b/egs/ami/s5b/local/chain/compare_wer_general.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +mic=$1; +shift; + +echo -n "System " +for x in $*; do printf "% 10s" $x; done +echo + +#for d in exp/sdm1/chain_cleaned/tdnn*/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done|grep eval_hires + + +echo -n "WER on dev " +for x in $*; do + wer=$(grep Sum exp/$mic/chain_cleaned/${x}/decode_dev*/*sc*/*ys | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "WER on eval " +for x in $*; do + wer=$(grep Sum exp/$mic/chain_cleaned/${x}/decode_eval*/*sc*/*ys | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "Final train prob " +for x in $*; do + prob=$(grep Overall exp/$mic/chain_cleaned/${x}/log/compute_prob_train.final.log | grep -v xent | awk '{print $8}') + printf "% 10s" $prob +done +echo + +echo -n "Final valid prob " +for x in $*; do + prob=$(grep Overall exp/$mic/chain_cleaned/${x}/log/compute_prob_valid.final.log | grep -v xent | awk '{print $8}') + printf "% 10s" $prob +done +echo + +echo -n "Final train prob (xent) " +for x in $*; do + prob=$(grep Overall exp/$mic/chain_cleaned/${x}/log/compute_prob_train.final.log | grep -w xent | awk '{print $8}') + printf "% 10s" $prob +done +echo + +echo -n "Final valid prob (xent) " +for x in $*; do + prob=$(grep Overall exp/$mic/chain_cleaned/${x}/log/compute_prob_valid.final.log | grep -w xent | awk '{print $8}') + printf "% 10s" $prob +done +echo diff --git a/egs/ami/s5b/local/chain/multi_condition/run_tdnn.sh b/egs/ami/s5b/local/chain/multi_condition/run_tdnn.sh new file mode 100755 index 00000000000..28c9849d885 --- /dev/null +++ b/egs/ami/s5b/local/chain/multi_condition/run_tdnn.sh @@ -0,0 +1,283 @@ +#!/bin/bash + +# This is a chain-training script with TDNN neural networks. +# This script is based on local/chain/run_tdnn.sh, but adding +# the reverberated IHM data into the train set. +# This script obtains better results on both IHM and SDM tasks. + +# Please see RESULTS_* for examples of command lines invoking this script. + +# local/chain/multi_condition/run_tdnn.sh --mic ihm --train-set train_cleaned --gmm tri3_cleaned & +# local/chain/multi_condition/run_tdnn.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned & +# local/chain/multi_condition/run_tdnn.sh --mic mdm8 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned & + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=1 +mic=ihm +nj=30 +min_seg_len=1.55 +use_ihm_ali=true +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +ihm_gmm=tri3_cleaned # the gmm for the IHM system (if --use-ihm-ali true). +num_threads_ubm=32 +num_data_reps=1 + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnn_affix= #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! $use_ihm_ali; then + [ "$mic" != "ihm" ] && \ + echo "$0: you cannot specify --use-ihm-ali false if the microphone is not ihm." && \ + exit 1; +else + [ "$mic" == "ihm" ] && \ + echo "$0: you must specify --use-ihm-ali false if the microphone is ihm." && \ + exit 1; +fi + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $original_lat_dir + rm $original_lat_dir/fsts.*.gz # save space + + lat_dir_ihmdata=exp/ihm/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats + + mkdir -p $lat_dir/temp/ + mkdir -p $lat_dir/temp2/ + lattice-copy "ark:gunzip -c $original_lat_dir/lat.*.gz |" ark,scp:$lat_dir/temp/lats.ark,$lat_dir/temp/lats.scp + lattice-copy "ark:gunzip -c $lat_dir_ihmdata/lat.*.gz |" ark,scp:$lat_dir/temp2/lats.ark,$lat_dir/temp2/lats.scp + + # copy the lattices for the reverberated data + rm -f $lat_dir/temp/combined_lats.scp + touch $lat_dir/temp/combined_lats.scp + cat $lat_dir/temp/lats.scp >> $lat_dir/temp/combined_lats.scp + for i in `seq 1 $num_data_reps`; do + cat $lat_dir/temp2/lats.scp | sed -e "s/^/rev${i}_/" >> $lat_dir/temp/combined_lats.scp + done + sort -u $lat_dir/temp/combined_lats.scp > $lat_dir/temp/combined_lats_sorted.scp + + lattice-copy scp:$lat_dir/temp/combined_lats_sorted.scp "ark:|gzip -c >$lat_dir/lat.1.gz" || exit 1; + echo "1" > $lat_dir/num_jobs + + # copy other files from original lattice dir + for f in cmvn_opts final.mdl splice_opts tree; do + cp $original_lat_dir/$f $lat_dir/$f + done +fi + + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +if [ $stage -le 15 ]; then + mkdir -p $dir + + echo "$0: creating neural net configs"; + + steps/nnet3/tdnn/make_configs.py \ + --self-repair-scale-nonlinearity 0.00001 \ + --feat-dir data/$mic/${train_set}_sp_hires_comb \ + --ivector-dir $train_ivector_dir \ + --tree-dir $tree_dir \ + --relu-dim 450 \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize 0.1 \ + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target 1.0 \ + $dir/configs || exit 1; +fi + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-rvb$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + +graph_dir=$dir/graph_${LM} +if [ $stage -le 17 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + rm $dir/.error 2>/dev/null || true + for decode_set in dev eval; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$decode_cmd" \ + --online-ivector-dir exp/$mic/nnet3${nnet3_affix}${rvb_affix}/ivectors_${decode_set}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1; + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/ami/s5b/local/chain/run_tdnn.sh b/egs/ami/s5b/local/chain/run_tdnn.sh deleted file mode 100755 index 8df62af8bad..00000000000 --- a/egs/ami/s5b/local/chain/run_tdnn.sh +++ /dev/null @@ -1,242 +0,0 @@ -#!/bin/bash - -# This is a chain-training script with TDNN neural networks. -# Please see RESULTS_* for examples of command lines invoking this script. - - -# local/nnet3/run_tdnn.sh --stage 8 --use-ihm-ali true --mic sdm1 # rerunning with biphone -# local/nnet3/run_tdnn.sh --stage 8 --use-ihm-ali false --mic sdm1 - -# local/chain/run_tdnn.sh --use-ihm-ali true --mic sdm1 --train-set train --gmm tri3 --nnet3-affix "" --stage 12 & - -# local/chain/run_tdnn.sh --use-ihm-ali true --mic mdm8 --stage 12 & -# local/chain/run_tdnn.sh --use-ihm-ali true --mic mdm8 --train-set train --gmm tri3 --nnet3-affix "" --stage 12 & - -# local/chain/run_tdnn.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned& - - -set -e -o pipefail - -# First the options that are passed through to run_ivector_common.sh -# (some of which are also used in this script directly). -stage=0 -mic=ihm -nj=30 -min_seg_len=1.55 -use_ihm_ali=false -train_set=train_cleaned -gmm=tri3_cleaned # the gmm for the target data -ihm_gmm=tri3 # the gmm for the IHM system (if --use-ihm-ali true). -num_threads_ubm=32 -nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned - -# The rest are configs specific to this script. Most of the parameters -# are just hardcoded at this level, in the commands below. -train_stage=-10 -tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. -tdnn_affix= #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. -common_egs_dir= # you can set this to use previously dumped egs. - -# End configuration section. -echo "$0 $@" # Print the command line for logging - -. cmd.sh -. ./path.sh -. ./utils/parse_options.sh - - -if ! cuda-compiled; then - cat <data/lang_chain/topo - fi -fi - -if [ $stage -le 13 ]; then - # Get the alignments as lattices (gives the chain training more freedom). - # use the same num-jobs as the alignments - steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ - data/lang $gmm_dir $lat_dir - rm $lat_dir/fsts.*.gz # save space -fi - -if [ $stage -le 14 ]; then - # Build a tree using our new topology. We know we have alignments for the - # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use - # those. - if [ -f $tree_dir/final.mdl ]; then - echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." - exit 1; - fi - steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ - --context-opts "--context-width=2 --central-position=1" \ - --leftmost-questions-truncate -1 \ - --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir -fi - -if [ $stage -le 15 ]; then - mkdir -p $dir - - echo "$0: creating neural net configs"; - - steps/nnet3/tdnn/make_configs.py \ - --self-repair-scale-nonlinearity 0.00001 \ - --feat-dir data/$mic/${train_set}_sp_hires_comb \ - --ivector-dir $train_ivector_dir \ - --tree-dir $tree_dir \ - --relu-dim 450 \ - --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ - --use-presoftmax-prior-scale false \ - --xent-regularize 0.1 \ - --xent-separate-forward-affine true \ - --include-log-softmax false \ - --final-layer-normalize-target 1.0 \ - $dir/configs || exit 1; -fi - -if [ $stage -le 16 ]; then - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then - utils/create_split_dir.pl \ - /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage - fi - - touch $dir/egs/.nodelete # keep egs around when that run dies. - - steps/nnet3/chain/train.py --stage $train_stage \ - --cmd "$decode_cmd" \ - --feat.online-ivector-dir $train_ivector_dir \ - --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ - --chain.xent-regularize 0.1 \ - --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize 0.00005 \ - --chain.apply-deriv-weights false \ - --chain.lm-opts="--num-extra-lm-states=2000" \ - --egs.dir "$common_egs_dir" \ - --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width 150 \ - --trainer.num-chunk-per-minibatch 128 \ - --trainer.frames-per-iter 1500000 \ - --trainer.num-epochs 4 \ - --trainer.optimization.num-jobs-initial 2 \ - --trainer.optimization.num-jobs-final 12 \ - --trainer.optimization.initial-effective-lrate 0.001 \ - --trainer.optimization.final-effective-lrate 0.0001 \ - --trainer.max-param-change 2.0 \ - --cleanup.remove-egs true \ - --feat-dir $train_data_dir \ - --tree-dir $tree_dir \ - --lat-dir $lat_dir \ - --dir $dir -fi - - -graph_dir=$dir/graph_${LM} -if [ $stage -le 17 ]; then - # Note: it might appear that this data/lang_chain directory is mismatched, and it is as - # far as the 'topo' is concerned, but this script doesn't read the 'topo' from - # the lang directory. - utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir -fi - -if [ $stage -le 18 ]; then - rm $dir/.error 2>/dev/null || true - for decode_set in dev eval; do - ( - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --nj $nj --cmd "$decode_cmd" \ - --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ - --scoring-opts "--min-lmwt 5 " \ - $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1; - ) || touch $dir/.error & - done - wait - if [ -f $dir/.error ]; then - echo "$0: something went wrong in decoding" - exit 1 - fi -fi -exit 0 \ No newline at end of file diff --git a/egs/ami/s5b/local/chain/run_tdnn.sh b/egs/ami/s5b/local/chain/run_tdnn.sh new file mode 120000 index 00000000000..61f8f499182 --- /dev/null +++ b/egs/ami/s5b/local/chain/run_tdnn.sh @@ -0,0 +1 @@ +tuning/run_tdnn_1b.sh \ No newline at end of file diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1a.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1a.sh new file mode 100755 index 00000000000..8df62af8bad --- /dev/null +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1a.sh @@ -0,0 +1,242 @@ +#!/bin/bash + +# This is a chain-training script with TDNN neural networks. +# Please see RESULTS_* for examples of command lines invoking this script. + + +# local/nnet3/run_tdnn.sh --stage 8 --use-ihm-ali true --mic sdm1 # rerunning with biphone +# local/nnet3/run_tdnn.sh --stage 8 --use-ihm-ali false --mic sdm1 + +# local/chain/run_tdnn.sh --use-ihm-ali true --mic sdm1 --train-set train --gmm tri3 --nnet3-affix "" --stage 12 & + +# local/chain/run_tdnn.sh --use-ihm-ali true --mic mdm8 --stage 12 & +# local/chain/run_tdnn.sh --use-ihm-ali true --mic mdm8 --train-set train --gmm tri3 --nnet3-affix "" --stage 12 & + +# local/chain/run_tdnn.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned& + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +mic=ihm +nj=30 +min_seg_len=1.55 +use_ihm_ali=false +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +ihm_gmm=tri3 # the gmm for the IHM system (if --use-ihm-ali true). +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnn_affix= #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +if [ $stage -le 15 ]; then + mkdir -p $dir + + echo "$0: creating neural net configs"; + + steps/nnet3/tdnn/make_configs.py \ + --self-repair-scale-nonlinearity 0.00001 \ + --feat-dir data/$mic/${train_set}_sp_hires_comb \ + --ivector-dir $train_ivector_dir \ + --tree-dir $tree_dir \ + --relu-dim 450 \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize 0.1 \ + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target 1.0 \ + $dir/configs || exit 1; +fi + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + +graph_dir=$dir/graph_${LM} +if [ $stage -le 17 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + rm $dir/.error 2>/dev/null || true + for decode_set in dev eval; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$decode_cmd" \ + --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1; + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 \ No newline at end of file diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1b.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1b.sh new file mode 100755 index 00000000000..a262f8e1860 --- /dev/null +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1b.sh @@ -0,0 +1,270 @@ +#!/bin/bash + +# This is a chain-training script with TDNN neural networks. +# Please see RESULTS_* for examples of command lines invoking this script. + + +# local/nnet3/run_tdnn.sh --stage 8 --use-ihm-ali true --mic sdm1 # rerunning with biphone +# local/nnet3/run_tdnn.sh --stage 8 --use-ihm-ali false --mic sdm1 + +# local/chain/run_tdnn.sh --use-ihm-ali true --mic sdm1 --train-set train --gmm tri3 --nnet3-affix "" --stage 12 & + +# local/chain/run_tdnn.sh --use-ihm-ali true --mic mdm8 --stage 12 & +# local/chain/run_tdnn.sh --use-ihm-ali true --mic mdm8 --train-set train --gmm tri3 --nnet3-affix "" --stage 12 & + +# local/chain/run_tdnn.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned& + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +mic=ihm +nj=30 +min_seg_len=1.55 +use_ihm_ali=false +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +ihm_gmm=tri3 # the gmm for the IHM system (if --use-ihm-ali true). +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnn_affix=1b #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +xent_regularize=0.1 + +if [ $stage -le 15 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=450 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=450 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=450 + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=450 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=450 + relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=450 + relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=450 + + ## adding the layers for chain branch + relu-renorm-layer name=prefinal-chain input=tdnn7 dim=450 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-renorm-layer name=prefinal-xent input=tdnn7 dim=450 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + +graph_dir=$dir/graph_${LM} +if [ $stage -le 17 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + rm $dir/.error 2>/dev/null || true + for decode_set in dev eval; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$decode_cmd" \ + --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1; + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1c.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1c.sh new file mode 100755 index 00000000000..64cde69e7dd --- /dev/null +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1c.sh @@ -0,0 +1,257 @@ +#!/bin/bash + +# same as 1b but with shorter minibatches + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +mic=ihm +nj=30 +min_seg_len=1.55 +use_ihm_ali=false +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +ihm_gmm=tri3 # the gmm for the IHM system (if --use-ihm-ali true). +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnn_affix=1c #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +xent_regularize=0.1 + +if [ $stage -le 15 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=450 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=450 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=450 + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=450 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=450 + relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=450 + relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=450 + + ## adding the layers for chain branch + relu-renorm-layer name=prefinal-chain input=tdnn7 dim=450 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-renorm-layer name=prefinal-xent input=tdnn7 dim=450 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + +graph_dir=$dir/graph_${LM} +if [ $stage -le 17 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + rm $dir/.error 2>/dev/null || true + for decode_set in dev eval; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$decode_cmd" \ + --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1; + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh new file mode 100755 index 00000000000..ba136e67521 --- /dev/null +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh @@ -0,0 +1,288 @@ +#!/bin/bash + +# TDNN+LSTM architecture similar to swbd/tdnn_lstm_1b +# results on sdm1 with ihm ali +#System tdnn1b tdnn_lstm1a +#WER on dev 39.9 38.9 +#WER on eval 43.9 42.2 +#Final train prob -0.186387 -0.142585 +#Final valid prob -0.259997 -0.251197 +#Final train prob (xent) -2.4593 -1.73176 +#Final valid prob (xent) -2.70347 -2.26965 + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +mic=ihm +nj=30 +min_seg_len=1.55 +use_ihm_ali=false +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +ihm_gmm=tri3 # the gmm for the IHM system (if --use-ihm-ali true). +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned + +chunk_width=150 +chunk_left_context=40 +chunk_right_context=0 +label_delay=5 +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tlstm_affix=1a #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + + +# decode options +extra_left_context=50 +frames_per_chunk= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +xent_regularize=0.1 + +if [ $stage -le 15 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=512 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=512 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=512 + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=512 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=512 + lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=512 + relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=512 + lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --chain.left-deriv-truncate 0 \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + +graph_dir=$dir/graph_${LM} +if [ $stage -le 17 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + rm $dir/.error 2>/dev/null || true + + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + + for decode_set in dev eval; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$decode_cmd" \ + --extra-left-context $extra_left_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1; + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1b.sh new file mode 100755 index 00000000000..ed615a98e30 --- /dev/null +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1b.sh @@ -0,0 +1,292 @@ +#!/bin/bash + +# same as 1a but the neural network has two more TDNN layers (0,3 0,3) +# above the lstm +# results on sdm1 with ihm ali + +#System 1a 1b +#WER on dev 38.9 39.6 +#WER on eval 42.2 42.9 +#Final train prob -0.142585 -0.152283 +#Final valid prob -0.251197 -0.253287 +#Final train prob (xent) -1.73176 -1.77542 +#Final valid prob (xent) -2.26965 -2.28851 + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +mic=ihm +nj=30 +min_seg_len=1.55 +use_ihm_ali=false +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +ihm_gmm=tri3 # the gmm for the IHM system (if --use-ihm-ali true). +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned + +chunk_width=150 +chunk_left_context=40 +chunk_right_context=0 +label_delay=5 +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tlstm_affix=1b #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + + +# decode options +extra_left_context=50 +frames_per_chunk= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +xent_regularize=0.1 + +if [ $stage -le 15 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=512 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=512 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=512 + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=512 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=512 + lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=512 + relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=512 + lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + relu-renorm-layer name=tdnn8 input=Append(0,3) dim=512 + relu-renorm-layer name=tdnn9 input=Append(0,3) dim=512 + + ## adding the layers for chain branch + output-layer name=output input=tdnn9 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=tdnn9 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --chain.left-deriv-truncate 0 \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + +graph_dir=$dir/graph_${LM} +if [ $stage -le 17 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + rm $dir/.error 2>/dev/null || true + + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + + for decode_set in dev eval; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$decode_cmd" \ + --extra-left-context $extra_left_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1; + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1c.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1c.sh new file mode 100755 index 00000000000..ce719d6f2cb --- /dev/null +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1c.sh @@ -0,0 +1,291 @@ +#!/bin/bash + +# same as 1a, but with more TDNN layers between each LSTM +# results on sdm1 with ihm ali +#System 1a 1c +#WER on dev 38.9 39.0 +#WER on eval 42.2 41.9 +#Final train prob -0.142585 -0.142951 +#Final valid prob -0.251197 -0.249901 +#Final train prob (xent) -1.73176 -1.71779 +#Final valid prob (xent) -2.26965 -2.22776 + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +mic=ihm +nj=30 +min_seg_len=1.55 +use_ihm_ali=false +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +ihm_gmm=tri3 # the gmm for the IHM system (if --use-ihm-ali true). +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned + +chunk_width=150 +chunk_left_context=40 +chunk_right_context=0 +label_delay=5 +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tlstm_affix=1c #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + + +# decode options +extra_left_context=50 +frames_per_chunk= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +xent_regularize=0.1 + +if [ $stage -le 15 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=512 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=512 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=512 + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=512 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=512 + relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=512 + lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=512 + relu-renorm-layer name=tdnn8 input=Append(-3,0,3) dim=512 + relu-renorm-layer name=tdnn9 input=Append(-3,0,3) dim=512 + lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --chain.left-deriv-truncate 0 \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + +graph_dir=$dir/graph_${LM} +if [ $stage -le 17 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + rm $dir/.error 2>/dev/null || true + + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + + for decode_set in dev eval; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$decode_cmd" \ + --extra-left-context $extra_left_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1; + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1d.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1d.sh new file mode 100755 index 00000000000..22967036cb2 --- /dev/null +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1d.sh @@ -0,0 +1,293 @@ +#!/bin/bash + +# same as 1c, but with more TDNN layers between each LSTM +# results on sdm1 with ihm ali + +#System tdnn_lstm1c_sp_bi_ihmali_ld5 tdnn_lstm1d_sp_bi_ihmali_ld5 +#WER on dev 39.0 39.1 +#WER on eval 41.9 42.0 +#Final train prob -0.142951 -0.150625 +#Final valid prob -0.249901 -0.248819 +#Final train prob (xent) -1.71779 -1.75401 +#Final valid prob (xent) -2.22776 -2.24072 + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +mic=ihm +nj=30 +min_seg_len=1.55 +use_ihm_ali=false +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +ihm_gmm=tri3 # the gmm for the IHM system (if --use-ihm-ali true). +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned + +chunk_width=150 +chunk_left_context=40 +chunk_right_context=0 +label_delay=5 +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tlstm_affix=1d #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + + +# decode options +extra_left_context=50 +frames_per_chunk= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +xent_regularize=0.1 + +if [ $stage -le 15 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=512 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=512 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=512 + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=512 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=512 + relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=512 + relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=512 + lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + relu-renorm-layer name=tdnn8 input=Append(-3,0,3) dim=512 + relu-renorm-layer name=tdnn9 input=Append(-3,0,3) dim=512 + relu-renorm-layer name=tdnn10 input=Append(-3,0,3) dim=512 + relu-renorm-layer name=tdnn11 input=Append(-3,0,3) dim=512 + lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --chain.left-deriv-truncate 0 \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + +graph_dir=$dir/graph_${LM} +if [ $stage -le 17 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + rm $dir/.error 2>/dev/null || true + + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + + for decode_set in dev eval; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$decode_cmd" \ + --extra-left-context $extra_left_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1; + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1e.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1e.sh new file mode 100755 index 00000000000..6e73457a772 --- /dev/null +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1e.sh @@ -0,0 +1,293 @@ +#!/bin/bash + +# same as 1c but with only right context for the TDNNs i.e., (0,3) in place +# of (-3,0,3) +# results on sdm1 with ihm ali + +#System tdnn_lstm1c_sp_bi_ihmali_ld5tdnn_lstm1e_sp_bi_ihmali_ld5 +#WER on dev 39.0 39.4 +#WER on eval 41.9 42.4 +#Final train prob -0.142951 -0.152498 +#Final valid prob -0.249901 -0.251393 +#Final train prob (xent) -1.71779 -1.77722 +#Final valid prob (xent) -2.22776 -2.26705 +# + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +mic=ihm +nj=30 +min_seg_len=1.55 +use_ihm_ali=false +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +ihm_gmm=tri3 # the gmm for the IHM system (if --use-ihm-ali true). +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned + +chunk_width=150 +chunk_left_context=40 +chunk_right_context=0 +label_delay=5 +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tlstm_affix=1e #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + + +# decode options +extra_left_context=50 +frames_per_chunk= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +xent_regularize=0.1 + +if [ $stage -le 15 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=512 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=512 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=512 + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + relu-renorm-layer name=tdnn4 input=Append(0,3) dim=512 + relu-renorm-layer name=tdnn5 input=Append(0,3) dim=512 + relu-renorm-layer name=tdnn6 input=Append(0,3) dim=512 + lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + relu-renorm-layer name=tdnn7 input=Append(0,3) dim=512 + relu-renorm-layer name=tdnn8 input=Append(0,3) dim=512 + relu-renorm-layer name=tdnn9 input=Append(0,3) dim=512 + lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --chain.left-deriv-truncate 0 \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + +graph_dir=$dir/graph_${LM} +if [ $stage -le 17 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + rm $dir/.error 2>/dev/null || true + + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + + for decode_set in dev eval; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$decode_cmd" \ + --extra-left-context $extra_left_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1; + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1f.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1f.sh new file mode 100755 index 00000000000..3c4df056460 --- /dev/null +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1f.sh @@ -0,0 +1,292 @@ +#!/bin/bash + +# same as 1a but the neural network has two more TDNN layers (0,3 0,3) +# above the lstm +# results on sdm1 with ihm ali +# +#System tdnn_lstm1a_sp_bi_ihmali_ld5tdnn_lstm1f_sp_bi_ihmali_ld5 +#WER on dev 38.9 39.4 +#WER on eval 42.2 42.7 +#Final train prob -0.142585 -0.15514 +#Final valid prob -0.251197 -0.253257 +#Final train prob (xent) -1.73176 -1.80786 +#Final valid prob (xent) -2.26965 -2.29771 + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +mic=ihm +nj=30 +min_seg_len=1.55 +use_ihm_ali=false +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +ihm_gmm=tri3 # the gmm for the IHM system (if --use-ihm-ali true). +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned + +chunk_width=150 +chunk_left_context=40 +chunk_right_context=0 +label_delay=5 +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tlstm_affix=1f #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + + +# decode options +extra_left_context=50 +frames_per_chunk= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +xent_regularize=0.1 + +if [ $stage -le 15 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=512 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=512 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=512 + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=512 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=512 + lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=512 + relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=512 + lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + relu-renorm-layer name=tdnn8 input=Append(0,3) dim=512 + tanh-layer name=tdnn9 input=Append(0,3) dim=512 + + ## adding the layers for chain branch + output-layer name=output input=tdnn9 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=tdnn9 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --chain.left-deriv-truncate 0 \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + +graph_dir=$dir/graph_${LM} +if [ $stage -le 17 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + rm $dir/.error 2>/dev/null || true + + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + + for decode_set in dev eval; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$decode_cmd" \ + --extra-left-context $extra_left_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1; + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1g.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1g.sh new file mode 100755 index 00000000000..cce5f2f5f3e --- /dev/null +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1g.sh @@ -0,0 +1,293 @@ +#!/bin/bash + +# same as 1c but with smaller minibatch +# using smaller minibatches seems to be better in TDNN+LSTM archs. +# not much difference in other archs. +# results on sdm1 using ihm ali +#System tdnn_lstm1c_sp_bi_ihmali_ld5tdnn_lstm1g_sp_bi_ihmali_ld5 +#WER on dev 39.0 38.3 +#WER on eval 41.9 41.6 +#Final train prob -0.142951 -0.138017 +#Final valid prob -0.249901 -0.238659 +#Final train prob (xent) -1.71779 -1.66834 +#Final valid prob (xent) -2.22776 -2.17419 + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +mic=ihm +nj=30 +min_seg_len=1.55 +use_ihm_ali=false +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +ihm_gmm=tri3 # the gmm for the IHM system (if --use-ihm-ali true). +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned + +chunk_width=150 +chunk_left_context=40 +chunk_right_context=0 +label_delay=5 +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tlstm_affix=1g #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + + +# decode options +extra_left_context=50 +frames_per_chunk= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +xent_regularize=0.1 + +if [ $stage -le 15 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=512 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=512 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=512 + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=512 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=512 + relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=512 + lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=512 + relu-renorm-layer name=tdnn8 input=Append(-3,0,3) dim=512 + relu-renorm-layer name=tdnn9 input=Append(-3,0,3) dim=512 + lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --chain.left-deriv-truncate 0 \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + +graph_dir=$dir/graph_${LM} +if [ $stage -le 17 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + rm $dir/.error 2>/dev/null || true + + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + + for decode_set in dev eval; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$decode_cmd" \ + --extra-left-context $extra_left_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1; + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1h.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1h.sh new file mode 100755 index 00000000000..c306849632a --- /dev/null +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1h.sh @@ -0,0 +1,294 @@ +#!/bin/bash + +# same as 1c but with one more stack of TDNN and LSTM layers +# results on sdm1 using ihm ali +#System tdnn_lstm1c_sp_bi_ihmali_ld5 tdnn_lstm1h_sp_bi_ihmali_ld5 +#WER on dev 39.0 39.4 +#WER on eval 41.9 42.6 +#Final train prob -0.142951 -0.157634 +#Final valid prob -0.249901 -0.24945 +#Final train prob (xent) -1.71779 -1.7585 +#Final valid prob (xent) -2.22776 -2.2512 + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +mic=ihm +nj=30 +min_seg_len=1.55 +use_ihm_ali=false +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +ihm_gmm=tri3 # the gmm for the IHM system (if --use-ihm-ali true). +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned + +chunk_width=150 +chunk_left_context=40 +chunk_right_context=0 +label_delay=5 +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tlstm_affix=1h #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + + +# decode options +extra_left_context=50 +frames_per_chunk= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +xent_regularize=0.1 + +if [ $stage -le 15 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=512 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=512 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=512 + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=512 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=512 + relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=512 + lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=512 + relu-renorm-layer name=tdnn8 input=Append(-3,0,3) dim=512 + relu-renorm-layer name=tdnn9 input=Append(-3,0,3) dim=512 + lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + relu-renorm-layer name=tdnn10 input=Append(-3,0,3) dim=512 + relu-renorm-layer name=tdnn11 input=Append(-3,0,3) dim=512 + relu-renorm-layer name=tdnn12 input=Append(-3,0,3) dim=512 + lstmp-layer name=lstm4 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --chain.left-deriv-truncate 0 \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + +graph_dir=$dir/graph_${LM} +if [ $stage -le 17 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + rm $dir/.error 2>/dev/null || true + + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + + for decode_set in dev eval; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$decode_cmd" \ + --extra-left-context $extra_left_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1; + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh new file mode 100755 index 00000000000..3f8ff14efd9 --- /dev/null +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh @@ -0,0 +1,292 @@ +#!/bin/bash + +# same as 1g but with TDNN output dim 1024 instead of 512 +# (num-params 1g:21309812 1i: 43447156) +# results on sdm1 using ihm ali +#System tdnn_lstm1g_sp_bi_ihmali_ld5 tdnn_lstm1i_sp_bi_ihmali_ld5 +#WER on dev 38.3 37.6 +#WER on eval 41.6 40.9 +#Final train prob -0.138017 -0.114135 +#Final valid prob -0.238659 -0.245208 +#Final train prob (xent) -1.66834 -1.47648 +#Final valid prob (xent) -2.17419 -2.16365 + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +mic=ihm +nj=30 +min_seg_len=1.55 +use_ihm_ali=false +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +ihm_gmm=tri3 # the gmm for the IHM system (if --use-ihm-ali true). +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned + +chunk_width=150 +chunk_left_context=40 +chunk_right_context=0 +label_delay=5 +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tlstm_affix=1i #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + + +# decode options +extra_left_context=50 +frames_per_chunk= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +xent_regularize=0.1 + +if [ $stage -le 15 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=1024 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 + lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn8 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn9 input=Append(-3,0,3) dim=1024 + lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + +graph_dir=$dir/graph_${LM} +if [ $stage -le 17 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + rm $dir/.error 2>/dev/null || true + + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + + for decode_set in dev eval; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$decode_cmd" \ + --extra-left-context $extra_left_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1; + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/ami/s5b/local/nnet3/multi_condition/run_ivector_common.sh b/egs/ami/s5b/local/nnet3/multi_condition/run_ivector_common.sh new file mode 100755 index 00000000000..eb20415e515 --- /dev/null +++ b/egs/ami/s5b/local/nnet3/multi_condition/run_ivector_common.sh @@ -0,0 +1,249 @@ +#!/bin/bash + +set -e -o pipefail + + +# This script is called from local/chain/multi_condition/run_tdnn.sh. +# It contains the common feature preparation and iVector-related parts +# of the script. See those scripts for examples of usage. + +stage=1 +mic=ihm +nj=30 +min_seg_len=1.55 # min length in seconds... we do this because chain training + # will discard segments shorter than 1.5 seconds. Must remain in sync with + # the same option given to prepare_lores_feats.sh. +train_set=train_cleaned # you might set this to e.g. train_cleaned. +gmm=tri3_cleaned # This specifies a GMM-dir from the features of the type you're training the system on; + # it should contain alignments for 'train_set'. + + +num_threads_ubm=32 +rvb_affix=_rvb +nnet3_affix=_cleaned # affix for exp/$mic/nnet3 directory to put iVector stuff in, so it + # becomes exp/$mic/nnet3_cleaned or whatever. +num_data_reps=1 + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=${nnet3_affix}$rvb_affix + +gmmdir=exp/${mic}/${gmm} + + +for f in data/${mic}/${train_set}/feats.scp ${gmmdir}/final.mdl; do + if [ ! -f $f ]; then + echo "$0: expected file $f to exist" + exit 1 + fi +done + + + +if [ $stage -le 1 ] && [ -f data/$mic/${train_set}_sp_hires/feats.scp ]; then + echo "$0: data/$mic/${train_set}_sp_hires/feats.scp already exists." + echo " ... Please either remove it, or rerun this script with stage > 2." + exit 1 +fi + +if [ $stage -le 1 ]; then + echo "$0: preparing directory for speed-perturbed data" + utils/data/perturb_data_dir_speed_3way.sh data/${mic}/${train_set} data/${mic}/${train_set}_sp + + echo "$0: creating high-resolution MFCC features" + + # this shows how you can split across multiple file-systems. we'll split the + # MFCC dir across multiple locations. You might want to be careful here, if you + # have multiple copies of Kaldi checked out and run the same recipe, not to let + # them overwrite each other. + mfccdir=data/$mic/${train_set}_sp_hires/data + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$mic-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage + fi + + for datadir in ${train_set}_sp dev eval; do + utils/copy_data_dir.sh data/$mic/$datadir data/$mic/${datadir}_hires + done + + # do volume-perturbation on the training data prior to extracting hires + # features; this helps make trained nnets more invariant to test data volume. + utils/data/perturb_data_dir_volume.sh data/$mic/${train_set}_sp_hires + + for datadir in ${train_set}_sp dev eval; do + steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/$mic/${datadir}_hires + steps/compute_cmvn_stats.sh data/$mic/${datadir}_hires + utils/fix_data_dir.sh data/$mic/${datadir}_hires + done +fi + +if [ $stage -le 2 ]; then + echo "$0: combining short segments of speed-perturbed high-resolution MFCC training data" + # we have to combine short segments or we won't be able to train chain models + # on those segments. + utils/data/combine_short_segments.sh \ + data/${mic}/${train_set}_sp_hires $min_seg_len data/${mic}/${train_set}_sp_hires_comb + + # just copy over the CMVN to avoid having to recompute it. + cp data/${mic}/${train_set}_sp_hires/cmvn.scp data/${mic}/${train_set}_sp_hires_comb/ + utils/fix_data_dir.sh data/${mic}/${train_set}_sp_hires_comb/ +fi + +if [ $stage -le 3 ]; then + echo "$0: creating reverberated MFCC features" + + datadir=data/ihm/train_cleaned_sp + + mfccdir=${datadir}_rvb${num_data_reps}_hires/data + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$mic-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage + fi + + if [ ! -f ${datadir}_rvb${num_data_reps}_hires/feats.scp ]; then + if [ ! -d "RIRS_NOISES" ]; then + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + + rvb_opts=() + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list") + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") + rvb_opts+=(--noise-set-parameters RIRS_NOISES/pointsource_noises/noise_list) + + python steps/data/reverberate_data_dir.py \ + "${rvb_opts[@]}" \ + --prefix "rev" \ + --foreground-snrs "20:10:15:5:0" \ + --background-snrs "20:10:15:5:0" \ + --speech-rvb-probability 1 \ + --pointsource-noise-addition-probability 1 \ + --isotropic-noise-addition-probability 1 \ + --num-replications ${num_data_reps} \ + --max-noises-per-minute 1 \ + --source-sampling-rate 16000 \ + ${datadir} ${datadir}_rvb${num_data_reps} + + utils/copy_data_dir.sh ${datadir}_rvb${num_data_reps} ${datadir}_rvb${num_data_reps}_hires + utils/data/perturb_data_dir_volume.sh ${datadir}_rvb${num_data_reps}_hires + + steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" ${datadir}_rvb${num_data_reps}_hires + steps/compute_cmvn_stats.sh ${datadir}_rvb${num_data_reps}_hires + utils/fix_data_dir.sh ${datadir}_rvb${num_data_reps}_hires + + utils/data/combine_short_segments.sh \ + ${datadir}_rvb${num_data_reps}_hires $min_seg_len ${datadir}_rvb${num_data_reps}_hires_comb + + # just copy over the CMVN to avoid having to recompute it. + cp ${datadir}_rvb${num_data_reps}_hires/cmvn.scp ${datadir}_rvb${num_data_reps}_hires_comb/ + utils/fix_data_dir.sh ${datadir}_rvb${num_data_reps}_hires_comb/ + fi + + utils/combine_data.sh data/${mic}/${train_set}_sp_rvb_hires data/${mic}/${train_set}_sp_hires ${datadir}_rvb${num_data_reps}_hires + utils/combine_data.sh data/${mic}/${train_set}_sp_rvb_hires_comb data/${mic}/${train_set}_sp_hires_comb ${datadir}_rvb${num_data_reps}_hires_comb +fi + + +if [ $stage -le 4 ]; then + echo "$0: selecting segments of hires training data that were also present in the" + echo " ... original training data." + + # note, these data-dirs are temporary; we put them in a sub-directory + # of the place where we'll make the alignments. + temp_data_root=exp/$mic/nnet3${nnet3_affix}/tri5 + mkdir -p $temp_data_root + + utils/data/subset_data_dir.sh --utt-list data/${mic}/${train_set}/feats.scp \ + data/${mic}/${train_set}_sp_hires $temp_data_root/${train_set}_hires + + # note: essentially all the original segments should be in the hires data. + n1=$(wc -l /dev/null || true + if [ -z $extra_left_context ]; then + extra_left_context=$chunk_left_context + fi + if [ -z $extra_right_context ]; then + extra_right_context=$chunk_right_context + fi + if [ -z $frames_per_chunk ]; then + frames_per_chunk=$chunk_width + fi + model_opts= + [ ! -z $decode_iter ] && model_opts=" --iter $decode_iter "; + for decode_set in dev eval; do + ( + num_jobs=`cat data/$mic/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + decode_dir=${dir}/decode_${decode_set} + steps/nnet3/decode.sh --nj 250 --cmd "$decode_cmd" \ + $model_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/$mic/${decode_set}_hires $decode_dir || exit 1; + ) & + done + wait; + if [ -f $dir/.error ]; then + echo "$0: error detected during decoding" + exit 1 + fi +fi + +exit 0; diff --git a/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh b/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh index 79d633b1ebd..e54b5f43128 100755 --- a/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh +++ b/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh @@ -176,7 +176,6 @@ if [ $stage -le 12 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.max-param-change 1.414 \ --egs.stage $get_egs_stage \ @@ -193,6 +192,7 @@ if [ $stage -le 12 ]; then --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.shrink-value 0.99 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ --cleanup.remove-egs $remove_egs \ --feat-dir data/train_rvb_min${min_seg_len}_hires \ --tree-dir $treedir \ diff --git a/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh b/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh index 5fa4ea565cd..0ca6062e9c8 100755 --- a/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh +++ b/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh @@ -173,7 +173,6 @@ if [ $stage -le 12 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.max-param-change 1.414 \ --egs.stage $get_egs_stage \ @@ -188,6 +187,7 @@ if [ $stage -le 12 ]; then --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.shrink-value 0.99 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ --cleanup.remove-egs $remove_egs \ --feat-dir data/train_rvb_min${min_seg_len}_hires \ --tree-dir $treedir \ diff --git a/egs/aspire/s5/local/fisher_data_prep.sh b/egs/aspire/s5/local/fisher_data_prep.sh index 93abf390225..233185f071e 100755 --- a/egs/aspire/s5/local/fisher_data_prep.sh +++ b/egs/aspire/s5/local/fisher_data_prep.sh @@ -52,7 +52,7 @@ for subdir in fe_03_p1_sph1 fe_03_p1_sph3 fe_03_p1_sph5 fe_03_p1_sph7 \ found_subdir=true ln -s $dir/$subdir data/local/data/links else - new_style_subdir=$(echo $subdir | sed s/fe_03_p2_sph/fisher_eng_tr_sp_d/) + new_style_subdir=$(echo $subdir | sed s/fe_03_p1_sph/fisher_eng_tr_sp_d/) if [ -d $dir/$new_style_subdir ]; then found_subdir=true ln -s $dir/$new_style_subdir data/local/data/links/$subdir diff --git a/egs/fisher_english/s5/local/fisher_data_prep.sh b/egs/fisher_english/s5/local/fisher_data_prep.sh index 93abf390225..233185f071e 100755 --- a/egs/fisher_english/s5/local/fisher_data_prep.sh +++ b/egs/fisher_english/s5/local/fisher_data_prep.sh @@ -52,7 +52,7 @@ for subdir in fe_03_p1_sph1 fe_03_p1_sph3 fe_03_p1_sph5 fe_03_p1_sph7 \ found_subdir=true ln -s $dir/$subdir data/local/data/links else - new_style_subdir=$(echo $subdir | sed s/fe_03_p2_sph/fisher_eng_tr_sp_d/) + new_style_subdir=$(echo $subdir | sed s/fe_03_p1_sph/fisher_eng_tr_sp_d/) if [ -d $dir/$new_style_subdir ]; then found_subdir=true ln -s $dir/$new_style_subdir data/local/data/links/$subdir diff --git a/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh b/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh index b70da4e852a..d9b11f9fb21 100644 --- a/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh +++ b/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh @@ -117,7 +117,6 @@ if [ $stage -le 13 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.frames-per-iter 1200000 \ --trainer.max-param-change 1.414 \ @@ -128,6 +127,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/fisher_swbd/s5/local/fisher_data_prep.sh b/egs/fisher_swbd/s5/local/fisher_data_prep.sh index dfc29c5a6c6..470577f28d3 100755 --- a/egs/fisher_swbd/s5/local/fisher_data_prep.sh +++ b/egs/fisher_swbd/s5/local/fisher_data_prep.sh @@ -39,7 +39,7 @@ for subdir in fe_03_p1_sph1 fe_03_p1_sph3 fe_03_p1_sph5 fe_03_p1_sph7 \ found_subdir=true ln -s $dir/$subdir data/local/data_fisher/links/$subdir else - new_style_subdir=$(echo $subdir | sed s/fe_03_p2_sph/fisher_eng_tr_sp_d/) + new_style_subdir=$(echo $subdir | sed s/fe_03_p1_sph/fisher_eng_tr_sp_d/) if [ -d $dir/$new_style_subdir ]; then found_subdir=true ln -s $dir/$new_style_subdir data/local/data_fisher/links/$subdir diff --git a/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh b/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh index be6c82a935e..4afa867503a 100644 --- a/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh +++ b/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh @@ -108,9 +108,6 @@ model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | a left_context=$[model_left_context + extra_left_context] right_context=$[model_right_context + extra_right_context] -valid_left_context=$[valid_left_context + frames_per_eg] -valid_right_context=$[valid_right_context + frames_per_eg] - frame_subsampling_opt= if [ -f $srcdir/frame_subsampling_factor ]; then frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)" @@ -136,8 +133,7 @@ if [ -z "$degs_dir" ]; then --adjust-priors $adjust_priors \ --online-ivector-dir $online_ivector_dir \ --left-context $left_context --right-context $right_context \ - --valid-left-context $valid_left_context --valid-right-context $valid_right_context \ - --priors-left-context $valid_left_context --priors-right-context $valid_right_context $frame_subsampling_opt \ + $frame_subsampling_opt \ --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \ $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ; fi diff --git a/egs/librispeech/s5/local/nnet3/run_tdnn_discriminative.sh b/egs/librispeech/s5/local/nnet3/run_tdnn_discriminative.sh index ef7e9d2594f..81732779d37 100755 --- a/egs/librispeech/s5/local/nnet3/run_tdnn_discriminative.sh +++ b/egs/librispeech/s5/local/nnet3/run_tdnn_discriminative.sh @@ -115,9 +115,6 @@ model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | a left_context=$[model_left_context + extra_left_context] right_context=$[model_right_context + extra_right_context] -valid_left_context=$[valid_left_context + frames_per_eg] -valid_right_context=$[valid_right_context + frames_per_eg] - frame_subsampling_opt= if [ -f $srcdir/frame_subsampling_factor ]; then frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)" @@ -143,8 +140,7 @@ if [ -z "$degs_dir" ]; then --adjust-priors $adjust_priors \ --online-ivector-dir $train_ivector_dir \ --left-context $left_context --right-context $right_context \ - --valid-left-context $valid_left_context --valid-right-context $valid_right_context \ - --priors-left-context $valid_left_context --priors-right-context $valid_right_context $frame_subsampling_opt \ + $frame_subsampling_opt \ --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \ $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ; fi diff --git a/egs/lre07/v2/local/dnn/fisher_data_prep.sh b/egs/lre07/v2/local/dnn/fisher_data_prep.sh index c7e74dea3bc..70cede2f86c 100755 --- a/egs/lre07/v2/local/dnn/fisher_data_prep.sh +++ b/egs/lre07/v2/local/dnn/fisher_data_prep.sh @@ -52,7 +52,7 @@ for subdir in fe_03_p1_sph1 fe_03_p1_sph3 fe_03_p1_sph5 fe_03_p1_sph7 \ found_subdir=true ln -s $dir/$subdir data/local/data/links else - new_style_subdir=$(echo $subdir | sed s/fe_03_p2_sph/fisher_eng_tr_sp_d/) + new_style_subdir=$(echo $subdir | sed s/fe_03_p1_sph/fisher_eng_tr_sp_d/) if [ -d $dir/$new_style_subdir ]; then found_subdir=true ln -s $dir/$new_style_subdir data/local/data/links/$subdir diff --git a/egs/multi_en/s5/local/chain/run_blstm_6h.sh b/egs/multi_en/s5/local/chain/run_blstm_6h.sh index 51ca7db0495..df9b8002d0c 100644 --- a/egs/multi_en/s5/local/chain/run_blstm_6h.sh +++ b/egs/multi_en/s5/local/chain/run_blstm_6h.sh @@ -124,7 +124,6 @@ if [ $stage -le 13 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.frames-per-iter 1200000 \ --trainer.max-param-change 1.414 \ @@ -135,6 +134,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/multi_en/s5/local/fisher_data_prep.sh b/egs/multi_en/s5/local/fisher_data_prep.sh index 386fb5e111c..ae0b9683125 100755 --- a/egs/multi_en/s5/local/fisher_data_prep.sh +++ b/egs/multi_en/s5/local/fisher_data_prep.sh @@ -46,7 +46,7 @@ for subdir in fe_03_p1_sph1 fe_03_p1_sph3 fe_03_p1_sph5 fe_03_p1_sph7 \ found_subdir=true ln -s $dir/$subdir data/local/fisher/links/$subdir else - new_style_subdir=$(echo $subdir | sed s/fe_03_p2_sph/fisher_eng_tr_sp_d/) + new_style_subdir=$(echo $subdir | sed s/fe_03_p1_sph/fisher_eng_tr_sp_d/) if [ -d $dir/$new_style_subdir ]; then found_subdir=true ln -s $dir/$new_style_subdir data/local/fisher/links/$subdir diff --git a/egs/rm/s5/RESULTS b/egs/rm/s5/RESULTS index 65a9840df71..ecafb588cfe 100644 --- a/egs/rm/s5/RESULTS +++ b/egs/rm/s5/RESULTS @@ -230,8 +230,9 @@ for x in exp/nnet2_online_wsj/nnet_ms_a_smbr_0.00005/1/decode_*; do grep WER $x/ %WER 7.36 [ 923 / 12533, 85 ins, 148 del, 690 sub ] exp/nnet2_online_wsj/nnet_ms_a_smbr_0.00005/1/decode_ug_epoch4/wer_13 ### chain results ### -# current best chain result with TDNN (check local/chain/run_tdnn_5f.sh) -%WER 2.94 [ 369 / 12533, 51 ins, 71 del, 247 sub ] exp/chain/tdnn_5f/decode/wer_3_0.5 +# current best chain result with TDNN (check local/chain/run_tdnn_5g.sh) +%WER 2.86 [ 358 / 12533, 46 ins, 61 del, 251 sub ] exp/chain/tdnn_5g/decode/wer_5_0.0 +%WER 2.71 [ 340 / 12533, 58 ins, 59 del, 223 sub ] exp/chain/tdnn_5n/decode/wer_4_0.0 ### nnet1 results ### diff --git a/egs/rm/s5/local/chain/run_tdnn_5g.sh b/egs/rm/s5/local/chain/run_tdnn_5g.sh new file mode 100755 index 00000000000..f6fbe070763 --- /dev/null +++ b/egs/rm/s5/local/chain/run_tdnn_5g.sh @@ -0,0 +1,155 @@ +#!/bin/bash + +# This is modified from run_tdnn_5f.sh, to use the old topology, as a baseline +# to test the modified transition-model code (by which we hope to be able to +# create more compact decoding graphs for chain models). + +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +dir=exp/chain/tdnn_5g + +# training options +num_epochs=12 +initial_effective_lrate=0.005 +final_effective_lrate=0.0005 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=2 +num_jobs_final=4 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 6 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 1200 data/train $lang $ali_dir $treedir +fi + +if [ $stage -le 7 ]; then + mkdir -p $dir + + echo "$0: creating neural net configs"; + + steps/nnet3/tdnn/make_configs.py \ + --self-repair-scale-nonlinearity 0.00001 \ + --feat-dir data/train \ + --ivector-dir exp/nnet2_online/ivectors \ + --tree-dir $treedir \ + --relu-dim 450 \ + --splice-indexes "-1,0,1 -2,-1,0,1 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize 0.1 \ + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target 1.0 \ + $dir/configs || exit 1; +fi + +if [ $stage -le 8 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet2_online/ivectors \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=200" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1000000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs true \ + --feat-dir data/train \ + --tree-dir $treedir \ + --lat-dir exp/tri3b_lats \ + --dir $dir +fi + +if [ $stage -le 9 ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \ + data/test exp/nnet2_online/extractor exp/nnet2_online/ivectors_test || exit 1; +fi + +if [ $stage -le 10 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --scoring-opts "--min-lmwt 1" \ + --nj 20 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet2_online/ivectors_test \ + $dir/graph data/test $dir/decode || exit 1; +fi + +if [ $stage -le 11 ]; then + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_ug $dir $dir/graph_ug + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 20 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet2_online/ivectors_test \ + $dir/graph_ug data/test $dir/decode_ug || exit 1; +fi +wait; +exit 0; diff --git a/egs/rm/s5/local/chain/run_tdnn_5f.sh b/egs/rm/s5/local/chain/run_tdnn_5n.sh old mode 100644 new mode 100755 similarity index 62% rename from egs/rm/s5/local/chain/run_tdnn_5f.sh rename to egs/rm/s5/local/chain/run_tdnn_5n.sh index 0379d16fe13..7fd7b82aa1d --- a/egs/rm/s5/local/chain/run_tdnn_5f.sh +++ b/egs/rm/s5/local/chain/run_tdnn_5n.sh @@ -1,6 +1,9 @@ #!/bin/bash -# this script is a modified version of swbd/run_tdnn_5f.sh +# this script is a modified version of run_tdnn_5g.sh. It uses +# the new transition model and the python version of training scripts. + + set -e @@ -8,7 +11,7 @@ set -e stage=0 train_stage=-10 get_egs_stage=-10 -dir=exp/chain/tdnn_5f +dir=exp/chain/tdnn_5n # training options num_epochs=12 @@ -43,13 +46,13 @@ fi # run those things. ali_dir=exp/tri3b_ali -treedir=exp/chain/tri4_2y_tree -lang=data/lang_chain_2y +treedir=exp/chain/tri4_5n_tree +lang=data/lang_chain_5n local/online/run_nnet2_common.sh --stage $stage || exit 1; if [ $stage -le 4 ]; then - # Get the alignments as lattices (gives the CTC training more freedom). + # Get the alignments as lattices (gives the chain training more freedom). # use the same num-jobs as the alignments nj=$(cat exp/tri3b_ali/num_jobs) || exit 1; steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/train \ @@ -78,51 +81,73 @@ if [ $stage -le 6 ]; then fi if [ $stage -le 7 ]; then - steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + mkdir -p $dir + + echo "$0: creating neural net configs"; + + steps/nnet3/tdnn/make_configs.py \ + --self-repair-scale-nonlinearity 0.00001 \ + --feat-dir data/train \ + --ivector-dir exp/nnet2_online/ivectors \ + --tree-dir $treedir \ + --relu-dim 450 \ + --splice-indexes "-1,0,1 -2,-1,0,1 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ --xent-regularize 0.1 \ - --leaky-hmm-coefficient 0.1 \ - --l2-regularize 0.00005 \ - --jesus-opts "--jesus-forward-input-dim 200 --jesus-forward-output-dim 500 --jesus-hidden-dim 2000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ - --splice-indexes "-1,0,1 -2,-1,0,1 -3,0,3 -6,-3,0" \ - --apply-deriv-weights false \ - --frames-per-iter 1000000 \ - --lm-opts "--num-extra-lm-states=200" \ - --get-egs-stage $get_egs_stage \ - --minibatch-size $minibatch_size \ - --egs-opts "--frames-overlap-per-eg 0" \ - --frames-per-eg $frames_per_eg \ - --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ - --feat-type raw \ - --online-ivector-dir exp/nnet2_online/ivectors \ - --cmvn-opts "--norm-means=false --norm-vars=false" \ - --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ - --max-param-change $max_param_change \ - --cmd "$decode_cmd" \ - --remove-egs $remove_egs \ - data/train $treedir exp/tri3b_lats $dir || exit 1; + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target 1.0 \ + $dir/configs || exit 1; fi if [ $stage -le 8 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet2_online/ivectors \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=200" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1000000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs true \ + --feat-dir data/train \ + --tree-dir $treedir \ + --lat-dir exp/tri3b_lats \ + --dir $dir +fi + +if [ $stage -le 9 ]; then steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \ data/test exp/nnet2_online/extractor exp/nnet2_online/ivectors_test || exit 1; fi -if [ $stage -le 9 ]; then +if [ $stage -le 10 ]; then # Note: it might appear that this $lang directory is mismatched, and it is as # far as the 'topo' is concerned, but this script doesn't read the 'topo' from # the lang directory. utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context 20 --scoring-opts "--min-lmwt 1" \ + --scoring-opts "--min-lmwt 1" \ --nj 20 --cmd "$decode_cmd" \ --online-ivector-dir exp/nnet2_online/ivectors_test \ $dir/graph data/test $dir/decode || exit 1; fi -if [ $stage -le 10 ]; then +if [ $stage -le 11 ]; then utils/mkgraph.sh --self-loop-scale 1.0 data/lang_ug $dir $dir/graph_ug steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context 20 \ --nj 20 --cmd "$decode_cmd" \ --online-ivector-dir exp/nnet2_online/ivectors_test \ $dir/graph_ug data/test $dir/decode_ug || exit 1; diff --git a/egs/sprakbanken_swe/s5/RESULTS b/egs/sprakbanken_swe/s5/RESULTS new file mode 100644 index 00000000000..a133a0c2e9f --- /dev/null +++ b/egs/sprakbanken_swe/s5/RESULTS @@ -0,0 +1,19 @@ +%WER 48.86 [ 34040 / 69674, 3407 ins, 7500 del, 23133 sub ] exp/mono/decode_test120_p_spk/wer_9_0.0 + +%WER 24.16 [ 16835 / 69674, 2620 ins, 2887 del, 11328 sub ] exp/tri1/decode_test120_p_spk/wer_13_0.5 + +%WER 23.86 [ 16623 / 69674, 2793 ins, 2576 del, 11254 sub ] exp/tri2a/decode_test120_p_spk/wer_13_0.5 + +%WER 22.66 [ 15791 / 69674, 3016 ins, 2196 del, 10579 sub ] exp/tri2b/decode_test120_p_spk/wer_16_0.0 + +%WER 20.19 [ 14065 / 69674, 2899 ins, 1929 del, 9237 sub ] exp/tri3b/decode_test120_p_spk/wer_17_0.0 + +%WER 19.06 [ 13279 / 69674, 2900 ins, 1673 del, 8706 sub ] exp/tri4a/decode_test120_p_spk/wer_18_0.0 + +—————————————————————————————————————————————————————————————————————————————————— +#full test set + +%WER 18.88 [ 111453 / 590285, 25457 ins, 13698 del, 72298 sub ] exp/tri4a/decode_4g_test/wer_17_0.0 + +%WER 15.97 [ 94242 / 590285, 21022 ins, 12697 del, 60523 sub ] exp/nnet5c/decode_4g_test/wer_11 + diff --git a/egs/sre10/v1/local/dnn/fisher_data_prep.sh b/egs/sre10/v1/local/dnn/fisher_data_prep.sh index c7e74dea3bc..70cede2f86c 100755 --- a/egs/sre10/v1/local/dnn/fisher_data_prep.sh +++ b/egs/sre10/v1/local/dnn/fisher_data_prep.sh @@ -52,7 +52,7 @@ for subdir in fe_03_p1_sph1 fe_03_p1_sph3 fe_03_p1_sph5 fe_03_p1_sph7 \ found_subdir=true ln -s $dir/$subdir data/local/data/links else - new_style_subdir=$(echo $subdir | sed s/fe_03_p2_sph/fisher_eng_tr_sp_d/) + new_style_subdir=$(echo $subdir | sed s/fe_03_p1_sph/fisher_eng_tr_sp_d/) if [ -d $dir/$new_style_subdir ]; then found_subdir=true ln -s $dir/$new_style_subdir data/local/data/links/$subdir diff --git a/egs/swbd/s5c/local/chain/compare_wer.sh b/egs/swbd/s5c/local/chain/compare_wer.sh deleted file mode 100755 index ded03563711..00000000000 --- a/egs/swbd/s5c/local/chain/compare_wer.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/bin/bash - - -echo -n "System " -for x in $*; do printf "% 10s" $x; done -echo - -echo -n "WER on train_dev(tg) " -for x in $*; do - wer=$(grep WER exp/chain/tdnn_${x}_sp/decode_train_dev_sw1_tg/wer_* | utils/best_wer.sh | awk '{print $2}') - printf "% 10s" $wer -done -echo - -echo -n "WER on train_dev(fg) " -for x in $*; do - wer=$(grep WER exp/chain/tdnn_${x}_sp/decode_train_dev_sw1_fsh_fg/wer_* | utils/best_wer.sh | awk '{print $2}') - printf "% 10s" $wer -done -echo - -echo -n "WER on eval2000(tg) " -for x in $*; do - wer=$(grep Sum exp/chain/tdnn_${x}_sp/decode_eval2000_sw1_tg/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}') - printf "% 10s" $wer -done -echo - -echo -n "WER on eval2000(fg) " -for x in $*; do - wer=$(grep Sum exp/chain/tdnn_${x}_sp/decode_eval2000_sw1_fsh_fg/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}') - printf "% 10s" $wer -done -echo - -echo -n "Final train prob " -for x in $*; do - prob=$(grep Overall exp/chain/tdnn_${x}_sp/log/compute_prob_train.final.log | grep -v xent | awk '{print $8}') - printf "% 10s" $prob -done -echo - -echo -n "Final valid prob " -for x in $*; do - prob=$(grep Overall exp/chain/tdnn_${x}_sp/log/compute_prob_valid.final.log | grep -v xent | awk '{print $8}') - printf "% 10s" $prob -done -echo - -echo -n "Final train prob (xent) " -for x in $*; do - prob=$(grep Overall exp/chain/tdnn_${x}_sp/log/compute_prob_train.final.log | grep -w xent | awk '{print $8}') - printf "% 10s" $prob -done -echo - -echo -n "Final valid prob (xent) " -for x in $*; do - prob=$(grep Overall exp/chain/tdnn_${x}_sp/log/compute_prob_valid.final.log | grep -w xent | awk '{print $8}') - printf "% 10s" $prob -done -echo diff --git a/egs/swbd/s5c/local/chain/compare_wer_general.sh b/egs/swbd/s5c/local/chain/compare_wer_general.sh new file mode 100755 index 00000000000..c8aae0b3b94 --- /dev/null +++ b/egs/swbd/s5c/local/chain/compare_wer_general.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +echo -n "System " +for x in $*; do printf "% 10s" $x; done +echo + +echo -n "WER on train_dev(tg) " +for x in $*; do + wer=$(grep WER exp/chain/${x}_sp/decode_train_dev_sw1_tg/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "WER on train_dev(fg) " +for x in $*; do + wer=$(grep WER exp/chain/${x}_sp/decode_train_dev_sw1_fsh_fg/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "WER on eval2000(tg) " +for x in $*; do + wer=$(grep Sum exp/chain/${x}_sp/decode_eval2000_sw1_tg/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "WER on eval2000(fg) " +for x in $*; do + wer=$(grep Sum exp/chain/${x}_sp/decode_eval2000_sw1_fsh_fg/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "Final train prob " +for x in $*; do + prob=$(grep Overall exp/chain/${x}_sp/log/compute_prob_train.final.log | grep -v xent | awk '{print $8}') + printf "% 10s" $prob +done +echo + +echo -n "Final valid prob " +for x in $*; do + prob=$(grep Overall exp/chain/${x}_sp/log/compute_prob_valid.final.log | grep -v xent | awk '{print $8}') + printf "% 10s" $prob +done +echo + +echo -n "Final train prob (xent) " +for x in $*; do + prob=$(grep Overall exp/chain/${x}_sp/log/compute_prob_train.final.log | grep -w xent | awk '{print $8}') + printf "% 10s" $prob +done +echo + +echo -n "Final valid prob (xent) " +for x in $*; do + prob=$(grep Overall exp/chain/${x}_sp/log/compute_prob_valid.final.log | grep -w xent | awk '{print $8}') + printf "% 10s" $prob +done +echo diff --git a/egs/swbd/s5c/local/chain/compare_wer_tdnn.sh b/egs/swbd/s5c/local/chain/compare_wer_tdnn.sh new file mode 100755 index 00000000000..542dae82581 --- /dev/null +++ b/egs/swbd/s5c/local/chain/compare_wer_tdnn.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +models="" +for x in $*; do models="$models tdnn_${x}"; done + +local/chain/compare_wer_general.sh $models diff --git a/egs/swbd/s5c/local/chain/run_blstm.sh b/egs/swbd/s5c/local/chain/run_blstm.sh new file mode 120000 index 00000000000..0160247619f --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_blstm.sh @@ -0,0 +1 @@ +tuning/run_blstm_6j.sh \ No newline at end of file diff --git a/egs/swbd/s5c/local/chain/run_lstm.sh b/egs/swbd/s5c/local/chain/run_lstm.sh index 28e5e6cc20c..8b421ac2649 120000 --- a/egs/swbd/s5c/local/chain/run_lstm.sh +++ b/egs/swbd/s5c/local/chain/run_lstm.sh @@ -1 +1 @@ -tuning/run_lstm_6i.sh \ No newline at end of file +tuning/run_lstm_6j.sh \ No newline at end of file diff --git a/egs/swbd/s5c/local/chain/run_tdnn.sh b/egs/swbd/s5c/local/chain/run_tdnn.sh index 669740d5f27..7b86453e14b 120000 --- a/egs/swbd/s5c/local/chain/run_tdnn.sh +++ b/egs/swbd/s5c/local/chain/run_tdnn.sh @@ -1 +1 @@ -tuning/run_tdnn_7f.sh \ No newline at end of file +tuning/run_tdnn_7h.sh \ No newline at end of file diff --git a/egs/swbd/s5c/local/chain/run_tdnn_lstm.sh b/egs/swbd/s5c/local/chain/run_tdnn_lstm.sh new file mode 120000 index 00000000000..a4fa11e0908 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_lstm.sh @@ -0,0 +1 @@ +tuning/run_tdnn_lstm_1b.sh \ No newline at end of file diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh index 95f7aef2708..a1be44cdbbf 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh @@ -144,7 +144,6 @@ if [ $stage -le 13 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.frames-per-iter 1200000 \ --trainer.max-param-change 2.0 \ @@ -155,6 +154,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh index 26cdaed29d7..a4333e40b30 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh @@ -150,7 +150,6 @@ if [ $stage -le 13 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.frames-per-iter 1200000 \ --trainer.max-param-change 2.0 \ @@ -161,6 +160,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6j.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6j.sh new file mode 100755 index 00000000000..34dd378a7fe --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6j.sh @@ -0,0 +1,238 @@ +#!/bin/bash + +# 6j is same as 6i but using the xconfig format of network specification. +# Also, the model is trained without layer-wise discriminative pretraining. +# Another minor change is that the final-affine component has param-stddev-0 +# and bias-stddev=0 initialization. +# This run also accounts for changes in training due to the BackpropTruncationComponent + +#System blstm_6i blstm_6j +#WER on train_dev(tg) 14.11 13.80 +#WER on train_dev(fg) 13.04 12.64 +#WER on eval2000(tg) 16.2 15.6 +#WER on eval2000(fg) 14.6 14.2 +#Final train prob -0.0615713-0.0552637 +#Final valid prob -0.0829338-0.0765151 +#Final train prob (xent) -1.16518 -0.777318 +#Final valid prob (xent) -1.26028 -0.912595 + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/blstm_6j # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= +decode_dir_affix= + +# training options +leftmost_questions_truncate=-1 +chunk_width=150 +chunk_left_context=40 +chunk_right_context=40 +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=0 + +# decode options +extra_left_context=50 +extra_right_context=50 +frames_per_chunk= + +remove_egs=false +common_egs_dir= + +affix= +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info exp/chain/tri5_7d_tree_sp/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + lstmp-layer name=blstm1-forward input=lda cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + lstmp-layer name=blstm1-backward input=lda cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 + + lstmp-layer name=blstm2-forward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + lstmp-layer name=blstm2-backward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 + + lstmp-layer name=blstm3-forward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + lstmp-layer name=blstm3-backward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 + + ## adding the layers for chain branch + output-layer name=output input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 1200000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $extra_right_context ] && extra_right_context=$chunk_right_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh index fbced146199..ac22e858aea 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh @@ -149,7 +149,6 @@ if [ $stage -le 13 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.frames-per-iter 1200000 \ --trainer.max-param-change 2.0 \ @@ -160,6 +159,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh index c5548cbfa5c..db0a0fe7b1a 100644 --- a/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh @@ -149,7 +149,6 @@ if [ $stage -le 13 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.frames-per-iter 1200000 \ --trainer.max-param-change 2.0 \ @@ -160,6 +159,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh new file mode 100755 index 00000000000..90afd1fb4cd --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh @@ -0,0 +1,234 @@ +#!/bin/bash + +# 6j is same as 6i but using the xconfig format of network specification. +# Also, the model is trained without layer-wise discriminative pretraining. +# Another minor change is that the final-affine component has param-stddev-0 +# and bias-stddev=0 initialization. The results also account for changes +# due to BackpropTruncationComponent in place of ClipGradientComponent. +# Note that removal of layerwise discriminative pretraining does not result +# in a lot of improvement in LSTMs, compared to TDNNs (7f vs 7g). + +#System lstm_6i_ld5 lstm_6j_ld5 +#WER on train_dev(tg) 14.65 14.66 +#WER on train_dev(fg) 13.38 13.42 +#WER on eval2000(tg) 16.9 16.8 +#WER on eval2000(fg) 15.4 15.4 +#Final train prob -0.0751668-0.0824531 +#Final valid prob -0.0928206-0.0989325 +#Final train prob (xent) -1.34549 -1.15506 +#Final valid prob (xent) -1.41301 -1.24364 +# +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/lstm_6j # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= +decode_dir_affix= + +# training options +leftmost_questions_truncate=-1 +chunk_width=150 +chunk_left_context=40 +chunk_right_context=0 +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 +# decode options +extra_left_context=50 +extra_right_context=0 +frames_per_chunk= + +remove_egs=false +common_egs_dir= + +affix= +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info exp/chain/tri5_7d_tree_sp/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 1200000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $extra_right_context ] && extra_right_context=$chunk_right_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh index 28c20c92ab0..aa666e4c5ab 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh @@ -155,7 +155,6 @@ if [ $stage -le 13 ]; then --chain.xent-regularize $xent_regularize \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.max-param-change 2.0 \ --trainer.num-epochs 4 \ @@ -165,6 +164,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ --egs.stage $get_egs_stage \ --egs.opts="--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh new file mode 100755 index 00000000000..7a4512097d3 --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh @@ -0,0 +1,228 @@ +#!/bin/bash + + +# 7g is same as 7f but using the xconfig format of network specification. +# Also, the model is trained without layer-wise discriminative pretraining. + + +# System 7f 7g +# WER on train_dev(tg) 14.46 13.85 +# WER on train_dev(fg) 13.23 12.67 +# WER on eval2000(tg) 17.0 16.5 +# WER on eval2000(fg) 15.4 14.8 +# Final train prob -0.0882071 -0.0885075 +# Final valid prob -0.107545 -0.113462 +# Final train prob (xent) -1.26246 -1.25788 +# Final valid prob (xent) -1.35525 -1.37058 + + + +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_7g # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# training options +num_epochs=6 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false +#common_egs_dir=exp/chain/tdnn_7e_sp/egs +xent_regularize=0.1 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info exp/chain/tri5_7d_tree_sp/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=625 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=625 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=625 + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=625 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=625 + # it doesn't make sense to have -6,0,6 splicing for a chain model + # as we compute a sequence of outputs and computation can be shared + # this has to be split into two -3,0,3 layers. But I will keep this + # to have same setup as 7f + relu-renorm-layer name=tdnn6 input=Append(-6,0,6) dim=625 + + ## adding the layers for chain branch + relu-renorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-renorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh new file mode 100755 index 00000000000..00743ca9ebf --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh @@ -0,0 +1,218 @@ +#!/bin/bash + +#System tdnn_7g tdnn_7h +#WER on train_dev(tg) 13.98 13.84 +#WER on train_dev(fg) 12.78 12.84 +#WER on eval2000(tg) 16.7 16.5 +#WER on eval2000(fg) 14.9 14.8 +#Final train prob -0.0817467-0.0889771 +#Final valid prob -0.110475 -0.113102 +#Final train prob (xent) -1.20065 -1.2533 +#Final valid prob (xent) -1.3313 -1.36743 +# +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_7h # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info exp/chain/tri5_7d_tree_sp/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=625 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=625 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=625 + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=625 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=625 + relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=625 + relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=625 + + ## adding the layers for chain branch + relu-renorm-layer name=prefinal-chain input=tdnn7 dim=625 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-renorm-layer name=prefinal-xent input=tdnn7 dim=625 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7i.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7i.sh new file mode 100755 index 00000000000..1b3e86715ed --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7i.sh @@ -0,0 +1,221 @@ +#!/bin/bash + +# Same as 7h but double the number of parameters (27983950 vs 15551509) + +set -e + + +#System tdnn_7h tdnn_7i +#WER on train_dev(tg) 13.84 13.48 +#WER on train_dev(fg) 12.84 12.47 +#WER on eval2000(tg) 16.5 16.4 +#WER on eval2000(fg) 14.8 14.9 +#Final train prob -0.0889771-0.0785415 +#Final valid prob -0.113102 -0.105757 +#Final train prob (xent) -1.2533 -1.15785 +#Final valid prob (xent) -1.36743 -1.28397 +# +# configs for 'chain' +affix= +stage=12 +train_stage=0 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_7i # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false +common_egs_dir=exp/chain/tdnn_7g_sp/egs +xent_regularize=0.1 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info exp/chain/tri5_7d_tree_sp/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=1024 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 + + ## adding the layers for chain branch + relu-renorm-layer name=prefinal-chain input=tdnn7 dim=625 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-renorm-layer name=prefinal-xent input=tdnn7 dim=625 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh new file mode 100755 index 00000000000..b19ea6eafab --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh @@ -0,0 +1,221 @@ +#!/bin/bash + + +#System tdnn_7h tdnn_7j +#WER on train_dev(tg) 13.84 14.15 +#WER on train_dev(fg) 12.84 12.96 +#WER on eval2000(tg) 16.5 16.8 +#WER on eval2000(fg) 14.8 15.1 +#Final train prob -0.0889771-0.0910883 +#Final valid prob -0.113102 -0.112464 +#Final train prob (xent) -1.2533 -1.31768 +#Final valid prob (xent) -1.36743 -1.41603 + + +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_7j # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false +common_egs_dir=exp/chain/tdnn_7g_sp/egs +xent_regularize=0.1 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info exp/chain/tri5_7d_tree_sp/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=768 + tdnn-relu-renorm-layer name=tdnn2 splice-indexes=-1,0,1 dim=768 subset-dim=384 + tdnn-relu-renorm-layer name=tdnn3 splice-indexes=-1,0,1 dim=768 subset-dim=384 + tdnn-relu-renorm-layer name=tdnn4 splice-indexes=-3,0,3 dim=768 subset-dim=384 + tdnn-relu-renorm-layer name=tdnn5 splice-indexes=-3,0,3 dim=768 subset-dim=384 + tdnn-relu-renorm-layer name=tdnn6 splice-indexes=-3,0,3 dim=768 subset-dim=384 + tdnn-relu-renorm-layer name=tdnn7 splice-indexes=-3,0,3 dim=768 subset-dim=384 + + ## adding the layers for chain branch + relu-renorm-layer name=prefinal-chain input=tdnn7 dim=768 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-renorm-layer name=prefinal-xent input=tdnn7 dim=768 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + nnet3-info $dir/configs/ref.raw |grep num-param +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh new file mode 100644 index 00000000000..06ae6f49728 --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh @@ -0,0 +1,216 @@ +#!/bin/bash + +# 7l is based on 7h, but adding a 64 dim lowrank module in the xent branch +#System tdnn_7h tdnn_7l +#WER on train_dev(tg) 13.84 13.83 +#WER on train_dev(fg) 12.84 12.88 +#WER on eval2000(tg) 16.5 16.4 +#WER on eval2000(fg) 14.8 14.7 +#Final train prob -0.089 -0.090 +#Final valid prob -0.113 -0.116 +#Final train prob (xent) -1.25 -1.38 +#Final valid prob (xent) -1.36 -1.48 +#Time consuming one iter 53.56s 48.18s +#Time reduction percent 10.1% +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_7l # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info exp/chain/tri5_7d_tree_sp/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=625 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=625 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=625 + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=625 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=625 + relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=625 + relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=625 + ## adding the layers for chain branch + relu-renorm-layer name=prefinal-chain input=tdnn7 dim=625 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-renorm-layer name=prefinal-xent input=tdnn7 dim=625 target-rms=0.5 + relu-renorm-layer name=prefinal-lowrank-xent input=prefinal-xent dim=64 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; \ No newline at end of file diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh new file mode 100755 index 00000000000..e32fdffb69d --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh @@ -0,0 +1,241 @@ +#!/bin/bash + +# 1a is a straight forward combination of tdnn_7h and lstm_6j. +# TDNN layers are stacked before LSTM. +# This model has the same performance as the BLSTM. + + +#System lstm_6j tdnn_7h blstm_6j tdnn_lstm_1a +#WER on train_dev(tg) 14.66 13.84 13.80 13.42 +#WER on train_dev(fg) 13.42 12.84 12.64 12.42 +#WER on eval2000(tg) 16.8 16.5 15.6 15.7 +#WER on eval2000(fg) 15.4 14.8 14.2 14.2 +#Final train prob -0.0824531-0.0889771-0.0552637-0.0538088 +#Final valid prob -0.0989325 -0.113102-0.0765151-0.0800484 +#Final train prob (xent) -1.15506 -1.2533 -0.777318 -0.7603 +#Final valid prob (xent) -1.24364 -1.36743 -0.912595 -0.949909 + + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_lstm_1a # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= +decode_dir_affix= + +# training options +leftmost_questions_truncate=-1 +chunk_width=150 +chunk_left_context=40 +chunk_right_context=0 +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 +# decode options +extra_left_context=50 +extra_right_context=0 +frames_per_chunk= + +remove_egs=false +common_egs_dir= + +affix= +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info exp/chain/tri5_7d_tree_sp/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=1024 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 1200000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $extra_right_context ] && extra_right_context=$chunk_right_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1b.sh new file mode 100755 index 00000000000..555afa467fa --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1b.sh @@ -0,0 +1,237 @@ +#!/bin/bash + +# Unlike 1a this setup interleaves the TDNN and LSTM layers. + +#System tdnn_lstm_1a_ld5tdnn_lstm_1b_ld5 +#WER on train_dev(tg) 13.42 13.00 +#WER on train_dev(fg) 12.42 12.03 +#WER on eval2000(tg) 15.7 15.3 +#WER on eval2000(fg) 14.2 13.9 +#Final train prob -0.0538088 -0.056294 +#Final valid prob -0.0800484-0.0813322 +#Final train prob (xent) -0.7603 -0.777787 +#Final valid prob (xent) -0.949909 -0.939146 + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_lstm_1b # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= +decode_dir_affix= + +# training options +leftmost_questions_truncate=-1 +chunk_width=150 +chunk_left_context=40 +chunk_right_context=0 +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 +# decode options +extra_left_context=50 +extra_right_context=0 +frames_per_chunk= + +remove_egs=false +common_egs_dir= + +affix= +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info exp/chain/tri5_7d_tree_sp/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=1024 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 + lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 + lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 1200000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $extra_right_context ] && extra_right_context=$chunk_right_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh b/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh index 1908b390151..99f6a31e708 100755 --- a/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh +++ b/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh @@ -8,8 +8,7 @@ set -e # note: this relies on having a cluster that has plenty of CPUs as well as GPUs, # since the lattice generation runs in about real-time, so takes of the order of # 1000 hours of CPU time. -# -. cmd.sh +# stage=0 @@ -26,7 +25,7 @@ extra_right_context=40 extra_left_context_initial=-1 extra_right_context_final=-1 -. cmd.sh +. ./cmd.sh . ./path.sh . ./utils/parse_options.sh @@ -52,9 +51,9 @@ effective_learning_rate=0.0000125 max_param_change=1 num_jobs_nnet=4 num_epochs=4 -regularization_opts= # Applicable for providing --xent-regularize and --l2-regularize options +regularization_opts= # Applicable for providing --xent-regularize and --l2-regularize options minibatch_size=64 -adjust_priors=true # May need to be set to false +adjust_priors=true # May need to be set to false # because it does not help in some setups modify_learning_rates=true last_layer_factor=0.1 @@ -64,8 +63,8 @@ decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we deci if $use_gpu; then if ! cuda-compiled; then - cat </dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + ${graph_dir} data/${dset}_hires ${dir}/decode_${dset} || exit 1 + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 +fi + + +exit 0; diff --git a/egs/tedlium/s5_r2/local/prepare_dict.sh b/egs/tedlium/s5_r2/local/prepare_dict.sh index 9ba31893b22..18837c21085 100755 --- a/egs/tedlium/s5_r2/local/prepare_dict.sh +++ b/egs/tedlium/s5_r2/local/prepare_dict.sh @@ -3,13 +3,14 @@ # Copyright 2014 Nickolay V. Shmyrev # 2014 Brno University of Technology (Author: Karel Vesely) # 2016 Daniel Galvez +# 2016 Vincent Nguyen # Apache 2.0 # dir=data/local/dict_nosp mkdir -p $dir -srcdict=db/cantab-TEDLIUM/cantab-TEDLIUM.dct +srcdict=db/TEDLIUM_release2/TEDLIUM.152k.dic [ ! -r $srcdict ] && echo "Missing $srcdict" && exit 1 diff --git a/egs/tedlium/s5_r2/local/run_learn_lex.sh b/egs/tedlium/s5_r2/local/run_learn_lex.sh new file mode 100755 index 00000000000..4960fbd848e --- /dev/null +++ b/egs/tedlium/s5_r2/local/run_learn_lex.sh @@ -0,0 +1,136 @@ +#! /bin/bash +# +# This script demonstrates a lexicon learning recipe, which aims to imrove +# the pronounciation of abbreviated words in the TED-LIUM lexicon. It assumes +# the model exp/tri3 already exists. Please see steps/dict/learn_lexicon.sh +# for explanation of the options. +# +# Copyright 2016 Xiaohui Zhang +# Apache 2.0 + +. ./cmd.sh +. ./path.sh + +oov_symbol="" +# The user may have an English g2p model ready. +g2p_mdl_dir= +# The dir which contains the reference lexicon (most probably hand-derived) +# we want to expand/improve, and nonsilence_phones.txt,.etc which we need +# for building new dict dirs. +ref_dict=data/local/dict +# acoustic training data we use to get alternative +# pronunciations and collet acoustic evidence. +data=data/train +# the cut-off parameter used to select pronunciation candidates from phone +# decoding. We remove pronunciations with probabilities less than this value +# after normalizing the probs s.t. the max-prob is 1.0 for each word." +min_prob=0.4 +# Mean of priors (summing up to 1) assigned to three exclusive pronunciation +# source: reference lexicon, g2p, and phone decoding (used in the Bayesian +# pronunciation selection procedure). We recommend setting a larger prior +# mean for the reference lexicon, e.g. '0.6,0.2,0.2'. +prior_mean="0.7,0.2,0.1" +# Total amount of prior counts we add to all pronunciation candidates of +# each word. By multiplying it with the prior mean of a source, and then dividing +# by the number of candidates (for a word) from this source, we get the +# prior counts we actually add to each candidate. +prior_counts_tot=15 +# In the Bayesian pronunciation selection procedure, for each word, we +# choose candidates (from all three sources) with highest posteriors +# until the total prob mass hit this amount. +# It's used in a similar fashion when we apply G2P. +variants_prob_mass=0.6 +# In the Bayesian pronunciation selection procedure, for each word, +# after the total prob mass of selected candidates hit variants-prob-mass, +# we continue to pick up reference candidates with highest posteriors +# until the total prob mass hit this amount (must >= variants_prob_mass). +variants_prob_mass_ref=0.95 +# Intermediate outputs of the lexicon learning stage will be put into dir +dir=exp/tri3_lex_work +nj=35 +decode_nj=30 +stage=0 +lexlearn_stage=0 + +. utils/parse_options.sh # accept options + + +# The reference vocab is the list of words which we already have hand-derived pronunciations. +ref_vocab=data/local/vocab.txt +cat $ref_dict/lexicon.txt | awk '{print $1}' | sort | uniq > $ref_vocab || exit 1; + +# Get a G2P generated lexicon for oov words (w.r.t the reference lexicon) +# in acoustic training data. +if [ $stage -le 0 ]; then + if [ -z $g2p_mdl_dir ]; then + g2p_mdl_dir=exp/g2p + steps/dict/train_g2p.sh --cmd "$decode_cmd --mem 4G" $ref_dict/lexicon.txt $g2p_mdl_dir || exit 1; + fi + awk '{for (n=2;n<=NF;n++) vocab[$n]=1;} END{for (w in vocab) printf "%s\n",w;}' \ + $data/text | sort -u > $data/train_vocab.txt || exit 1; + awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $ref_vocab \ + $data/train_vocab.txt | sort > $data/oov_train.txt || exit 1; + steps/dict/apply_g2p.sh --var-counts 4 $data/oov_train.txt \ + $g2p_mdl_dir exp/g2p/oov_lex_train || exit 1; + cat exp/g2p/oov_lex_train/lexicon.lex | cut -f1,3 | \ + tr -s '\t' ' ' | sort | uniq > $data/lexicon_oov_g2p.txt || exit 1; +fi + +# Learn a lexicon based on the acoustic training data and the reference lexicon. +if [ $stage -le 1 ]; then + steps/dict/learn_lexicon.sh --lexicon-g2p "$data/lexicon_oov_g2p.txt" \ + --min-prob $min_prob --variants-prob-mass $variants_prob_mass \ + --variants-prob-mass-ref $variants_prob_mass_ref \ + --prior-counts-tot $prior_counts_tot --prior-mean $prior_mean \ + --stage $lexlearn_stage --nj 60 --oov-symbol $oov_symbol --retrain-src-mdl true \ + $ref_dict $ref_vocab $data exp/tri3 data/lang data/local/dict_learned_nosp \ + $dir || exit 1; +fi + +# Add pronounciation probs to the learned lexicon. +if [ $stage -le 1 ]; then + utils/prepare_lang.sh --phone-symbol-table data/lang/phones.txt \ + data/local/dict_learned_nosp $oov_symbol data/local/lang_learned_nosp data/lang_learned_nosp || exit 1; + + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + $data data/lang_learned_nosp exp/tri2 exp/tri2_ali_learned_lex_nosp || exit 1; + + steps/get_prons.sh --cmd "$train_cmd" data/train data/lang_learned_nosp exp/tri2_ali_learned_lex_nosp || exit 1; + + utils/dict_dir_add_pronprobs.sh --max-normalize true \ + data/local/dict_learned_nosp exp/tri2_ali_learned_lex_nosp/pron_counts_nowb.txt \ + exp/tri2_ali_learned_lex_nosp/sil_counts_nowb.txt \ + exp/tri2_ali_learned_lex_nosp/pron_bigram_counts_nowb.txt data/local/dict_learned || exit 1; + + utils/prepare_lang.sh --phone-symbol-table data/lang/phones.txt \ + data/local/dict_learned $oov_symbol data/local/lang_learned data/lang_learned || exit 1; +fi + +# Re-train the acoustic model using the learned lexicon +if [ $stage -le 2 ]; then + steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ + $data data/lang_learned exp/tri3 exp/tri3_ali_learned_lex || exit 1; + + steps/train_sat.sh --cmd "$train_cmd" \ + 5000 100000 $data data/lang_learned exp/tri3_ali_learned_lex exp/tri3_learned_lex || exit 1; +fi + +# Decode +if [ $stage -le 3 ]; then + cp -rT data/lang_learned data/lang_learned_rescore || exit 1; + ! cmp data/lang_nosp/words.txt data/lang_learned/words.txt &&\ + echo "$0: The vocab of the learned lexicon and the reference vocab may be incompatible." + cp data/lang_nosp/G.fst data/lang_learned/ + cp data/lang_nosp_rescore/G.carpa data/lang_learned_rescore/ + utils/mkgraph.sh data/lang_learned exp/tri3_learned_lex exp/tri3_learned_lex/graph || exit 1; + + for dset in dev test; do + ( steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ + exp/tri3_learned_lex/graph data/${dset} exp/tri3_learned_lex/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang_learned data/lang_learned_rescore \ + data/${dset} exp/tri3_learned_lex/decode_${dset} exp/tri3_learned_lex/decode_${dset}_rescore || exit 1; + ) & + done +fi + +wait diff --git a/egs/tedlium/s5_r2/local/ted_train_lm.sh b/egs/tedlium/s5_r2/local/ted_train_lm.sh index 3d46726b5ca..3a1bef567fb 100755 --- a/egs/tedlium/s5_r2/local/ted_train_lm.sh +++ b/egs/tedlium/s5_r2/local/ted_train_lm.sh @@ -44,7 +44,7 @@ num_dev_sentences=10000 # These example numbers of metaparameters is for 4-gram model (with min-counts) # running with train_lm.py. # The dev perplexity should be close to the non-bypassed model. -bypass_metaparam_optim_opt="--bypass-metaparameter-optimization=0.837,0.023,0.761,0.065,0.029,0.015,0.999,0.361,0.157,0.080,0.999,0.625,0.2164,0.2162" +bypass_metaparam_optim_opt="--bypass-metaparameter-optimization=0.854,0.0722,0.5808,0.338,0.166,0.015,0.999,0.6228,0.340,0.172,0.999,0.788,0.501,0.406" # Note: to use these example parameters, you may need to remove the .done files # to make sure the make_lm_dir.py be called and tain only 3-gram model #for order in 3; do @@ -58,8 +58,8 @@ if [ $stage -le 0 ]; then rm ${dir}/data/text/* 2>/dev/null || true - # cantab-TEDLIUM is the larger data source. gzip it. - sed 's/ <\/s>//g' < db/cantab-TEDLIUM/cantab-TEDLIUM.txt | gzip -c > ${dir}/data/text/train.txt.gz + # Unzip TEDLIUM 6 data sources, normalize apostrophe+suffix to previous word, gzip the result. + gunzip -c db/TEDLIUM_release2/LM/*.en.gz | sed 's/ <\/s>//g' | local/join_suffix.py | gzip -c > ${dir}/data/text/train.txt.gz # use a subset of the annotated training data as the dev set . # Note: the name 'dev' is treated specially by pocolm, it automatically # becomes the dev set. @@ -76,7 +76,7 @@ if [ $stage -le 0 ]; then cut -d " " -f 2- < data/dev/text > ${dir}/data/real_dev_set.txt # get wordlist - awk '{print $1}' db/cantab-TEDLIUM/cantab-TEDLIUM.dct | sort | uniq > ${dir}/data/wordlist + awk '{print $1}' db/TEDLIUM_release2/TEDLIUM.152k.dic | sed 's:([0-9])::g' | sort | uniq > ${dir}/data/wordlist fi order=4 @@ -103,13 +103,7 @@ if [ $stage -le 1 ]; then ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' - - # current results, after adding --limit-unk-history=true: - # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/wordlist_4.pocolm was -5.13486225358 per word [perplexity = 169.840923284] over 18290.0 words. - # older results (after adding min-counts): - # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/wordlist_4.pocolm was -5.13902242865 per word [perplexity = 170.514153159] over 18290.0 words. - # even older results, before adding min-counts: - # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4 was -5.10576291033 per word [perplexity = 164.969879761] over 18290.0 words. + #[perplexity = 157.87] over 18290.0 words fi if [ $stage -le 2 ]; then @@ -121,10 +115,8 @@ if [ $stage -le 2 ]; then get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' # current results, after adding --limit-unk-history=true: - # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_big was -5.17558740241 per word [perplexity = 176.90049554] over 18290.0 words. - # older results, after adding min-counts: - # get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' - # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_big was -5.17638942756 per word [perplexity = 177.006688203] over 18290.0 words. + # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_big was -5.16562818753 per word [perplexity = 175.147449465] over 18290.0 words. + mkdir -p ${dir}/data/arpa format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz @@ -140,11 +132,8 @@ if [ $stage -le 3 ]; then get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' # current results, after adding --limit-unk-history=true (needed for modeling OOVs and not blowing up LG.fst): - # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_small was -5.28036622198 per word [perplexity = 196.441803486] over 18290.0 words. - # older results, after adding min-counts: - # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_small was -5.28346290049 per word [perplexity = 197.123843355] over 18290.0 words. - # even older results, before adding min-counts: - # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_small was -5.27623197813 per word [perplexity = 195.631341646] over 18290.0 words. + # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_small was -5.29432352378 per word [perplexity = 199.202824404 over 18290.0 words. + format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz fi diff --git a/egs/tedlium/s5_r2/run.sh b/egs/tedlium/s5_r2/run.sh index 754cec0494d..19bc92a738c 100755 --- a/egs/tedlium/s5_r2/run.sh +++ b/egs/tedlium/s5_r2/run.sh @@ -185,7 +185,7 @@ fi if [ $stage -le 17 ]; then # This will only work if you have GPUs on your system (and note that it requires # you to have the queue set up the right way... see kaldi-asr.org/doc/queue.html) - local/chain/run_tdnn.sh + local/chain/run_tdnn.sh --train-set train --gmm tri3 --nnet3-affix "" fi # The nnet3 TDNN recipe: diff --git a/egs/voxforge/s5/local/voxforge_prepare_dict.sh b/egs/voxforge/s5/local/voxforge_prepare_dict.sh index d27e8be69f9..9936ba7b556 100755 --- a/egs/voxforge/s5/local/voxforge_prepare_dict.sh +++ b/egs/voxforge/s5/local/voxforge_prepare_dict.sh @@ -12,7 +12,7 @@ echo "=== Preparing the dictionary ..." if [ ! -f $locdict/cmudict/cmudict.0.7a ]; then echo "--- Downloading CMU dictionary ..." - mkdir -p $locdict + mkdir -p $locdict svn co http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict \ $locdict/cmudict || exit 1; fi @@ -64,6 +64,7 @@ g2p.py --model=conf/g2p_model --apply $locdict/vocab-oov.txt > $locdict/lexicon- cat $locdict/lexicon-oov.txt $locdict/lexicon-iv.txt |\ sort > $locdict/lexicon.txt +rm $locdict/lexiconp.txt 2>/dev/null || true echo "--- Prepare phone lists ..." echo SIL > $locdict/silence_phones.txt diff --git a/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh b/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh index ad1c12a835a..0b6d7bb3970 100755 --- a/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh +++ b/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh @@ -123,9 +123,6 @@ model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | a left_context=$[model_left_context + extra_left_context] right_context=$[model_right_context + extra_right_context] -valid_left_context=$[valid_left_context + frames_per_eg] -valid_right_context=$[valid_right_context + frames_per_eg] - frame_subsampling_opt= if [ -f $srcdir/frame_subsampling_factor ]; then frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)" @@ -151,8 +148,7 @@ if [ -z "$degs_dir" ]; then --adjust-priors $adjust_priors \ --online-ivector-dir $online_ivector_dir \ --left-context $left_context --right-context $right_context \ - --valid-left-context $valid_left_context --valid-right-context $valid_right_context \ - --priors-left-context $valid_left_context --priors-right-context $valid_right_context $frame_subsampling_opt \ + $frame_subsampling_opt \ --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \ $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ; fi diff --git a/egs/wsj/s5/local/nnet3/run_tdnn_discriminative.sh b/egs/wsj/s5/local/nnet3/run_tdnn_discriminative.sh index b7ace847c6a..a514e354eef 100755 --- a/egs/wsj/s5/local/nnet3/run_tdnn_discriminative.sh +++ b/egs/wsj/s5/local/nnet3/run_tdnn_discriminative.sh @@ -107,9 +107,6 @@ model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | a left_context=$[model_left_context + extra_left_context] right_context=$[model_right_context + extra_right_context] -valid_left_context=$[valid_left_context + frames_per_eg] -valid_right_context=$[valid_right_context + frames_per_eg] - frame_subsampling_opt= if [ -f $srcdir/frame_subsampling_factor ]; then frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)" @@ -135,8 +132,7 @@ if [ -z "$degs_dir" ]; then --adjust-priors $adjust_priors \ --online-ivector-dir $online_ivector_dir \ --left-context $left_context --right-context $right_context \ - --valid-left-context $valid_left_context --valid-right-context $valid_right_context \ - --priors-left-context $valid_left_context --priors-right-context $valid_right_context $frame_subsampling_opt \ + $frame_subsampling_opt \ --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \ $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ; fi diff --git a/egs/wsj/s5/local/run_segmentation.sh b/egs/wsj/s5/local/run_segmentation.sh index 553260c0f0c..458536162cb 100755 --- a/egs/wsj/s5/local/run_segmentation.sh +++ b/egs/wsj/s5/local/run_segmentation.sh @@ -8,52 +8,75 @@ # bigram language model built from the reference, and then work out the # segmentation from a ctm like file. +stage=0 + . ./cmd.sh . ./path.sh -local/append_utterances.sh data/train_si284 data/train_si284_long -steps/cleanup/split_long_utterance.sh \ - --seg-length 30 --overlap-length 5 \ - data/train_si284_long data/train_si284_split +if [ $stage -le 0 ]; then + local/append_utterances.sh data/train_si284 data/train_si284_long + steps/cleanup/split_long_utterance.sh \ + --seg-length 30 --overlap-length 5 \ + data/train_si284_long data/train_si284_split +fi -steps/make_mfcc.sh --cmd "$train_cmd" --nj 64 \ - data/train_si284_split exp/make_mfcc/train_si284_split mfcc || exit 1; -steps/compute_cmvn_stats.sh data/train_si284_split \ - exp/make_mfcc/train_si284_split mfcc || exit 1; +if [ $stage -le 1 ]; then + steps/make_mfcc.sh --cmd "$train_cmd" --nj 64 \ + data/train_si284_split exp/make_mfcc/train_si284_split mfcc || exit 1; + steps/compute_cmvn_stats.sh data/train_si284_split \ + exp/make_mfcc/train_si284_split mfcc || exit 1; +fi -steps/cleanup/make_segmentation_graph.sh \ - --cmd "$mkgraph_cmd" --nj 32 \ - data/train_si284_split/ data/lang exp/tri2b/ \ - exp/tri2b/graph_train_si284_split || exit 1; +if [ $stage -le 2 ]; then + steps/cleanup/make_segmentation_graph.sh \ + --cmd "$mkgraph_cmd" --nj 32 \ + data/train_si284_split/ data/lang exp/tri2b/ \ + exp/tri2b/graph_train_si284_split || exit 1; +fi -steps/cleanup/decode_segmentation.sh \ - --nj 64 --cmd "$decode_cmd" --skip-scoring true \ - exp/tri2b/graph_train_si284_split/lats \ - data/train_si284_split exp/tri2b/decode_train_si284_split || exit 1; +if [ $stage -le 3 ]; then + steps/cleanup/decode_segmentation.sh \ + --nj 64 --cmd "$decode_cmd" --skip-scoring true \ + exp/tri2b/graph_train_si284_split \ + data/train_si284_split exp/tri2b/decode_train_si284_split || exit 1; +fi -steps/get_ctm.sh --cmd "$decode_cmd" data/train_si284_split \ - exp/tri2b/graph_train_si284_split exp/tri2b/decode_train_si284_split +if [ $stage -le 4 ]; then + steps/get_ctm.sh --cmd "$decode_cmd" data/train_si284_split \ + exp/tri2b/graph_train_si284_split exp/tri2b/decode_train_si284_split +fi -steps/cleanup/make_segmentation_data_dir.sh --wer-cutoff 0.9 \ - --min-sil-length 0.5 --max-seg-length 15 --min-seg-length 1 \ - exp/tri2b/decode_train_si284_split/score_10/train_si284_split.ctm \ - data/train_si284_split data/train_si284_reseg +if [ $stage -le 5 ]; then + steps/cleanup/make_segmentation_data_dir.sh --wer-cutoff 0.9 \ + --min-sil-length 0.5 --max-seg-length 15 --min-seg-length 1 \ + exp/tri2b/decode_train_si284_split/score_10/train_si284_split.ctm \ + data/train_si284_split data/train_si284_reseg +fi # Now, use the re-segmented data for training. -steps/make_mfcc.sh --cmd "$train_cmd" --nj 64 \ - data/train_si284_reseg exp/make_mfcc/train_si284_reseg mfcc || exit 1; -steps/compute_cmvn_stats.sh data/train_si284_reseg \ - exp/make_mfcc/train_si284_reseg mfcc || exit 1; - -steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \ - data/train_si284_reseg data/lang exp/tri3b exp/tri3b_ali_si284_reseg || exit 1; - -steps/train_sat.sh --cmd "$train_cmd" \ - 4200 40000 data/train_si284_reseg \ - data/lang exp/tri3b_ali_si284_reseg exp/tri4c || exit 1; - -utils/mkgraph.sh data/lang_test_tgpr exp/tri4c exp/tri4c/graph_tgpr || exit 1; -steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \ - exp/tri4c/graph_tgpr data/test_dev93 exp/tri4c/decode_tgpr_dev93 || exit 1; -steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \ - exp/tri4c/graph_tgpr data/test_eval92 exp/tri4c/decode_tgpr_eval92 || exit 1; +if [ $stage -le 6 ]; then + steps/make_mfcc.sh --cmd "$train_cmd" --nj 64 \ + data/train_si284_reseg exp/make_mfcc/train_si284_reseg mfcc || exit 1; + steps/compute_cmvn_stats.sh data/train_si284_reseg \ + exp/make_mfcc/train_si284_reseg mfcc || exit 1; +fi + +if [ $stage -le 7 ]; then + steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \ + data/train_si284_reseg data/lang exp/tri3b exp/tri3b_ali_si284_reseg || exit 1; +fi + +if [ $stage -le 8 ]; then + steps/train_sat.sh --cmd "$train_cmd" \ + 4200 40000 data/train_si284_reseg \ + data/lang exp/tri3b_ali_si284_reseg exp/tri4c || exit 1; +fi + + +if [ $stage -le 9 ]; then + utils/mkgraph.sh data/lang_test_tgpr exp/tri4c exp/tri4c/graph_tgpr || exit 1; + steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \ + exp/tri4c/graph_tgpr data/test_dev93 exp/tri4c/decode_tgpr_dev93 || exit 1; + steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \ + exp/tri4c/graph_tgpr data/test_eval92 exp/tri4c/decode_tgpr_eval92 || exit 1; +fi diff --git a/egs/wsj/s5/steps/cleanup/create_segments_from_ctm.pl b/egs/wsj/s5/steps/cleanup/create_segments_from_ctm.pl index d1819bb51e2..2660ebce479 100755 --- a/egs/wsj/s5/steps/cleanup/create_segments_from_ctm.pl +++ b/egs/wsj/s5/steps/cleanup/create_segments_from_ctm.pl @@ -228,7 +228,7 @@ sub SplitLongSegment { $aligned_ctm->[$seg_end_index]->[2] - $aligned_ctm->[$seg_start_index]->[1]; my $current_seg_index = $seg_start_index; - my $aligned_ctm_size = scalar(@{$aligned_ctm}); + my $aligned_ctm_size = scalar(@{$aligned_ctm}); while ($current_seg_length > 1.5 * $max_seg_length && $current_seg_index < $aligned_ctm_size-1) { my $split_point = GetSplitPoint($aligned_ctm, $current_seg_index, $seg_end_index, $max_seg_length); @@ -318,7 +318,7 @@ sub ProcessWav { $aligned_ctm[-1]->[3] += 1; } else { push(@aligned_ctm, ["", $start, $dur, 1]); - } + } } else { # Case 2.3: substitution. push(@aligned_ctm, [$ref_word, $start, $dur, 1]); @@ -417,11 +417,21 @@ sub InsertSilence { my @col = split; @col >= 2 || die "Error: bad line $_\n"; my $wav = shift @col; - my @pairs = split(" $separator ", join(" ", @col)); - for (my $x = 0; $x < @pairs; $x += 1) { - my @col1 = split(" ", $pairs[$x]); - @col1 == 2 || die "Error: bad pair $pairs[$x]\n"; - $pairs[$x] = \@col1; + if ( (@col + 0) % 3 != 2) { + die "Bad line in align-text output (unexpected number of fields): $_"; + } + my @pairs = (); + + for (my $x = 0; $x * 3 + 2 < @col; $x++) { + my $first_word = $col[$x * 3]; + my $second_word = $col[$x * 3 + 1]; + if ($x * 3 + 2 < @col) { + if ($col[$x*3 + 2] != $separator) { + die "Bad line in align-text output (expected separator '$separator'): $_"; + } + } + # the [ ] expression returns a reference to a new anonymous array. + push(@pairs, [ $first_word, $second_word ]); } ! defined($aligned{$wav}) || die "Error: $wav has already been processed\n"; $aligned{$wav} = \@pairs; diff --git a/egs/wsj/s5/steps/cleanup/debug_lexicon.sh b/egs/wsj/s5/steps/cleanup/debug_lexicon.sh index b7eaeb1319f..9091764924a 100755 --- a/egs/wsj/s5/steps/cleanup/debug_lexicon.sh +++ b/egs/wsj/s5/steps/cleanup/debug_lexicon.sh @@ -117,23 +117,34 @@ if [ $stage -le 8 ]; then export LC_ALL=C - cat $dir/word.ctm | awk '{printf("%s-%s %09d START %s\n", $1, $2, 100*$3, $5); printf("%s-%s %09d END %s\n", $1, $2, 100*($3+$4), $5);}' | \ - sort >$dir/word_processed.ctm + cat $dir/word.ctm | awk '{printf("%s-%s %010.0f START %s\n", $1, $2, 1000*$3, $5); printf("%s-%s %010.0f END %s\n", $1, $2, 1000*($3+$4), $5);}' | \ + sort > $dir/word_processed.ctm - cat $dir/phone_mapped.ctm | awk '{printf("%s-%s %09d PHONE %s\n", $1, $2, 100*($3+(0.5*$4)), $5);}' | \ - sort >$dir/phone_processed.ctm + # filter out those utteraces which only appea in phone_processed.ctm but not in word_processed.ctm + cat $dir/phone_mapped.ctm | awk '{printf("%s-%s %010.0f PHONE %s\n", $1, $2, 1000*($3+(0.5*$4)), $5);}' | \ + awk 'NR==FNR{a[$1] = 1; next} {if($1 in a) print $0}' $dir/word_processed.ctm - \ + > $dir/phone_processed.ctm # merge-sort both ctm's sort -m $dir/word_processed.ctm $dir/phone_processed.ctm > $dir/combined.ctm - fi + # after merge-sort of the two ctm's, we add to cover "deserted" phones due to precision limits, and then merge all consecutive 's. if [ $stage -le 9 ]; then - awk '{print $3, $4}' $dir/combined.ctm | \ - perl -e ' while (<>) { chop; @A = split(" ", $_); ($a,$b) = @A; + awk '{print $1, $3, $4}' $dir/combined.ctm | \ + perl -e ' while (<>) { chop; @A = split(" ", $_); ($utt, $a,$b) = @A; if ($a eq "START") { $cur_word = $b; @phones = (); } - if ($a eq "END") { print $cur_word, " ", join(" ", @phones), "\n"; } - if ($a eq "PHONE") { push @phones, $b; }} ' | sort | uniq -c | sort -nr > $dir/prons.txt + if ($a eq "END") { print $utt, " ", $cur_word, " ", join(" ", @phones), "\n"; } + if ($a eq "PHONE") { if ($prev eq "END") {print $utt, " ", "", " ", $b, "\n";} else {push @phones, $b;}} $prev = $a;} ' |\ + awk 'BEGIN{merge_prev=0;} {utt=$1;word=$2;pron=$3;for (i=4;i<=NF;i++) pron=pron" "$i; + if (word_prev == "" && word == "" && utt_prev == utt) {merge=0;pron_prev=pron_prev" "pron;} else {merge=1;} + if(merge_prev==1) {print utt_prev, word_prev, pron_prev;}; + merge_prev=merge; utt_prev=utt; word_prev=word; pron_prev=pron;} + END{if(merge_prev==1) {print utt_prev, word_prev, pron_prev;}}' > $dir/ctm_prons.txt + + steps/cleanup/internal/get_non_scored_words.py $lang > $dir/non_scored_words + steps/cleanup/internal/get_pron_stats.py $dir/ctm_prons.txt $phone_lang/phones/silence.txt $phone_lang/phones/optional_silence.txt $dir/non_scored_words - | \ + sort -nr > $dir/prons.txt fi if [ $stage -le 10 ]; then diff --git a/egs/wsj/s5/steps/cleanup/internal/get_pron_stats.py b/egs/wsj/s5/steps/cleanup/internal/get_pron_stats.py new file mode 100755 index 00000000000..414875f9013 --- /dev/null +++ b/egs/wsj/s5/steps/cleanup/internal/get_pron_stats.py @@ -0,0 +1,227 @@ +#!/usr/bin/env python + +# Copyright 2016 Xiaohui Zhang +# Apache 2.0. + +from __future__ import print_function +import argparse +import sys +import warnings + +# Collect pronounciation stats from a ctm_prons.txt file of the form output +# by steps/cleanup/debug_lexicon.sh. This input file has lines of the form: +# utt_id word phone1 phone2 .. phoneN +# e.g. +# foo-bar123-342 hello h eh l l ow +# (and this script does require that lines from the same utterance be ordered in +# order of time). +# The output of this program is word pronunciation stats of the form: +# count word phone1 .. phoneN +# e.g.: +# 24.0 hello h ax l l ow +# This program uses various heuristics to account for the fact that in the input ctm_prons.txt +# file may not always be well aligned. As a result of some of these heuristics the counts will +# not always be integers. + +def GetArgs(): + parser = argparse.ArgumentParser(description = "Accumulate pronounciation statistics from " + "a ctm_prons.txt file.", + epilog = "See steps/cleanup/debug_lexicon.sh for example") + parser.add_argument("ctm_prons_file", metavar = "", type = str, + help = "File containing word-pronounciation alignments obtained from a ctm file; " + "It represents phonetic decoding results, aligned with word boundaries obtained" + "from forced alignments." + "each line must be ") + parser.add_argument("silence_file", metavar = "", type = str, + help = "File containing a list of silence phones.") + parser.add_argument("optional_silence_file", metavar = "", type = str, + help = "File containing the optional silence phone. We'll be replacing empty prons by this," + "because empty prons would cause a problem for lattice word alignment.") + parser.add_argument("non_scored_words_file", metavar = "", type = str, + help = "File containing a list of non-scored words.") + parser.add_argument("stats_file", metavar = "", type = str, + help = "Write accumulated statitistics to this file; each line represents how many times " + "a specific word-pronunciation pair appears in the phonetic decoding results (ctm_pron_file)." + "each line is ") + print (' '.join(sys.argv), file=sys.stderr) + + args = parser.parse_args() + args = CheckArgs(args) + + return args + +def CheckArgs(args): + if args.ctm_prons_file == "-": + args.ctm_prons_file_handle = sys.stdin + else: + args.ctm_prons_file_handle = open(args.ctm_prons_file) + args.non_scored_words_file_handle = open(args.non_scored_words_file) + args.silence_file_handle = open(args.silence_file) + args.optional_silence_file_handle = open(args.optional_silence_file) + if args.stats_file == "-": + args.stats_file_handle = sys.stdout + else: + args.stats_file_handle = open(args.stats_file, "w") + return args + +def ReadEntries(file_handle): + entries = set() + for line in file_handle: + entries.add(line.strip()) + return entries + +# Basically, this function generates an "info" list from a ctm_prons file. +# Each entry in the list represents the pronounciation candidate(s) of a word. +# For each non- word, the entry is a list: [utt_id, word, set(pronunciation_candidates)]. e.g: +# [911Mothers_2010W-0010916-0012901-1, other, set('AH DH ER', 'AH DH ER K AH N')] +# For each , we split the phones it aligns to into two parts: "nonsil_left", +# which includes phones before the first silphone, and "nonsil_right", which includes +# phones after the last silphone. For example, for : 'V SIL B AH SIL', +# nonsil_left is 'V' and nonsil_right is empty ''. After processing an entry +# in ctm_prons, we put it in "info" as an entry: [utt_id, word, nonsil_right] +# only if it's nonsil_right segment is not empty, which may be used when processing +# the next word. +# +# Normally, one non- word is only aligned to one pronounciation candidate. However +# when there is a preceding/following , like in the following example, we +# assume the phones aligned to should be statistically distributed +# to its neighboring words (BTW we assume there are no consecutive within an utterance.) +# Thus we append the "nonsil_left" segment of these phones to the pronounciation +# of the preceding word, if the last phone of this pronounciation is not a silence phone, +# Similarly we can add a pron candidate to the following word. +# +# For example, for the following part of a ctm_prons file: +# 911Mothers_2010W-0010916-0012901-1 other AH DH ER +# 911Mothers_2010W-0010916-0012901-1 K AH N SIL B +# 911Mothers_2010W-0010916-0012901-1 because IH K HH W AA Z AH +# 911Mothers_2010W-0010916-0012901-1 V SIL +# 911Mothers_2010W-0010916-0012901-1 when W EH N +# 911Mothers_2010W-0010916-0012901-1 people P IY P AH L +# 911Mothers_2010W-0010916-0012901-1 SIL +# 911Mothers_2010W-0010916-0012901-1 heard HH ER +# 911Mothers_2010W-0010916-0012901-1 D +# 911Mothers_2010W-0010916-0012901-1 that SIL DH AH T +# 911Mothers_2010W-0010916-0012901-1 my M AY +# +# The corresponding segment in the "info" list is: +# [911Mothers_2010W-0010916-0012901-1, other, set('AH DH ER', 'AH DH ER K AH N')] +# [911Mothers_2010W-0010916-0012901-1, , 'B' +# [911Mothers_2010W-0010916-0012901-1, because, set('IH K HH W AA Z AH', 'B IH K HH W AA Z AH', 'IH K HH W AA Z AH V', 'B IH K HH W AA Z AH V')] +# [911Mothers_2010W-0010916-0012901-1, when, set('W EH N')] +# [911Mothers_2010W-0010916-0012901-1, people, set('P IY P AH L')] +# [911Mothers_2010W-0010916-0012901-1, , 'D'] +# [911Mothers_2010W-0010916-0012901-1, that, set('SIL DH AH T')] +# [911Mothers_2010W-0010916-0012901-1, my, set('M AY')] +# +# Then we accumulate pronouciation stats from "info". Basically, for each occurence +# of a word, each pronounciation candidate gets equal soft counts. e.g. In the above +# example, each pron candidate of "because" gets a count of 1/4. The stats is stored +# in a dictionary (word, pron) : count. + +def GetStatsFromCtmProns(silphones, optional_silence, non_scored_words, ctm_prons_file_handle): + info = [] + for line in ctm_prons_file_handle.readlines(): + splits = line.strip().split() + utt = splits[0] + word = splits[1] + phones = splits[2:] + if phones == []: + phones = [optional_silence] + # extract the nonsil_left and nonsil_right segments, and then try to + # append nonsil_left to the pron candidates of preceding word, getting + # extended pron candidates. + # Note: the ctm_pron file may have cases like: + # KevinStone_2010U-0024782-0025580-1 [UH] EH + # KevinStone_2010U-0024782-0025580-1 fda F T + # KevinStone_2010U-0024782-0025580-1 [NOISE] IY EY + # which means non-scored-words (except oov symbol /) behaves like . + # So we apply the same merging method in these cases. + if word == '' or (word in non_scored_words and word != '' and word != ''): + nonsil_left = [] + nonsil_right = [] + for phone in phones: + if phone in silphones: + break + nonsil_left.append(phone) + + for phone in reversed(phones): + if phone in silphones: + break + nonsil_right.insert(0, phone) + + # info[-1][0] is the utt_id of the last entry + if len(nonsil_left) > 0 and len(info) > 0 and utt == info[-1][0]: + # pron_ext is a set of extended pron candidates. + pron_ext = set() + # info[-1][2] is the set of pron candidates of the last entry. + for pron in info[-1][2]: + # skip generating the extended pron candidate if + # the pron ends with a silphone. + ends_with_sil = False + for sil in silphones: + if pron.endswith(sil): + ends_with_sil = True + if not ends_with_sil: + pron_ext.add(pron+" "+" ".join(nonsil_left)) + if isinstance(info[-1][2], set): + info[-1][2] = info[-1][2].union(pron_ext) + if len(nonsil_right) > 0: + info.append([utt, word, " ".join(nonsil_right)]) + else: + prons = set() + prons.add(" ".join(phones)) + # If there's a preceding /non_scored_words (which means the third field is a string rather than a set of strings), + # we append it's nonsil_right segment to the pron candidates of the current word. + if len(info) > 0 and utt == info[-1][0] and isinstance(info[-1][2], str) and (phones == [] or phones[0] not in silphones): + # info[-1][2] is the nonsil_right segment of the phones aligned to the last /non_scored_words. + prons.add(info[-1][2]+' '+" ".join(phones)) + info.append([utt, word, prons]) + stats = {} + for utt, word, prons in info: + # If the prons is not a set, the current word must be or an non_scored_word, + # where we just left the nonsil_right part as prons. + if isinstance(prons, set) and len(prons) > 0: + count = 1.0 / float(len(prons)) + for pron in prons: + phones = pron.strip().split() + # post-processing: remove all begining/trailing silence phones. + # we allow only candidates that either consist of a single silence + # phone, or the silence phones are inside non-silence phones. + if len(phones) > 1: + begin = 0 + for phone in phones: + if phone in silphones: + begin += 1 + else: + break + if begin == len(phones): + begin -= 1 + phones = phones[begin:] + if len(phones) == 1: + break + end = len(phones) + for phone in reversed(phones): + if phone in silphones: + end -= 1 + else: + break + phones = phones[:end] + phones = " ".join(phones) + stats[(word, phones)] = stats.get((word, phones), 0) + count + return stats + +def WriteStats(stats, file_handle): + for word_pron, count in stats.iteritems(): + print('{0} {1} {2}'.format(count, word_pron[0], word_pron[1]), file=file_handle) + file_handle.close() + +def Main(): + args = GetArgs() + silphones = ReadEntries(args.silence_file_handle) + non_scored_words = ReadEntries(args.non_scored_words_file_handle) + optional_silence = ReadEntries(args.optional_silence_file_handle) + stats = GetStatsFromCtmProns(silphones, optional_silence.pop(), non_scored_words, args.ctm_prons_file_handle) + WriteStats(stats, args.stats_file_handle) + +if __name__ == "__main__": + Main() diff --git a/egs/wsj/s5/steps/dict/apply_g2p.sh b/egs/wsj/s5/steps/dict/apply_g2p.sh new file mode 100755 index 00000000000..1f66c838010 --- /dev/null +++ b/egs/wsj/s5/steps/dict/apply_g2p.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# Copyright 2014 Johns Hopkins University (Author: Yenda Trmal) +# Copyright 2016 Xiaohui Zhang +# Apache 2.0 + +# Begin configuration section. +stage=0 +encoding='utf-8' +var_counts=3 #Generate upto N variants +var_mass=0.9 #Generate so many variants to produce 90 % of the prob mass +cmd=run.pl +nj=10 #Split the task into several parallel, to speedup things +model= +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +set -u +set -e + +if [ $# != 3 ]; then + echo "Usage: $0 [options] " + echo "... where is a list of words whose pronunciation is to be generated" + echo " is a directory used as a target during training of G2P" + echo " is the directory where the output lexicon should be stored" + echo "e.g.: $0 oov_words exp/g2p exp/g2p/oov_lex" + echo "" + echo "main options (for others, see top of script file)" + echo " --nj # How many tasks should be spawn (to speedup things)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +wordlist=$1 +modeldir=$2 +output=$3 + + +mkdir -p $output/log + +model=$modeldir/g2p.model.final +[ ! -f ${model:-} ] && echo "File $model not found in the directory $modeldir." && exit 1 +#[ ! -x $wordlist ] && echo "File $wordlist not found!" && exit 1 + +cp $wordlist $output/wordlist.txt + +if ! g2p=`which g2p.py` ; then + echo "The Sequitur was not found !" + echo "Go to $KALDI_ROOT/tools and execute extras/install_sequitur.sh" + exit 1 +fi + +echo "Applying the G2P model to wordlist $wordlist" + +if [ $stage -le 0 ]; then + $cmd JOBS=1:$nj $output/log/apply.JOBS.log \ + split -n l/JOBS/$nj $output/wordlist.txt \| \ + g2p.py -V $var_mass --variants-number $var_counts --encoding $encoding \ + --model $modeldir/g2p.model.final --apply - \ + \> $output/output.JOBS +fi +cat $output/output.* > $output/output + +# Remap the words from output file back to the original casing +# Conversion of some of thems might have failed, so we have to be careful +# and use the transform_map file we generated beforehand +# Also, because the sequitur output is not readily usable as lexicon (it adds +# one more column with ordering of the pron. variants) convert it into the proper lexicon form +output_lex=$output/lexicon.lex + +# Just convert it to a proper lexicon format +cut -f 1,3,4 $output/output > $output_lex + +# Some words might have been removed or skipped during the process, +# let's check it and warn the user if so... +nlex=`cut -f 1 $output_lex | sort -u | wc -l` +nwlist=`cut -f 1 $output/wordlist.txt | sort -u | wc -l` +if [ $nlex -ne $nwlist ] ; then + echo "WARNING: Unable to generate pronunciation for all words. "; + echo "WARINNG: Wordlist: $nwlist words" + echo "WARNING: Lexicon : $nlex words" + echo "WARNING:Diff example: " + diff <(cut -f 1 $output_lex | sort -u ) \ + <(cut -f 1 $output/wordlist.txt | sort -u ) || true +fi +exit 0 diff --git a/egs/wsj/s5/steps/dict/apply_lexicon_edits.py b/egs/wsj/s5/steps/dict/apply_lexicon_edits.py new file mode 100755 index 00000000000..a5bdbc30d46 --- /dev/null +++ b/egs/wsj/s5/steps/dict/apply_lexicon_edits.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python + +# Copyright 2016 Xiaohui Zhang +# Apache 2.0. + +from __future__ import print_function +import argparse +import sys + +def GetArgs(): + parser = argparse.ArgumentParser(description = "Apply an lexicon edits file (output from steps/dict/select_prons_bayesian.py)to an input lexicon" + "to produce a learned lexicon.", + epilog = "See steps/dict/learn_lexicon.sh for example") + + parser.add_argument("in_lexicon", metavar='', type = str, + help = "Input lexicon. Each line must be .") + parser.add_argument("lexicon_edits_file", metavar='', type = str, + help = "Input lexicon edits file containing human-readable & editable" + "pronounciation info. The info for each word is like:" + "------------ an 4086.0 --------------" + "R | Y | 2401.6 | AH N" + "R | Y | 640.8 | AE N" + "P | Y | 1035.5 | IH N" + "R(ef), P(hone-decoding) represents the pronunciation source" + "Y/N means the recommended decision of including this pron or not" + "and the numbers are soft counts accumulated from lattice-align-word outputs. See steps/dict/select_prons_bayesian.py for more details.") + parser.add_argument("out_lexicon", metavar='', type = str, + help = "Output lexicon to this file.") + + print (' '.join(sys.argv), file=sys.stderr) + + args = parser.parse_args() + args = CheckArgs(args) + + return args + +def CheckArgs(args): + if args.in_lexicon == "-": + args.in_lexicon = sys.stdin + else: + args.in_lexicon_handle = open(args.in_lexicon) + args.lexicon_edits_file_handle = open(args.lexicon_edits_file) + + if args.out_lexicon == "-": + args.out_lexicon_handle = sys.stdout + else: + args.out_lexicon_handle = open(args.out_lexicon, "w") + + return args + +def ReadLexicon(lexicon_file_handle): + lexicon = set() + if lexicon_file_handle: + for line in lexicon_file_handle.readlines(): + splits = line.strip().split() + if len(splits) == 0: + continue + if len(splits) < 2: + raise Exception('Invalid format of line ' + line + + ' in lexicon file.') + word = splits[0] + phones = ' '.join(splits[1:]) + lexicon.add((word, phones)) + return lexicon + +def ApplyLexiconEdits(lexicon, lexicon_edits_file_handle): + if lexicon_edits_file_handle: + for line in lexicon_edits_file_handle.readlines(): + # skip all commented lines + if line.startswith('#'): + continue + # read a word from a line like "---- MICROPHONES 200.0 ----". + if line.startswith('---'): + splits = line.strip().strip('-').strip().split() + if len(splits) != 2: + print(splits, file=sys.stderr) + raise Exception('Invalid format of line ' + line + + ' in lexicon edits file.') + word = splits[0].strip() + else: + # parse the pron and decision 'Y/N' of accepting the pron or not, + # from a line like: 'P | Y | 42.0 | M AY K R AH F OW N Z' + splits = line.split('|') + if len(splits) != 4: + raise Exception('Invalid format of line ' + line + + ' in lexicon edits file.') + pron = splits[3].strip() + if splits[1].strip() == 'Y': + lexicon.add((word, pron)) + elif splits[1].strip() == 'N': + lexicon.discard((word, pron)) + else: + raise Exception('Invalid format of line ' + line + + ' in lexicon edits file.') + return lexicon + + +def WriteLexicon(lexicon, out_lexicon_handle): + for word, pron in lexicon: + print('{0} {1}'.format(word, pron), file=out_lexicon_handle) + out_lexicon_handle.close() + +def Main(): + args = GetArgs() + lexicon = ReadLexicon(args.in_lexicon_handle) + ApplyLexiconEdits(lexicon, args.lexicon_edits_file_handle) + WriteLexicon(lexicon, args.out_lexicon_handle) + +if __name__ == "__main__": + Main() diff --git a/egs/wsj/s5/steps/dict/get_pron_stats.py b/egs/wsj/s5/steps/dict/get_pron_stats.py new file mode 100755 index 00000000000..b5202a69abb --- /dev/null +++ b/egs/wsj/s5/steps/dict/get_pron_stats.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python + +# Copyright 2016 Xiaohui Zhang +# 2016 Vimal Manohar +# Apache 2.0. + +from __future__ import print_function +from collections import defaultdict +import argparse +import sys + +def GetArgs(): + parser = argparse.ArgumentParser(description = "Accumulate statistics from lattice-alignment outputs for lexicon" + "learning. The inputs are a file containing arc level information from lattice-align-words," + "and a map which maps word-position-dependent phones to word-position-independent phones" + "(output from steps/cleanup/debug_lexicon.txt). The output contains accumulated soft-counts" + "of pronunciations", + epilog = "cat exp/tri3_lex_0.4_work/lats/arc_info_sym.*.txt \\|" + " steps/dict/get_pron_stats.py - exp/tri3_lex_0.4_work/phone_decode/phone_map.txt \\" + " exp/tri3_lex_0.4_work/lats/pron_stats.txt" + "See steps/dict/learn_lexicon.sh for examples in detail.") + + parser.add_argument("arc_info_file", metavar = "", type = str, + help = "Input file containing per arc statistics; " + "each line must be ") + parser.add_argument("phone_map", metavar = "", type = str, + help = "An input phone map used to remove word boundary markers from phones;" + "generated in steps/cleanup/debug_lexicon.sh") + parser.add_argument("stats_file", metavar = "", type = str, + help = "Write accumulated statitistics to this file;" + "each line is ") + + print (' '.join(sys.argv), file=sys.stderr) + + args = parser.parse_args() + args = CheckArgs(args) + + return args + +def CheckArgs(args): + if args.arc_info_file == "-": + args.arc_info_file_handle = sys.stdin + else: + args.arc_info_file_handle = open(args.arc_info_file) + args.phone_map_handle = open(args.phone_map) + + if args.stats_file == "-": + args.stats_file_handle = sys.stdout + else: + args.stats_file_handle = open(args.stats_file, "w") + + return args + + +def GetStatsFromArcInfo(arc_info_file_handle, phone_map_handle): + prons = defaultdict(set) + # need to map the phones to remove word boundary markers. + phone_map = {} + stats_unmapped = {} + stats = {} + for line in phone_map_handle.readlines(): + splits = line.strip().split() + phone_map[splits[0]] = splits[1] + + for line in arc_info_file_handle.readlines(): + splits = line.strip().split() + if (len(splits) == 0): + continue + if (len(splits) < 6): + raise Exception('Invalid format of line ' + line + + ' in arc_info_file') + word = splits[4] + count = float(splits[3]) + phones = " ".join(splits[5:]) + prons[word].add(phones) + stats_unmapped[(word, phones)] = stats_unmapped.get((word, phones), 0) + count + + for word_pron, count in stats_unmapped.iteritems(): + phones_unmapped = word_pron[1].split() + phones = [phone_map[phone] for phone in phones_unmapped] + stats[(word_pron[0], " ".join(phones))] = count + return stats + +def WriteStats(stats, file_handle): + for word_pron, count in stats.iteritems(): + print('{2} {0} {1}'.format(word_pron[0], word_pron[1], count), + file=file_handle) + file_handle.close() + +def Main(): + args = GetArgs() + stats = GetStatsFromArcInfo(args.arc_info_file_handle, args.phone_map_handle) + WriteStats(stats, args.stats_file_handle) + +if __name__ == "__main__": + Main() diff --git a/egs/wsj/s5/steps/dict/internal/prune_pron_candidates.py b/egs/wsj/s5/steps/dict/internal/prune_pron_candidates.py new file mode 100755 index 00000000000..1f2863424f3 --- /dev/null +++ b/egs/wsj/s5/steps/dict/internal/prune_pron_candidates.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python + +# Copyright 2016 Xiaohui Zhang +# Apache 2.0. + +from __future__ import print_function +from collections import defaultdict +import argparse +import sys +import math + +def GetArgs(): + parser = argparse.ArgumentParser(description = "Prune pronunciation candidates based on soft-counts from lattice-alignment" + "outputs, and a reference lexicon. Basically, for each word we sort all pronunciation" + "cadidates according to their soft-counts, and then select the top r * N candidates" + "(For words in the reference lexicon, N = # pron variants given by the reference" + "lexicon; For oov words, N = avg. # pron variants per word in the reference lexicon)." + "r is a user-specified constant, like 2.", + epilog = "See steps/dict/learn_lexicon.sh for example") + + parser.add_argument("--r", type = float, default = "2.0", + help = "a user-specified ratio parameter which determines how many" + "pronunciation candidates we want to keep for each word.") + parser.add_argument("pron_stats", metavar = "", type = str, + help = "File containing soft-counts of all pronounciation candidates; " + "each line must be ") + parser.add_argument("ref_lexicon", metavar = "", type = str, + help = "Reference lexicon file, where we obtain # pron variants for" + "each word, based on which we prune the pron candidates." + "Each line must be ") + parser.add_argument("pruned_prons", metavar = "", type = str, + help = "An output file in lexicon format, which contains prons we want to" + "prune off from the pron_stats file.") + + print (' '.join(sys.argv), file=sys.stderr) + + args = parser.parse_args() + args = CheckArgs(args) + + return args + +def CheckArgs(args): + args.pron_stats_handle = open(args.pron_stats) + args.ref_lexicon_handle = open(args.ref_lexicon) + if args.pruned_prons == "-": + args.pruned_prons_handle = sys.stdout + else: + args.pruned_prons_handle = open(args.pruned_prons, "w") + return args + +def ReadStats(pron_stats_handle): + stats = defaultdict(list) + for line in pron_stats_handle.readlines(): + splits = line.strip().split() + if len(splits) == 0: + continue + if len(splits) < 2: + raise Exception('Invalid format of line ' + line + + ' in stats file.') + count = float(splits[0]) + word = splits[1] + phones = ' '.join(splits[2:]) + stats[word].append((phones, count)) + + for word, entry in stats.iteritems(): + entry.sort(key=lambda x: x[1]) + return stats + +def ReadLexicon(ref_lexicon_handle): + ref_lexicon = defaultdict(set) + for line in ref_lexicon_handle.readlines(): + splits = line.strip().split() + if len(splits) == 0: + continue + if len(splits) < 2: + raise Exception('Invalid format of line ' + line + + ' in lexicon file.') + word = splits[0] + phones = ' '.join(splits[1:]) + ref_lexicon[word].add(phones) + return ref_lexicon + +def PruneProns(args, stats, ref_lexicon): + # Compute the average # pron variants counts per word in the reference lexicon. + num_words_ref = 0 + num_prons_ref = 0 + for word, prons in ref_lexicon.iteritems(): + num_words_ref += 1 + num_prons_ref += len(prons) + avg_variants_counts_ref = math.ceil(float(num_prons_ref) / float(num_words_ref)) + + for word, entry in stats.iteritems(): + if word in ref_lexicon: + variants_counts = args.r * len(ref_lexicon[word]) + else: + variants_counts = args.r * avg_variants_counts_ref + num_variants = 0 + while num_variants < variants_counts: + try: + pron, prob = entry.pop() + if word not in ref_lexicon or pron not in ref_lexicon[word]: + num_variants += 1 + except IndexError: + break + + for word, entry in stats.iteritems(): + for pron, prob in entry: + if word not in ref_lexicon or pron not in ref_lexicon[word]: + print('{0} {1}'.format(word, pron), file=args.pruned_prons_handle) + +def Main(): + args = GetArgs() + ref_lexicon = ReadLexicon(args.ref_lexicon_handle) + stats = ReadStats(args.pron_stats_handle) + PruneProns(args, stats, ref_lexicon) + +if __name__ == "__main__": + Main() diff --git a/egs/wsj/s5/steps/dict/learn_lexicon.sh b/egs/wsj/s5/steps/dict/learn_lexicon.sh new file mode 100755 index 00000000000..7f32428c059 --- /dev/null +++ b/egs/wsj/s5/steps/dict/learn_lexicon.sh @@ -0,0 +1,410 @@ +#! /bin/bash + +# Copyright 2016 Xiaohui Zhang +# 2016 Vimal Manohar +# Apache 2.0 + +# This script demonstrate how to expand a existing lexicon using a combination +# of acoustic evidence and G2P to learn a lexicon that covers words in a target +# vocab, and agrees sufficiently with the acoustics. The basic idea is to +# run phonetic decoding on acoustic training data using an existing +# acoustice model (possibly re-trained using a G2P-expanded lexicon) to get +# alternative pronunciations for words in training data. Then we combine three +# exclusive sources of pronunciations: the reference lexicon (supposedly +# hand-derived), phonetic decoding, and G2P (optional) into one lexicon and then run +# lattice alignment on the same data, to collect acoustic evidence (soft +# counts) of all pronunciations. Based on these statistics, and +# user-specified prior-counts (parameterized by prior mean and prior-counts-tot, +# assuming the prior follows a Dirichlet distribution), we then use a Bayesian +# framework to compute posteriors of all pronunciations for each word, +# and then select best pronunciations for each word. The output is a final learned lexicon +# whose vocab matches the user-specified target-vocab, and two intermediate resultis: +# an edits file which records the recommended changes to all in-ref-vocab words' +# prons, and a half-learned lexicon where all in-ref-vocab words' prons were untouched +# (on top of which we apply the edits file to produce the final learned lexicon). +# The user can always modify the edits file manually and then re-apply it on the +# half-learned lexicon using steps/dict/apply_lexicon_edits to produce the final +# learned lexicon. See the last stage in this script for details. + + +stage=0 +# Begin configuration section. +cmd=run.pl +nj= +stage=6 +oov_symbol= +lexicon_g2p= +min_prob=0.3 +variants_prob_mass=0.7 +variants_prob_mass_ref=0.9 +prior_counts_tot=15 +prior_mean="0.7,0.2,0.1" +num_gauss= +num_leaves= +retrain_src_mdl=true +cleanup=true +# End configuration section. + +. ./path.sh +. utils/parse_options.sh + +if [ $# -ne 7 ]; then + echo "Usage: $0 [options] \\" + echo " ." + echo " This script does lexicon expansion using a combination of acoustic" + echo " evidence and G2P to produce a lexicon that covers words of a target vocab:" + echo "" + echo "Arguments:" + echo " the dir which contains the reference lexicon (most probably hand-derived)" + echo " we want to expand/improve, and nonsilence_phones.txt,.etc which we need " + echo " for building new dict dirs." + echo " the vocabulary we want the final learned lexicon to cover (one word per line)." + echo " acoustic training data we use to get alternative" + echo " pronunciations and collet acoustic evidence." + echo " The dir containing an SAT-GMM acoustic model (we optionaly we re-train it" + echo " using G2P expanded lexicon) to do phonetic decoding (to get alternative" + echo " pronunciations) and lattice-alignment (to collect acoustic evidence for" + echo " evaluating all prounciations)" + echo " the reference lang dir which we use to get non-scored-words" + echo " like for building new dict dirs" + echo " the dict dir where we put the final learned lexicon, whose vocab" + echo " matches ." + echo "" + echo "Note: and the vocab of don't have to match. For words" + echo " who are in but not seen in , their pronunciations" + echo " will be given by G2P at the end." + echo "" + echo "e.g. $0 data/local/dict data/local/lm/librispeech-vocab.txt data/train \\" + echo " exp/tri3 data/lang data/local/dict_learned" + echo "Options:" + echo " --stage # stage to run from, to enable resuming from partially" + echo " # completed run (default: 0)" + echo " --cmd '$cmd' # command to submit jobs with (e.g. run.pl, queue.pl)" + echo " --nj # number of parallel jobs" + echo " --oov-symbol '$oov_symbol' # oov symbol, like ." + echo " --g2p-pron-candidates # A lexicon file containing g2p generated pronunciations, for words in acoustic training " + echo " # data / target vocabulary. It's optional." + echo " --min-prob # The cut-off parameter used to select pronunciation candidates from phonetic" + echo " # decoding. We remove pronunciations with probabilities less than this value" + echo " # after normalizing the probs s.t. the max-prob is 1.0 for each word." + echo " --prior-mean # Mean of priors (summing up to 1) assigned to three exclusive pronunciation" + echo " # source: reference lexicon, g2p, and phonetic decoding (used in the Bayesian" + echo " # pronunciation selection procedure). We recommend setting a larger prior" + echo " # mean for the reference lexicon, e.g. '0.6,0.2,0.2'." + echo " --prior-counts-tot # Total amount of prior counts we add to all pronunciation candidates of" + echo " # each word. By timing it with the prior mean of a source, and then dividing" + echo " # by the number of candidates (for a word) from this source, we get the" + echo " # prior counts we actually add to each candidate." + echo " --variants-prob-mass # In the Bayesian pronunciation selection procedure, for each word, we" + echo " # choose candidates (from all three sources) with highest posteriors" + echo " # until the total prob mass hit this amount." + echo " # It's used in a similar fashion when we apply G2P." + echo " --variants-prob-mass-ref # In the Bayesian pronunciation selection procedure, for each word," + echo " # after the total prob mass of selected candidates hit variants-prob-mass," + echo " # we continue to pick up reference candidates with highest posteriors" + echo " # until the total prob mass hit this amount (must >= variants-prob-mass)." + echo " --num-gauss # number of gaussians for the re-trained SAT model (on top of )." + echo " --num-leaves # number of leaves for the re-trained SAT model (on top of )." + echo " --retrain-src-mdl # true if you want to re-train the src_mdl before phone decoding (default false)." + exit 1 +fi + +echo "$0 $@" # Print the command line for logging + +ref_dict=$1 +target_vocab=$2 +data=$3 +src_mdl_dir=$4 +ref_lang=$5 +dest_dict=$6 +dir=$7 # Most intermediate outputs will be put here. + +mkdir -p $dir +if [ $stage -le 0 ]; then + echo "$0: Some preparatory work." + # Get the word counts of training data. + awk '{for (n=2;n<=NF;n++) counts[$n]++;} END{for (w in counts) printf "%s %d\n",w, counts[w];}' \ + $data/text | sort > $dir/train_counts.txt + + # Get the non-scored entries and exclude them from the reference lexicon/vocab, and target_vocab. + steps/cleanup/internal/get_non_scored_words.py $ref_lang > $dir/non_scored_words + awk 'NR==FNR{a[$1] = 1; next} {if($1 in a) print $0}' $dir/non_scored_words \ + $ref_dict/lexicon.txt > $dir/non_scored_entries + + # Remove non-scored-words from the reference lexicon. + awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' $dir/non_scored_words \ + $ref_dict/lexicon.txt | tr -s '\t' ' ' > $dir/ref_lexicon.txt + + cat $dir/ref_lexicon.txt | awk '{print $1}' | sort | uniq > $dir/ref_vocab.txt + awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' $dir/non_scored_words \ + $target_vocab | sort | uniq > $dir/target_vocab.txt + + # From the reference lexicon, we estimate the target_num_prons_per_word as, + # ceiling(avg. # prons per word in the reference lexicon). This'll be used as + # the upper bound of # pron variants per word when we apply G2P or select prons to + # construct the learned lexicon in later stages. + python -c 'import sys; import math; print int(math.ceil(float(sys.argv[1])/float(sys.argv[2])))' \ + `wc -l $dir/ref_lexicon.txt | awk '{print $1}'` `wc -l $dir/ref_vocab.txt | awk '{print $1}'` \ + > $dir/target_num_prons_per_word || exit 1; + + if [ -z $lexicon_g2p ]; then + # create an empty list of g2p generated prons, if it's not given. + touch $dir/lexicon_g2p.txt + else + cp $lexicon_g2p $dir/lexicon_g2p.txt 2>/dev/null + fi +fi + +if [ $stage -le 1 ] && $retrain_src_mdl; then + echo "$0: Expand the reference lexicon to cover all words in the target vocab. and then" + echo " ... re-train the source acoustic model for phonetic decoding. " + mkdir -p $dir/dict_expanded_target_vocab + cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \ + $dir/dict_expanded_target_vocab 2>/dev/null + rm $dir/dict_expanded_target_vocab/lexiconp.txt $dir/dict_expanded_target_vocab/lexicon.txt 2>/dev/null + + # Get the oov words list (w.r.t ref vocab) which are in the target vocab. + awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' $dir/ref_lexicon.txt \ + $dir/target_vocab.txt | sort | uniq > $dir/oov_target_vocab.txt + + # Assign pronunciations from lexicon_g2p.txt to oov_target_vocab. For words which + # cannot be found in lexicon_g2p.txt, we simply ignore them. + awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/oov_target_vocab.txt \ + $dir/lexicon_g2p.txt > $dir/lexicon_g2p_oov_target_vocab.txt + + cat $dir/lexicon_g2p_oov_target_vocab.txt $dir/ref_lexicon.txt | \ + awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/target_vocab.txt - | \ + cat $dir/non_scored_entries - | + sort | uniq > $dir/dict_expanded_target_vocab/lexicon.txt + + utils/prepare_lang.sh --phone-symbol-table $ref_lang/phones.txt $dir/dict_expanded_target_vocab \ + $oov_symbol $dir/lang_expanded_target_vocab_tmp $dir/lang_expanded_target_vocab || exit 1; + + # Align the acoustic training data using the given src_mdl_dir. + alidir=${src_mdl_dir}_ali_$(basename $data) + steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ + $data $dir/lang_expanded_target_vocab $src_mdl_dir $alidir || exit 1; + + # Train another SAT system on the given data and put it in $dir/${src_mdl_dir}_retrained + # this model will be used for phonetic decoding and lattice alignment later on. + if [ -z $num_leaves ] || [ -z $num_gauss ] ; then + echo "num_leaves and num_gauss need to be specified." && exit 1; + fi + steps/train_sat.sh --cmd "$train_cmd" $num_leaves $num_gauss \ + $data $dir/lang_expanded_target_vocab $alidir $dir/${src_mdl_dir}_retrained || exit 1; +fi + +if [ $stage -le 2 ]; then + echo "$0: Expand the reference lexicon to cover all words seen in," + echo " ... acoustic training data, and prepare corresponding dict and lang directories." + echo " ... This is needed when generate pron candidates from phonetic decoding." + mkdir -p $dir/dict_expanded_train + cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \ + $dir/dict_expanded_train 2>/dev/null + rm $dir/dict_expanded_train/lexiconp.txt $dir/dict_expanded_train/lexicon.txt 2>/dev/null + + # Get the oov words list (w.r.t ref vocab) which are in training data. + awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/ref_lexicon.txt \ + $dir/train_counts.txt | sort > $dir/oov_train.txt + + awk 'NR==FNR{a[$1] = 1; next} {if(($1 in a)) b+=$2; else c+=$2} END{print c/(b+c)}' \ + $dir/ref_vocab.txt $dir/train_counts.txt > $dir/train_oov_rate + + echo "OOV rate (w.r.t. the reference lexicon) of the acoustic training data is:" + cat $dir/train_oov_rate + + # Assign pronunciations from lexicon_g2p to oov_train. For words which + # cannot be found in lexicon_g2p, we simply assign oov_symbol's pronunciaiton + # (like NSN) to them, in order to get phonetic decoding pron candidates for them later on. + awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/oov_train.txt \ + $dir/lexicon_g2p.txt > $dir/g2p_prons_for_oov_train.txt + + # Get the pronunciation of oov_symbol. + oov_pron=`cat $dir/non_scored_entries | grep $oov_symbol | cut -f2- -d' '` + # For oov words in training data for which we don't even have G2P pron candidates, + # we simply assign them the pronunciation of the oov symbol (like ). + awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/g2p_prons_for_oov_train.txt \ + $dir/oov_train.txt | awk -v op=$oov_pron '{print $0" "op}' > $dir/oov_train_no_pron.txt + + cat $dir/oov_train_no_pron.txt $dir/g2p_prons_for_oov_train.txt $dir/ref_lexicon.txt | \ + awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/train_counts.txt - | \ + cat - $dir/non_scored_entries | \ + sort | uniq > $dir/dict_expanded_train/lexicon.txt || exit 1; + + utils/prepare_lang.sh $dir/dict_expanded_train $oov_symbol \ + $dir/lang_expanded_train_tmp $dir/lang_expanded_train || exit 1; +fi + +if [ $stage -le 3 ]; then + echo "$0: Generate pronunciation candidates from phonetic decoding on acoustic training data.." + if $retrain_src_mdl; then mdl_dir=$dir/${src_mdl_dir}_retrained; else mdl_dir=$src_mdl_dir; fi + steps/cleanup/debug_lexicon.sh --nj $nj --cmd "$decode_cmd" $data $dir/lang_expanded_train \ + $mdl_dir $dir/dict_expanded_train/lexicon.txt $dir/phonetic_decoding || exit 1; + + # We prune the phonetic decoding generated prons relative to the largest count, by setting "min_prob", + # and only leave prons who are not present in the reference lexicon / g2p-generated lexicon. + cat $dir/ref_lexicon.txt $dir/lexicon_g2p.txt > $dir/phonetic_decoding/filter_lexicon.txt + + $cmd $dir/phonetic_decoding/log/prons_to_lexicon.log steps/dict/prons_to_lexicon.py \ + --min-prob=$min_prob --filter-lexicon=$dir/phonetic_decoding/filter_lexicon.txt \ + $dir/phonetic_decoding/prons.txt $dir/lexicon_phonetic_decoding_with_eps.txt + cat $dir/lexicon_phonetic_decoding_with_eps.txt | grep -vP "|||\[.*\]" | \ + sort | uniq > $dir/lexicon_phonetic_decoding.txt || exit 1; +fi + +if [ $stage -le 4 ]; then + echo "$0: Combine the reference lexicon and pronunciations from phone-decoding/G2P into one" + echo " ... lexicon, and run lattice alignment using this lexicon on acoustic training data" + echo " ... to collect acoustic evidence." + # Combine the reference lexicon, pronunciations from G2P and phonetic decoding into one lexicon. + mkdir -p $dir/dict_combined_iter1 + cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \ + $dir/dict_combined_iter1/ 2>/dev/null + rm $dir/dict_combined_iter1/lexiconp.txt $dir/dict_combined_iter1/lexicon.txt 2>/dev/null + + # Filter out words which don't appear in the acoustic training data + cat $dir/lexicon_phonetic_decoding.txt $dir/lexicon_g2p.txt \ + $dir/ref_lexicon.txt | tr -s '\t' ' ' | \ + awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/train_counts.txt - | \ + cat $dir/non_scored_entries - | \ + sort | uniq > $dir/dict_combined_iter1/lexicon.txt + + utils/prepare_lang.sh --phone-symbol-table $ref_lang/phones.txt \ + $dir/dict_combined_iter1 $oov_symbol \ + $dir/lang_combined_iter1_tmp $dir/lang_combined_iter1 || exit 1; + + # Generate lattices for the acoustic training data with the combined lexicon. + if $retrain_src_mdl; then mdl_dir=$dir/${src_mdl_dir}_retrained; else mdl_dir=$src_mdl_dir; fi + steps/align_fmllr_lats.sh --cmd "$decode_cmd" --nj $nj \ + $data $dir/lang_combined_iter1 $mdl_dir $dir/lats_iter1 || exit 1; + + # Get arc level information from the lattice. + $cmd JOB=1:$nj $dir/lats_iter1/log/get_arc_info.JOB.log \ + lattice-align-words $dir/lang_combined_iter1/phones/word_boundary.int \ + $dir/lats_iter1/final.mdl \ + "ark:gunzip -c $dir/lats_iter1/lat.JOB.gz |" ark:- \| \ + lattice-arc-post --acoustic-scale=0.1 $dir/lats_iter1/final.mdl ark:- - \| \ + utils/int2sym.pl -f 5 $dir/lang_combined_iter1/words.txt \| \ + utils/int2sym.pl -f 6- $dir/lang_combined_iter1/phones.txt '>' \ + $dir/lats_iter1/arc_info_sym.JOB.txt || exit 1; + + # Get soft counts of all pronunciations from arc level information. + cat $dir/lats_iter1/arc_info_sym.*.txt | steps/dict/get_pron_stats.py - \ + $dir/phonetic_decoding/phone_map.txt $dir/lats_iter1/pron_stats.txt || exit 1; +fi + +if [ $stage -le 5 ]; then + echo "$0: Prune the pronunciation candidates generated from G2P/phonetic decoding, and re-do lattice-alignment." + mkdir -p $dir/dict_combined_iter2 + cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \ + $dir/dict_combined_iter2/ 2>/dev/null + rm $dir/dict_combined_iter2/lexiconp.txt $dir/dict_combined_iter2/lexicon.txt 2>/dev/null + + # Prune away pronunciations which have low acoustic evidence from the first pass of lattice alignment. + $cmd $dir/lats_iter1/log/prune_pron_candidates.log steps/dict/internal/prune_pron_candidates.py $dir/lats_iter1/pron_stats.txt $dir/ref_lexicon.txt $dir/pruned_prons.txt + + awk 'NR==FNR{a[$0] = 1; next} (!($0 in a))' $dir/pruned_prons.txt $dir/lexicon_phonetic_decoding.txt \ + > $dir/lexicon_phonetic_decoding_pruned.txt + + awk 'NR==FNR{a[$0] = 1; next} (!($0 in a))' $dir/pruned_prons.txt $dir/lexicon_g2p.txt \ + > $dir/lexicon_g2p_pruned.txt \ + + # Filter out words which don't appear in the acoustic training data + cat $dir/lexicon_phonetic_decoding_pruned.txt $dir/lexicon_g2p_pruned.txt \ + $dir/ref_lexicon.txt | tr -s '\t' ' ' | \ + awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/train_counts.txt - | \ + cat $dir/non_scored_entries - | \ + sort | uniq > $dir/dict_combined_iter2/lexicon.txt + + utils/prepare_lang.sh --phone-symbol-table $ref_lang/phones.txt \ + $dir/dict_combined_iter2 $oov_symbol \ + $dir/lang_combined_iter2_tmp $dir/lang_combined_iter2 || exit 1; + + if $retrain_src_mdl; then mdl_dir=$dir/${src_mdl_dir}_retrained; else mdl_dir=$src_mdl_dir; fi + steps/align_fmllr_lats.sh --cmd "$decode_cmd" --nj $nj \ + $data $dir/lang_combined_iter2 $mdl_dir $dir/lats_iter2 || exit 1; + + # Get arc level information from the lattice. + $cmd JOB=1:$nj $dir/lats_iter2/log/get_arc_info.JOB.log \ + lattice-align-words $dir/lang_combined_iter2/phones/word_boundary.int \ + $dir/lats_iter2/final.mdl \ + "ark:gunzip -c $dir/lats_iter2/lat.JOB.gz |" ark:- \| \ + lattice-arc-post --acoustic-scale=0.1 $dir/lats_iter2/final.mdl ark:- - \| \ + utils/int2sym.pl -f 5 $dir/lang_combined_iter2/words.txt \| \ + utils/int2sym.pl -f 6- $dir/lang_combined_iter2/phones.txt '>' \ + $dir/lats_iter2/arc_info_sym.JOB.txt || exit 1; + + # Get soft counts of all pronunciations from arc level information. + cat $dir/lats_iter2/arc_info_sym.*.txt | steps/dict/get_pron_stats.py - \ + $dir/phonetic_decoding/phone_map.txt $dir/lats_iter2/pron_stats.txt || exit 1; +fi + +if [ $stage -le 6 ]; then + echo "$0: Select pronunciations according to the acoustic evidence from lattice alignment." + # Given the acoustic evidence (soft-counts), we use a Bayesian framework to select pronunciations + # from three exclusive candidate sources: reference (hand-derived) lexicon, G2P and phonetic decoding. + # The posteriors for all candidate prons for all words are printed into pron_posteriors.txt + # For words which are out of the ref. vocab, the learned prons are written into out_of_ref_vocab_prons_learned.txt. + # Among them, for words without acoustic evidence, we just ignore them, even if pron candidates from G2P were provided). + # For words in the ref. vocab, we instead output a human readable & editable "edits" file called + # ref_lexicon_edits.txt, which records all proposed changes to the prons (if any). Also, a + # summary is printed into the log file. + + variants_counts=`cat $dir/target_num_prons_per_word` || exit 1; + $cmd $dir/lats_iter2/log/select_prons_bayesian.log \ + steps/dict/select_prons_bayesian.py --prior-mean=$prior_mean --prior-counts-tot=$prior_counts_tot \ + --variants-counts=$variants_counts --variants-prob-mass=$variants_prob_mass --variants-prob-mass-ref=$variants_prob_mass_ref \ + $ref_dict/silence_phones.txt $dir/lats_iter2/pron_stats.txt $dir/train_counts.txt $dir/ref_lexicon.txt \ + $dir/lexicon_g2p_pruned.txt $dir/lexicon_phonetic_decoding_pruned.txt \ + $dir/lats_iter2/pron_posteriors.temp $dir/lats_iter2/out_of_ref_vocab_prons_learned.txt $dir/lats_iter2/ref_lexicon_edits.txt + + # We reformat the pron_posterior file and add some comments. + paste <(cat $dir/lats_iter2/pron_posteriors.temp | cut -d' ' -f1-3 | column -t) \ + <(cat $dir/lats_iter2/pron_posteriors.temp | cut -d' ' -f4-) | sort -nr -k1,3 | \ + cat <( echo ';; ') - \ + > $dir/lats_iter2/pron_posteriors.txt + rm $dir/pron_posteriors.temp 2>/dev/null + + # Remove some stuff that takes up space and is unlikely to be useful later on. + if $cleanup; then + rm -r $dir/lats_iter*/{fsts*,lat*} 2>/dev/null + fi +fi + +if [ $stage -le 7 ]; then + echo "$0: Expand the learned lexicon further to cover words in target vocab that are." + echo " ... not seen in acoustic training data." + mkdir -p $dest_dict + cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \ + $dest_dict 2>/dev/null + rm $dest_dict/lexiconp.txt $dest_dict/lexicon.txt 2>/dev/null + # Get the list of oov (w.r.t. ref vocab) without acoustic evidence, which are in the + # target vocab. We'll just assign to them pronunciations from lexicon_g2p, if any. + cat $dir/lats_iter2/out_of_ref_vocab_prons_learned.txt $dir/ref_lexicon.txt | \ + awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' - \ + $dir/target_vocab.txt | sort | uniq > $dir/oov_no_acoustics.txt + + awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/oov_no_acoustics.txt \ + $dir/lexicon_g2p.txt > $dir/g2p_prons_for_oov_no_acoustics.txt + + # We concatenate three lexicons togethers: G2P lexicon for oov words without acoustics, + # learned lexicon for oov words with acoustics, and the original reference lexicon (for + # this part, later one we'll apply recommended changes using steps/dict/apply_lexicon_edits.py + cat $dir/g2p_prons_for_oov_no_acoustics.txt $dir/lats_iter2/out_of_ref_vocab_prons_learned.txt \ + $dir/ref_lexicon.txt | tr -s '\t' ' ' | sort | uniq > $dest_dict/lexicon.temp + + awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/target_vocab.txt \ + $dest_dict/lexicon.temp | sort | uniq > $dest_dict/lexicon.nosil + + cat $dir/non_scored_entries $dest_dict/lexicon.nosil | sort | uniq >$dest_dict/lexicon0.txt +fi + +if [ $stage -le 8 ]; then + echo "$0: Apply the ref_lexicon_edits file to the reference lexicon." + echo " ... The user can inspect/modify the edits file and then re-run:" + echo " ... steps/dict/apply_lexicon_edits.py $dest_dict/lexicon0.txt $dir/lats_iter2/ref_lexicon_edits.txt - | \\" + echo " ... sort -u \> $dest_dict/lexicon.txt to re-produce the final learned lexicon." + cp $dir/lats_iter2/ref_lexicon_edits.txt $dest_dict/lexicon_edits.txt 2>/dev/null + steps/dict/apply_lexicon_edits.py $dest_dict/lexicon0.txt $dir/lats_iter2/ref_lexicon_edits.txt - | \ + sort | uniq > $dest_dict/lexicon.txt +fi diff --git a/egs/wsj/s5/steps/dict/prons_to_lexicon.py b/egs/wsj/s5/steps/dict/prons_to_lexicon.py new file mode 100755 index 00000000000..2a87d172602 --- /dev/null +++ b/egs/wsj/s5/steps/dict/prons_to_lexicon.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python + +# Copyright 2016 Vimal Manohar +# 2016 Xiaohui Zhang +# Apache 2.0. + +# we're using python 3.x style print but want it to work in python 2.x, +from __future__ import print_function +import argparse +import sys + +class StrToBoolAction(argparse.Action): + """ A custom action to convert bools from shell format i.e., true/false + to python format i.e., True/False """ + def __call__(self, parser, namespace, values, option_string=None): + if values == "true": + setattr(namespace, self.dest, True) + elif values == "false": + setattr(namespace, self.dest, False) + else: + raise Exception("Unknown value {0} for --{1}".format(values, self.dest)) + +def GetArgs(): + parser = argparse.ArgumentParser(description = "Converts pronunciation statistics (from phone level decoding) " + "into a lexicon for lexicon learning. We prune the pronunciations " + "based on a provided stats file, and optionally filter out entries which are present " + "in a filter lexicon.", + epilog = "e.g. steps/dict/prons_to_lexicon.py --min-prob=0.4 \\" + "--filter-lexicon=exp/tri3_lex_0.4_work/phone_decode/filter_lexicon.txt \\" + "exp/tri3_lex_0.4_work/phone_decode/prons.txt \\" + "exp/tri3_lex_0.4_work/lexicon_phone_decoding.txt" + "See steps/dict/learn_lexicon.sh for examples in detail.") + + parser.add_argument("--set-sum-to-one", type = str, default = False, + action = StrToBoolAction, choices = ["true", "false"], + help = "If normalize lexicon such that the sum of " + "probabilities is 1.") + parser.add_argument("--set-max-to-one", type = str, default = True, + action = StrToBoolAction, choices = ["true", "false"], + help = "If normalize lexicon such that the max " + "probability is 1.") + parser.add_argument("--min-prob", type = float, default = 0.1, + help = "Remove pronunciation with probabilities less " + "than this value after normalization.") + parser.add_argument("--filter-lexicon", metavar='', type = str, default = '', + help = "Exclude entries in this filter lexicon from the output lexicon." + "each line must be ") + parser.add_argument("stats_file", metavar='', type = str, + help = "Input file containing pronunciation statistics, representing how many times " + "each word-pronunciation appear in the phonetic decoding results." + "each line must be ") + parser.add_argument("out_lexicon", metavar='', type = str, + help = "Output lexicon.") + + print (' '.join(sys.argv), file = sys.stderr) + + args = parser.parse_args() + args = CheckArgs(args) + + return args + +def CheckArgs(args): + if args.stats_file == "-": + args.stats_file_handle = sys.stdin + else: + args.stats_file_handle = open(args.stats_file) + + if args.filter_lexicon is not '': + if args.filter_lexicon == "-": + args.filter_lexicon_handle = sys.stdout + else: + args.filter_lexicon_handle = open(args.filter_lexicon) + + if args.out_lexicon == "-": + args.out_lexicon_handle = sys.stdout + else: + args.out_lexicon_handle = open(args.out_lexicon, "w") + + if args.set_max_to_one == args.set_sum_to_one: + raise Exception("Cannot have both " + "set-max-to-one and set-sum-to-one as true or false.") + + return args + +def ReadStats(args): + lexicon = {} + word_count = {} + for line in args.stats_file_handle: + splits = line.strip().split() + if len(splits) < 3: + continue + + word = splits[1] + count = float(splits[0]) + phones = ' '.join(splits[2:]) + + lexicon[(word, phones)] = lexicon.get((word, phones), 0) + count + word_count[word] = word_count.get(word, 0) + count + + return [lexicon, word_count] + +def ReadLexicon(lexicon_file_handle): + lexicon = set() + if lexicon_file_handle: + for line in lexicon_file_handle.readlines(): + splits = line.strip().split() + if len(splits) == 0: + continue + if len(splits) < 2: + raise Exception('Invalid format of line ' + line + + ' in lexicon file.') + word = splits[0] + phones = ' '.join(splits[1:]) + lexicon.add((word, phones)) + return lexicon + +def ConvertWordCountsToProbs(args, lexicon, word_count): + word_probs = {} + for entry, count in lexicon.iteritems(): + word = entry[0] + phones = entry[1] + prob = float(count) / float(word_count[word]) + if word in word_probs: + word_probs[word].append((phones, prob)) + else: + word_probs[word] = [(phones, prob)] + + return word_probs + +def ConvertWordProbsToLexicon(word_probs): + lexicon = {} + for word, entry in word_probs.iteritems(): + for x in entry: + lexicon[(word, x[0])] = lexicon.get((word,x[0]), 0) + x[1] + return lexicon + +def NormalizeLexicon(lexicon, set_max_to_one = True, + set_sum_to_one = False, min_prob = 0): + word_probs = {} + for entry, prob in lexicon.iteritems(): + t = word_probs.get(entry[0], (0,0)) + word_probs[entry[0]] = (t[0] + prob, max(t[1], prob)) + + for entry, prob in lexicon.iteritems(): + if set_max_to_one: + prob = prob / word_probs[entry[0]][1] + elif set_sum_to_one: + prob = prob / word_probs[entry[0]][0] + if prob < min_prob: + prob = 0 + lexicon[entry] = prob + +def WriteLexicon(args, lexicon, filter_lexicon): + words = set() + num_removed = 0 + num_filtered = 0 + for entry, prob in lexicon.iteritems(): + if prob == 0: + num_removed += 1 + continue + if entry in filter_lexicon: + num_filtered += 1 + continue + words.add(entry[0]) + print("{0} {1}".format(entry[0], entry[1]), + file = args.out_lexicon_handle) + print ("Before pruning, the total num. pronunciations is: {}".format(len(lexicon)), file=sys.stderr) + print ("Removed {0} pronunciations by setting min_prob {1}".format(num_removed, args.min_prob), file=sys.stderr) + print ("Filtered out {} pronunciations in the filter lexicon.".format(num_filtered), file=sys.stderr) + num_prons_from_phone_decoding = len(lexicon) - num_removed - num_filtered + print ("Num. pronunciations in the output lexicon, which solely come from phone decoding" + "is {0}. num. words is {1}".format(num_prons_from_phone_decoding, len(words)), file=sys.stderr) + +def Main(): + args = GetArgs() + + [lexicon, word_count] = ReadStats(args) + + word_probs = ConvertWordCountsToProbs(args, lexicon, word_count) + + lexicon = ConvertWordProbsToLexicon(word_probs) + filter_lexicon = ReadLexicon(args.filter_lexicon_handle) + NormalizeLexicon(lexicon, set_max_to_one = args.set_max_to_one, + set_sum_to_one = args.set_sum_to_one, + min_prob = args.min_prob) + WriteLexicon(args, lexicon, filter_lexicon) + args.out_lexicon_handle.close() + +if __name__ == "__main__": + Main() diff --git a/egs/wsj/s5/steps/dict/prune_pron_candidates.py b/egs/wsj/s5/steps/dict/prune_pron_candidates.py new file mode 100755 index 00000000000..affc5b17705 --- /dev/null +++ b/egs/wsj/s5/steps/dict/prune_pron_candidates.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python + +# Copyright 2016 Xiaohui Zhang +# Apache 2.0. + +from __future__ import print_function +from collections import defaultdict +import argparse +import sys +import math + +def GetArgs(): + parser = argparse.ArgumentParser(description = "Prune pronunciation candidates based on soft-counts from lattice-alignment" + "outputs, and a reference lexicon. Basically, for each word we sort all pronunciation" + "cadidates according to their soft-counts, and then select the top r * N candidates" + "(For words in the reference lexicon, N = # pron variants given by the reference" + "lexicon; For oov words, N = avg. # pron variants per word in the reference lexicon)." + "r is a user-specified constant, like 2.", + epilog = "See steps/dict/learn_lexicon.sh for example") + + parser.add_argument("--r", type = float, default = "2.0", + help = "a user-specified ratio parameter which determines how many" + "pronunciation candidates we want to keep for each word.") + parser.add_argument("pron_stats", metavar = "", type = str, + help = "File containing soft-counts of all pronounciation candidates; " + "each line must be ") + parser.add_argument("ref_lexicon", metavar = "", type = str, + help = "Reference lexicon file, where we obtain # pron variants for" + "each word, based on which we prune the pron candidates.") + parser.add_argument("pruned_prons", metavar = "", type = str, + help = "A file in lexicon format, which contains prons we want to" + "prune away from the pron_stats file.") + + print (' '.join(sys.argv), file=sys.stderr) + + args = parser.parse_args() + args = CheckArgs(args) + + return args + +def CheckArgs(args): + args.pron_stats_handle = open(args.pron_stats) + args.ref_lexicon_handle = open(args.ref_lexicon) + if args.pruned_prons == "-": + args.pruned_prons_handle = sys.stdout + else: + args.pruned_prons_handle = open(args.pruned_prons, "w") + return args + +def ReadStats(pron_stats_handle): + stats = defaultdict(list) + for line in pron_stats_handle.readlines(): + splits = line.strip().split() + if len(splits) == 0: + continue + if len(splits) < 2: + raise Exception('Invalid format of line ' + line + + ' in stats file.') + count = float(splits[0]) + word = splits[1] + phones = ' '.join(splits[2:]) + stats[word].append((phones, count)) + + for word, entry in stats.iteritems(): + entry.sort(key=lambda x: x[1]) + return stats + +def ReadLexicon(ref_lexicon_handle): + ref_lexicon = defaultdict(set) + for line in ref_lexicon_handle.readlines(): + splits = line.strip().split() + if len(splits) == 0: + continue + if len(splits) < 2: + raise Exception('Invalid format of line ' + line + + ' in lexicon file.') + word = splits[0] + try: + phones = ' '.join(splits[2:]) + except ValueError: + phones = ' '.join(splits[1:]) + ref_lexicon[word].add(phones) + return ref_lexicon + +def PruneProns(args, stats, ref_lexicon): + # Compute the average # pron variants counts per word in the reference lexicon. + num_words_ref = 0 + num_prons_ref = 0 + for word, prons in ref_lexicon.iteritems(): + num_words_ref += 1 + num_prons_ref += len(prons) + avg_variants_counts_ref = math.ceil(float(num_prons_ref) / float(num_words_ref)) + + for word, entry in stats.iteritems(): + if word in ref_lexicon: + variants_counts = args.r * len(ref_lexicon[word]) + else: + variants_counts = args.r * avg_variants_counts_ref + num_variants = 0 + while num_variants < variants_counts: + try: + pron, prob = entry.pop() + if word not in ref_lexicon or pron not in ref_lexicon[word]: + num_variants += 1 + except IndexError: + break + + for word, entry in stats.iteritems(): + for pron, prob in entry: + if word not in ref_lexicon or pron not in ref_lexicon[word]: + print('{0} {1}'.format(word, pron), file=args.pruned_prons_handle) + +def Main(): + args = GetArgs() + ref_lexicon = ReadLexicon(args.ref_lexicon_handle) + stats = ReadStats(args.pron_stats_handle) + PruneProns(args, stats, ref_lexicon) + +if __name__ == "__main__": + Main() diff --git a/egs/wsj/s5/steps/dict/select_prons_bayesian.py b/egs/wsj/s5/steps/dict/select_prons_bayesian.py new file mode 100755 index 00000000000..e728a4af0b8 --- /dev/null +++ b/egs/wsj/s5/steps/dict/select_prons_bayesian.py @@ -0,0 +1,445 @@ +#!/usr/bin/env python + +# Copyright 2016 Xiaohui Zhang +# Apache 2.0. + +from __future__ import print_function +from collections import defaultdict +import argparse +import sys +import math + +def GetArgs(): + parser = argparse.ArgumentParser(description = "Use a Bayesian framework to select" + "pronunciation candidates from three sources: reference lexicon" + ", G2P lexicon and phonetic-decoding lexicon. The inputs are a word-stats file," + "a pron-stats file, and three source lexicons (ref/G2P/phonetic-decoding)." + "We assume the pronunciations for each word follow a Categorical distribution" + "with Dirichlet priors. Thus, with user-specified prior counts (parameterized by" + "prior-mean and prior-count-tot) and observed counts from the pron-stats file, " + "we can compute posterior for each pron, and select candidates with highest" + "posteriors, until we hit user-specified variants-prob-mass/counts thresholds." + "The outputs are: a file specifiying posteriors of all candidate (pron_posteriors)," + "a learned lexicon for words out of the ref. vocab (learned_lexicon_oov)," + "and a lexicon_edits file containing suggested modifications of prons, for" + "words within the ref. vocab (ref_lexicon_edits).", + epilog = "See steps/dict/learn_lexicon.sh for example.") + parser.add_argument("--prior-mean", type = str, default = "0,0,0", + help = "Mean of priors (summing up to 1) assigned to three exclusive n" + "pronunciatio sources: reference lexicon, g2p, and phonetic decoding. We " + "recommend setting a larger prior mean for the reference lexicon, e.g. '0.6,0.2,0.2'") + parser.add_argument("--prior-counts-tot", type = float, default = 15.0, + help = "Total amount of prior counts we add to all pronunciation candidates of" + "each word. By timing it with the prior mean of a source, and then dividing" + "by the number of candidates (for a word) from this source, we get the" + "prior counts we actually add to each candidate.") + parser.add_argument("--variants-prob-mass", type = float, default = 0.7, + help = "For each word, we pick up candidates (from all three sources)" + "with highest posteriors until the total prob mass hit this amount.") + parser.add_argument("--variants-prob-mass-ref", type = float, default = 0.9, + help = "For each word, after the total prob mass of selected candidates " + "hit variants-prob-mass, we continue to pick up reference candidates" + "with highest posteriors until the total prob mass hit this amount (must >= variants-prob-mass).") + parser.add_argument("--variants-counts", type = int, default = 1, + help = "Generate upto this many variants of prons for each word out" + "of the ref. lexicon.") + parser.add_argument("silence_file", metavar = "", type = str, + help = "File containing a list of silence phones.") + parser.add_argument("pron_stats_file", metavar = "", type = str, + help = "File containing pronunciation statistics from lattice alignment; " + "each line must be .") + parser.add_argument("word_counts_file", metavar = "", type = str, + help = "File containing word counts in acoustic training data; " + "each line must be .") + parser.add_argument("ref_lexicon", metavar = "", type = str, + help = "The reference lexicon (most probably hand-derived)." + "Each line must be ") + parser.add_argument("g2p_lexicon", metavar = "", type = str, + help = "Candidate ronouciations from G2P results." + "Each line must be ") + parser.add_argument("phonetic_decoding_lexicon", metavar = "", type = str, + help = "Candidate ronouciations from phonetic decoding results." + "Each line must be ") + parser.add_argument("pron_posteriors", metavar = "", type = str, + help = "Output file containing posteriors of all candidate prons for each word," + "based on which we select prons to construct the learned lexicon." + "each line is ") + parser.add_argument("learned_lexicon_oov", metavar = "", type = str, + help = "Output file which is the learned lexicon for words out of the ref. vocab.") + parser.add_argument("ref_lexicon_edits", metavar = "", type = str, + help = "Output file containing human-readable & editable pronounciation info (and the" + "accept/reject decision made by our algorithm) for those words in ref. vocab," + "to which any change has been recommended. The info for each word is like:" + "------------ an 4086.0 --------------" + "R | Y | 2401.6 | AH N" + "R | Y | 640.8 | AE N" + "P | Y | 1035.5 | IH N" + "R(ef), P(hone-decoding) represents the pronunciation source" + "Y/N means the recommended decision of including this pron or not" + "and the numbers are soft counts accumulated from lattice-align-word outputs. " + "See the function WriteEditsAndSummary for more details.") + + + print (' '.join(sys.argv), file=sys.stderr) + + args = parser.parse_args() + args = CheckArgs(args) + + return args + +def CheckArgs(args): + args.silence_file_handle = open(args.silence_file) + if args.pron_stats_file == "-": + args.pron_stats_file_handle = sys.stdin + else: + args.pron_stats_file_handle = open(args.pron_stats_file) + args.word_counts_file_handle = open(args.word_counts_file) + args.ref_lexicon_handle = open(args.ref_lexicon) + args.g2p_lexicon_handle = open(args.g2p_lexicon) + args.phonetic_decoding_lexicon_handle = open(args.phonetic_decoding_lexicon) + args.pron_posteriors_handle = open(args.pron_posteriors, "w") + args.learned_lexicon_oov_handle = open(args.learned_lexicon_oov, "w") + args.ref_lexicon_edits_handle = open(args.ref_lexicon_edits, "w") + + prior_mean = args.prior_mean.strip().split(',') + if len(prior_mean) is not 3: + raise Exception('Invalid Dirichlet prior mean ', args.prior_mean) + for i in range(0,3): + if float(prior_mean[i]) <= 0 or float(prior_mean[i]) >= 1: + raise Exception('Dirichlet prior mean', prior_mean[i], 'is invalid, it must be between 0 and 1.') + args.prior_mean = [float(prior_mean[0]), float(prior_mean[1]), float(prior_mean[2])] + + return args + +def ReadPronStats(pron_stats_file_handle): + stats = {} + for line in pron_stats_file_handle.readlines(): + splits = line.strip().split() + if len(splits) == 0: + continue + if len(splits) < 2: + raise Exception('Invalid format of line ' + line + + ' in stats file.') + count = float(splits[0]) + word = splits[1] + phones = ' '.join(splits[2:]) + stats[(word, phones)] = count + return stats + +def ReadWordCounts(word_counts_file_handle): + counts = {} + for line in word_counts_file_handle.readlines(): + splits = line.strip().split() + if len(splits) < 2: + raise Exception('Invalid format of line ' + line + + ' in counts file.') + word = splits[0] + count = int(splits[1]) + counts[word] = count + return counts + +def ReadLexicon(args, lexicon_file_handle, counts): + # we're skipping any word not in counts (not seen in training data), + # cause we're only learning prons for words who have acoustic examples. + lexicon = defaultdict(set) + for line in lexicon_file_handle.readlines(): + splits = line.strip().split() + if len(splits) == 0: + continue + if len(splits) < 2: + raise Exception('Invalid format of line ' + line + + ' in lexicon file.') + word = splits[0] + if word not in counts: + continue + phones = ' '.join(splits[1:]) + lexicon[word].add(phones) + return lexicon + +def FilterPhoneticDecodingLexicon(args, phonetic_decoding_lexicon, stats): + # We want to remove all candidates which contains silence phones + silphones = set() + for line in args.silence_file_handle: + silphones.add(line.strip()) + rejected_candidates = set() + for word, prons in phonetic_decoding_lexicon.iteritems(): + for pron in prons: + for phone in pron.split(): + if phone in silphones: + if (word, pron) in stats: + count = stats[(word, pron)] + del stats[(word, pron)] + else: + count = 0 + rejected_candidates.add((word, pron)) + print('WARNING: removing the candidate pronunciation from phonetic-decoding: {0}: ' + '"{1}" whose soft-count from lattice-alignment is {2}, cause it contains at' + ' least one silence phone.'.format(word, pron, count), file=sys.stderr) + break + for word, pron in rejected_candidates: + phonetic_decoding_lexicon[word].remove(pron) + return phonetic_decoding_lexicon, stats + +def ComputePriorCounts(args, counts, ref_lexicon, g2p_lexicon, phonetic_decoding_lexicon): + prior_counts = defaultdict(list) + # In case one source is absent for a word, we set zero prior to this source, + # and then re-normalize the prior mean parameters s.t. they sum up to one. + for word in counts: + prior_mean = [args.prior_mean[0], args.prior_mean[1], args.prior_mean[2]] + if word not in ref_lexicon: + prior_mean[0] = 0 + if word not in g2p_lexicon: + prior_mean[1] = 0 + if word not in phonetic_decoding_lexicon: + prior_mean[2] = 0 + prior_mean_sum = sum(prior_mean) + try: + prior_mean = [t / prior_mean_sum for t in prior_mean] + except ZeroDivisionError: + print('WARNING: word {} appears in train_counts but not in any lexicon.'.format(word), file=sys.stderr) + prior_counts[word] = [t * args.prior_counts_tot for t in prior_mean] + return prior_counts + +def ComputePosteriors(args, stats, ref_lexicon, g2p_lexicon, phonetic_decoding_lexicon, prior_counts): + posteriors = defaultdict(list) # This dict stores a list of (pronunciation, posterior) + # pairs for each word, where the posteriors are normalized soft counts. Before normalization, + # The soft-counts were augmented by a user-specified prior count, according the source + # (ref/G2P/phonetic-decoding) of this pronunciation. + + for word, prons in ref_lexicon.iteritems(): + for pron in prons: + # c is the augmented soft count (observed count + prior count) + c = prior_counts[word][0] / len(ref_lexicon[word]) + stats.get((word, pron), 0) + posteriors[word].append((pron, c)) + + for word, prons in g2p_lexicon.iteritems(): + for pron in prons: + c = prior_counts[word][1] / len(g2p_lexicon[word]) + stats.get((word, pron), 0) + posteriors[word].append((pron, c)) + + for word, prons in phonetic_decoding_lexicon.iteritems(): + for pron in prons: + c = prior_counts[word][2] / len(phonetic_decoding_lexicon[word]) + stats.get((word, pron), 0) + posteriors[word].append((pron, c)) + + num_prons_from_ref = sum(len(ref_lexicon[i]) for i in ref_lexicon) + num_prons_from_g2p = sum(len(g2p_lexicon[i]) for i in g2p_lexicon) + num_prons_from_phonetic_decoding = sum(len(phonetic_decoding_lexicon[i]) for i in phonetic_decoding_lexicon) + print ("---------------------------------------------------------------------------------------------------", file=sys.stderr) + print ('Total num. words is {}:'.format(len(posteriors)), file=sys.stderr) + print ('{0} candidate prons came from the reference lexicon; {1} came from G2P;{2} came from' + 'phonetic_decoding'.format(num_prons_from_ref, num_prons_from_g2p, num_prons_from_phonetic_decoding), file=sys.stderr) + print ("---------------------------------------------------------------------------------------------------", file=sys.stderr) + + # Normalize the augmented soft counts to get posteriors. + count_sum = defaultdict(float) # This dict stores the pronunciation which has + # the sum of augmented soft counts for each word. + + for word in posteriors: + # each entry is a pair: (prounciation, count) + count_sum[word] = sum([entry[1] for entry in posteriors[word]]) + + for word, entry in posteriors.iteritems(): + new_entry = [] + for pron, count in entry: + post = count / count_sum[word] + new_entry.append((pron, post)) + source = 'R' + if word in g2p_lexicon and pron in g2p_lexicon[word]: + source = 'G' + elif word in phonetic_decoding_lexicon and pron in phonetic_decoding_lexicon[word]: + source = 'P' + print(word, source, "%3.2f" % post, pron, file=args.pron_posteriors_handle) + del entry[:] + entry.extend(sorted(new_entry, key=lambda new_entry: new_entry[1])) + return posteriors + +def SelectPronsBayesian(args, counts, posteriors, ref_lexicon, g2p_lexicon, phonetic_decoding_lexicon): + reference_selected = 0 + g2p_selected = 0 + phonetic_decoding_selected = 0 + learned_lexicon = defaultdict(set) + + for word, entry in posteriors.iteritems(): + num_variants = 0 + post_tot = 0.0 + variants_counts = args.variants_counts + variants_prob_mass = args.variants_prob_mass + if word in ref_lexicon: + # the variants count of the current word's prons in the ref lexicon. + variants_counts_ref = len(ref_lexicon[word]) + # For words who don't appear in acoustic training data at all, we simply accept all ref prons. + # For words in ref. vocab, we set the max num. variants + if counts.get(word, 0) > 0: + variants_counts = math.ceil(1.5 * variants_counts_ref) + else: + variants_counts = variants_counts_ref + variants_prob_mass = 1.0 + last_post = 0.0 + while ((num_variants < variants_counts and post_tot < variants_prob_mass) + or (len(entry) > 0 and entry[-1][1] == last_post)): # this conditions + # means the posterior of the current pron is the same as the one we just included. + try: + pron, post = entry.pop() + last_post = post + except IndexError: + break + post_tot += post + learned_lexicon[word].add(pron) + num_variants += 1 + if word in ref_lexicon and pron in ref_lexicon[word]: + reference_selected += 1 + elif word in g2p_lexicon and pron in g2p_lexicon[word]: + g2p_selected += 1 + else: + phonetic_decoding_selected += 1 + + while (num_variants < variants_counts and post_tot < args.variants_prob_mass_ref): + try: + pron, post = entry.pop() + except IndexError: + break + if word in ref_lexicon and pron in ref_lexicon[word]: + post_tot += post + learned_lexicon[word].add(pron) + num_variants += 1 + reference_selected += 1 + + num_prons_tot = reference_selected + g2p_selected + phonetic_decoding_selected + print('---------------------------------------------------------------------------------------------------', file=sys.stderr) + print ('Num. words in the learned lexicon: {0} num. selected prons: {1}'.format(len(learned_lexicon), num_prons_tot), file=sys.stderr) + print ('{0} selected prons came from reference candidate prons; {1} came from G2P candidate prons;' + '{2} came from phonetic-decoding candidate prons.'.format(reference_selected, g2p_selected, phonetic_decoding_selected), file=sys.stderr) + return learned_lexicon + +def WriteEditsAndSummary(args, learned_lexicon, ref_lexicon, phonetic_decoding_lexicon, g2p_lexicon, counts, stats): + # Note that learned_lexicon and ref_lexicon are dicts of sets of prons, while the other two lexicons are sets of (word, pron) pairs. + threshold = 3 + words = [defaultdict(set) for i in range(4)] # "words" contains four bins, where we + # classify each word into, according to whether it's count > threshold, + # and whether it's OOVs w.r.t the reference lexicon. + + src = {} + print("# Note: This file contains pronunciation info for words who have candidate" + "prons from G2P/phonetic-decoding accepted in the learned lexicon." + ", sorted by their counts in acoustic training data, " + ,file=args.ref_lexicon_edits_handle) + print("# 1st Col: source of the candidate pron: G(2P) / P(hone-decoding) / R(eference)." + ,file=args.ref_lexicon_edits_handle) + print("# 2nd Col: accepted or not in the learned lexicon (Y/N).", file=args.ref_lexicon_edits_handle) + print("# 3rd Col: soft counts from lattice-alignment (not augmented by prior-counts)." + ,file=args.ref_lexicon_edits_handle) + print("# 4th Col: the pronunciation cadidate.", file=args.ref_lexicon_edits_handle) + + # words which are to be printed into the edits file. + words_to_edit = [] + for word in learned_lexicon: + count = counts.get(word, 0) + flags = ['0' for i in range(3)] # "flags" contains three binary indicators, + # indicating where this word's pronunciations come from. + for pron in learned_lexicon[word]: + if word in phonetic_decoding_lexicon and pron in phonetic_decoding_lexicon[word]: + flags[0] = '1' + src[(word, pron)] = 'P' + if word in ref_lexicon and pron in ref_lexicon[word]: + flags[1] = '1' + src[(word, pron)] = 'R' + if word in g2p_lexicon and pron in g2p_lexicon[word]: + flags[2] = '1' + src[(word, pron)] = 'G' + if word in ref_lexicon: + all_ref_prons_accepted = True + for pron in ref_lexicon[word]: + if pron not in learned_lexicon[word]: + all_ref_prons_accepted = False + break + if not all_ref_prons_accepted or flags[0] == '1' or flags[2] == '1': + words_to_edit.append((word, counts[word])) + if count > threshold: + words[0][flags[0] + flags[1] + flags[2]].add(word) + else: + words[1][flags[0] + flags[1] + flags[2]].add(word) + else: + if count > threshold: + words[2][flags[0] + flags[2]].add(word) + else: + words[3][flags[0] + flags[2]].add(word) + + words_to_edit_sorted = sorted(words_to_edit, key=lambda entry: entry[1], reverse=True) + for word, count in words_to_edit_sorted: + print("------------",word, "%2.1f" % count, "--------------", file=args.ref_lexicon_edits_handle) + for pron in learned_lexicon[word]: + print(src[(word, pron)], ' | Y | ', "%2.1f | " % stats.get((word, pron), 0), pron, + file=args.ref_lexicon_edits_handle) + for pron in ref_lexicon[word]: + if pron not in learned_lexicon[word]: + soft_count = stats.get((word, pron), 0) + print('R | N | {:.2f} | {} '.format(soft_count, pron), file=args.ref_lexicon_edits_handle) + print("Here are the words whose reference pron candidates were all declined", words[0]['100'], file=sys.stderr) + print("-------------------------------------------------Summary------------------------------------------", file=sys.stderr) + print("In the learned lexicon, out of those", len(ref_lexicon), "words from the vocab of the reference lexicon:", file=sys.stderr) + print(" For those frequent words whose counts in the training text > ", threshold, ":", file=sys.stderr) + num_freq_ivs_from_all_sources = len(words[0]['111']) + len(words[0]['110']) + len(words[0]['011']) + num_freq_ivs_from_g2p_or_phonetic_decoding = len(words[0]['101']) + len(words[0]['001']) + len(words[0]['100']) + num_freq_ivs_from_ref = len(words[0]['010']) + num_infreq_ivs_from_all_sources = len(words[1]['111']) + len(words[1]['110']) + len(words[1]['011']) + num_infreq_ivs_from_g2p_or_phonetic_decoding = len(words[1]['101']) + len(words[1]['001']) + len(words[1]['100']) + num_infreq_ivs_from_ref = len(words[1]['010']) + print(' {} words\' selected prons came from the reference lexicon, G2P/phonetic-decoding.'.format(num_freq_ivs_from_all_sources), file=sys.stderr) + print(' {} words\' selected prons come from G2P/phonetic-decoding-generated.'.format(num_freq_ivs_from_g2p_or_phonetic_decoding), file=sys.stderr) + print(' {} words\' selected prons came from the reference lexicon only.'.format(num_freq_ivs_from_ref), file=sys.stderr) + print(' For those words whose counts in the training text <= {}:'.format(threshold), file=sys.stderr) + print(' {} words\' selected prons came from the reference lexicon, G2P/phonetic-decoding.'.format(num_infreq_ivs_from_all_sources), file=sys.stderr) + print(' {} words\' selected prons come from G2P/phonetic-decoding-generated.'.format(num_infreq_ivs_from_g2p_or_phonetic_decoding), file=sys.stderr) + print(' {} words\' selected prons came from the reference lexicon only.'.format(num_infreq_ivs_from_ref), file=sys.stderr) + print("---------------------------------------------------------------------------------------------------", file=sys.stderr) + num_oovs = len(learned_lexicon) - len(ref_lexicon) + num_freq_oovs_from_both_sources = len(words[2]['11']) + num_freq_oovs_from_phonetic_decoding = len(words[2]['10']) + num_freq_oovs_from_g2p = len(words[2]['01']) + num_infreq_oovs_from_both_sources = len(words[3]['11']) + num_infreq_oovs_from_phonetic_decoding = len(words[3]['10']) + num_infreq_oovs_from_g2p = len(words[3]['01']) + print(' In the learned lexicon, out of those {} OOV words (w.r.t the reference lexicon):'.format(num_oovs), file=sys.stderr) + print(' For those words whose counts in the training text > {}:'.format(threshold), file=sys.stderr) + print(' {} words\' selected prons came from G2P and phonetic-decoding.'.format(num_freq_oovs_from_both_sources), file=sys.stderr) + print(' {} words\' selected prons came from phonetic decoding only.'.format(num_freq_oovs_from_phonetic_decoding), file=sys.stderr) + print(' {} words\' selected prons came from G2P only.'.format(num_freq_oovs_from_g2p), file=sys.stderr) + print(' For those words whose counts in the training text <= {}:'.format(threshold), file=sys.stderr) + print(' {} words\' selected prons came from G2P and phonetic-decoding.'.format(num_infreq_oovs_from_both_sources), file=sys.stderr) + print(' {} words\' selected prons came from phonetic decoding only.'.format(num_infreq_oovs_from_phonetic_decoding), file=sys.stderr) + print(' {} words\' selected prons came from G2P only.'.format(num_infreq_oovs_from_g2p), file=sys.stderr) + +def WriteLearnedLexiconOov(learned_lexicon, ref_lexicon, file_handle): + for word, prons in learned_lexicon.iteritems(): + if word not in ref_lexicon: + for pron in prons: + print('{0} {1}'.format(word, pron), file=file_handle) + file_handle.close() + +def Main(): + args = GetArgs() + + # Read in three lexicon sources, word counts, and pron stats. + counts = ReadWordCounts(args.word_counts_file_handle) + ref_lexicon = ReadLexicon(args, args.ref_lexicon_handle, counts) + g2p_lexicon = ReadLexicon(args, args.g2p_lexicon_handle, counts) + phonetic_decoding_lexicon = ReadLexicon(args, args.phonetic_decoding_lexicon_handle, counts) + stats = ReadPronStats(args.pron_stats_file_handle) + phonetic_decoding_lexicon, stats = FilterPhoneticDecodingLexicon(args, phonetic_decoding_lexicon, stats) + + # Compute prior counts + prior_counts = ComputePriorCounts(args, counts, ref_lexicon, g2p_lexicon, phonetic_decoding_lexicon) + # Compute posteriors, and then select prons to construct the learned lexicon. + posteriors = ComputePosteriors(args, stats, ref_lexicon, g2p_lexicon, phonetic_decoding_lexicon, prior_counts) + + # Select prons to construct the learned lexicon. + learned_lexicon = SelectPronsBayesian(args, counts, posteriors, ref_lexicon, g2p_lexicon, phonetic_decoding_lexicon) + + # Write the learned prons for words out of the ref. vocab into learned_lexicon_oov. + WriteLearnedLexiconOov(learned_lexicon, ref_lexicon, args.learned_lexicon_oov_handle) + # Edits will be printed into ref_lexicon_edits, and the summary will be printed into stderr. + WriteEditsAndSummary(args, learned_lexicon, ref_lexicon, phonetic_decoding_lexicon, g2p_lexicon, counts, stats) + +if __name__ == "__main__": + Main() diff --git a/egs/wsj/s5/steps/dict/train_g2p.sh b/egs/wsj/s5/steps/dict/train_g2p.sh new file mode 100755 index 00000000000..85e1605afba --- /dev/null +++ b/egs/wsj/s5/steps/dict/train_g2p.sh @@ -0,0 +1,83 @@ +#!/bin/bash +# Copyright 2014 Johns Hopkins University (Author: Yenda Trmal) +# Copyright 2016 Xiaohui Zhang +# Apache 2.0 + +# Begin configuration section. +iters=5 +stage=0 +encoding='utf-8' +only_words=true +cmd=run.pl +# a list of silence phones, like data/local/dict/silence_phones.txt +silence_phones= +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. utils/parse_options.sh || exit 1; + +set -u +set -e + +if [ $# != 2 ]; then + echo "Usage: $0 [options] " + echo " where is the training lexicon (one pronunciation per " + echo " word per line) and is directory where the models will " + echo " be stored" + echo "e.g.: train_g2p.sh data/local/lexicon.txt exp/g2p/" + echo "" + echo "main options (for others, see top of script file)" + echo " --iters # How many iterations. Relates to N-ngram order" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +lexicon=$1 +wdir=$2 + + +mkdir -p $wdir/log + +[ ! -f $lexicon ] && echo "$0: Training lexicon does not exist." && exit 1 + +# Optionally remove words that are mapped to a single silence phone from the lexicon. +if $only_words && [ -z $silence_phones ]; then + awk 'NR==FNR{a[$1] = 1; next} {s=$2;for(i=3;i<=NF;i++) s=s" "$i;a[$1]=s;if(!(s in a)) print $1" "s}' \ + $silence_phones > $wdir/lexicon_onlywords.txt + lexicon=$wdir/lexicon_onlywords.txt +fi + +if ! g2p=`which g2p.py` ; then + echo "Sequitur was not found !" + echo "Go to $KALDI_ROOT/tools and execute extras/install_sequitur.sh" + exit 1 +fi + +echo "Training the G2P model (iter 0)" + +if [ $stage -le 0 ]; then + $cmd $wdir/log/g2p.0.log \ + g2p.py -S --encoding $encoding --train $lexicon --devel 5% --write-model $wdir/g2p.model.0 +fi + +for i in `seq 0 $(($iters-2))`; do + + echo "Training the G2P model (iter $[$i + 1] )" + + if [ $stage -le $i ]; then + $cmd $wdir/log/g2p.$(($i + 1)).log \ + g2p.py -S --encoding $encoding --model $wdir/g2p.model.$i --ramp-up --train $lexicon --devel 5% --write-model $wdir/g2p.model.$(($i+1)) + fi + +done + +! (set -e; cd $wdir; ln -sf g2p.model.$[$iters-1] g2p.model.final ) && echo "Problem finalizing training... " && exit 1 + +if [ $stage -le $(($i + 2)) ]; then + echo "Running test..." + $cmd $wdir/log/test.log \ + g2p.py --encoding $encoding --model $wdir/g2p.model.final --test $lexicon +fi + diff --git a/egs/wsj/s5/steps/libs/__init__.py b/egs/wsj/s5/steps/libs/__init__.py new file mode 100644 index 00000000000..2a472386568 --- /dev/null +++ b/egs/wsj/s5/steps/libs/__init__.py @@ -0,0 +1,9 @@ + + +# Copyright 2016 Vimal Manohar +# Apache 2.0. + +""" This package contains modules and subpackages used in kaldi scripts. +""" + +__all__ = ["common"] diff --git a/egs/wsj/s5/steps/libs/nnet3/__init__.py b/egs/wsj/s5/steps/libs/nnet3/__init__.py new file mode 100644 index 00000000000..03131a3a8d6 --- /dev/null +++ b/egs/wsj/s5/steps/libs/nnet3/__init__.py @@ -0,0 +1,12 @@ + +# Copyright 2016 Johns Hopkins University (Dan Povey) +# 2016 Vimal Manohar +# 2016 Vijayaditya Peddinti +# 2016 Yiming Wang +# Apache 2.0. + + +# This module has the python functions which facilitate the use of nnet3 toolkit +# It has two sub-modules +# xconfig : Library for parsing high level description of neural networks +# train : Library for training scripts diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/__init__.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/__init__.py new file mode 100644 index 00000000000..6c824b1195b --- /dev/null +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/__init__.py @@ -0,0 +1,39 @@ +# Copyright 2016 Johns Hopkins University (Dan Povey) +# 2016 Vijayaditya Peddinti +# 2016 Yiming Wang +# Apache 2.0. + +"""This library has classes and methods to form neural network computation graphs, +in the nnet3 framework, using higher level abstractions called 'layers' +(e.g. sub-graphs like LSTMS ). + +Note : We use the term 'layer' though the computation graph can have a highly +non-linear structure as, other terms such as nodes/components have already been +used in C++ codebase of nnet3. + +This is basically a config parser module, where the configs have very concise +descriptions of a neural network. + +This module has methods to convert the xconfigs into a configs interpretable by +nnet3 C++ library. + +It generates three different configs: + 'init.config' : which is the config with the info necessary for computing + the preconditioning matrix i.e., LDA transform + e.g. + input-node name=input dim=40 + input-node name=ivector dim=100 + output-node name=output input=Append(Offset(input, -2), Offset(input, -1), input, Offset(input, 1), Offset(input, 2), ReplaceIndex(ivector, t, 0)) objective=linear + + 'ref.config' : which is a version of the config file used to generate + a model for getting left and right context (it doesn't read + anything for the LDA-like transform and/or + presoftmax-prior-scale components) + + 'final.config' : which has the actual config used to initialize the model used + in training i.e, it has file paths for LDA transform and + other initialization files +""" + + +__all__ = ["utils", "layers", "parser"] diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py new file mode 100644 index 00000000000..52f366b4cc2 --- /dev/null +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -0,0 +1,906 @@ +# Copyright 2016 Johns Hopkins University (Dan Povey) +# 2016 Vijayaditya Peddinti +# Apache 2.0. + +""" This module contains the parent class from which all layers are inherited +and some basic layer definitions. +""" + +from __future__ import print_function +import sys +import libs.nnet3.xconfig.utils as xutils +from libs.nnet3.xconfig.utils import XconfigParserError as xparser_error + + +class XconfigLayerBase(object): + """ A base-class for classes representing layers of xconfig files. + """ + + def __init__(self, first_token, key_to_value, all_layers): + """ + first_token: first token on the xconfig line, e.g. 'affine-layer'.f + key_to_value: dictionary with parameter values + { 'name':'affine1', + 'input':'Append(0, 1, 2, ReplaceIndex(ivector, t, 0))', + 'dim=1024' }. + The only required and 'special' values that are dealt with directly + at this level, are 'name' and 'input'. The rest are put in + self.config and are dealt with by the child classes' init functions. + all_layers: An array of objects inheriting XconfigLayerBase for all + previously parsed layers. + """ + + self.layer_type = first_token + if not 'name' in key_to_value: + raise xparser_error("Expected 'name' to be specified.", self.str()) + self.name = key_to_value['name'] + if not xutils.is_valid_line_name(self.name): + raise xparser_error("Invalid value: name={0}".format( + key_to_value['name']), self.str()) + + # the following, which should be overridden in the child class, sets + # default config parameters in self.config. + self.set_default_configs() + # The following is not to be reimplemented in child classes; + # it sets the config values to those specified by the user, and + # parses any Descriptors. + self.set_configs(key_to_value, all_layers) + # This method, sets the derived default config values + # i.e., config values when not specified can be derived from + # other values. It can be overridden in the child class. + self.set_derived_configs() + # the following, which should be overridden in the child class, checks + # that the config parameters that have been set are reasonable. + self.check_configs() + + + def set_configs(self, key_to_value, all_layers): + """ Sets the config variables. + We broke this code out of __init__ for clarity. + the child-class constructor will deal with the configuration values + in a more specific way. + """ + + for key,value in key_to_value.items(): + if key != 'name': + if not key in self.config: + raise xparser_error("Configuration value {0}={1} was not" + " expected in layer of type {2}" + "".format(key, value, self.layer_type), + self.str()) + self.config[key] = xutils.convert_value_to_type(key, + type(self.config[key]), + value) + self.descriptors = dict() + self.descriptor_dims = dict() + # Parse Descriptors and get their dims and their 'final' string form. + # in self.descriptors[key] + for key in self.get_input_descriptor_names(): + if not key in self.config: + raise xparser_error("{0}: object of type {1} needs to override" + " get_input_descriptor_names()." + "".format(sys.argv[0], str(type(self))), + self.str()) + descriptor_string = self.config[key] # input string. + assert isinstance(descriptor_string, str) + desc = self.convert_to_descriptor(descriptor_string, all_layers) + desc_dim = self.get_dim_for_descriptor(desc, all_layers) + desc_norm_str = desc.str() + + # desc_output_str contains the "final" component names, those that + # appear in the actual config file (i.e. not names like + # 'layer.auxiliary_output'); that's how it differs from desc_norm_str. + # Note: it's possible that the two strings might be the same in + # many, even most, cases-- it depends whether + # output_name(self, auxiliary_output) + # returns self.get_name() + '.' + auxiliary_output + # when auxiliary_output is not None. + # That's up to the designer of the layer type. + desc_output_str = self.get_string_for_descriptor(desc, all_layers) + self.descriptors[key] = {'string':desc, + 'normalized-string':desc_norm_str, + 'final-string':desc_output_str, + 'dim':desc_dim} + + # the following helps to check the code by parsing it again. + desc2 = self.convert_to_descriptor(desc_norm_str, all_layers) + desc_norm_str2 = desc2.str() + # if the following ever fails we'll have to do some debugging. + if desc_norm_str != desc_norm_str2: + raise xparser_error("Likely code error: '{0}' != '{1}'" + "".format(desc_norm_str, desc_norm_str2), + self.str()) + + def str(self): + """Converts 'this' to a string which could be printed to + an xconfig file; in xconfig_to_configs.py we actually expand all the + lines to strings and write it as xconfig.expanded as a reference + (so users can see any defaults). + """ + + ans = '{0} name={1}'.format(self.layer_type, self.name) + ans += ' ' + ' '.join([ '{0}={1}'.format(key, self.config[key]) + for key in sorted(self.config.keys())]) + return ans + + def __str__(self): + + return self.str() + + + def normalize_descriptors(self): + """Converts any config variables in self.config which correspond to + Descriptors, into a 'normalized form' derived from parsing them as + Descriptors, replacing things like [-1] with the actual layer names, + and regenerating them as strings. We stored this when the object was + initialized, in self.descriptors; this function just copies them back + to the config. + """ + + for key, desc_str_dict in self.descriptors.items(): + self.config[key] = desc_str_dict['normalized-string'] + + def convert_to_descriptor(self, descriptor_string, all_layers): + """Convenience function intended to be called from child classes, + converts a string representing a descriptor ('descriptor_string') + into an object of type Descriptor, and returns it. It needs 'self' and + 'all_layers' (where 'all_layers' is a list of objects of type + XconfigLayerBase) so that it can work out a list of the names of other + layers, and get dimensions from them. + """ + + prev_names = xutils.get_prev_names(all_layers, self) + tokens = xutils.tokenize_descriptor(descriptor_string, prev_names) + pos = 0 + (descriptor, pos) = xutils.parse_new_descriptor(tokens, pos, prev_names) + # note: 'pos' should point to the 'end of string' marker + # that terminates 'tokens'. + if pos != len(tokens) - 1: + raise xparser_error("Parsing Descriptor, saw junk at end: " + + ' '.join(tokens[pos:-1]), self.str()) + return descriptor + + def get_dim_for_descriptor(self, descriptor, all_layers): + """Returns the dimension of a Descriptor object. This is a convenience + function used in set_configs. + """ + + layer_to_dim_func = \ + lambda name: xutils.get_dim_from_layer_name(all_layers, self, + name) + return descriptor.dim(layer_to_dim_func) + + def get_string_for_descriptor(self, descriptor, all_layers): + """Returns the 'final' string form of a Descriptor object, + as could be used in config files. This is a convenience function + provided for use in child classes; + """ + + layer_to_string_func = \ + lambda name: xutils.get_string_from_layer_name(all_layers, + self, name) + return descriptor.config_string(layer_to_string_func) + + def get_name(self): + """Returns the name of this layer, e.g. 'affine1'. It does not + necessarily correspond to a component name. + """ + + return self.name + + ###### Functions that might be overridden by the child class: ##### + + def set_default_configs(self): + """Child classes should override this. + """ + + raise Exception("Child classes must override set_default_configs().") + + def set_derived_configs(self): + """This is expected to be called after set_configs and before + check_configs(). + """ + + if self.config['dim'] <= 0: + self.config['dim'] = self.descriptors['input']['dim'] + + def check_configs(self): + """child classes should override this. + """ + + pass + + def get_input_descriptor_names(self): + """This function, which may be (but usually will not have to be) + overridden by child classes, returns a list of names of the input + descriptors expected by this component. Typically this would just + return ['input'] as most layers just have one 'input'. However some + layers might require more inputs (e.g. cell state of previous LSTM layer + in Highway LSTMs). It is used in the function 'normalize_descriptors()'. + This implementation will work for layer types whose only + Descriptor-valued config is 'input'. + If a child class adds more inputs, or does not have an input + (e.g. the XconfigInputLayer), it should override this function's + implementation to something like: `return ['input', 'input2']` + """ + + return [ 'input' ] + + def auxiliary_outputs(self): + """Returns a list of all auxiliary outputs that this layer supports. + These are either 'None' for the regular output, or a string + (e.g. 'projection' or 'memory_cell') for any auxiliary outputs that + the layer might provide. Most layer types will not need to override + this. + """ + + return [ None ] + + def output_name(self, auxiliary_output = None): + """Called with auxiliary_output == None, this returns the component-node + name of the principal output of the layer (or if you prefer, the text + form of a descriptor that gives you such an output; such as + Append(some_node, some_other_node)). + The 'auxiliary_output' argument is a text value that is designed for + extensions to layers that have additional auxiliary outputs. + For example, to implement a highway LSTM you need the memory-cell of a + layer, so you might allow auxiliary_output='memory_cell' for such a + layer type, and it would return the component node or a suitable + Descriptor: something like 'lstm3.c_t' + """ + + raise Exception("Child classes must override output_name()") + + def output_dim(self, auxiliary_output = None): + """The dimension that this layer outputs. The 'auxiliary_output' + parameter is for layer types which support auxiliary outputs. + """ + + raise Exception("Child classes must override output_dim()") + + def get_full_config(self): + """This function returns lines destined for the 'full' config format, as + would be read by the C++ programs. Since the program + xconfig_to_configs.py writes several config files, this function returns + a list of pairs of the form (config_file_basename, line), + e.g. something like + [ ('init', 'input-node name=input dim=40'), + ('ref', 'input-node name=input dim=40') ] + which would be written to config_dir/init.config and config_dir/ref.config. + """ + + raise Exception("Child classes must override get_full_config()") + + +class XconfigInputLayer(XconfigLayerBase): + """This class is for lines like + 'input name=input dim=40' + or + 'input name=ivector dim=100' + in the config file. + """ + + + def __init__(self, first_token, key_to_value, prev_names = None): + + assert first_token == 'input' + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + + def set_default_configs(self): + + self.config = { 'dim': -1} + + def check_configs(self): + + if self.config['dim'] <= 0: + raise xparser_error("Dimension of input-layer '{0}'" + "should be positive.".format(self.name), + self.str()) + + def get_input_descriptor_names(self): + + return [] # there is no 'input' field in self.config. + + def output_name(self, auxiliary_outputs = None): + + # there are no auxiliary outputs as this layer will just pass the input + assert auxiliary_outputs is None + return self.name + + def output_dim(self, auxiliary_outputs = None): + + # there are no auxiliary outputs as this layer will just pass the input + assert auxiliary_outputs is None + return self.config['dim'] + + def get_full_config(self): + + # unlike other layers the input layers need to be printed in + # 'init.config' (which initializes the neural network prior to the LDA) + ans = [] + for config_name in [ 'init', 'ref', 'final' ]: + ans.append( (config_name, + 'input-node name={0} dim={1}'.format(self.name, + self.config['dim']))) + return ans + + + +class XconfigTrivialOutputLayer(XconfigLayerBase): + """This class is for lines like + 'output name=output input=Append(input@-1, input@0, input@1, ReplaceIndex(ivector, t, 0))' + This is for outputs that are not really output "layers" + (there is no affine transform or nonlinearity), they just directly map to an + output-node in nnet3. + """ + + def __init__(self, first_token, key_to_value, prev_names = None): + + assert first_token == 'output' + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + + # note: self.config['input'] is a descriptor, '[-1]' means output + # the most recent layer. + self.config = { 'input':'[-1]' } + + def check_configs(self): + + pass # nothing to check; descriptor-parsing can't happen in this function. + + def output_name(self, auxiliary_outputs = None): + + # there are no auxiliary outputs as this layer will just pass the output + # of the previous layer + assert auxiliary_outputs is None + return self.name + + def output_dim(self, auxiliary_outputs = None): + + assert auxiliary_outputs is None + # note: each value of self.descriptors is (descriptor, dim, normalized-string, output-string). + return self.descriptors['input']['dim'] + + def get_full_config(self): + + # the input layers need to be printed in 'init.config' (which + # initializes the neural network prior to the LDA), in 'ref.config', + # which is a version of the config file used for getting left and right + # context (it doesn't read anything for the LDA-like transform and/or + # presoftmax-prior-scale components) + # In 'full.config' we write everything, this is just for reference, + # and also for cases where we don't use the LDA-like transform. + ans = [] + + # note: each value of self.descriptors is (descriptor, dim, + # normalized-string, output-string). + # by 'output-string' we mean a string that can appear in + # config-files, i.e. it contains the 'final' names of nodes. + descriptor_final_str = self.descriptors['input']['final-string'] + + for config_name in ['init', 'ref', 'final' ]: + ans.append( (config_name, + 'output-node name={0} input={1}'.format( + self.name, descriptor_final_str))) + return ans + + +class XconfigOutputLayer(XconfigLayerBase): + """This class is for lines like + 'output-layer name=output dim=4257 input=Append(input@-1, input@0, input@1, ReplaceIndex(ivector, t, 0))' + By default this includes a log-softmax component. The parameters are + initialized to zero, asthis is best for output layers. + + Parameters of the class, and their defaults: + input='[-1]' : Descriptor giving the input of the layer. + dim=None : Output dimension of layer, will normally equal the number of pdfs. + include-log-softmax=true : setting it to false will omit the + log-softmax component- useful for chain models. + objective-type=linear : the only other choice currently is + 'quadratic', for use in regression problems + learning-rate-factor=1.0 : Learning rate factor for the final + affine component, multiplies the standard learning rate. normally + you'll leave this as-is, but for xent regularization output layers + for chain models you'll want to set + learning-rate-factor=(0.5/xent_regularize), + normally learning-rate-factor=5.0 since xent_regularize is + normally 0.1. + presoftmax-scale-file=None : If set, a filename for a vector that + will be used to scale the output of the affine component before the + log-softmax (if include-log-softmax=true), or before the output + (if not). This is helpful to avoid instability in training due to + some classes having much more data than others. The way we normally + create this vector is to take the priors of the classes to the + power -0.25 and rescale them so the average is 1.0. This factor + -0.25 is referred to as presoftmax_prior_scale_power in scripts. In + the scripts this would normally be set to + config_dir/presoftmax_prior_scale.vec + """ + + def __init__(self, first_token, key_to_value, prev_names = None): + + assert first_token == 'output-layer' + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + + # note: self.config['input'] is a descriptor, '[-1]' means output + # the most recent layer. + self.config = {'input' : '[-1]', + 'dim' : -1, + 'include-log-softmax' : True, + # this would be false for chain models + 'objective-type' : 'linear', + # see Nnet::ProcessOutputNodeConfigLine in + # nnet-nnet.cc for other options + 'learning-rate-factor' : 1.0, + 'presoftmax-scale-file' : '', + # used in DNN (not RNN) training when using + # frame-level objfns, + 'max-change' : 1.5, + 'param-stddev' : 0.0, + 'bias-stddev' : 0.0, + 'output-delay' : 0 + } + + def check_configs(self): + + if self.config['dim'] <= -1: + raise xparser_error("In output-layer, dim has invalid value {0}" + "".format(self.config['dim']), self.str()) + + if self.config['objective-type'] != 'linear' and \ + self.config['objective_type'] != 'quadratic': + raise xparser_error("In output-layer, objective-type has" + " invalid value {0}" + "".format(self.config['objective-type']), + self.str()) + + if self.config['learning-rate-factor'] <= 0.0: + raise xparser_error("In output-layer, learning-rate-factor has" + " invalid value {0}" + "".format(self.config['learning-rate-factor']), + self.str()) + + + # you cannot access the output of this layer from other layers... see + # comment in output_name for the reason why. + def auxiliary_outputs(self): + + return [] + + def output_name(self, auxiliary_outputs = None): + + # Note: nodes of type output-node in nnet3 may not be accessed in + # Descriptors, so calling this with auxiliary_outputs=None doesn't + # make sense. But it might make sense to make the output of the softmax + # layer and/or the output of the affine layer available as inputs to + # other layers, in some circumstances. + # we'll implement that when it's needed. + raise xparser_error("Outputs of output-layer may not be used by other" + " layers", self.str()) + + def output_dim(self, auxiliary_output = None): + + # see comment in output_name(). + raise xparser_error("Outputs of output-layer may not be used by other" + " layers", self.str()) + + def get_full_config(self): + + ans = [] + + # note: each value of self.descriptors is (descriptor, dim, + # normalized-string, output-string). + # by 'descriptor_final_string' we mean a string that can appear in + # config-files, i.e. it contains the 'final' names of nodes. + descriptor_final_string = self.descriptors['input']['final-string'] + input_dim = self.descriptors['input']['dim'] + output_dim = self.config['dim'] + objective_type = self.config['objective-type'] + learning_rate_factor = self.config['learning-rate-factor'] + include_log_softmax = self.config['include-log-softmax'] + presoftmax_scale_file = self.config['presoftmax-scale-file'] + param_stddev = self.config['param-stddev'] + bias_stddev = self.config['bias-stddev'] + output_delay = self.config['output-delay'] + max_change = self.config['max-change'] + + # note: ref.config is used only for getting the left-context and + # right-context of the network; + # final.config is where we put the actual network definition. + for config_name in [ 'ref', 'final' ]: + # First the affine node. + line = ('component name={0}.affine' + ' type=NaturalGradientAffineComponent' + ' input-dim={1}' + ' output-dim={2}' + ' param-stddev={3}' + ' bias-stddev={4}' + ' max-change={5} ' + ''.format(self.name, input_dim, output_dim, + param_stddev, bias_stddev, max_change) + + ('learning-rate-factor={0} '.format(learning_rate_factor) + if learning_rate_factor != 1.0 else '')) + ans.append((config_name, line)) + + line = ('component-node name={0}.affine' + ' component={0}.affine input={1}' + ''.format(self.name, descriptor_final_string)) + ans.append((config_name, line)) + cur_node = '{0}.affine'.format(self.name) + + if presoftmax_scale_file is not '' and config_name == 'final': + # don't use the presoftmax-scale in 'ref.config' since that + # file won't exist at the time we evaluate it. + # (ref.config is used to find the left/right context). + line = ('component name={0}.fixed-scale' + ' type=FixedScaleComponent scales={1}' + ''.format(self.name, presoftmax_scale_file)) + ans.append((config_name, line)) + + line = ('component-node name={0}.fixed-scale' + ' component={0}.fixed-scale input={1}' + ''.format(self.name, cur_node)) + ans.append((config_name, line)) + cur_node = '{0}.fixed-scale'.format(self.name) + + if include_log_softmax: + line = ('component name={0}.log-softmax' + ' type=LogSoftmaxComponent dim={1}' + ''.format(self.name, output_dim)) + ans.append((config_name, line)) + + line = ('component-node name={0}.log-softmax' + ' component={0}.log-softmax input={1}' + ''.format(self.name, cur_node)) + ans.append((config_name, line)) + cur_node = '{0}.log-softmax'.format(self.name) + + if output_delay != 0: + cur_node = 'Offset({0}, {1})'.format(cur_node, output_delay) + + line = ('output-node name={0} input={1}'.format(self.name, cur_node)) + ans.append((config_name, line)) + return ans + + +# This class is for parsing lines like +# 'relu-renorm-layer name=layer1 dim=1024 input=Append(-3,0,3)' +# or: +# 'sigmoid-layer name=layer1 dim=1024 input=Append(-3,0,3)' +# which specify addition of an affine component and a sequence of non-linearities. +# Here, the name of the layer itself dictates the sequence of nonlinearities +# that are applied after the affine component; the name should contain some +# combination of 'relu', 'renorm', 'sigmoid' and 'tanh', +# and these nonlinearities will be added along with the affine component. +# +# The dimension specified is the output dim; the input dim is worked out from the input descriptor. +# This class supports only nonlinearity types that do not change the dimension; we can create +# another layer type to enable the use p-norm and similar dimension-reducing nonlinearities. +# +# See other configuration values below. +# +# Parameters of the class, and their defaults: +# input='[-1]' [Descriptor giving the input of the layer.] +# dim=None [Output dimension of layer, e.g. 1024] +# self-repair-scale=1.0e-05 [Affects relu, sigmoid and tanh layers.] +# +class XconfigBasicLayer(XconfigLayerBase): + def __init__(self, first_token, key_to_value, prev_names = None): + # Here we just list some likely combinations.. you can just add any + # combinations you want to use, to this list. + assert first_token in [ 'relu-layer', 'relu-renorm-layer', 'sigmoid-layer', + 'tanh-layer' ] + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + + # note: self.config['input'] is a descriptor, '[-1]' means output + # the most recent layer. + self.config = { 'input':'[-1]', + 'dim':-1, + 'max-change' : 0.75, + 'self-repair-scale' : 1.0e-05, + 'target-rms' : 1.0, + 'ng-affine-options' : ''} + + def check_configs(self): + if self.config['dim'] < 0: + raise xparser_error("dim has invalid value {0}".format(self.config['dim']), self.str()) + if self.config['self-repair-scale'] < 0.0 or self.config['self-repair-scale'] > 1.0: + raise xparser_error("self-repair-scale has invalid value {0}".format(self.config['self-repair-scale']), self.str()) + if self.config['target-rms'] < 0.0: + raise xparser_error("target-rms has invalid value {0}".format(self.config['target-rms']), self.str()) + + def output_name(self, auxiliary_output=None): + # at a later stage we might want to expose even the pre-nonlinearity + # vectors + assert auxiliary_output == None + + split_layer_name = self.layer_type.split('-') + assert split_layer_name[-1] == 'layer' + last_nonlinearity = split_layer_name[-2] + # return something like: layer3.renorm + return '{0}.{1}'.format(self.name, last_nonlinearity) + + def output_dim(self, auxiliary_output = None): + output_dim = self.config['dim'] + # If not set, the output-dim defaults to the input-dim. + if output_dim <= 0: + output_dim = self.descriptors['input']['dim'] + return output_dim + + + def get_full_config(self): + ans = [] + config_lines = self._generate_config() + + for line in config_lines: + for config_name in ['ref', 'final']: + # we do not support user specified matrices in this layer + # so 'ref' and 'final' configs are the same. + ans.append((config_name, line)) + return ans + + + def _generate_config(self): + split_layer_name = self.layer_type.split('-') + assert split_layer_name[-1] == 'layer' + nonlinearities = split_layer_name[:-1] + + # by 'descriptor_final_string' we mean a string that can appear in + # config-files, i.e. it contains the 'final' names of nodes. + input_desc = self.descriptors['input']['final-string'] + input_dim = self.descriptors['input']['dim'] + + # the child classes e.g. tdnn might want to process the input + # before adding the other components + + return self._add_components(input_desc, input_dim, nonlinearities) + + def _add_components(self, input_desc, input_dim, nonlinearities): + output_dim = self.output_dim() + self_repair_scale = self.config['self-repair-scale'] + target_rms = self.config['target-rms'] + max_change = self.config['max-change'] + ng_opt_str = self.config['ng-affine-options'] + + configs = [] + # First the affine node. + line = ('component name={0}.affine' + ' type=NaturalGradientAffineComponent' + ' input-dim={1}' + ' output-dim={2}' + ' max-change={3}' + ' {4}' + ''.format(self.name, input_dim, output_dim, + max_change, ng_opt_str)) + configs.append(line) + + line = ('component-node name={0}.affine' + ' component={0}.affine input={1}' + ''.format(self.name, input_desc)) + configs.append(line) + cur_node = '{0}.affine'.format(self.name) + + for nonlinearity in nonlinearities: + if nonlinearity == 'relu': + line = ('component name={0}.{1}' + ' type=RectifiedLinearComponent dim={2}' + ' self-repair-scale={3}' + ''.format(self.name, nonlinearity, output_dim, + self_repair_scale)) + + elif nonlinearity == 'sigmoid': + line = ('component name={0}.{1}' + ' type=SigmoidComponent dim={2}' + ' self-repair-scale={3}' + ''.format(self.name, nonlinearity, output_dim, + self_repair_scale)) + + elif nonlinearity == 'tanh': + line = ('component name={0}.{1}' + ' type=TanhComponent dim={2}' + ' self-repair-scale={3}' + ''.format(self.name, nonlinearity, output_dim, + self_repair_scale)) + + elif nonlinearity == 'renorm': + line = ('component name={0}.{1}' + ' type=NormalizeComponent dim={2}' + ' target-rms={3}' + ''.format(self.name, nonlinearity, output_dim, + target_rms)) + + else: + raise xparser_error("Unknown nonlinearity type:" + "{0}".format(nonlinearity), self.str()) + + configs.append(line) + line = ('component-node name={0}.{1}' + ' component={0}.{1} input={2}' + ''.format(self.name, nonlinearity, cur_node)) + + configs.append(line) + cur_node = '{0}.{1}'.format(self.name, nonlinearity) + return configs + + +# This class is for lines like +# 'fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=foo/bar/lda.mat' +# +# The output dimension of the layer may be specified via 'dim=xxx', but if not specified, +# the dimension defaults to the same as the input. Note: we don't attempt to read that +# file at the time the config is created, because in the recipes, that file is created +# after the config files. +# +# See other configuration values below. +# +# Parameters of the class, and their defaults: +# input='[-1]' [Descriptor giving the input of the layer.] +# dim=None [Output dimension of layer; defaults to the same as the input dim.] +# affine-transform-file='' [Must be specified.] +# +class XconfigFixedAffineLayer(XconfigLayerBase): + def __init__(self, first_token, key_to_value, prev_names = None): + assert first_token == 'fixed-affine-layer' + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + # note: self.config['input'] is a descriptor, '[-1]' means output + # the most recent layer. + self.config = { 'input':'[-1]', + 'dim':-1, + 'affine-transform-file':''} + + def check_configs(self): + if self.config['affine-transform-file'] is None: + raise xparser_error("affine-transform-file must be set.", self.str()) + + def output_name(self, auxiliary_output = None): + # Fixed affine layer computes only one vector, there are no intermediate + # vectors. + assert auxiliary_output == None + return self.name + + def output_dim(self, auxiliary_output = None): + output_dim = self.config['dim'] + # If not set, the output-dim defaults to the input-dim. + if output_dim <= 0: + output_dim = self.descriptors['input']['dim'] + return output_dim + + def get_full_config(self): + ans = [] + + # note: each value of self.descriptors is (descriptor, dim, + # normalized-string, output-string). + # by 'descriptor_final_string' we mean a string that can appear in + # config-files, i.e. it contains the 'final' names of nodes. + descriptor_final_string = self.descriptors['input']['final-string'] + input_dim = self.descriptors['input']['dim'] + output_dim = self.output_dim() + transform_file = self.config['affine-transform-file'] + + + # to init.config we write an output-node with the name 'output' and + # with a Descriptor equal to the descriptor that's the input to this + # layer. This will be used to accumulate stats to learn the LDA transform. + line = 'output-node name=output input={0}'.format(descriptor_final_string) + ans.append(('init', line)) + + # write the 'real' component to final.config + line = 'component name={0} type=FixedAffineComponent matrix={1}'.format( + self.name, transform_file) + ans.append(('final', line)) + # write a random version of the component, with the same dims, to ref.config + line = 'component name={0} type=FixedAffineComponent input-dim={1} output-dim={2}'.format( + self.name, input_dim, output_dim) + ans.append(('ref', line)) + # the component-node gets written to final.config and ref.config. + line = 'component-node name={0} component={0} input={1}'.format( + self.name, descriptor_final_string) + ans.append(('final', line)) + ans.append(('ref', line)) + return ans + +# This class is for lines like +# 'affine-layer name=affine input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0))' +# +# The output dimension of the layer may be specified via 'dim=xxx', but if not specified, +# the dimension defaults to the same as the input. Note: we don't attempt to read that +# file at the time the config is created, because in the recipes, that file is created +# after the config files. +# +# See other configuration values below. +# +# Parameters of the class, and their defaults: +# input='[-1]' [Descriptor giving the input of the layer.] +# dim=None [Output dimension of layer; defaults to the same as the input dim.] +# +class XconfigAffineLayer(XconfigLayerBase): + def __init__(self, first_token, key_to_value, prev_names = None): + assert first_token == 'affine-layer' + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + # note: self.config['input'] is a descriptor, '[-1]' means output + # the most recent layer. + # use None for optional parameters as we want to default to the C++ defaults + # C++ component provides more options but I will just expose these for now + # Note : The type of the parameter is determined based on the value assigned + # so please use decimal point if your parameter is a float + self.config = { 'input' : '[-1]', + 'dim' : -1, + 'param-stddev' : -1.0, # this has to be initialized to 1/sqrt(input_dim) + 'bias-stddev' : 1.0, + 'bias-mean' : 0.0, + 'max-change' : 0.75, + 'learning-rate-factor' : 1.0, + 'ng-affine-options' : ''} + + def set_derived_configs(self): + super(XconfigAffineLayer, self).set_derived_configs() + if self.config['param-stddev'] < 0: + self.config['param-stddev'] = 1.0 / self.descriptors['input']['dim'] + + def check_configs(self): + if self.config['dim'] <= 0: + raise xparser_error("dim specified is invalid".format(self.name, self.layer_type), self.str()) + + def output_name(self, auxiliary_output = None): + # affine layer computes only one vector, there are no intermediate + # vectors. + assert auxiliary_output == None + return self.name + + def output_dim(self, auxiliary_output = None): + output_dim = self.config['dim'] + # If not set, the output-dim defaults to the input-dim. + if output_dim <= 0: + output_dim = self.descriptors['input']['dim'] + + return output_dim + + def get_full_config(self): + ans = [] + + # note: each value of self.descriptors is (descriptor, dim, + # normalized-string, output-string). + # by 'descriptor_final_string' we mean a string that can appear in + # config-files, i.e. it contains the 'final' names of nodes. + descriptor_final_string = self.descriptors['input']['final-string'] + input_dim = self.descriptors['input']['dim'] + output_dim = self.output_dim() + + option_string='' + for key in ['param-stddev', 'bias-stddev', 'bias-mean', 'max-change']: + option_string += ' {0}={1}'.format(key, self.config[key]) + option_string += self.config['ng-affine-options'] + + conf_lines = [] + # write the 'real' component to final.config + conf_lines.append('component name={n} type=NaturalGradientAffineComponent ' + 'input-dim={i} output-dim={o} {opts}'.format(n = self.name, + i = input_dim, + o = output_dim, + opts = option_string)) + # the component-node gets written to final.config and ref.config. + conf_lines.append('component-node name={0} component={0} input={1}'.format(self.name, + descriptor_final_string)) + + # the config is same for both final and ref configs + for conf_name in ['final', 'ref']: + for line in conf_lines: + ans.append((conf_name, line)) + return ans + + +def test_layers(): + # for some config lines that should be printed the same way as they + # are read, check that this is the case. + for x in [ 'input name=input dim=30' ]: + assert str(config_line_to_object(x, [])) == x diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py new file mode 100644 index 00000000000..353b9d3bba4 --- /dev/null +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py @@ -0,0 +1,8 @@ +# Copyright 2016 Johns Hopkins University (Dan Povey) +# 2016 Vijayaditya Peddinti +# 2016 Yiming Wang +# Apache 2.0. + +from basic_layers import * +from lstm import * +from tdnn import * diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py new file mode 100644 index 00000000000..7b37958f81b --- /dev/null +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py @@ -0,0 +1,532 @@ +# Copyright 2016 Johns Hopkins University (Dan Povey) +# 2016 Vijayaditya Peddinti +# 2016 Yiming Wang +# Apache 2.0. + + +""" This module has the implementations of different LSTM layers. +""" +import re + +from libs.nnet3.xconfig.basic_layers import XconfigLayerBase +from libs.nnet3.xconfig.utils import XconfigParserError as xparser_error + + +# This class is for lines like +# 'lstm-layer name=lstm1 input=[-1] delay=-3' +# It generates an LSTM sub-graph without output projections. +# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified, +# the dimension defaults to the same as the input. +# See other configuration values below. +# +# Parameters of the class, and their defaults: +# input='[-1]' [Descriptor giving the input of the layer.] +# cell-dim=-1 [Dimension of the cell] +# delay=-1 [Delay in the recurrent connections of the LSTM ] +# clipping-threshold=30 [nnet3 LSTMs use a gradient clipping component at the recurrent connections. This is the threshold used to decide if clipping has to be activated ] +# norm-based-clipping=True [specifies if the gradient clipping has to activated based on total norm or based on per-element magnitude] +# self_repair_scale_nonlinearity=1e-5 [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent] +# i.e., SigmoidComponent, TanhComponent and RectifiedLinearComponent ] +# ng-per-element-scale-options='' [Additional options used for the diagonal matrices in the LSTM ] +# ng-affine-options='' [Additional options used for the full matrices in the LSTM, can be used to do things like set biases to initialize to 1] +class XconfigLstmLayer(XconfigLayerBase): + def __init__(self, first_token, key_to_value, prev_names = None): + assert first_token == "lstm-layer" + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + self.config = {'input':'[-1]', + 'cell-dim' : -1, # this is a compulsory argument + 'clipping-threshold' : 30.0, + 'norm-based-clipping' : True, + 'delay' : -1, + 'ng-per-element-scale-options' : ' max-change=0.75', + 'ng-affine-options' : ' max-change=0.75 ', + 'self-repair-scale-nonlinearity' : 0.00001, + 'zeroing-interval' : 20, + 'zeroing-threshold' : 3.0 + } + + def set_derived_configs(self): + if self.config['cell-dim'] <= 0: + self.config['cell-dim'] = self.InputDim() + + def check_configs(self): + key = 'cell-dim' + if self.config['cell-dim'] <= 0: + raise xparser_error("cell-dim has invalid value {0}.".format(self.config[key]), self.str()) + + for key in ['self-repair-scale-nonlinearity']: + if self.config[key] < 0.0 or self.config[key] > 1.0: + raise xparser_error("{0} has invalid value {1}.".format(key, self.config[key])) + + def auxiliary_outputs(self): + return ['c_t'] + + def output_name(self, auxiliary_output = None): + node_name = 'm_t' + if auxiliary_output is not None: + if auxiliary_output in self.auxiliary_outputs(): + node_name = auxiliary_output + else: + raise xparser_error("Unknown auxiliary output name {0}".format(auxiliary_output), self.str()) + + return '{0}.{1}'.format(self.name, node_name) + + def output_dim(self, auxiliary_output = None): + if auxiliary_output is not None: + if auxiliary_output in self.auxiliary_outputs(): + if node_name == 'c_t': + return self.config['cell-dim'] + # add code for other auxiliary_outputs here when we decide to expose them + else: + raise xparser_error("Unknown auxiliary output name {0}".format(auxiliary_output), self.str()) + + return self.config['cell-dim'] + + def get_full_config(self): + ans = [] + config_lines = self.generate_lstm_config() + + for line in config_lines: + for config_name in ['ref', 'final']: + # we do not support user specified matrices in LSTM initialization + # so 'ref' and 'final' configs are the same. + ans.append((config_name, line)) + return ans + + # convenience function to generate the LSTM config + def generate_lstm_config(self): + + # assign some variables to reduce verbosity + name = self.name + # in the below code we will just call descriptor_strings as descriptors for conciseness + input_dim = self.descriptors['input']['dim'] + input_descriptor = self.descriptors['input']['final-string'] + cell_dim = self.config['cell-dim'] + delay = self.config['delay'] + + repair_nonlin = self.config['self-repair-scale-nonlinearity'] + repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else '' + bptrunc_str = ("clipping-threshold={0}" + " zeroing-threshold={1}" + " zeroing-interval={2}" + " recurrence-interval={3}" + "".format(self.config['clipping-threshold'], + self.config['zeroing-threshold'], + self.config['zeroing-interval'], + abs(delay))) + affine_str = self.config['ng-affine-options'] + # Natural gradient per element scale parameters + # TODO: decide if we want to keep exposing these options + if re.search('param-mean', ng_per_element_scale_options) is None and \ + re.search('param-stddev', ng_per_element_scale_options) is None: + ng_per_element_scale_options += " param-mean=0.0 param-stddev=1.0 " + pes_str = ng_per_element_scale_options + + + + configs = [] + + # the equations implemented here are + # TODO: write these + # naming convention + # .W_. e.g. Lstm1.W_i.xr for matrix providing output to gate i and operating on an appended vector [x,r] + configs.append("# Input gate control : W_i* matrices") + configs.append("component name={0}.W_i.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim, affine_str)) + configs.append("# note : the cell outputs pass through a diagonal matrix") + configs.append("component name={0}.w_i.c type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim, pes_str)) + + configs.append("# Forget gate control : W_f* matrices") + configs.append("component name={0}.W_f.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim, affine_str)) + configs.append("# note : the cell outputs pass through a diagonal matrix") + configs.append("component name={0}.w_f.c type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim, pes_str)) + + configs.append("# Output gate control : W_o* matrices") + configs.append("component name={0}.W_o.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim, affine_str)) + configs.append("# note : the cell outputs pass through a diagonal matrix") + configs.append("component name={0}.w_o.c type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim, pes_str)) + + configs.append("# Cell input matrices : W_c* matrices") + configs.append("component name={0}.W_c.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim, affine_str)) + + + configs.append("# Defining the non-linearities") + configs.append("component name={0}.i type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append("component name={0}.f type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append("component name={0}.g type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + + configs.append("# Defining the components for other cell computations") + configs.append("component name={0}.c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) + configs.append("component name={0}.c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) + configs.append("component name={0}.m type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) + configs.append("component name={0}.c type=BackpropTruncationComponent dim={1} {2}".format(name, cell_dim, bptrunc_str)) + + # c1_t and c2_t defined below + configs.append("component-node name={0}.c_t component={0}.c input=Sum({0}.c1_t, {0}.c2_t)".format(name)) + delayed_c_t_descriptor = "IfDefined(Offset({0}.c_t, {1}))".format(name, delay) + + configs.append("# i_t") + configs.append("component-node name={0}.i1_t component={0}.W_i.xr input=Append({1}, IfDefined(Offset({0}.r_t, {2})))".format(name, input_descriptor, delay)) + configs.append("component-node name={0}.i2_t component={0}.w_i.c input={1}".format(name, delayed_c_t_descriptor)) + configs.append("component-node name={0}.i_t component={0}.i input=Sum({0}.i1_t, {0}.i2_t)".format(name)) + + configs.append("# f_t") + configs.append("component-node name={0}.f1_t component={0}.W_f.xr input=Append({1}, IfDefined(Offset({0}.r_t, {2})))".format(name, input_descriptor, delay)) + configs.append("component-node name={0}.f2_t component={0}.w_f.c input={1}".format(name, delayed_c_t_descriptor)) + configs.append("component-node name={0}.f_t component={0}.f input=Sum({0}.f1_t, {0}.f2_t)".format(name)) + + configs.append("# o_t") + configs.append("component-node name={0}.o1_t component={0}.W_o.xr input=Append({1}, IfDefined(Offset({0}.r_t, {2})))".format(name, input_descriptor, delay)) + configs.append("component-node name={0}.o2_t component={0}.w_o.c input={0}.c_t".format(name)) + configs.append("component-node name={0}.o_t component={0}.o input=Sum({0}.o1_t, {0}.o2_t)".format(name)) + + configs.append("# h_t") + configs.append("component-node name={0}.h_t component={0}.h input={0}.c_t".format(name)) + + configs.append("# g_t") + configs.append("component-node name={0}.g1_t component={0}.W_c.xr input=Append({1}, IfDefined(Offset({0}.r_t, {2})))".format(name, input_descriptor, delay)) + configs.append("component-node name={0}.g_t component={0}.g input={0}.g1_t".format(name)) + + configs.append("# parts of c_t") + configs.append("component-node name={0}.c1_t component={0}.c1 input=Append({0}.f_t, {1})".format(name, delayed_c_t_descriptor)) + configs.append("component-node name={0}.c2_t component={0}.c2 input=Append({0}.i_t, {0}.g_t)".format(name)) + + configs.append("# m_t") + configs.append("component-node name={0}.m_t component={0}.m input=Append({0}.o_t, {0}.h_t)".format(name)) + + # add the recurrent connections + configs.append("component name={0}.r type=BackpropTruncationComponent dim={1} {2}".format(name, cell_dim, bptrunc_str)) + configs.append("component-node name={0}.r_t component={0}.r input={0}.m_t".format(name)) + + return configs + + +# This class is for lines like +# 'lstmp-layer name=lstm1 input=[-1] delay=-3' +# It generates an LSTM sub-graph with output projections. It can also generate +# outputs without projection, but you could use the XconfigLstmLayer for this +# simple LSTM. +# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified, +# the dimension defaults to the same as the input. +# See other configuration values below. +# +# Parameters of the class, and their defaults: +# input='[-1]' [Descriptor giving the input of the layer.] +# cell-dim=-1 [Dimension of the cell] +# recurrent_projection_dim [Dimension of the projection used in recurrent connections] +# non_recurrent_projection_dim [Dimension of the projection in non-recurrent connections] +# delay=-1 [Delay in the recurrent connections of the LSTM ] +# clipping-threshold=30 [nnet3 LSTMs use a gradient clipping component at the recurrent connections. This is the threshold used to decide if clipping has to be activated ] +# norm-based-clipping=True [specifies if the gradient clipping has to activated based on total norm or based on per-element magnitude] +# self_repair_scale_nonlinearity=1e-5 [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent] +# i.e., SigmoidComponent, TanhComponent and RectifiedLinearComponent ] +# ng-per-element-scale-options='' [Additional options used for the diagonal matrices in the LSTM ] +# ng-affine-options='' [Additional options used for the full matrices in the LSTM, can be used to do things like set biases to initialize to 1] +class XconfigLstmpLayer(XconfigLayerBase): + def __init__(self, first_token, key_to_value, prev_names = None): + print first_token + assert first_token == "lstmp-layer" + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + self.config = {'input' : '[-1]', + 'cell-dim' : -1, # this is a compulsory argument + 'recurrent-projection-dim' : -1, + 'non-recurrent-projection-dim' : -1, + 'clipping-threshold' : 30.0, + 'norm-based-clipping' : True, + 'delay' : -1, + 'ng-per-element-scale-options' : ' max-change=0.75 ', + 'ng-affine-options' : ' max-change=0.75 ', + 'self-repair-scale-nonlinearity' : 0.00001, + 'zeroing-interval' : 20, + 'zeroing-threshold' : 3.0 + } + + def set_derived_configs(self): + if self.config['cell-dim'] <= 0: + self.config['cell-dim'] = self.InputDim() + + for key in ['recurrent-projection-dim', 'non-recurrent-projection-dim']: + if self.config[key] <= 0: + self.config[key] = self.config['cell-dim'] / 2 + + def check_configs(self): + for key in ['cell-dim', 'recurrent-projection-dim', 'non-recurrent-projection-dim']: + if self.config[key] <= 0: + raise xparser_error("{0} has invalid value {1}.".format(key, self.config[key]), self.str()) + + for key in ['self-repair-scale-nonlinearity']: + if self.config[key] < 0.0 or self.config[key] > 1.0: + raise xparser_error("{0} has invalid value {2}.".format(self.layer_type, + key, + self.config[key])) + def auxiliary_outputs(self): + return ['c_t'] + + def output_name(self, auxiliary_output = None): + node_name = 'rp_t' + if auxiliary_output is not None: + if auxiliary_output in self.auxiliary_outputs(): + node_name = auxiliary_output + else: + raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output)) + + return '{0}.{1}'.format(self.name, node_name) + + def output_dim(self, auxiliary_output = None): + if auxiliary_output is not None: + if auxiliary_output in self.auxiliary_outputs(): + if node_name == 'c_t': + return self.config['cell-dim'] + # add code for other auxiliary_outputs here when we decide to expose them + else: + raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output)) + + return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim'] + + def get_full_config(self): + ans = [] + config_lines = self.generate_lstm_config() + + for line in config_lines: + for config_name in ['ref', 'final']: + # we do not support user specified matrices in LSTM initialization + # so 'ref' and 'final' configs are the same. + ans.append((config_name, line)) + return ans + + # convenience function to generate the LSTM config + def generate_lstm_config(self): + + # assign some variables to reduce verbosity + name = self.name + # in the below code we will just call descriptor_strings as descriptors for conciseness + input_dim = self.descriptors['input']['dim'] + input_descriptor = self.descriptors['input']['final-string'] + cell_dim = self.config['cell-dim'] + rec_proj_dim = self.config['recurrent-projection-dim'] + nonrec_proj_dim = self.config['non-recurrent-projection-dim'] + delay = self.config['delay'] + repair_nonlin = self.config['self-repair-scale-nonlinearity'] + repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else '' + bptrunc_str = ("clipping-threshold={0}" + " zeroing-threshold={1}" + " zeroing-interval={2}" + " recurrence-interval={3}" + "".format(self.config['clipping-threshold'], + self.config['zeroing-threshold'], + self.config['zeroing-interval'], + abs(delay))) + affine_str = self.config['ng-affine-options'] + pes_str = self.config['ng-per-element-scale-options'] + + # Natural gradient per element scale parameters + # TODO: decide if we want to keep exposing these options + if re.search('param-mean', pes_str) is None and \ + re.search('param-stddev', pes_str) is None: + pes_str += " param-mean=0.0 param-stddev=1.0 " + + configs = [] + # the equations implemented here are from Sak et. al. "Long Short-Term Memory Recurrent Neural Network Architectures for Large Scale Acoustic Modeling" + # http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43905.pdf + # naming convention + # .W_. e.g. Lstm1.W_i.xr for matrix providing output to gate i and operating on an appended vector [x,r] + configs.append("# Input gate control : W_i* matrices") + configs.append("component name={0}.W_i.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str)) + configs.append("# note : the cell outputs pass through a diagonal matrix") + configs.append("component name={0}.w_i.c type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim, pes_str)) + + configs.append("# Forget gate control : W_f* matrices") + configs.append("component name={0}.W_f.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str)) + configs.append("# note : the cell outputs pass through a diagonal matrix") + configs.append("component name={0}.w_f.c type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim, pes_str)) + + configs.append("# Output gate control : W_o* matrices") + configs.append("component name={0}.W_o.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str)) + configs.append("# note : the cell outputs pass through a diagonal matrix") + configs.append("component name={0}.w_o.c type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim, pes_str)) + + configs.append("# Cell input matrices : W_c* matrices") + configs.append("component name={0}.W_c.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str)) + + configs.append("# Defining the non-linearities") + configs.append("component name={0}.i type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append("component name={0}.f type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append("component name={0}.g type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + + configs.append("# Defining the components for other cell computations") + configs.append("component name={0}.c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) + configs.append("component name={0}.c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) + configs.append("component name={0}.m type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) + configs.append("component name={0}.c type=BackpropTruncationComponent dim={1} {2}".format(name, cell_dim, bptrunc_str)) + + # c1_t and c2_t defined below + configs.append("component-node name={0}.c_t component={0}.c input=Sum({0}.c1_t, {0}.c2_t)".format(name)) + delayed_c_t_descriptor = "IfDefined(Offset({0}.c_t, {1}))".format(name, delay) + + recurrent_connection = '{0}.r_t'.format(name) + configs.append("# i_t") + configs.append("component-node name={0}.i1_t component={0}.W_i.xr input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) + configs.append("component-node name={0}.i2_t component={0}.w_i.c input={1}".format(name, delayed_c_t_descriptor)) + configs.append("component-node name={0}.i_t component={0}.i input=Sum({0}.i1_t, {0}.i2_t)".format(name)) + + configs.append("# f_t") + configs.append("component-node name={0}.f1_t component={0}.W_f.xr input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) + configs.append("component-node name={0}.f2_t component={0}.w_f.c input={1}".format(name, delayed_c_t_descriptor)) + configs.append("component-node name={0}.f_t component={0}.f input=Sum({0}.f1_t, {0}.f2_t)".format(name)) + + configs.append("# o_t") + configs.append("component-node name={0}.o1_t component={0}.W_o.xr input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) + configs.append("component-node name={0}.o2_t component={0}.w_o.c input={0}.c_t".format(name)) + configs.append("component-node name={0}.o_t component={0}.o input=Sum({0}.o1_t, {0}.o2_t)".format(name)) + + configs.append("# h_t") + configs.append("component-node name={0}.h_t component={0}.h input={0}.c_t".format(name)) + + configs.append("# g_t") + configs.append("component-node name={0}.g1_t component={0}.W_c.xr input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) + configs.append("component-node name={0}.g_t component={0}.g input={0}.g1_t".format(name)) + + configs.append("# parts of c_t") + configs.append("component-node name={0}.c1_t component={0}.c1 input=Append({0}.f_t, {1})".format(name, delayed_c_t_descriptor)) + configs.append("component-node name={0}.c2_t component={0}.c2 input=Append({0}.i_t, {0}.g_t)".format(name)) + + configs.append("# m_t") + configs.append("component-node name={0}.m_t component={0}.m input=Append({0}.o_t, {0}.h_t)".format(name)) + + # add the recurrent connections + configs.append("# projection matrices : Wrm and Wpm") + configs.append("component name={0}.W_rp.m type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str)) + configs.append("component name={0}.r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str)) + + configs.append("# r_t and p_t : rp_t will be the output") + configs.append("component-node name={0}.rp_t component={0}.W_rp.m input={0}.m_t".format(name)) + configs.append("dim-range-node name={0}.r_t_preclip input-node={0}.rp_t dim-offset=0 dim={1}".format(name, rec_proj_dim)) + configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_preclip".format(name)) + + return configs + +# Same as the LSTMP layer except that the matrix multiplications are combined +# we probably keep only version after experimentation. One year old experiments +# show that this version is slightly worse and might require some tuning +class XconfigLstmpcLayer(XconfigLstmpLayer): + def __init__(self, first_token, key_to_value, prev_names = None): + assert first_token == "lstmpc-layer" + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + # convenience function to generate the LSTM config + def generate_lstm_config(self): + # assign some variables to reduce verbosity + name = self.name + # in the below code we will just call descriptor_strings as descriptors for conciseness + input_dim = self.descriptors['input']['dim'] + input_descriptor = self.descriptors['input']['final-string'] + cell_dim = self.config['cell-dim'] + rec_proj_dim = self.config['recurrent-projection-dim'] + nonrec_proj_dim = self.config['non-recurrent-projection-dim'] + delay = self.config['delay'] + + repair_nonlin = self.config['self-repair-scale-nonlinearity'] + repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else '' + bptrunc_str = ("clipping-threshold={0}" + " zeroing-threshold={1}" + " zeroing-interval={2}" + " recurrence-interval={3}" + "".format(self.config['clipping-threshold'], + self.config['zeroing-threshold'], + self.config['zeroing-interval'], + abs(delay))) + affine_str = self.config['ng-affine-options'] + # Natural gradient per element scale parameters + # TODO: decide if we want to keep exposing these options + if re.search('param-mean', ng_per_element_scale_options) is None and \ + re.search('param-stddev', ng_per_element_scale_options) is None: + ng_per_element_scale_options += " param-mean=0.0 param-stddev=1.0 " + pes_str = ng_per_element_scale_options + + configs = [] + # naming convention + # .W_. e.g. Lstm1.W_i.xr for matrix providing output to gate i and operating on an appended vector [x,r] + configs.append("# Full W_ifoc* matrix") + configs.append("component name={0}.W_ifoc.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, 4*cell_dim, affine_str)) + configs.append("# note : the cell outputs pass through a diagonal matrix") + + # we will not combine the diagonal matrix operations as one of these has a different delay + configs.append("# note : the cell outputs pass through a diagonal matrix") + configs.append("component name={0}.w_i.c type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim, pes_str)) + configs.append("component name={0}.w_f.c type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim, pes_str)) + configs.append("component name={0}.w_o.c type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim, pes_str)) + + configs.append("# Defining the non-linearities") + configs.append("component name={0}.i type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append("component name={0}.f type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append("component name={0}.g type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + + configs.append("# Defining the components for other cell computations") + configs.append("component name={0}.c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) + configs.append("component name={0}.c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) + configs.append("component name={0}.m type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) + configs.append("component name={0}.c type=BackpropTruncationComponent dim={1} {2}".format(name, cell_dim, bptrunc_str)) + + # c1_t and c2_t defined below + configs.append("component-node name={0}.c_t component={0}.c input=Sum({0}.c1_t, {0}.c2_t)".format(name)) + delayed_c_t_descriptor = "IfDefined(Offset({0}.c_t, {1}))".format(name, delay) + rec_connection = '{0}.rp_t'.format(name) + + component_nodes.append("component-node name={0}.ifoc_t component={0}.W_ifoc.xr input=Append({1}, IfDefined(Offset({0}_{2}, {3})))".format(name, input_descriptor, recurrent_connection, lstm_delay)) + + + offset = 0 + component_nodes.append("# i_t") + component_nodes.append("dim-range-node name={0}.i1_t input-node={0}.ifoc_t dim-offset={1} dim={2}".format(name, offset, cell_dim)) + offset += cell_dim + component_nodes.append("component-node name={0}.i2_t component={0}.w_i.cinput={1}".format(name, delayed_c_t_descriptor)) + component_nodes.append("component-node name={0}.i_t component={0}.i input=Sum({0}.i1_t, {0}.i2_t)".format(name)) + + component_nodes.append("# f_t") + component_nodes.append("dim-range-node name={0}.f1_t input-node={0}.ifoc_t dim-offset={1} dim={2}".format(name, offset, cell_dim)) + offset += cell_dim + component_nodes.append("component-node name={0}.f2_t component={0}.w_f.c input={1}".format(name, delayed_c_t_descriptor)) + component_nodes.append("component-node name={0}.f_t component={0}.f input=Sum({0}.f1_t, {0}.f2_t)".format(name)) + + component_nodes.append("# o_t") + component_nodes.append("dim-range-node name={0}.o1_t input-node={0}.ifoc_t dim-offset={1} dim={2}".format(name, offset, cell_dim)) + offset += cell_dim + component_nodes.append("component-node name={0}.o2_t component={0}.w_o.c input={0}.c_t".format(name)) + component_nodes.append("component-node name={0}.o_t component={0}.o input=Sum({0}.o1_t, {0}.o2_t)".format(name)) + + component_nodes.append("# h_t") + component_nodes.append("component-node name={0}.h_t component={0}.h input={0}.c_t".format(name)) + + component_nodes.append("# g_t") + component_nodes.append("dim-range-node name={0}.g1_t input-node={0}.ifoc_t dim-offset={1} dim={2}".format(name, offset, cell_dim)) + offset += cell_dim + component_nodes.append("component-node name={0}.g_t component={0}.g input={0}.g1_t".format(name)) + + + configs.append("# parts of c_t") + configs.append("component-node name={0}.c1_t component={0}.c1 input=Append({0}.f_t, {1})".format(name, delayed_c_t_descriptor)) + configs.append("component-node name={0}.c2_t component={0}.c2 input=Append({0}.i_t, {0}.g_t)".format(name)) + + configs.append("# m_t") + configs.append("component-node name={0}.m_t component={0}.m input=Append({0}.o_t, {0}.h_t)".format(name)) + + # add the recurrent connections + configs.append("# projection matrices : Wrm and Wpm") + configs.append("component name={0}.W_rp.m type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, recurrent_projection_dim + non_recurrent_projection_dim, affine_str)) + configs.append("component name={0}.r type=BackpropTruncationComponent dim={1} {2}".format(name, recurrent_projection_dim, bptrunc_str)) + + configs.append("# r_t and p_t : rp_t will be the output") + configs.append("component-node name={0}.rp_t component={0}.W_rp.m input={0}.m_t".format(name)) + configs.append("dim-range-node name={0}.r_t_preclip input-node={0}.rp_t dim-offset=0 dim={1}".format(name, recurrent_projection_dim)) + configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_preclip".format(name)) + + return configs diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py new file mode 100644 index 00000000000..4976084a977 --- /dev/null +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -0,0 +1,94 @@ +# Copyright 2016 Johns Hopkins University (Dan Povey) +# 2016 Vijayaditya Peddinti +# Apache 2.0. + +""" This module contains the top level xconfig parsing functions. +""" + +import libs.nnet3.xconfig.layers as xlayers +import libs.nnet3.xconfig.utils as xutils +from libs.nnet3.xconfig.utils import XconfigParserError as xparser_error + + +# We have to modify this dictionary when adding new layers +config_to_layer = { + 'input' : xlayers.XconfigInputLayer, + 'output' : xlayers.XconfigTrivialOutputLayer, + 'output-layer' : xlayers.XconfigOutputLayer, + 'relu-layer' : xlayers.XconfigBasicLayer, + 'relu-renorm-layer' : xlayers.XconfigBasicLayer, + 'sigmoid-layer' : xlayers.XconfigBasicLayer, + 'tanh-layer' : xlayers.XconfigBasicLayer, + 'tdnn-relu-layer' : xlayers.XconfigTdnnLayer, + 'tdnn-relu-renorm-layer' : xlayers.XconfigTdnnLayer, + 'tdnn-sigmoid-layer' : xlayers.XconfigTdnnLayer, + 'tdnn-tanh-layer' : xlayers.XconfigTdnnLayer, + 'fixed-affine-layer' : xlayers.XconfigFixedAffineLayer, + 'affine-layer' : xlayers.XconfigAffineLayer, + 'lstm-layer' : xlayers.XconfigLstmLayer, + 'lstmp-layer' : xlayers.XconfigLstmpLayer, + 'lstmpc-layer' : xlayers.XconfigLstmpcLayer + } + +# Converts a line as parsed by ParseConfigLine() into a first +# token e.g. 'input-layer' and a key->value map, into +# an objet inherited from XconfigLayerBase. +# 'prev_names' is a list of previous layer names, it's needed +# to parse things like '[-1]' (meaning: the previous layer) +# when they appear in Desriptors. +def parsed_line_to_xconfig_layer(first_token, key_to_value, prev_names): + + conf_line = first_token + ' ' + ' '.join(['{0}={1}'.format(x,y) for x,y in key_to_value.items()]) + + if not config_to_layer.has_key(first_token): + raise xparser_error("No such layer type.", conf_line) + + try: + return config_to_layer[first_token](first_token, key_to_value, prev_names) + except xparser_error as e: + if e.conf_line is None: + # we want to throw informative errors which point to the xconfig line + e.conf_line = conf_line + raise + +# Uses ParseConfigLine() to turn a config line that has been parsed into +# a first token e.g. 'affine-layer' and a key->value map like { 'dim':'1024', 'name':'affine1' }, +# and then turns this into an object representing that line of the config file. +# 'prev_names' is a list of the names of preceding lines of the +# config file. +def config_line_to_object(config_line, prev_names = None): + (first_token, key_to_value) = xutils.parse_config_line(config_line) + return parsed_line_to_xconfig_layer(first_token, key_to_value, prev_names) + +# This function reads an xconfig file and returns it as a list of layers +# (usually we use the variable name 'all_layers' elsewhere for this). +# It will die if the xconfig file is empty or if there was +# some error parsing it. +def read_xconfig_file(xconfig_filename): + try: + f = open(xconfig_filename, 'r') + except Exception as e: + sys.exit("{0}: error reading xconfig file '{1}'; error was {2}".format( + sys.argv[0], xconfig_filename, repr(e))) + all_layers = [] + while True: + line = f.readline() + if line == '': + break + x = xutils.parse_config_line(line) + if x is None: + continue # line was blank or only comments. + (first_token, key_to_value) = x + # the next call will raise an easy-to-understand exception if + # it fails. + this_layer = parsed_line_to_xconfig_layer(first_token, + key_to_value, + all_layers) + all_layers.append(this_layer) + if len(all_layers) == 0: + raise xparser_error("{0}: xconfig file '{1}' is empty".format( + sys.argv[0], xconfig_filename)) + f.close() + return all_layers + + diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/tdnn.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/tdnn.py new file mode 100644 index 00000000000..21f9db4f5c8 --- /dev/null +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/tdnn.py @@ -0,0 +1,110 @@ +# Copyright 2016 Johns Hopkins University (Dan Povey) +# 2016 Vijayaditya Peddinti +# Apache 2.0. + + +""" This module contains the implementation of the TDNN layer. +""" + +import libs.nnet3.xconfig.utils as xutils +from libs.nnet3.xconfig.utils import XconfigParserError as xparser_error +from libs.nnet3.xconfig.basic_layers import XconfigBasicLayer +from libs.nnet3.xconfig.basic_layers import XconfigLayerBase + +class XconfigTdnnLayer(XconfigBasicLayer): + """This class is for parsing lines like + tdnn-relu-renorm-layer name=tdnn1 dim=1024 splice-indexes=-3,0,3 subset-dim=512 + + It is similar to XconfigBasicLayer except for the way in which the input + splicing is done. So we derive this class from XconfigBasicLayer. + """ + + def __init__(self, first_token, key_to_value, prev_names = None): + assert first_token in [ 'tdnn-relu-layer', 'tdnn-relu-renorm-layer', + 'tdnn-sigmoid-layer', 'tdnn-tanh-layer' ] + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + + def set_default_configs(self): + + super(XconfigTdnnLayer, self).set_default_configs() + + self.config['splice-indexes'] = '' + self.config['subset-dim'] = -1 + + def check_configs(self): + + if self.config['splice-indexes'] == '': + raise xparser_error("splice-indexes has to be non-empty", self.str()) + super(XconfigTdnnLayer, self).check_configs() + + + def _generate_config(self): + split_layer_name = self.layer_type.split('-') + assert split_layer_name[-1] == 'layer' + # ignore the first 'tdnn' and the last 'layer' + nonlinearities = split_layer_name[1:-1] + + # by 'descriptor_final_string' we mean a string that can appear in + # config-files, i.e. it contains the 'final' names of nodes. + input_desc = self.descriptors['input']['final-string'] + input_dim = self.descriptors['input']['dim'] + splice_indexes = self.get_splice_indexes() + input_desc, input_dim, sp_configs = self.splice_input(input_desc, + input_dim, splice_indexes, self.config['subset-dim'], + '{0}.input-subset'.format(self.name)) + + return sp_configs + self._add_components(input_desc, input_dim, nonlinearities) + + def get_splice_indexes(self): + try: + return map(lambda x: int(x), self.config['splice-indexes'].split(",")) + except ValueError: + raise xparser_error("Invalid value for splice-indexes.", str(self)) + + @staticmethod + def splice_input(input_desc, input_dim, + splice_indexes, subset_dim = -1, + dim_range_node_name = None ): + """Convenience function to create an appended descriptor with the + splice_indexes. + """ + + configs = [] + try: + zero_index = splice_indexes.index(0) + except ValueError: + zero_index = None + + if subset_dim > 0: + assert(dim_range_node_name is not None) + # if subset_dim is specified the script expects a zero + # in the splice indexes + assert(zero_index is not None) + line = ("dim-range-node name={0}" + " input-node={1}" + " dim-offset={2}" + " dim={3}" + "".format(dim_range_node_name, + input_desc, 0, subset_dim)) + configs.append(line) + subset_desc = dim_range_node_name + + else: + subset_desc = input_desc + subset_dim = input_dim + + appended_descriptors = [] + appended_dimension = 0 + for j in range(len(splice_indexes)): + if j == zero_index: + appended_descriptors.append(input_desc) + appended_dimension += input_dim + continue + appended_descriptors.append('Offset({0}, {1})'.format(subset_desc, splice_indexes[j])) + appended_dimension += subset_dim + return ["Append({0})".format(", ".join(appended_descriptors)), + appended_dimension, + configs] + + diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py new file mode 100644 index 00000000000..87c9d880089 --- /dev/null +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py @@ -0,0 +1,615 @@ +# Copyright 2016 Johns Hopkins University (Author: Daniel Povey). +# License: Apache 2.0. + +# This library contains various utilities that are involved in processing +# of xconfig -> config conversion. It contains "generic" lower-level code +# while xconfig_layers.py contains the code specific to layer types. + +from __future__ import print_function +import re +import sys + + +class XconfigParserError(RuntimeError): + def __init__(self, error_msg, conf_line=None): + self.conf_line = conf_line + if conf_line is not None: + self.msg = 'While parsing "{c}" :{e}'.format(c=conf_line, e=error_msg) + else: + self.msg = error_msg + + def __str__(self): + return self.msg + +# [utility function used in xconfig_layers.py] +# Given a list of objects of type XconfigLayerBase ('all_layers'), +# including at least the layers preceding 'current_layer' (and maybe +# more layers), return the names of layers preceding 'current_layer' +# This will be used in parsing expressions like [-1] in descriptors +# (which is an alias for the previous layer). +def get_prev_names(all_layers, current_layer): + prev_names = [] + for layer in all_layers: + if layer is current_layer: + break + prev_names.append(layer.get_name()) + prev_names_set = set() + for name in prev_names: + if name in prev_names_set: + raise XconfigParserError("{0}: Layer name {1} is used more than once.".format( + sys.argv[0], name), current_layer.str()) + prev_names_set.add(name) + return prev_names + + +# This is a convenience function to parser the auxiliary output name from the +# full layer name + +def split_layer_name(full_layer_name): + assert isinstance(full_layer_name, str) + split_name = full_layer_name.split('.') + if len(split_name) == 0: + raise XconfigParserError("Bad layer name: " + full_layer_name) + layer_name = split_name[0] + if len(split_name) == 1: + auxiliary_output = None + else: + # we probably expect len(split_name) == 2 in this case, + # but no harm in allowing dots in the auxiliary_output. + auxiliary_output = '.'.join(split_name[1:]) + + return [layer_name, auxiliary_output] + +# [utility function used in xconfig_layers.py] +# this converts a layer-name like 'ivector' or 'input', or a sub-layer name like +# 'lstm2.memory_cell', into a dimension. 'all_layers' is a vector of objects +# inheriting from XconfigLayerBase. 'current_layer' is provided so that the +# function can make sure not to look in layers that appear *after* this layer +# (because that's not allowed). +def get_dim_from_layer_name(all_layers, current_layer, full_layer_name): + layer_name, auxiliary_output = split_layer_name(full_layer_name) + for layer in all_layers: + if layer is current_layer: + break + if layer.get_name() == layer_name: + if not auxiliary_output in layer.auxiliary_outputs() and auxiliary_output is not None: + raise XconfigParserError("Layer '{0}' has no such auxiliary output: '{1}' ({0}.{1})".format(layer_name, auxiliary_output), layer.str()) + return layer.output_dim(auxiliary_output) + # No such layer was found. + if layer_name in [ layer.get_name() for layer in all_layers ]: + raise XconfigParserError("Layer '{0}' was requested before it appeared in " + "the xconfig file (circular dependencies or out-of-order " + "layers".format(layer_name)) + else: + raise XconfigParserError("No such layer: '{0}'".format(layer_name)) + + +# [utility function used in xconfig_layers.py] +# this converts a layer-name like 'ivector' or 'input', or a sub-layer name like +# 'lstm2.memory_cell', into a descriptor (usually, but not required to be a simple +# component-node name) that can appear in the generated config file. 'all_layers' is a vector of objects +# inheriting from XconfigLayerBase. 'current_layer' is provided so that the +# function can make sure not to look in layers that appear *after* this layer +# (because that's not allowed). +def get_string_from_layer_name(all_layers, current_layer, full_layer_name): + layer_name, auxiliary_output = split_layer_name(full_layer_name) + for layer in all_layers: + if layer is current_layer: + break + if layer.get_name() == layer_name: + if not auxiliary_output in layer.auxiliary_outputs() and auxiliary_output is not None: + raise XconfigParserError("Layer '{0}' has no such auxiliary output: '{1}' ({0}.{1})".format( + layer_name, auxiliary_output)) + return layer.output_name(auxiliary_output) + # No such layer was found. + if layer_name in [ layer.get_name() for layer in all_layers ]: + raise XconfigParserError("Layer '{0}' was requested before it appeared in " + "the xconfig file (circular dependencies or out-of-order " + "layers".format(layer_name)) + else: + raise XconfigParserError("No such layer: '{0}'".format(layer_name)) + + +# This function, used in converting string values in config lines to +# configuration values in self.config in layers, attempts to +# convert 'string_value' to an instance dest_type (which is of type Type) +# 'key' is only needed for printing errors. +def convert_value_to_type(key, dest_type, string_value): + if dest_type == type(bool()): + if string_value == "True" or string_value == "true": + return True + elif string_value == "False" or string_value == "false": + return False + else: + raise XconfigParserError("Invalid configuration value {0}={1} (expected bool)".format( + key, string_value)) + elif dest_type == type(int()): + try: + return int(string_value) + except: + raise XconfigParserError("Invalid configuration value {0}={1} (expected int)".format( + key, string_value)) + elif dest_type == type(float()): + try: + return float(string_value) + except: + raise XconfigParserError("Invalid configuration value {0}={1} (expected int)".format( + key, string_value)) + elif dest_type == type(str()): + return string_value + + + +# This class parses and stores a Descriptor-- expression +# like Append(Offset(input, -3), input) and so on. +# For the full range of possible expressions, see the comment at the +# top of src/nnet3/nnet-descriptor.h. +# Note: as an extension to the descriptor format used in the C++ +# code, we can have e.g. input@-3 meaning Offset(input, -3); +# and if bare integer numbers appear where a descriptor was expected, +# they are interpreted as Offset(prev_layer, -3) where 'prev_layer' +# is the previous layer in the config file. + +# Also, in any place a raw input/layer/output name can appear, we accept things +# like [-1] meaning the previous input/layer/output's name, or [-2] meaning the +# last-but-one input/layer/output, and so on. +class Descriptor: + def __init__(self, + descriptor_string = None, + prev_names = None): + # self.operator is a string that may be 'Offset', 'Append', + # 'Sum', 'Failover', 'IfDefined', 'Offset', 'Switch', 'Round', + # 'ReplaceIndex'; it also may be None, representing the base-case + # (where it's just a layer name) + + # self.items will be whatever items are + # inside the parentheses, e.g. if this is Sum(foo bar), + # then items will be [d1, d2], where d1 is a Descriptor for + # 'foo' and d1 is a Descriptor for 'bar'. However, there are + # cases where elements of self.items are strings or integers, + # for instance in an expression 'ReplaceIndex(ivector, x, 0)', + # self.items would be [d, 'x', 0], where d is a Descriptor + # for 'ivector'. In the case where self.operator is None (where + # this Descriptor represents just a bare layer name), self. + # items contains the name of the input layer as a string. + self.operator = None + self.items = None + + if descriptor_string != None: + try: + tokens = tokenize_descriptor(descriptor_string, prev_names) + pos = 0 + (d, pos) = parse_new_descriptor(tokens, pos, prev_names) + # note: 'pos' should point to the 'end of string' marker + # that terminates 'tokens'. + if pos != len(tokens) - 1: + raise XconfigParserError("Parsing Descriptor, saw junk at end: " + + ' '.join(tokens[pos:-1])) + # copy members from d. + self.operator = d.operator + self.items = d.items + except XconfigParserError as e: + traceback.print_tb(sys.exc_info()[2]) + raise XconfigParserError("Error parsing Descriptor '{0}', specific error was: {1}".format( + descriptor_string, repr(e))) + + # This is like the str() function, but it uses the layer_to_string function + # (which is a function from strings to strings) to convert layer names (or + # in general sub-layer names of the form 'foo.bar') to the component-node + # (or, in general, descriptor) names that appear in the final config file. + # This mechanism gives those designing layer types the freedom to name their + # nodes as they want. + def config_string(self, layer_to_string): + if self.operator is None: + assert len(self.items) == 1 and isinstance(self.items[0], str) + return layer_to_string(self.items[0]) + else: + assert isinstance(self.operator, str) + return self.operator + '(' + ', '.join( + [ item.config_string(layer_to_string) if isinstance(item, Descriptor) else str(item) + for item in self.items]) + ')' + + def str(self): + if self.operator is None: + assert len(self.items) == 1 and isinstance(self.items[0], str) + return self.items[0] + else: + assert isinstance(self.operator, str) + return self.operator + '(' + ', '.join([str(item) for item in self.items]) + ')' + + def __str__(self): + return self.str() + + # This function returns the dimension (i.e. the feature dimension) of the + # descriptor. It takes 'layer_to_dim' which is a function from + # layer-names (including sub-layer names, like lstm1.memory_cell) to + # dimensions, e.g. you might have layer_to_dim('ivector') = 100, or + # layer_to_dim('affine1') = 1024. + # note: layer_to_dim will raise an exception if a nonexistent layer or + # sub-layer is requested. + def dim(self, layer_to_dim): + if self.operator is None: + # base-case: self.items = [ layer_name ] (or sub-layer name, like + # 'lstm.memory_cell'). + return layer_to_dim(self.items[0]) + elif self.operator in [ 'Sum', 'Failover', 'IfDefined', 'Switch' ]: + # these are all operators for which all args are descriptors + # and must have the same dim. + dim = self.items[0].dim(layer_to_dim) + for desc in self.items[1:]: + next_dim = desc.dim(layer_to_dim) + if next_dim != dim: + raise XparserError("In descriptor {0}, different fields have different " + "dimensions: {1} != {2}".format(self.str(), dim, next_dim)) + return dim + elif self.operator in [ 'Offset', 'Round', 'ReplaceIndex' ]: + # for these operators, only the 1st arg is relevant. + return self.items[0].dim(layer_to_dim) + elif self.operator == 'Append': + return sum([ x.dim(layer_to_dim) for x in self.items]) + else: + raise XconfigParserError("Unknown operator {0}".format(self.operator)) + + + +# This just checks that seen_item == expected_item, and raises an +# exception if not. +def expect_token(expected_item, seen_item, what_parsing): + if seen_item != expected_item: + raise XconfigParserError("parsing {0}, expected '{1}' but got '{2}'".format( + what_parsing, expected_item, seen_item)) + +# returns true if 'name' is valid as the name of a line (input, layer or output); +# this is the same as IsValidname() in the nnet3 code. +def is_valid_line_name(name): + return isinstance(name, str) and re.match(r'^[a-zA-Z_][-a-zA-Z_0-9.]*', name) != None + +# This function for parsing Descriptors takes an array of tokens as produced +# by tokenize_descriptor. It parses a descriptor +# starting from position pos >= 0 of the array 'tokens', and +# returns a new position in the array that reflects any tokens consumed while +# parsing the descriptor. +# It returns a pair (d, pos) where d is the newly parsed Descriptor, +# and 'pos' is the new position after consuming the relevant input. +# 'prev_names' is so that we can find the most recent layer name for +# expressions like Append(-3, 0, 3) which is shorthand for the most recent +# layer spliced at those time offsets. +def parse_new_descriptor(tokens, pos, prev_names): + size = len(tokens) + first_token = tokens[pos] + pos += 1 + d = Descriptor() + + # when reading this function, be careful to note the indent level, + # there is an if-statement within an if-statement. + if first_token in [ 'Offset', 'Round', 'ReplaceIndex', 'Append', 'Sum', 'Switch', 'Failover', 'IfDefined' ]: + expect_token('(', tokens[pos], first_token + '()') + pos += 1 + d.operator = first_token + # the 1st argument of all these operators is a Descriptor. + (desc, pos) = parse_new_descriptor(tokens, pos, prev_names) + d.items = [desc] + + if first_token == 'Offset': + expect_token(',', tokens[pos], 'Offset()') + pos += 1 + try: + t_offset = int(tokens[pos]) + pos += 1 + d.items.append(t_offset) + except: + raise XconfigParserError("Parsing Offset(), expected integer, got " + tokens[pos]) + if tokens[pos] == ')': + return (d, pos + 1) + elif tokens[pos] != ',': + raise XconfigParserError("Parsing Offset(), expected ')' or ',', got " + tokens[pos]) + pos += 1 + try: + x_offset = int(tokens[pos]) + pos += 1 + d.items.append(x_offset) + except: + raise XconfigParserError("Parsing Offset(), expected integer, got " + tokens[pos]) + expect_token(')', tokens[pos], 'Offset()') + pos += 1 + elif first_token in [ 'Append', 'Sum', 'Switch', 'Failover', 'IfDefined' ]: + while True: + if tokens[pos] == ')': + # check num-items is correct for some special cases. + if first_token == 'Failover' and len(d.items) != 2: + raise XconfigParserError("Parsing Failover(), expected 2 items but got {0}".format(len(d.items))) + if first_token == 'IfDefined' and len(d.items) != 1: + raise XconfigParserError("Parsing IfDefined(), expected 1 item but got {0}".format(len(d.items))) + pos += 1 + break + elif tokens[pos] == ',': + pos += 1 # consume the comma. + else: + raise XconfigParserError("Parsing Append(), expected ')' or ',', got " + tokens[pos]) + + (desc, pos) = parse_new_descriptor(tokens, pos, prev_names) + d.items.append(desc) + elif first_token == 'Round': + expect_token(',', tokens[pos], 'Round()') + pos += 1 + try: + t_modulus = int(tokens[pos]) + assert t_modulus > 0 + pos += 1 + d.items.append(t_modulus) + except: + raise XconfigParserError("Parsing Offset(), expected integer, got " + tokens[pos]) + expect_token(')', tokens[pos], 'Round()') + pos += 1 + elif first_token == 'ReplaceIndex': + expect_token(',', tokens[pos], 'ReplaceIndex()') + pos += 1 + if tokens[pos] in [ 'x', 't' ]: + d.items.append(tokens[pos]) + pos += 1 + else: + raise XconfigParserError("Parsing ReplaceIndex(), expected 'x' or 't', got " + + tokens[pos]) + expect_token(',', tokens[pos], 'ReplaceIndex()') + pos += 1 + try: + new_value = int(tokens[pos]) + pos += 1 + d.items.append(new_value) + except: + raise XconfigParserError("Parsing Offset(), expected integer, got " + tokens[pos]) + expect_token(')', tokens[pos], 'ReplaceIndex()') + pos += 1 + else: + raise XconfigParserError("code error") + elif first_token in [ 'end of string', '(', ')', ',', '@' ]: + raise XconfigParserError("Expected descriptor, got " + first_token) + elif is_valid_line_name(first_token) or first_token == '[': + # This section parses a raw input/layer/output name, e.g. "affine2" + # (which must start with an alphabetic character or underscore), + # optionally followed by an offset like '@-3'. + + d.operator = None + d.items = [first_token] + + # If the layer-name o is followed by '@', then + # we're parsing something like 'affine1@-3' which + # is syntactic sugar for 'Offset(affine1, 3)'. + if tokens[pos] == '@': + pos += 1 + try: + offset_t = int(tokens[pos]) + pos += 1 + except: + raise XconfigParserError("Parse error parsing {0}@{1}".format( + first_token, tokens[pos])) + if offset_t != 0: + inner_d = d + d = Descriptor() + # e.g. foo@3 is equivalent to 'Offset(foo, 3)'. + d.operator = 'Offset' + d.items = [ inner_d, offset_t ] + else: + # the last possible case is that 'first_token' is just an integer i, + # which can appear in things like Append(-3, 0, 3). + # See if the token is an integer. + # In this case, it's interpreted as the name of previous layer + # (with that time offset applied). + try: + offset_t = int(first_token) + except: + raise XconfigParserError("Parsing descriptor, expected descriptor but got " + + first_token) + assert isinstance(prev_names, list) + if len(prev_names) < 1: + raise XconfigParserError("Parsing descriptor, could not interpret '{0}' because " + "there is no previous layer".format(first_token)) + d.operator = None + # the layer name is the name of the most recent layer. + d.items = [prev_names[-1]] + if offset_t != 0: + inner_d = d + d = Descriptor() + d.operator = 'Offset' + d.items = [ inner_d, offset_t ] + return (d, pos) + + +# This function takes a string 'descriptor_string' which might +# look like 'Append([-1], [-2], input)', and a list of previous layer +# names like prev_names = ['foo', 'bar', 'baz'], and replaces +# the integers in brackets with the previous layers. -1 means +# the most recent previous layer ('baz' in this case), -2 +# means the last layer but one ('bar' in this case), and so on. +# It will throw an exception if the number is out of range. +# If there are no such expressions in the string, it's OK if +# prev_names == None (this is useful for testing). +def replace_bracket_expressions_in_descriptor(descriptor_string, + prev_names = None): + fields = re.split(r'(\[|\])\s*', descriptor_string) + out_fields = [] + i = 0 + while i < len(fields): + f = fields[i] + i += 1 + if f == ']': + raise XconfigParserError("Unmatched ']' in descriptor") + elif f == '[': + if i + 2 >= len(fields): + raise XconfigParserError("Error tokenizing string '{0}': '[' found too close " + "to the end of the descriptor.".format(descriptor_string)) + assert isinstance(prev_names, list) + try: + offset = int(fields[i]) + assert offset < 0 and -offset <= len(prev_names) + i += 2 # consume the int and the ']'. + except: + raise XconfigParserError("Error tokenizing string '{0}': expression [{1}] has an " + "invalid or out of range offset.".format(descriptor_string, fields[i])) + this_field = prev_names[offset] + out_fields.append(this_field) + else: + out_fields.append(f) + return ''.join(out_fields) + +# tokenizes 'descriptor_string' into the tokens that may be part of Descriptors. +# Note: for convenience in parsing, we add the token 'end-of-string' to this +# list. +# The argument 'prev_names' (for the names of previous layers and input and +# output nodes) is needed to process expressions like [-1] meaning the most +# recent layer, or [-2] meaning the last layer but one. +# The default None for prev_names is only supplied for testing purposes. +def tokenize_descriptor(descriptor_string, + prev_names = None): + # split on '(', ')', ',', '@', and space. Note: the parenthesis () in the + # regexp causes it to output the stuff inside the () as if it were a field, + # which is how the call to re.split() keeps characters like '(' and ')' as + # tokens. + fields = re.split(r'(\(|\)|@|,|\s)\s*', + replace_bracket_expressions_in_descriptor(descriptor_string, + prev_names)) + ans = [] + for f in fields: + # don't include fields that are space, or are empty. + if re.match(r'^\s*$', f) is None: + ans.append(f) + + ans.append('end of string') + return ans + + +# This function parses a line in a config file, something like +# affine-layer name=affine1 input=Append(-3, 0, 3) +# and returns a pair, +# (first_token, fields), as (string, dict) e.g. in this case +# ('affine-layer', {'name':'affine1', 'input':'Append(-3, 0, 3)" +# Note: spaces are allowed in the field names but = signs are +# disallowed, which is why it's possible to parse them. +# This function also removes comments (anything after '#'). +# As a special case, this function will return None if the line +# is empty after removing spaces. +def parse_config_line(orig_config_line): + # Remove comments. + # note: splitting on '#' will always give at least one field... python + # treats splitting on space as a special case that may give zero fields. + config_line = orig_config_line.split('#')[0] + if re.match('[^a-zA-Z0-9\.\-\(\)_\s"]', config_line) is not None: + raise XconfigParserError("Xconfig line has unknown characters.", config_line) + + # Now split on space; later we may splice things back together. + fields=config_line.split() + if len(fields) == 0: + return None # Line was only whitespace after removing comments. + first_token = fields[0] + # if first_token does not look like 'foo-bar' or 'foo-bar2', then die. + if re.match('^[a-z][-a-z0-9]+$', first_token) is None: + raise XconfigParserError("Error parsing config line (first field doesn't look right): {0}".format( + orig_config_line)) + # get rid of the first field which we put in 'first_token'. + fields = fields[1:] + + rest_of_line = ' '.join(fields) + # rest of the line can be of the form 'a=1 b=" x=1 y=2 " c=Append( i1, i2)' + positions = map(lambda x: x.start(), re.finditer('"', rest_of_line)) + if not len(positions) % 2 == 0: + raise XconfigParserError('"s should occur in pairs', config_line) + + # add the " enclosed strings and corresponding keys to the dict + # and remove them from the rest_of_line + num_strings = len(positions) / 2 + fields = [] + for i in range(num_strings): + start = positions[i * 2] + end = positions[i * 2 + 1] + rest_of_line_after = rest_of_line[end + 1:] + parts = rest_of_line[:start].split() + rest_of_line_before = ' '.join(parts[:-1]) + assert(parts[-1][-1] == '=') + fields.append(parts[-1][:-1]) + fields.append(rest_of_line[start + 1 : end]) + rest_of_line = rest_of_line_before + ' ' + rest_of_line_after + + # suppose rest_of_line is: 'input=Append(foo, bar) foo=bar' + # then after the below we'll get + # fields = ['', 'input', 'Append(foo, bar)', 'foo', 'bar'] + ans_dict = dict() + other_fields = re.split(r'\s*([-a-zA-Z0-9_]*)=', rest_of_line) + if not (other_fields[0] == '' and len(other_fields) % 2 == 1): + raise XconfigParserError("Could not parse config line: " + orig_config_line) + fields += other_fields[1:] + num_variables = len(fields) / 2 + for i in range(num_variables): + var_name = fields[i * 2] + var_value = fields[i * 2 + 1] + if re.match(r'[a-zA-Z_]', var_name) is None: + raise XconfigParserError("Expected variable name '{0}' to start with alphabetic character or _, " + "in config line {1}".format(var_name, orig_config_line)) + if var_name in ans_dict: + raise XconfigParserError("Config line has multiply defined variable {0}: {1}".format( + var_name, orig_config_line)) + ans_dict[var_name] = var_value + return (first_token, ans_dict) + +# Reads a config file and returns a list of objects, where each object +# represents one line of the file. +def read_config_file(filename): + try: + f = open(filename, "r") + except XconfigParserError as e: + raise XconfigParserError("Error reading config file {0}: {1}".format( + filename, repr(e))) + ans = [] + prev_names = [] + while True: + line = f.readline() + if line == '': + break + x = parse_config_line(line) + if x is None: + continue # blank line + (first_token, key_to_value) = x + layer_object = config_line_to_object(first_token, key_to_value, prev_names) + ans.append(layer_object) + prev_names.append(layer_object.get_name()) + +def test_library(): + tokenize_test = lambda x: tokenize_descriptor(x)[:-1] # remove 'end of string' + assert tokenize_test("hi") == ['hi'] + assert tokenize_test("hi there") == ['hi', 'there'] + assert tokenize_test("hi,there") == ['hi', ',', 'there'] + assert tokenize_test("hi@-1,there") == ['hi', '@', '-1', ',', 'there'] + assert tokenize_test("hi(there)") == ['hi', '(', 'there', ')'] + assert tokenize_descriptor("[-1]@2", ['foo', 'bar'])[:-1] == ['bar', '@', '2' ] + assert tokenize_descriptor("[-2].special@2", ['foo', 'bar'])[:-1] == ['foo.special', '@', '2' ] + + assert Descriptor('foo').str() == 'foo' + assert Descriptor('Sum(foo,bar)').str() == 'Sum(foo, bar)' + assert Descriptor('Sum(Offset(foo,1),Offset(foo,0))').str() == 'Sum(Offset(foo, 1), Offset(foo, 0))' + for x in [ 'Append(foo, Sum(bar, Offset(baz, 1)))', 'Failover(foo, Offset(bar, -1))', + 'IfDefined(Round(baz, 3))', 'Switch(foo1, Offset(foo2, 2), Offset(foo3, 3))', + 'IfDefined(ReplaceIndex(ivector, t, 0))', 'ReplaceIndex(foo, x, 0)' ]: + if not Descriptor(x).str() == x: + print("Error: '{0}' != '{1}'".format(Descriptor(x).str(), x)) + + prev_names = ['last_but_one_layer', 'prev_layer'] + for x, y in [ ('Sum(foo,bar)', 'Sum(foo, bar)'), + ('Sum(foo1,bar-3_4)', 'Sum(foo1, bar-3_4)'), + ('Append(input@-3, input@0, input@3)', + 'Append(Offset(input, -3), input, Offset(input, 3))'), + ('Append(-3,0,3)', + 'Append(Offset(prev_layer, -3), prev_layer, Offset(prev_layer, 3))'), + ('[-1]', 'prev_layer'), + ('[-2]', 'last_but_one_layer'), + ('[-2]@3', + 'Offset(last_but_one_layer, 3)') ]: + if not Descriptor(x, prev_names).str() == y: + print("Error: '{0}' != '{1}'".format(Descriptor(x).str(), y)) + + + print(parse_config_line('affine-layer input=Append(foo, bar) foo=bar')) + print(parse_config_line('affine-layer input=Append(foo, bar) foo=bar opt2="a=1 b=2"')) + print(parse_config_line('affine-layer1 input=Append(foo, bar) foo=bar')) + print(parse_config_line('affine-layer')) + +if __name__ == "__main__": + test_library() diff --git a/egs/wsj/s5/steps/nnet3/chain/gen_topo.py b/egs/wsj/s5/steps/nnet3/chain/gen_topo.py index fdd7a02fd88..b27cd9eff1c 100755 --- a/egs/wsj/s5/steps/nnet3/chain/gen_topo.py +++ b/egs/wsj/s5/steps/nnet3/chain/gen_topo.py @@ -2,6 +2,9 @@ # Copyright 2012 Johns Hopkins University (author: Daniel Povey) +# This script was modified around 11.11.2016, when the code was extended to +# support having a different pdf-class on the self loop. + # Generate a topology file. This allows control of the number of states in the # non-silence HMMs, and in the silence HMMs. This is a modified version of # 'utils/gen_topo.pl' that generates a different type of topology, one that we @@ -41,9 +44,8 @@ # We make the transition-probs 0.5 so they normalize, to keep the code happy. # In fact, we always set the transition probability scale to 0.0 in the 'chain' # code, so they are never used. -print(" 0 0 1 0.5 2 0.5 ") -print(" 1 1 1 0.5 2 0.5 ") -print(" 2 ") +print(" 0 0 1 0 0.5 1 0.5 ") +print(" 1 ") print("") print("") diff --git a/egs/wsj/s5/steps/nnet3/chain/gen_topo_orig.py b/egs/wsj/s5/steps/nnet3/chain/gen_topo_orig.py new file mode 100755 index 00000000000..01a715a9a23 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/gen_topo_orig.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python + +# Copyright 2012 Johns Hopkins University (author: Daniel Povey) + +# This file is as ./gen_topo.py used to be (before we extended the transition-model +# code to support having a different self-loop pdf-class). It is included +# here for baseline and testing purposes. + + +# Generate a topology file. This allows control of the number of states in the +# non-silence HMMs, and in the silence HMMs. This is a modified version of +# 'utils/gen_topo.pl' that generates a different type of topology, one that we +# believe should be useful in the 'chain' model. Note: right now it doesn't +# have any real options, and it treats silence and nonsilence the same. The +# intention is that you write different versions of this script, or add options, +# if you experiment with it. + +from __future__ import print_function +import argparse + + +parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py " + " " + "e.g.: steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n", + epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage."); +parser.add_argument("nonsilence_phones", type=str, + help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9"); +parser.add_argument("silence_phones", type=str, + help="List of silence phones as integers, separated by colons, e.g. 1:2:3"); + +args = parser.parse_args() + +silence_phones = [ int(x) for x in args.silence_phones.split(":") ] +nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ] +all_phones = silence_phones + nonsilence_phones + +print("") +print("") +print("") +print(" ".join([str(x) for x in all_phones])) +print("") +# The next two lines may look like a bug, but they are as intended. State 0 has +# no self-loop, it happens exactly once. And it can go either to state 1 (with +# a self-loop) or to state 2, so we can have zero or more instances of state 1 +# following state 0. +# We make the transition-probs 0.5 so they normalize, to keep the code happy. +# In fact, we always set the transition probability scale to 0.0 in the 'chain' +# code, so they are never used. +print(" 0 0 1 0.5 2 0.5 ") +print(" 1 1 1 0.5 2 0.5 ") +print(" 2 ") +print("") +print("") diff --git a/egs/wsj/s5/steps/nnet3/chain/nnet3_chain_lib.py b/egs/wsj/s5/steps/nnet3/chain/nnet3_chain_lib.py index f012d06cca9..d58db33bf98 100644 --- a/egs/wsj/s5/steps/nnet3/chain/nnet3_chain_lib.py +++ b/egs/wsj/s5/steps/nnet3/chain/nnet3_chain_lib.py @@ -169,7 +169,8 @@ def PrepareInitialAcousticModel(dir, run_opts): command = run_opts.command, dir = dir)) def CombineModels(dir, num_iters, num_iters_combine, num_chunk_per_minibatch, - egs_dir, leaky_hmm_coefficient, l2_regularize, + egs_dir, left_context, right_context, + leaky_hmm_coefficient, l2_regularize, xent_regularize, run_opts): # Now do combination. In the nnet3 setup, the logic # for doing averaging of subsets of the models in the case where @@ -188,10 +189,13 @@ def CombineModels(dir, num_iters, num_iters_combine, num_chunk_per_minibatch, nnet3-chain-combine --num-iters=40 \ --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ --enforce-sum-to-one=true --enforce-positive-weights=true \ - --verbose=3 {dir}/den.fst {raw_models} "ark,bg:nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:{egs_dir}/combine.cegs ark:-|" \ + --verbose=3 {dir}/den.fst {raw_models} \ + "ark,bg:nnet3-chain-copy-egs --left-context={lc} --right-context={rc} ark:{egs_dir}/combine.cegs ark:- | \ + nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:- ark:-|" \ "|nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl {dir}/final.mdl" """.format(command = run_opts.command, combine_queue_opt = run_opts.combine_queue_opt, + lc = left_context, rc = right_context, l2 = l2_regularize, leaky = leaky_hmm_coefficient, dir = dir, raw_models = " ".join(raw_model_strings), num_chunk_per_minibatch = num_chunk_per_minibatch, @@ -201,9 +205,20 @@ def CombineModels(dir, num_iters, num_iters_combine, num_chunk_per_minibatch, # Compute the probability of the final, combined model with # the same subset we used for the previous compute_probs, as the # different subsets will lead to different probs. - ComputeTrainCvProbabilities(dir, 'final', egs_dir, l2_regularize, xent_regularize, leaky_hmm_coefficient, run_opts, wait = False) + ComputeTrainCvProbabilities(dir = dir, + iter = 'final', + egs_dir = egs_dir, + left_context = left_context, + right_context = right_context, + l2_regularize = l2_regularize, + xent_regularize = xent_regularize, + leaky_hmm_coefficient = leaky_hmm_coefficient, + run_opts = run_opts, + wait = False) -def ComputeTrainCvProbabilities(dir, iter, egs_dir, l2_regularize, xent_regularize, +def ComputeTrainCvProbabilities(dir, iter, + egs_dir, left_context, right_context, + l2_regularize, xent_regularize, leaky_hmm_coefficient, run_opts, wait = False): model = '{0}/{1}.mdl'.format(dir, iter) @@ -213,9 +228,10 @@ def ComputeTrainCvProbabilities(dir, iter, egs_dir, l2_regularize, xent_regulari nnet3-chain-compute-prob --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ --xent-regularize={xent_reg} \ "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \ - "ark,bg:nnet3-chain-merge-egs ark:{egs_dir}/valid_diagnostic.cegs ark:- |" + "ark,bg:nnet3-chain-copy-egs --left-context={lc} --right-context={rc} ark:{egs_dir}/valid_diagnostic.cegs ark:-| nnet3-chain-merge-egs ark:- ark:- |" """.format(command = run_opts.command, dir = dir, iter = iter, model = model, + lc = left_context, rc = right_context, l2 = l2_regularize, leaky = leaky_hmm_coefficient, xent_reg = xent_regularize, egs_dir = egs_dir), wait = wait) @@ -225,11 +241,12 @@ def ComputeTrainCvProbabilities(dir, iter, egs_dir, l2_regularize, xent_regulari nnet3-chain-compute-prob --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ --xent-regularize={xent_reg} \ "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \ - "ark,bg:nnet3-chain-merge-egs ark:{egs_dir}/train_diagnostic.cegs ark:- |" + "ark,bg:nnet3-chain-copy-egs --left-context={lc} --right-context={rc} ark:{egs_dir}/train_diagnostic.cegs ark:- | nnet3-chain-merge-egs ark:- ark:- |" """.format(command = run_opts.command, dir = dir, iter = iter, model = model, + lc = left_context, rc = right_context, l2 = l2_regularize, leaky = leaky_hmm_coefficient, xent_reg = xent_regularize, egs_dir = egs_dir), wait = wait) diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index cd9ebf4c7a3..15679fb4061 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -118,11 +118,7 @@ def GetArgs(): " chain model's output") parser.add_argument("--chain.left-deriv-truncate", type=int, dest='left_deriv_truncate', - default = None, help="") - parser.add_argument("--chain.right-deriv-truncate", type=int, - dest='right_deriv_truncate', - default = None, help="") - + default = None, help="Deprecated. Kept for back compatibility") # trainer options parser.add_argument("--trainer.srand", type=int, dest='srand', @@ -224,6 +220,14 @@ def GetArgs(): parser.add_argument("--trainer.num-chunk-per-minibatch", type=int, dest='num_chunk_per_minibatch', default=512, help="Number of sequences to be processed in parallel every minibatch" ) + parser.add_argument("--trainer.deriv-truncate-margin", type=int, dest='deriv_truncate_margin', + default = None, + help="(Relevant only for recurrent models). If specified, gives the margin " + "(in input frames) around the 'required' part of each chunk that the " + "derivatives are backpropagated to. If unset, the derivatives are " + "backpropagated all the way to the boundaries of the input data. E.g. 8 is " + "a reasonable setting. Note: the 'required' part of the chunk is defined by " + "the model's {left,right}-context.") # General options parser.add_argument("--stage", type=int, default=-4, @@ -258,7 +262,8 @@ def GetArgs(): parser.add_argument("--feat-dir", type=str, required = True, help="Directory with features used for training the neural network.") parser.add_argument("--tree-dir", type=str, required = True, - help="Languade directory") + help="Directory containing the tree to use for this model (we also " + "expect final.mdl and ali.*.gz in that directory") parser.add_argument("--lat-dir", type=str, required = True, help="Directory with alignments used for training the neural network.") parser.add_argument("--dir", type=str, required = True, @@ -284,6 +289,12 @@ def ProcessArgs(args): if args.chunk_right_context < 0: raise Exception("--egs.chunk-right-context should be non-negative") + if not args.left_deriv_truncate is None: + args.deriv_truncate_margin = -args.left_deriv_truncate + logger.warning("--chain.left-deriv-truncate (deprecated) is set by user, " + "and --trainer.deriv-truncate-margin is set to negative of that value={0}. " + "We recommend using the option --trainer.deriv-truncate-margin.".format(args.deriv_truncate_margin)) + if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")): raise Exception("""This scripts expects {0} to exist and have a configs directory which is the output of make_configs.py script""") @@ -325,9 +336,9 @@ def __init__(self): def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives, - raw_model_string, egs_dir, + raw_model_string, egs_dir, left_context, right_context, apply_deriv_weights, - left_deriv_truncate, right_deriv_truncate, + min_deriv_time, max_deriv_time, l2_regularize, xent_regularize, leaky_hmm_coefficient, momentum, max_param_change, shuffle_buffer_size, num_chunk_per_minibatch, @@ -340,10 +351,10 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi # but we use the same script for consistency with FF-DNN code deriv_time_opts="" - if left_deriv_truncate is not None: - deriv_time_opts += " --optimization.min-deriv-time={0}".format(left_deriv_truncate) - if right_deriv_truncate is not None: - deriv_time_opts += " --optimization.max-deriv-time={0}".format(int(chunk-width-right_deriv_truncate)) + if not min_deriv_time is None: + deriv_time_opts += " --optimization.min-deriv-time={0}".format(min_deriv_time) + if not max_deriv_time is None: + deriv_time_opts += " --optimization.max-deriv-time={0}".format(max_deriv_time) processes = [] for job in range(1,num_jobs+1): @@ -366,7 +377,7 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi --print-interval=10 --momentum={momentum} \ --max-param-change={max_param_change} \ "{raw_model}" {dir}/den.fst \ - "ark,bg:nnet3-chain-copy-egs --truncate-deriv-weights={trunc_deriv} --frame-shift={fr_shft} ark:{egs_dir}/cegs.{archive_index}.ark ark:- | nnet3-chain-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:- ark:- |" \ + "ark,bg:nnet3-chain-copy-egs --left-context={lc} --right-context={rc} --truncate-deriv-weights={trunc_deriv} --frame-shift={fr_shft} ark:{egs_dir}/cegs.{archive_index}.ark ark:- | nnet3-chain-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:- ark:- |" \ {dir}/{next_iter}.{job}.raw """.format(command = run_opts.command, train_queue_opt = run_opts.train_queue_opt, @@ -379,11 +390,12 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi parallel_train_opts = run_opts.parallel_train_opts, momentum = momentum, max_param_change = max_param_change, raw_model = raw_model_string, - egs_dir = egs_dir, archive_index = archive_index, + egs_dir = egs_dir, lc=left_context, rc=right_context, + archive_index = archive_index, shuffle_buffer_size = shuffle_buffer_size, cache_io_opts = cur_cache_io_opts, num_chunk_per_minibatch = num_chunk_per_minibatch), - wait = False) + wait = False) processes.append(process_handle) @@ -404,7 +416,8 @@ def TrainOneIteration(dir, iter, srand, egs_dir, num_jobs, num_archives_processed, num_archives, learning_rate, shrinkage_value, num_chunk_per_minibatch, num_hidden_layers, add_layers_period, - apply_deriv_weights, left_deriv_truncate, right_deriv_truncate, + left_context, right_context, + apply_deriv_weights, min_deriv_time, max_deriv_time, l2_regularize, xent_regularize, leaky_hmm_coefficient, momentum, max_param_change, shuffle_buffer_size, frame_subsampling_factor, truncate_deriv_weights, @@ -427,8 +440,15 @@ def TrainOneIteration(dir, iter, srand, egs_dir, f.write(str(srand)) f.close() - chain_lib.ComputeTrainCvProbabilities(dir, iter, egs_dir, - l2_regularize, xent_regularize, leaky_hmm_coefficient, run_opts) + chain_lib.ComputeTrainCvProbabilities(dir = dir, + iter = iter, + egs_dir = egs_dir, + left_context = left_context, + right_context = right_context, + l2_regularize = l2_regularize, + xent_regularize = xent_regularize, + leaky_hmm_coefficient = leaky_hmm_coefficient, + run_opts = run_opts) if iter > 0: chain_lib.ComputeProgress(dir, iter, run_opts) @@ -460,15 +480,30 @@ def TrainOneIteration(dir, iter, srand, egs_dir, cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2 cur_max_param_change = float(max_param_change) / math.sqrt(2) - TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives, - raw_model_string, egs_dir, - apply_deriv_weights, - left_deriv_truncate, right_deriv_truncate, - l2_regularize, xent_regularize, leaky_hmm_coefficient, - momentum, cur_max_param_change, - shuffle_buffer_size, cur_num_chunk_per_minibatch, - frame_subsampling_factor, truncate_deriv_weights, - cache_io_opts, run_opts) + TrainNewModels(dir = dir, + iter = iter, + srand = srand, + num_jobs = num_jobs, + num_archives_processed = num_archives_processed, + num_archives = num_archives, + raw_model_string = raw_model_string, + egs_dir = egs_dir, + left_context = left_context, + right_context = right_context, + apply_deriv_weights = apply_deriv_weights, + min_deriv_time = min_deriv_time, + max_deriv_time = max_deriv_time, + l2_regularize = l2_regularize, + xent_regularize = xent_regularize, + leaky_hmm_coefficient = leaky_hmm_coefficient, + momentum = momentum, + max_param_change = cur_max_param_change, + shuffle_buffer_size = shuffle_buffer_size, + num_chunk_per_minibatch = cur_num_chunk_per_minibatch, + frame_subsampling_factor = frame_subsampling_factor, + truncate_deriv_weights = truncate_deriv_weights, + cache_io_opts = cache_io_opts, + run_opts = run_opts) [models_to_average, best_model] = train_lib.GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter)) nnets_list = [] @@ -567,14 +602,15 @@ def Train(args, run_opts): left_context = args.chunk_left_context + model_left_context right_context = args.chunk_right_context + model_right_context + egs_left_context = left_context + args.frame_subsampling_factor/2 + egs_right_context = right_context + args.frame_subsampling_factor/2 default_egs_dir = '{0}/egs'.format(args.dir) if (args.stage <= -3) and args.egs_dir is None: logger.info("Generating egs") # this is where get_egs.sh is called. chain_lib.GenerateChainEgs(args.dir, args.feat_dir, args.lat_dir, default_egs_dir, - left_context + args.frame_subsampling_factor/2, - right_context + args.frame_subsampling_factor/2, + egs_left_context, egs_right_context, run_opts, left_tolerance = args.left_tolerance, right_tolerance = args.right_tolerance, @@ -594,7 +630,7 @@ def Train(args, run_opts): else: egs_dir = args.egs_dir - [egs_left_context, egs_right_context, frames_per_eg, num_archives] = train_lib.VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context) + [egs_left_context, egs_right_context, frames_per_eg, num_archives] = train_lib.VerifyEgsDir(egs_dir, feat_dim, ivector_dim, egs_left_context, egs_right_context) assert(args.chunk_width == frames_per_eg) num_archives_expanded = num_archives * args.frame_subsampling_factor @@ -638,6 +674,12 @@ def Train(args, run_opts): args.initial_effective_lrate, args.final_effective_lrate) + min_deriv_time = None + max_deriv_time = None + if not args.deriv_truncate_margin is None: + min_deriv_time = -args.deriv_truncate_margin - model_left_context + max_deriv_time = args.chunk_width - 1 + args.deriv_truncate_margin + model_right_context + logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters)) for iter in range(num_iters): if (args.exit_stage is not None) and (iter == args.exit_stage): @@ -653,18 +695,32 @@ def Train(args, run_opts): shrinkage_value = args.shrink_value logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value)) - TrainOneIteration(args.dir, iter, args.srand, egs_dir, current_num_jobs, - num_archives_processed, num_archives, - learning_rate(iter, current_num_jobs, num_archives_processed), - shrinkage_value, - args.num_chunk_per_minibatch, - num_hidden_layers, args.add_layers_period, - args.apply_deriv_weights, args.left_deriv_truncate, args.right_deriv_truncate, - args.l2_regularize, args.xent_regularize, args.leaky_hmm_coefficient, - args.momentum, args.max_param_change, - args.shuffle_buffer_size, - args.frame_subsampling_factor, - args.truncate_deriv_weights, run_opts) + TrainOneIteration(dir = args.dir, + iter = iter, + srand = args.srand, + egs_dir = egs_dir, + num_jobs = current_num_jobs, + num_archives_processed = num_archives_processed, + num_archives = num_archives, + learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed), + shrinkage_value = shrinkage_value, + num_chunk_per_minibatch = args.num_chunk_per_minibatch, + num_hidden_layers = num_hidden_layers, + add_layers_period = args.add_layers_period, + left_context = left_context, + right_context = right_context, + apply_deriv_weights = args.apply_deriv_weights, + min_deriv_time = min_deriv_time, + max_deriv_time = max_deriv_time, + l2_regularize = args.l2_regularize, + xent_regularize = args.xent_regularize, + leaky_hmm_coefficient = args.leaky_hmm_coefficient, + momentum = args.momentum, + max_param_change = args.max_param_change, + shuffle_buffer_size = args.shuffle_buffer_size, + frame_subsampling_factor = args.frame_subsampling_factor, + truncate_deriv_weights = args.truncate_deriv_weights, + run_opts = run_opts) if args.cleanup: # do a clean up everythin but the last 2 models, under certain conditions train_lib.RemoveModel(args.dir, iter-2, num_iters, num_iters_combine, @@ -683,10 +739,17 @@ def Train(args, run_opts): if args.stage <= num_iters: logger.info("Doing final combination to produce final.mdl") - chain_lib.CombineModels(args.dir, num_iters, num_iters_combine, - args.num_chunk_per_minibatch, egs_dir, - args.leaky_hmm_coefficient, args.l2_regularize, - args.xent_regularize, run_opts) + chain_lib.CombineModels(dir = args.dir, + num_iters = num_iters, + num_iters_combine = num_iters_combine, + num_chunk_per_minibatch = args.num_chunk_per_minibatch, + egs_dir = egs_dir, + left_context = left_context, + right_context = right_context, + leaky_hmm_coefficient = args.leaky_hmm_coefficient, + l2_regularize = args.l2_regularize, + xent_regularize = args.xent_regularize, + run_opts = run_opts) if args.cleanup: logger.info("Cleaning up the experiment directory {0}".format(args.dir)) diff --git a/egs/wsj/s5/steps/nnet3/components.py b/egs/wsj/s5/steps/nnet3/components.py index cf755a8d2ec..4bfcb219fc3 100644 --- a/egs/wsj/s5/steps/nnet3/components.py +++ b/egs/wsj/s5/steps/nnet3/components.py @@ -96,7 +96,7 @@ def AddAffineLayer(config_lines, name, input, output_dim, ng_affine_options = "" # Per-component max-change option max_change_options = "max-change={0:.2f}".format(max_change_per_component) if max_change_per_component is not None else '' - + components.append("component name={0}_affine type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, input['dimension'], output_dim, ng_affine_options, max_change_options)) component_nodes.append("component-node name={0}_affine component={0}_affine input={1}".format(name, input['descriptor'])) @@ -111,7 +111,7 @@ def AddAffRelNormLayer(config_lines, name, input, output_dim, ng_affine_options self_repair_string = "self-repair-scale={0:.10f}".format(self_repair_scale) if self_repair_scale is not None else '' # Per-component max-change option max_change_options = "max-change={0:.2f}".format(max_change_per_component) if max_change_per_component is not None else '' - + components.append("component name={0}_affine type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, input['dimension'], output_dim, ng_affine_options, max_change_options)) components.append("component name={0}_relu type=RectifiedLinearComponent dim={1} {2}".format(name, output_dim, self_repair_string)) components.append("component name={0}_renorm type=NormalizeComponent dim={1} target-rms={2}".format(name, output_dim, norm_target_rms)) @@ -290,12 +290,12 @@ def AddLstmLayer(config_lines, recurrent_projection_dim = 0, non_recurrent_projection_dim = 0, clipping_threshold = 1.0, - norm_based_clipping = "false", + zeroing_threshold = 3.0, + zeroing_interval = 20, ng_per_element_scale_options = "", ng_affine_options = "", lstm_delay = -1, self_repair_scale_nonlinearity = None, - self_repair_scale_clipgradient = None, max_change_per_component = 0.75): assert(recurrent_projection_dim >= 0 and non_recurrent_projection_dim >= 0) components = config_lines['components'] @@ -320,8 +320,6 @@ def AddLstmLayer(config_lines, # self_repair_scale_nonlinearity is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent, # i.e., SigmoidComponent, TanhComponent and RectifiedLinearComponent self_repair_nonlinearity_string = "self-repair-scale={0:.10f}".format(self_repair_scale_nonlinearity) if self_repair_scale_nonlinearity is not None else '' - # self_repair_scale_clipgradient is a constant scaling the self-repair vector computed in ClipGradientComponent - self_repair_clipgradient_string = "self-repair-scale={0:.2f}".format(self_repair_scale_clipgradient) if self_repair_scale_clipgradient is not None else '' # Natural gradient per element scale parameters ng_per_element_scale_options += " param-mean=0.0 param-stddev=1.0 " # Per-component max-change option @@ -357,7 +355,10 @@ def AddLstmLayer(config_lines, components.append("component name={0}_c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) components.append("component name={0}_c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) components.append("component name={0}_m type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) - components.append("component name={0}_c type=ClipGradientComponent dim={1} clipping-threshold={2} norm-based-clipping={3} {4}".format(name, cell_dim, clipping_threshold, norm_based_clipping, self_repair_clipgradient_string)) + components.append("component name={0}_c type=BackpropTruncationComponent dim={1} " + "clipping-threshold={2} zeroing-threshold={3} zeroing-interval={4} " + "recurrence-interval={5}".format(name, cell_dim, clipping_threshold, zeroing_threshold, + zeroing_interval, abs(lstm_delay))) # c1_t and c2_t defined below component_nodes.append("component-node name={0}_c_t component={0}_c input=Sum({0}_c1_t, {0}_c2_t)".format(name)) @@ -396,7 +397,10 @@ def AddLstmLayer(config_lines, if (add_recurrent_projection and add_non_recurrent_projection): components.append("# projection matrices : Wrm and Wpm") components.append("component name={0}_W-m type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, cell_dim, recurrent_projection_dim + non_recurrent_projection_dim, ng_affine_options, max_change_options)) - components.append("component name={0}_r type=ClipGradientComponent dim={1} clipping-threshold={2} norm-based-clipping={3} {4}".format(name, recurrent_projection_dim, clipping_threshold, norm_based_clipping, self_repair_clipgradient_string)) + components.append("component name={0}_r type=BackpropTruncationComponent dim={1} " + "clipping-threshold={2} zeroing-threshold={3} zeroing-interval={4} " + "recurrence-interval={5}".format(name, recurrent_projection_dim, clipping_threshold, + zeroing_threshold, zeroing_interval, abs(lstm_delay))) component_nodes.append("# r_t and p_t") component_nodes.append("component-node name={0}_rp_t component={0}_W-m input={0}_m_t".format(name)) component_nodes.append("dim-range-node name={0}_r_t_preclip input-node={0}_rp_t dim-offset=0 dim={1}".format(name, recurrent_projection_dim)) @@ -406,8 +410,12 @@ def AddLstmLayer(config_lines, elif add_recurrent_projection: components.append("# projection matrices : Wrm") - components.append("component name={0}_Wrm type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, cell_dim, recurrent_projection_dim, ng_affine_options, max_change_options)) - components.append("component name={0}_r type=ClipGradientComponent dim={1} clipping-threshold={2} norm-based-clipping={3} {4}".format(name, recurrent_projection_dim, clipping_threshold, norm_based_clipping, self_repair_clipgradient_string)) + components.append("component name={0}_Wrm type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format( + name, cell_dim, recurrent_projection_dim, ng_affine_options, max_change_options)) + components.append("component name={0}_r type=BackpropTruncationComponent dim={1} " + "clipping-threshold={2} zeroing-threshold={3} zeroing-interval={4} " + "recurrence-interval={5}".format(name, recurrent_projection_dim, clipping_threshold, + zeroing_threshold, zeroing_interval, abs(lstm_delay))) component_nodes.append("# r_t") component_nodes.append("component-node name={0}_r_t_preclip component={0}_Wrm input={0}_m_t".format(name)) component_nodes.append("component-node name={0}_r_t component={0}_r input={0}_r_t_preclip".format(name)) @@ -415,7 +423,10 @@ def AddLstmLayer(config_lines, output_dim = recurrent_projection_dim else: - components.append("component name={0}_r type=ClipGradientComponent dim={1} clipping-threshold={2} norm-based-clipping={3} {4}".format(name, cell_dim, clipping_threshold, norm_based_clipping, self_repair_clipgradient_string)) + components.append("component name={0}_r type=BackpropTruncationComponent dim={1} " + "clipping-threshold={2} zeroing-threshold={3} zeroing-interval={4} " + "recurrence-interval={5}".format(name, cell_dim, clipping_threshold, + zeroing_threshold, zeroing_interval, abs(lstm_delay))) component_nodes.append("component-node name={0}_r_t component={0}_r input={0}_m_t".format(name)) output_descriptor = '{0}_r_t'.format(name) output_dim = cell_dim @@ -430,29 +441,41 @@ def AddBLstmLayer(config_lines, recurrent_projection_dim = 0, non_recurrent_projection_dim = 0, clipping_threshold = 1.0, - norm_based_clipping = "false", + zeroing_threshold = 3.0, + zeroing_interval = 20, ng_per_element_scale_options = "", ng_affine_options = "", lstm_delay = [-1,1], self_repair_scale_nonlinearity = None, - self_repair_scale_clipgradient = None, max_change_per_component = 0.75): assert(len(lstm_delay) == 2 and lstm_delay[0] < 0 and lstm_delay[1] > 0) - output_forward = AddLstmLayer(config_lines, "{0}_forward".format(name), input, cell_dim, - recurrent_projection_dim, non_recurrent_projection_dim, - clipping_threshold, norm_based_clipping, - ng_per_element_scale_options, ng_affine_options, + output_forward = AddLstmLayer(config_lines = config_lines, + name = "{0}_forward".format(name), + input = input, + cell_dim = cell_dim, + recurrent_projection_dim = recurrent_projection_dim, + non_recurrent_projection_dim = non_recurrent_projection_dim, + clipping_threshold = clipping_threshold, + zeroing_threshold = zeroing_threshold, + zeroing_interval = zeroing_interval, + ng_per_element_scale_options = ng_per_element_scale_options, + ng_affine_options = ng_affine_options, lstm_delay = lstm_delay[0], self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, - self_repair_scale_clipgradient = self_repair_scale_clipgradient, max_change_per_component = max_change_per_component) - output_backward = AddLstmLayer(config_lines, "{0}_backward".format(name), input, cell_dim, - recurrent_projection_dim, non_recurrent_projection_dim, - clipping_threshold, norm_based_clipping, - ng_per_element_scale_options, ng_affine_options, + output_backward = AddLstmLayer(config_lines = config_lines, + name = "{0}_backward".format(name), + input = input, + cell_dim = cell_dim, + recurrent_projection_dim = recurrent_projection_dim, + non_recurrent_projection_dim = non_recurrent_projection_dim, + clipping_threshold = clipping_threshold, + zeroing_threshold = zeroing_threshold, + zeroing_interval = zeroing_interval, + ng_per_element_scale_options = ng_per_element_scale_options, + ng_affine_options = ng_affine_options, lstm_delay = lstm_delay[1], self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, - self_repair_scale_clipgradient = self_repair_scale_clipgradient, max_change_per_component = max_change_per_component) output_descriptor = 'Append({0}, {1})'.format(output_forward['descriptor'], output_backward['descriptor']) output_dim = output_forward['dimension'] + output_backward['dimension'] @@ -461,4 +484,4 @@ def AddBLstmLayer(config_lines, 'descriptor': output_descriptor, 'dimension':output_dim } - + diff --git a/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py b/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py index 2290c4d2e7f..2a6499090e2 100755 --- a/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py +++ b/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py @@ -90,11 +90,12 @@ def GetDotNodeName(name_string, is_component = False): # this function is required as dot does not allow all the component names # allowed by nnet3. # Identified incompatibilities : - # 1. dot does not allow hyphen(-) in names + # 1. dot does not allow hyphen(-) and dot(.) in names # 2. Nnet3 names can be shared among components and component nodes # dot does not allow common names # node_name_string = re.sub("-", "hyphen", name_string) + node_name_string = re.sub("\.", "_dot_", node_name_string) if is_component: node_name_string += node_name_string.strip() + "_component" return {"label":name_string, "node":node_name_string} diff --git a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py index 8e6e3d8e0e2..01f84484a92 100755 --- a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py +++ b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py @@ -77,13 +77,21 @@ def GetArgs(): # Gradient clipper options parser.add_argument("--norm-based-clipping", type=str, action=nnet3_train_lib.StrToBoolAction, - help="use norm based clipping in ClipGradient components ", default=True, choices = ["false", "true"]) + help="Outdated option retained for back compatibility, has no effect.", + default=True, choices = ["false", "true"]) parser.add_argument("--clipping-threshold", type=float, - help="clipping threshold used in ClipGradient components, if clipping-threshold=0 no clipping is done", default=30) + help="clipping threshold used in BackpropTruncation components, " + "if clipping-threshold=0 no clipping is done", default=30) + parser.add_argument("--zeroing-threshold", type=float, + help="zeroing threshold used in BackpropTruncation components, " + "if zeroing-threshold=0 no periodic zeroing is done", default=3.0) + parser.add_argument("--zeroing-interval", type=int, + help="zeroing interval used in BackpropTruncation components", default=20) parser.add_argument("--self-repair-scale-nonlinearity", type=float, help="A non-zero value activates the self-repair mechanism in the sigmoid and tanh non-linearities of the LSTM", default=0.00001) parser.add_argument("--self-repair-scale-clipgradient", type=float, - help="A non-zero value activates the self-repair mechanism in the ClipGradient component of the LSTM", default=1.0) + help="Outdated option retained for back compatibility, has no effect.", + default=1.0) # Delay options parser.add_argument("--label-delay", type=int, default=None, @@ -133,8 +141,10 @@ def CheckArgs(args): if (args.num_lstm_layers < 1): sys.exit("--num-lstm-layers has to be a positive integer") - if (args.clipping_threshold < 0): - sys.exit("--clipping-threshold has to be a non-negative") + if (args.clipping_threshold < 0 or args.zeroing_threshold < 0): + sys.exit("--clipping-threshold and --zeroing-threshold have to be non-negative") + if not args.zeroing_interval > 0: + raise Exception("--zeroing-interval has to be positive") if args.lstm_delay is None: args.lstm_delay = [[-1]] * args.num_lstm_layers else: @@ -221,7 +231,7 @@ def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets, splice_indexes, lstm_delay, cell_dim, hidden_dim, recurrent_projection_dim, non_recurrent_projection_dim, num_lstm_layers, num_hidden_layers, - norm_based_clipping, clipping_threshold, + norm_based_clipping, clipping_threshold, zeroing_threshold, zeroing_interval, ng_per_element_scale_options, ng_affine_options, label_delay, include_log_softmax, xent_regularize, self_repair_scale_nonlinearity, self_repair_scale_clipgradient, @@ -243,22 +253,34 @@ def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets, for i in range(num_lstm_layers): if len(lstm_delay[i]) == 2: # add a bi-directional LSTM layer - prev_layer_output = nodes.AddBLstmLayer(config_lines, "BLstm{0}".format(i+1), - prev_layer_output, cell_dim, - recurrent_projection_dim, non_recurrent_projection_dim, - clipping_threshold, norm_based_clipping, - ng_per_element_scale_options, ng_affine_options, + prev_layer_output = nodes.AddBLstmLayer(config_lines = config_lines, + name = "BLstm{0}".format(i+1), + input = prev_layer_output, + cell_dim = cell_dim, + recurrent_projection_dim = recurrent_projection_dim, + non_recurrent_projection_dim = non_recurrent_projection_dim, + clipping_threshold = clipping_threshold, + zeroing_threshold = zeroing_threshold, + zeroing_interval = zeroing_interval, + ng_per_element_scale_options = ng_per_element_scale_options, + ng_affine_options = ng_affine_options, lstm_delay = lstm_delay[i], - self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, self_repair_scale_clipgradient = self_repair_scale_clipgradient, + self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, max_change_per_component = max_change_per_component) else: # add a uni-directional LSTM layer - prev_layer_output = nodes.AddLstmLayer(config_lines, "Lstm{0}".format(i+1), - prev_layer_output, cell_dim, - recurrent_projection_dim, non_recurrent_projection_dim, - clipping_threshold, norm_based_clipping, - ng_per_element_scale_options, ng_affine_options, + prev_layer_output = nodes.AddLstmLayer(config_lines = config_lines, + name = "Lstm{0}".format(i+1), + input = prev_layer_output, + cell_dim = cell_dim, + recurrent_projection_dim = recurrent_projection_dim, + non_recurrent_projection_dim = non_recurrent_projection_dim, + clipping_threshold = clipping_threshold, + zeroing_threshold = zeroing_threshold, + zeroing_interval = zeroing_interval, + ng_per_element_scale_options = ng_per_element_scale_options, + ng_affine_options = ng_affine_options, lstm_delay = lstm_delay[i][0], - self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, self_repair_scale_clipgradient = self_repair_scale_clipgradient, + self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, max_change_per_component = max_change_per_component) # make the intermediate config file for layerwise discriminative # training @@ -336,6 +358,8 @@ def Main(): num_hidden_layers = num_hidden_layers, norm_based_clipping = args.norm_based_clipping, clipping_threshold = args.clipping_threshold, + zeroing_threshold = args.zeroing_threshold, + zeroing_interval = args.zeroing_interval, ng_per_element_scale_options = args.ng_per_element_scale_options, ng_affine_options = args.ng_affine_options, label_delay = args.label_delay, diff --git a/egs/wsj/s5/steps/nnet3/nnet3_libs/train/__init__.py b/egs/wsj/s5/steps/nnet3/nnet3_libs/train/__init__.py new file mode 100644 index 00000000000..e6dc907fe0a --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/nnet3_libs/train/__init__.py @@ -0,0 +1 @@ +# This module will house the latest training libraries being written by Vimal diff --git a/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh b/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh index c36de8c16bf..06ccf9657be 100755 --- a/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh +++ b/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh @@ -17,6 +17,7 @@ if [ $# != 3 ]; then echo " e.g.: $0 exp/sdm1/nnet3/lstm_sp/0.mdl lstm.dot lstm.png" echo "" echo "Main options (for others, see top of script file)" + echo " --info-bin # Name of the binary to generate the nnet3 file" echo " --component-attributes # attributes to be printed in nnet3 components" echo " --node-prefixes # list of prefixes. Nnet3 components/component-nodes with the same prefix" echo " # will be clustered together in the dot-graph" @@ -34,6 +35,7 @@ $info_bin $model | \ steps/nnet3/dot/nnet3_to_dot.py \ --component-attributes "$component_attributes" \ $attr $dot_file +echo "Generated the dot file $dot_file" command -v dot >/dev/null 2>&1 || { echo >&2 "This script requires dot but it's not installed. Please compile $dot_file with dot"; exit 1; } dot -Tpdf $dot_file -o $output_file diff --git a/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py b/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py index a43aa05176b..e92ab05a847 100644 --- a/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py +++ b/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py @@ -252,7 +252,10 @@ def VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context): raise Exception('There is mismatch between featdim/ivector_dim of the current experiment and the provided egs directory') if (egs_left_context < left_context) or (egs_right_context < right_context): - raise Exception('The egs have insufficient context') + raise Exception('The egs have insufficient context.' + ' Required left context is {rlc} and available left context is {alc}.' + ' Required right context is {rrc} and available right context is {arc}.'.format(rlc = left_context, alc = egs_left_context, + rrc = right_context, arc = egs_right_context)) frames_per_eg = int(open('{0}/info/frames_per_eg'.format(egs_dir)).readline()) num_archives = int(open('{0}/info/num_archives'.format(egs_dir)).readline()) @@ -506,52 +509,65 @@ def DoShrinkage(iter, model_file, non_linearity, shrink_threshold): return False -def ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, mb_size=256, wait = False): +def ComputeTrainCvProbabilities(dir, iter, egs_dir, left_context, right_context, + run_opts, mb_size=256, wait = False): model = '{0}/{1}.mdl'.format(dir, iter) + context_opts="--left-context={0} --right-context={1}".format( + left_context, right_context) + RunKaldiCommand(""" {command} {dir}/log/compute_prob_valid.{iter}.log \ nnet3-compute-prob "nnet3-am-copy --raw=true {model} - |" \ - "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/valid_diagnostic.egs ark:- |" + "ark,bg:nnet3-copy-egs {context_opts} ark:{egs_dir}/valid_diagnostic.egs ark:- | nnet3-merge-egs --minibatch-size={mb_size} ark:- ark:- |" """.format(command = run_opts.command, dir = dir, iter = iter, mb_size = mb_size, model = model, + context_opts = context_opts, egs_dir = egs_dir), wait = wait) RunKaldiCommand(""" {command} {dir}/log/compute_prob_train.{iter}.log \ nnet3-compute-prob "nnet3-am-copy --raw=true {model} - |" \ - "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/train_diagnostic.egs ark:- |" + "ark,bg:nnet3-copy-egs {context_opts} ark:{egs_dir}/train_diagnostic.egs ark:- | nnet3-merge-egs --minibatch-size={mb_size} ark:- ark:- |" """.format(command = run_opts.command, dir = dir, iter = iter, mb_size = mb_size, model = model, + context_opts = context_opts, egs_dir = egs_dir), wait = wait) -def ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=256, wait=False): +def ComputeProgress(dir, iter, egs_dir, left_context, right_context, + run_opts, mb_size=256, wait=False): prev_model = '{0}/{1}.mdl'.format(dir, iter - 1) model = '{0}/{1}.mdl'.format(dir, iter) + + + context_opts="--left-context={0} --right-context={1}".format( + left_context, right_context) + RunKaldiCommand(""" {command} {dir}/log/progress.{iter}.log \ nnet3-info "nnet3-am-copy --raw=true {model} - |" '&&' \ nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true {prev_model} - |" "nnet3-am-copy --raw=true {model} - |" \ -"ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/train_diagnostic.egs ark:-|" +"ark,bg:nnet3-copy-egs {context_opts} ark:{egs_dir}/train_diagnostic.egs ark:- | nnet3-merge-egs --minibatch-size={mb_size} ark:- ark:-|" """.format(command = run_opts.command, dir = dir, iter = iter, model = model, mb_size = mb_size, prev_model = prev_model, + context_opts = context_opts, egs_dir = egs_dir), wait = wait) def CombineModels(dir, num_iters, num_iters_combine, egs_dir, - run_opts, chunk_width = None): + run_opts, left_context, right_context, chunk_width = None): # Now do combination. In the nnet3 setup, the logic # for doing averaging of subsets of the models in the case where # there are too many models to reliably esetimate interpolation @@ -570,26 +586,39 @@ def CombineModels(dir, num_iters, num_iters_combine, egs_dir, else: mbsize = 1024 + + context_opts="--left-context={0} --right-context={1}".format( + left_context, right_context) + RunKaldiCommand(""" {command} {combine_queue_opt} {dir}/log/combine.log \ nnet3-combine --num-iters=40 \ --enforce-sum-to-one=true --enforce-positive-weights=true \ - --verbose=3 {raw_models} "ark,bg:nnet3-merge-egs --measure-output-frames=false --minibatch-size={mbsize} ark:{egs_dir}/combine.egs ark:-|" \ + --verbose=3 {raw_models} "ark,bg:nnet3-copy-egs {context_opts} ark:{egs_dir}/combine.egs ark:- | \ + nnet3-merge-egs --measure-output-frames=false --minibatch-size={mbsize} ark:- ark:-|" \ "|nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl {dir}/combined.mdl" """.format(command = run_opts.command, combine_queue_opt = run_opts.combine_queue_opt, dir = dir, raw_models = " ".join(raw_model_strings), mbsize = mbsize, num_iters = num_iters, + context_opts = context_opts, egs_dir = egs_dir)) # Compute the probability of the final, combined model with # the same subset we used for the previous compute_probs, as the # different subsets will lead to different probs. - ComputeTrainCvProbabilities(dir, 'combined', egs_dir, run_opts, wait = False) + ComputeTrainCvProbabilities(dir = dir, + iter = 'combined', + egs_dir = egs_dir, + left_context = left_context, + right_context = right_context, + run_opts = run_opts, + wait = False) def ComputeAveragePosterior(dir, iter, egs_dir, num_archives, - prior_subset_size, run_opts): + prior_subset_size, left_context, right_context, + run_opts): # Note: this just uses CPUs, using a smallish subset of data. """ Computes the average posterior of the network""" import glob @@ -601,19 +630,24 @@ def ComputeAveragePosterior(dir, iter, egs_dir, num_archives, else: egs_part = 'JOB' + context_opts="--left-context={0} --right-context={1}".format( + left_context, right_context) + RunKaldiCommand(""" {command} JOB=1:{num_jobs_compute_prior} {prior_queue_opt} {dir}/log/get_post.{iter}.JOB.log \ - nnet3-subset-egs --srand=JOB --n={prior_subset_size} ark:{egs_dir}/egs.{egs_part}.ark ark:- \| \ + nnet3-copy-egs {context_opts} ark:{egs_dir}/egs.{egs_part}.ark ark:- \| \ + nnet3-subset-egs --srand=JOB --n={prior_subset_size} ark:- ark:- \| \ nnet3-merge-egs --measure-output-frames=true --minibatch-size=128 ark:- ark:- \| \ nnet3-compute-from-egs {prior_gpu_opt} --apply-exp=true \ - "nnet3-am-copy --raw=true {dir}/combined.mdl -|" ark:- ark:- \| \ -matrix-sum-rows ark:- ark:- \| vector-sum ark:- {dir}/post.{iter}.JOB.vec + "nnet3-am-copy --raw=true {dir}/combined.mdl -|" ark:- ark:- \| \ + matrix-sum-rows ark:- ark:- \| vector-sum ark:- {dir}/post.{iter}.JOB.vec """.format(command = run_opts.command, dir = dir, num_jobs_compute_prior = run_opts.num_jobs_compute_prior, prior_queue_opt = run_opts.prior_queue_opt, iter = iter, prior_subset_size = prior_subset_size, egs_dir = egs_dir, egs_part = egs_part, + context_opts = context_opts, prior_gpu_opt = run_opts.prior_gpu_opt)) # make sure there is time for $dir/post.{iter}.*.vec to appear. diff --git a/egs/wsj/s5/steps/nnet3/report/generate_plots.py b/egs/wsj/s5/steps/nnet3/report/generate_plots.py index ea8f41749da..26ca16c364b 100755 --- a/egs/wsj/s5/steps/nnet3/report/generate_plots.py +++ b/egs/wsj/s5/steps/nnet3/report/generate_plots.py @@ -102,13 +102,23 @@ def Compile(self): lat_file.close() logger.info("Compiling the latex report.") try: - proc = subprocess.Popen(['pdflatex', '-output-directory='+str(dir_name), latex_file], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + proc = subprocess.Popen(['pdflatex', '-interaction=batchmode', '-output-directory='+str(dir_name), latex_file], stdout=subprocess.PIPE, stderr=subprocess.PIPE) proc.communicate() except Exception as e: logger.warning("There was an error compiling the latex file {0}, please do it manually.".format(latex_file)) return False return True +def LatexCompliantName(name_string): + # this function is required as latex does not allow all the component names + # allowed by nnet3. + # Identified incompatibilities : + # 1. latex does not allow dot(.) in file names + # + node_name_string = re.sub("\.", "_dot_", name_string) + + return node_name_string + def GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'accuracy', file_basename = 'accuracy', comparison_dir = None, start_iter = 1, latex_report = None): assert(start_iter >= 1) @@ -240,7 +250,8 @@ def GenerateNonlinStatsPlots(exp_dir, output_dir, plot, comparison_dir = None, s lgd = plt.legend(handles=plots, loc='lower center', bbox_to_anchor=(0.5, -0.5 + len(dirs) * -0.2 ), ncol=1, borderaxespad=0.) plt.grid(True) fig.suptitle("Mean and stddev of the value and derivative at {comp_name}".format(comp_name = component_name)) - figfile_name = '{dir}/nonlinstats_{comp_name}.pdf'.format(dir = output_dir, comp_name = component_name) + comp_name = LatexCompliantName(component_name) + figfile_name = '{dir}/nonlinstats_{comp_name}.pdf'.format(dir = output_dir, comp_name = comp_name) fig.savefig(figfile_name, bbox_extra_artists=(lgd,), bbox_inches='tight') if latex_report is not None: latex_report.AddFigure(figfile_name, "Mean and stddev of the value and derivative at {0}".format(component_name)) @@ -317,7 +328,8 @@ def GenerateClippedProportionPlots(exp_dir, output_dir, plot, comparison_dir = N lgd = plt.legend(handles=plots, loc='lower center', bbox_to_anchor=(0.5, -0.5 + len(dirs) * -0.2 ), ncol=1, borderaxespad=0.) plt.grid(True) fig.suptitle("Clipped-proportion value at {comp_name}".format(comp_name = component_name)) - figfile_name = '{dir}/clipped_proportion_{comp_name}.pdf'.format(dir = output_dir, comp_name = component_name) + comp_name = LatexCompliantName(component_name) + figfile_name = '{dir}/clipped_proportion_{comp_name}.pdf'.format(dir = output_dir, comp_name = comp_name) fig.savefig(figfile_name, bbox_extra_artists=(lgd,), bbox_inches='tight') if latex_report is not None: latex_report.AddFigure(figfile_name, "Clipped proportion at {0}".format(component_name)) @@ -417,7 +429,8 @@ def GenerateParameterDiffPlots(exp_dir, output_dir, plot, comparison_dir = None, lgd = plt.legend(handles=plots, loc='lower center', bbox_to_anchor=(0.5, -0.5 + len(dirs) * -0.2 ), ncol=1, borderaxespad=0.) plt.grid(True) fig.suptitle("Parameter differences at {comp_name}".format(comp_name = component_name)) - figfile_name = '{dir}/param_diff_{comp_name}.pdf'.format(dir = output_dir, comp_name = component_name) + comp_name = LatexCompliantName(component_name) + figfile_name = '{dir}/param_diff_{comp_name}.pdf'.format(dir = output_dir, comp_name = comp_name) fig.savefig(figfile_name, bbox_extra_artists=(lgd,), bbox_inches='tight') if latex_report is not None: latex_report.AddFigure(figfile_name, "Parameter differences at {0}".format(component_name)) diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py index e4a9e617e48..4139d446872 100755 --- a/egs/wsj/s5/steps/nnet3/train_dnn.py +++ b/egs/wsj/s5/steps/nnet3/train_dnn.py @@ -359,10 +359,14 @@ def TrainOneIteration(dir, iter, srand, egs_dir, f.write(str(srand)) f.close() - ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts) + ComputeTrainCvProbabilities(dir=dir, iter=iter, egs_dir=egs_dir, + left_context=left_context, right_context=right_context, + run_opts=run_opts) if iter > 0: - ComputeProgress(dir, iter, egs_dir, run_opts) + ComputeProgress(dir=dir, iter=iter, egs_dir=egs_dir, + left_context=left_context, right_context=right_context, + run_opts=run_opts) if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0): @@ -578,14 +582,24 @@ def Train(args, run_opts): logger.info("On iteration {0}, learning rate is {1}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed))) - TrainOneIteration(args.dir, iter, args.srand, egs_dir, current_num_jobs, - num_archives_processed, num_archives, - learning_rate(iter, current_num_jobs, num_archives_processed), - args.minibatch_size, args.frames_per_eg, - num_hidden_layers, args.add_layers_period, - left_context, right_context, - args.momentum, args.max_param_change, - args.shuffle_buffer_size, run_opts) + TrainOneIteration(dir = args.dir, + iter = iter, + srand = args.srand, + egs_dir = egs_dir, + num_jobs = current_num_jobs, + num_archives_processed = num_archives_processed, + num_archives = num_archives, + learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed), + minibatch_size = args.minibatch_size, + frames_per_eg = args.frames_per_eg, + num_hidden_layers = num_hidden_layers, + add_layers_period = args.add_layers_period, + left_context = left_context, + right_context = right_context, + momentum = args.momentum, + max_param_change = args.max_param_change, + shuffle_buffer_size = args.shuffle_buffer_size, + run_opts = run_opts) if args.cleanup: # do a clean up everythin but the last 2 models, under certain conditions RemoveModel(args.dir, iter-2, num_iters, num_iters_combine, @@ -604,12 +618,24 @@ def Train(args, run_opts): if args.stage <= num_iters: logger.info("Doing final combination to produce final.mdl") - CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts) + CombineModels(dir = args.dir, + num_iters = num_iters, + num_iters_combine = num_iters_combine, + egs_dir = egs_dir, + left_context = left_context, + right_context = right_context, + run_opts = run_opts) if args.stage <= num_iters + 1: logger.info("Getting average posterior for purposes of adjusting the priors.") - avg_post_vec_file = ComputeAveragePosterior(args.dir, 'combined', egs_dir, - num_archives, args.prior_subset_size, run_opts) + avg_post_vec_file = ComputeAveragePosterior(dir = args.dir, + iter = 'combined', + egs_dir = egs_dir, + num_archives = num_archives, + prior_subset_size = args.prior_subset_size, + left_context = left_context, + right_context = right_context, + run_opts = run_opts) logger.info("Re-adjusting priors based on computed posteriors") combined_model = "{dir}/combined.mdl".format(dir = args.dir) diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py index 7ac7a58a3d5..89db4276cfc 100755 --- a/egs/wsj/s5/steps/nnet3/train_rnn.py +++ b/egs/wsj/s5/steps/nnet3/train_rnn.py @@ -194,7 +194,7 @@ def GetArgs(): help="Number of sequences to be processed in parallel every minibatch" ) parser.add_argument("--trainer.rnn.num-bptt-steps", type=int, dest='num_bptt_steps', default=None, - help="The number of time steps to back-propagate from the last label in the chunk. By default it is same as the chunk-width." ) + help="The number of time steps to back-propagate from the last label in the chunk. By default it is set to (chunk-width + 10)." ) # General options parser.add_argument("--stage", type=int, default=-4, @@ -346,7 +346,7 @@ def __init__(self): def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives, raw_model_string, egs_dir, - left_context, right_context, min_deriv_time, + left_context, right_context, min_deriv_time, max_deriv_time, momentum, max_param_change, shuffle_buffer_size, num_chunk_per_minibatch, cache_read_opt, run_opts): @@ -375,7 +375,7 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi nnet3-train {parallel_train_opts} {cache_read_opt} {cache_write_opt} \ --print-interval=10 --momentum={momentum} \ --max-param-change={max_param_change} \ - --optimization.min-deriv-time={min_deriv_time} "{raw_model}" \ + --optimization.min-deriv-time={min_deriv_time} --optimization.max-deriv-time={max_deriv_time} "{raw_model}" \ "ark,bg:nnet3-copy-egs {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-merge-egs --minibatch-size={num_chunk_per_minibatch} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \ {dir}/{next_iter}.{job}.raw """.format(command = run_opts.command, @@ -384,7 +384,7 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi parallel_train_opts = run_opts.parallel_train_opts, cache_read_opt = cache_read_opt, cache_write_opt = cache_write_opt, momentum = momentum, max_param_change = max_param_change, - min_deriv_time = min_deriv_time, + min_deriv_time = min_deriv_time, max_deriv_time = max_deriv_time, raw_model = raw_model_string, context_opts = context_opts, egs_dir = egs_dir, archive_index = archive_index, shuffle_buffer_size = shuffle_buffer_size, @@ -409,7 +409,7 @@ def TrainOneIteration(dir, iter, srand, egs_dir, num_jobs, num_archives_processed, num_archives, learning_rate, shrinkage_value, num_chunk_per_minibatch, num_hidden_layers, add_layers_period, - left_context, right_context, min_deriv_time, + left_context, right_context, min_deriv_time, max_deriv_time, momentum, max_param_change, shuffle_buffer_size, cv_minibatch_size, run_opts): # Set off jobs doing some diagnostics, in the background. @@ -430,10 +430,22 @@ def TrainOneIteration(dir, iter, srand, egs_dir, f.close() - ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, mb_size=cv_minibatch_size) + ComputeTrainCvProbabilities(dir = dir, + iter = iter, + egs_dir = egs_dir, + left_context = left_context, + right_context = right_context, + run_opts = run_opts, + mb_size=cv_minibatch_size) if iter > 0: - ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=cv_minibatch_size) + ComputeProgress(dir = dir, + iter = iter, + egs_dir = egs_dir, + left_context = left_context, + right_context = right_context, + run_opts = run_opts, + mb_size=cv_minibatch_size) # an option for writing cache (storing pairs of nnet-computations # and computation-requests) during training. @@ -467,12 +479,24 @@ def TrainOneIteration(dir, iter, srand, egs_dir, except OSError: pass - TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives, - raw_model_string, egs_dir, - left_context, right_context, min_deriv_time, - momentum, max_param_change, - shuffle_buffer_size, cur_num_chunk_per_minibatch, - cache_read_opt, run_opts) + TrainNewModels(dir = dir, + iter = iter, + srand = srand, + num_jobs = num_jobs, + num_archives_processed = num_archives_processed, + num_archives = num_archives, + raw_model_string = raw_model_string, + egs_dir = egs_dir, + left_context = left_context, + right_context = right_context, + min_deriv_time = min_deriv_time, + max_deriv_time = max_deriv_time, + momentum = momentum, + max_param_change = max_param_change, + shuffle_buffer_size = shuffle_buffer_size, + num_chunk_per_minibatch = cur_num_chunk_per_minibatch, + cache_read_opt = cache_read_opt, + run_opts = run_opts) [models_to_average, best_model] = GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter)) nnets_list = [] for n in models_to_average: @@ -627,11 +651,13 @@ def Train(args, run_opts): cur_egs_dir=egs_dir if args.num_bptt_steps is None: - num_bptt_steps = args.chunk_width + # num_bptt_steps is set to (chunk_width + 10) by default + num_bptt_steps = args.chunk_width + min(10, args.chunk_left_context, args.chunk_right_context) else: num_bptt_steps = args.num_bptt_steps min_deriv_time = args.chunk_width - num_bptt_steps + max_deriv_time = num_bptt_steps - 1 logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters)) @@ -672,6 +698,7 @@ def Train(args, run_opts): left_context = left_context, right_context = right_context, min_deriv_time = min_deriv_time, + max_deriv_time = max_deriv_time, momentum = args.momentum, max_param_change= args.max_param_change, shuffle_buffer_size = args.shuffle_buffer_size, @@ -696,13 +723,25 @@ def Train(args, run_opts): if args.stage <= num_iters: logger.info("Doing final combination to produce final.mdl") - CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts, - chunk_width = args.chunk_width) + CombineModels(dir = args.dir, + num_iters = num_iters, + num_iters_combine = num_iters_combine, + egs_dir = egs_dir, + left_context = left_context, + right_context = right_context, + run_opts = run_opts, + chunk_width = args.chunk_width) if args.stage <= num_iters + 1: logger.info("Getting average posterior for purposes of adjusting the priors.") - avg_post_vec_file = ComputeAveragePosterior(args.dir, 'combined', egs_dir, - num_archives, args.prior_subset_size, run_opts) + avg_post_vec_file = ComputeAveragePosterior(dir = args.dir, + iter = 'combined', + egs_dir = egs_dir, + num_archives = num_archives, + prior_subset_size = args.prior_subset_size, + left_context = left_context, + right_context = right_context, + run_opts = run_opts) logger.info("Re-adjusting priors based on computed posteriors") combined_model = "{dir}/combined.mdl".format(dir = args.dir) diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py new file mode 100755 index 00000000000..e29a9404403 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py @@ -0,0 +1,231 @@ +#!/usr/bin/env python + +# we're using python 3.x style print but want it to work in python 2.x, +from __future__ import print_function +import os +import argparse +import shlex +import sys +import warnings +import copy +import imp +import ast +from collections import defaultdict + +sys.path.insert(0, 'steps/') +# the following is in case we weren't running this from the normal directory. +sys.path.insert(0, os.path.realpath(os.path.dirname(sys.argv[0])) + '/') + +import libs.nnet3.xconfig.parser as xparser +# do the proper import when python scripts have been refactored +nnet3_lib = imp.load_source('', 'steps/nnet3/nnet3_train_lib.py') + +def get_args(): + # we add compulsary arguments as named arguments for readability + parser = argparse.ArgumentParser(description='Reads an xconfig file and creates config files ' + 'for neural net creation and training', + epilog='Search egs/*/*/local/{nnet3,chain}/*sh for examples') + parser.add_argument('--xconfig-file', required=True, + help='Filename of input xconfig file') + parser.add_argument('--config-dir', required=True, + help='Directory to write config files and variables') + + print(' '.join(sys.argv)) + + args = parser.parse_args() + args = check_args(args) + + return args + +def check_args(args): + if not os.path.exists(args.config_dir): + os.makedirs(args.config_dir) + return args + + + + +def backup_xconfig_file(xconfig_file, config_dir): + # we write a copy of the xconfig file just to have a record of the original + # input. + try: + xconfig_file_out = open(config_dir + '/xconfig', 'w') + except: + sys.exit('{0}: error opening file {1}/xconfig for output'.format( + sys.argv[0], config_dir)) + try: + xconfig_file_in = open(xconfig_file) + except: + sys.exit('{0}: error opening file {1} for input'.format(sys.argv[0], config_dir)) + + print("# This file was created by the command:\n" + "# {0}\n" + "# It is a copy of the source from which the config files in " + "# this directory were generated.\n".format(' '.join(sys.argv)), + file=xconfig_file_out) + + while True: + line = xconfig_file_in.readline() + if line == '': + break + print(line.strip(), file=xconfig_file_out) + xconfig_file_out.close() + xconfig_file_in.close() + + +# This functions writes config_dir/xconfig.expanded.1 and +# config_dir/xconfig.expanded.2, showing some of the internal stages of +# processing the xconfig file before turning it into config files. +def write_expanded_xconfig_files(config_dir, all_layers): + try: + xconfig_file_out = open(config_dir + '/xconfig.expanded.1', 'w') + except: + sys.exit('{0}: error opening file {1}/xconfig.expanded.1 for output'.format( + sys.argv[0], config_dir)) + + + print('# This file was created by the command:\n' + '# ' + ' '.join(sys.argv) + '\n' + '#It contains the same content as ./xconfig but it was parsed and\n' + '#default config values were set.\n' + '# See also ./xconfig.expanded.2\n', file=xconfig_file_out) + + for layer in all_layers: + print(str(layer), file=xconfig_file_out) + xconfig_file_out.close() + + try: + xconfig_file_out = open(config_dir + '/xconfig.expanded.2', 'w') + except: + sys.exit('{0}: error opening file {1}/xconfig.expanded.2 for output'.format( + sys.argv[0], config_dir)) + + print('# This file was created by the command:\n' + '# ' + ' '.join(sys.argv) + '\n' + '# It contains the same content as ./xconfig but it was parsed,\n' + '# default config values were set, and Descriptors (input=xxx) were normalized.\n' + '# See also ./xconfig.expanded.1\n\n', + file=xconfig_file_out) + + for layer in all_layers: + layer.normalize_descriptors() + print(str(layer), file=xconfig_file_out) + xconfig_file_out.close() + +# This function returns a map from config-file basename +# e.g. 'init', 'ref', 'layer1' to a documentation string that goes +# at the top of the file. +def get_config_headers(): + ans = defaultdict(str) # resulting dict will default to the empty string + # for any config files not explicitly listed here. + ans['init'] = ('# This file was created by the command:\n' + '# ' + ' '.join(sys.argv) + '\n' + '# It contains the input of the network and is used in\n' + '# accumulating stats for an LDA-like transform of the\n' + '# input features.\n'); + ans['ref'] = ('# This file was created by the command:\n' + '# ' + ' '.join(sys.argv) + '\n' + '# It contains the entire neural network, but with those\n' + '# components that would normally require fixed vectors/matrices\n' + '# read from disk, replaced with random initialization\n' + '# (this applies to the LDA-like transform and the\n' + '# presoftmax-prior-scale, if applicable). This file\n' + '# is used only to work out the left-context and right-context\n' + '# of the network.\n'); + ans['final'] = ('# This file was created by the command:\n' + '# ' + ' '.join(sys.argv) + '\n' + '# It contains the entire neural network.\n') + + return ans; + + + + +# This is where most of the work of this program happens. +def write_config_files(config_dir, all_layers): + # config_basename_to_lines is map from the basename of the + # config, as a string (i.e. 'ref', 'all', 'init') to a list of + # strings representing lines to put in the config file. + config_basename_to_lines = defaultdict(list) + + config_basename_to_header = get_config_headers() + + for layer in all_layers: + try: + pairs = layer.get_full_config() + for config_basename, line in pairs: + config_basename_to_lines[config_basename].append(line) + except Exception as e: + print("{0}: error producing config lines from xconfig " + "line '{1}': error was: {2}".format(sys.argv[0], str(layer), + repr(e)), file=sys.stderr) + # we use raise rather than raise(e) as using a blank raise + # preserves the backtrace + raise + + for basename,lines in config_basename_to_lines.items(): + header = config_basename_to_header[basename] + filename = '{0}/{1}.config'.format(config_dir, basename) + try: + f = open(filename, 'w') + print(header, file=f) + for line in lines: + print(line, file=f) + f.close() + except Exception as e: + print('{0}: error writing to config file {1}: error is {2}'.format( + sys.argv[0], filename, repr(e)), file=sys.stderr) + # we use raise rather than raise(e) as using a blank raise + # preserves the backtrace + raise + +def add_back_compatibility_info(config_dir): + """This will be removed when python script refactoring is done.""" + + nnet3_lib.RunKaldiCommand("nnet3-init {0}/ref.config {0}/ref.raw".format(config_dir)) + out, err = nnet3_lib.RunKaldiCommand("nnet3-info {0}/ref.raw | head -4".format(config_dir)) + #out looks like this + # left-context: 7 + # right-context: 0 + # num-parameters: 90543902 + # modulus: 1 + info = {} + for line in out.split("\n"): + parts = line.split(":") + if len(parts) != 2: + continue + info[parts[0].strip()] = int(parts[1].strip()) + + # Writing the back-compatible vars file + # model_left_context=0 + # model_right_context=7 + # num_hidden_layers=3 + vf = open('{0}/vars'.format(config_dir), 'w') + vf.write('model_left_context={0}\n'.format(info['left-context'])) + vf.write('model_right_context={0}\n'.format(info['right-context'])) + vf.write('num_hidden_layers=1\n') + vf.close() + + nnet3_lib.ForceSymlink("final.config".format(config_dir), + "{0}/layer1.config".format(config_dir)) + +def main(): + args = get_args() + backup_xconfig_file(args.xconfig_file, args.config_dir) + all_layers = xparser.read_xconfig_file(args.xconfig_file) + write_expanded_xconfig_files(args.config_dir, all_layers) + write_config_files(args.config_dir, all_layers) + add_back_compatibility_info(args.config_dir) + + +if __name__ == '__main__': + main() + + +# test: +# mkdir -p foo; (echo 'input dim=40 name=input'; echo 'output name=output input=Append(-1,0,1)') >xconfig; ./xconfig_to_configs.py xconfig foo +# mkdir -p foo; (echo 'input dim=40 name=input'; echo 'output-layer name=output dim=1924 input=Append(-1,0,1)') >xconfig; ./xconfig_to_configs.py xconfig foo + +# mkdir -p foo; (echo 'input dim=40 name=input'; echo 'relu-renorm-layer name=affine1 dim=1024'; echo 'output-layer name=output dim=1924 input=Append(-1,0,1)') >xconfig; ./xconfig_to_configs.py xconfig foo + +# mkdir -p foo; (echo 'input dim=100 name=ivector'; echo 'input dim=40 name=input'; echo 'fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=foo/bar/lda.mat'; echo 'output-layer name=output dim=1924 input=Append(-1,0,1)') >xconfig; ./xconfig_to_configs.py xconfig foo diff --git a/egs/wsj/s5/utils/lang/make_unk_lm.sh b/egs/wsj/s5/utils/lang/make_unk_lm.sh index f92d02ffc43..b46ab128b93 100755 --- a/egs/wsj/s5/utils/lang/make_unk_lm.sh +++ b/egs/wsj/s5/utils/lang/make_unk_lm.sh @@ -258,7 +258,7 @@ if ! $position_dependent_phones; then # We don't need to take into account the disambig symbol because we compose on # the right with this FST, and it doesn't appear on the output side. cat $dir/all_nonsil_phones | \ - awk -v '{ph[$1]=1} END{ for (p in ph) { print 0,1,p,p; print 1,2,p,p; print 2,2,p,p; } + awk '{ph[$1]=1} END{ for (p in ph) { print 0,1,p,p; print 1,2,p,p; print 2,2,p,p; } print 2,0.0; }' > $dir/constraint_fst.txt fi else diff --git a/egs/wsj/s5/utils/prepare_lang.sh b/egs/wsj/s5/utils/prepare_lang.sh index ea5264a0f07..054210cdd23 100755 --- a/egs/wsj/s5/utils/prepare_lang.sh +++ b/egs/wsj/s5/utils/prepare_lang.sh @@ -51,7 +51,6 @@ # Begin configuration section. num_sil_states=5 num_nonsil_states=3 -num_word_disambig_syms=1 position_dependent_phones=true # position_dependent_phones is false also when position dependent phones and word_boundary.txt # have been generated by another source diff --git a/src/INSTALL b/src/INSTALL index e0fdcc81e60..3f7a01928ba 100644 --- a/src/INSTALL +++ b/src/INSTALL @@ -7,14 +7,13 @@ You must first have completed the installation steps in ../tools/INSTALL (compiling OpenFst; getting ATLAS and CLAPACK headers). The installation instructions are: -./configure +./configure --shared make depend make Note that "make" takes a long time; you can speed it up by running make -in parallel if you have multiple CPUs, for instance +in parallel if you have multiple CPUs, for instance make depend -j 8 make -j 8 For more information, see documentation at http://kaldi-asr.org/doc/ and click on "The build process (how Kaldi is compiled)". - diff --git a/src/base/kaldi-math-test.cc b/src/base/kaldi-math-test.cc index 52719cc4669..8d6e6164eac 100644 --- a/src/base/kaldi-math-test.cc +++ b/src/base/kaldi-math-test.cc @@ -57,6 +57,17 @@ void UnitTestRoundUpToNearestPowerOfTwo() { KALDI_ASSERT(RoundUpToNearestPowerOfTwo(1073700000) == 1073741824); } +void UnitTestDivideRoundingDown() { + for (int32 i = 0; i < 100; i++) { + int32 a = RandInt(-100, 100); + int32 b = 0; + while (b == 0) + b = RandInt(-100, 100); + KALDI_ASSERT(DivideRoundingDown(a, b) == + std::floor(static_cast(a) / static_cast(b))); + } +} + void UnitTestGcdLcm() { UnitTestGcdLcmTpl(); UnitTestGcdLcmTpl(); @@ -314,6 +325,7 @@ int main() { UnitTestRand(); UnitTestAssertFunc(); UnitTestRoundUpToNearestPowerOfTwo(); + UnitTestDivideRoundingDown(); UnitTestExpSpeed(); UnitTestExpSpeed(); UnitTestLogSpeed(); diff --git a/src/base/kaldi-math.h b/src/base/kaldi-math.h index ac590a06a25..3ee6fe4ccf9 100644 --- a/src/base/kaldi-math.h +++ b/src/base/kaldi-math.h @@ -280,6 +280,17 @@ static inline void AssertEqual(float a, float b, // RoundUpToNearestPowerOfTwo does the obvious thing. It crashes if n <= 0. int32 RoundUpToNearestPowerOfTwo(int32 n); +/// Returns a / b, rounding towards negative infinity in all cases. +static inline int32 DivideRoundingDown(int32 a, int32 b) { + KALDI_ASSERT(b != 0); + if (a * b >= 0) + return a / b; + else if (a < 0) + return (a - b + 1) / b; + else + return (a - b - 1) / b; +} + template I Gcd(I m, I n) { if (m == 0 || n == 0) { if (m == 0 && n == 0) { // gcd not defined, as all integers are divisors. diff --git a/src/bin/acc-tree-stats.cc b/src/bin/acc-tree-stats.cc index 90432c2e58a..8b9ce9065b4 100644 --- a/src/bin/acc-tree-stats.cc +++ b/src/bin/acc-tree-stats.cc @@ -128,5 +128,3 @@ int main(int argc, char *argv[]) { return -1; } } - - diff --git a/src/bin/align-text.cc b/src/bin/align-text.cc index 833e29efe3b..616dac858d7 100644 --- a/src/bin/align-text.cc +++ b/src/bin/align-text.cc @@ -58,9 +58,11 @@ int main(int argc, char *argv[]) { po.Register("special-symbol", &special_symbol, "Special symbol to be " "aligned with the inserted or deleted words. Your sentences " "should not contain this symbol."); - po.Register("separator", &separator, "Separator for each aligned pairs in " - "the output alignment file. Your sentences should not contain " - "this symbol."); + po.Register("separator", &separator, "Separator for each aligned pair in " + "the output alignment file. Note: it should not be necessary " + "to change this even if your sentences contain ';', because " + "to parse the output of this program you can just split on " + "space and then assert that every third token is ';'."); po.Read(argc, argv); @@ -91,16 +93,12 @@ int main(int argc, char *argv[]) { const std::vector &text1 = text1_reader.Value(); const std::vector &text2 = text2_reader.Value(key); - // Checks if the special symbol and separator is in the string. + // Checks if the special symbol is in the string. KALDI_ASSERT(std::find(text1.begin(), text1.end(), special_symbol) == text1.end()); KALDI_ASSERT(std::find(text2.begin(), text2.end(), special_symbol) == text2.end()); - KALDI_ASSERT(std::find(text1.begin(), - text1.end(), separator) == text1.end()); - KALDI_ASSERT(std::find(text2.begin(), - text2.end(), separator) == text2.end()); - + if (std::find_if(text1.begin(), text1.end(), IsNotToken) != text1.end()) { KALDI_ERR << "In text1, the utterance " << key << " contains unprintable characters." \ << "That means there is a problem with the text (such as incorrect encoding)." << std::endl; @@ -111,7 +109,7 @@ int main(int argc, char *argv[]) { << "That means there is a problem with the text (such as incorrect encoding)." << std::endl; return -1; } - + std::vector > aligned; LevenshteinAlignment(text1, text2, special_symbol, &aligned); diff --git a/src/chain/chain-den-graph.cc b/src/chain/chain-den-graph.cc index b092b3de4d7..6f494a0c562 100644 --- a/src/chain/chain-den-graph.cc +++ b/src/chain/chain-den-graph.cc @@ -220,11 +220,6 @@ static void SortOnTransitionCount(fst::StdVectorFst *fst) { void DenGraphMinimizeWrapper(fst::StdVectorFst *fst) { for (int32 i = 1; i <= 3; i++) { - fst::PushSpecial(fst, fst::kDelta * 0.01); - MinimizeAcceptorNoPush(fst); - KALDI_LOG << "Number of states and arcs in transition-id FST after regular " - << "minimization is " << fst->NumStates() << " and " - << NumArcs(*fst) << " (pass " << i << ")"; fst::StdVectorFst fst_reversed; fst::Reverse(*fst, &fst_reversed); fst::PushSpecial(&fst_reversed, fst::kDelta * 0.01); @@ -233,6 +228,11 @@ void DenGraphMinimizeWrapper(fst::StdVectorFst *fst) { KALDI_LOG << "Number of states and arcs in transition-id FST after reversed " << "minimization is " << fst->NumStates() << " and " << NumArcs(*fst) << " (pass " << i << ")"; + fst::PushSpecial(fst, fst::kDelta * 0.01); + MinimizeAcceptorNoPush(fst); + KALDI_LOG << "Number of states and arcs in transition-id FST after regular " + << "minimization is " << fst->NumStates() << " and " + << NumArcs(*fst) << " (pass " << i << ")"; } fst::RmEpsilon(fst); KALDI_LOG << "Number of states and arcs in transition-id FST after " @@ -347,7 +347,7 @@ void CreateDenominatorFst(const ContextDependency &ctx_dep, BaseFloat self_loop_scale = 1.0; // We have to be careful to use the same // value in test time. - bool reorder = false; + bool reorder = true; // add self-loops to the FST with transition-ids as its labels. AddSelfLoops(trans_model, disambig_syms_h, self_loop_scale, reorder, &transition_id_fst); diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu index 640040c60f3..f093f21a5a5 100644 --- a/src/chain/chain-kernels.cu +++ b/src/chain/chain-kernels.cu @@ -20,21 +20,40 @@ #include #include "chain/chain-kernels-ansi.h" -template -__device__ inline void atomic_add(Real* address, Real value) { - atomicAdd(address, value); -} +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 200 +#error - Kaldi no longer supports CC1.x devices. Please use a newer GPU or \ + configure with --use-cuda=no (this will disable the use of GPU). +#endif -template<> -__device__ inline void atomic_add(double* address, double val) { - unsigned long long int* address_as_ull = - reinterpret_cast(address); + +#ifdef __CUDACC__ +#if ( __CUDACC_VER_MAJOR__ >= 8 ) && ( !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 ) +// native implementation available +#else +#if __CUDA_ARCH__ >= 600 +#error using CAS implementation of double atomicAdd +#endif +__device__ double atomicAdd(double* address, double val) { + unsigned long long int* address_as_ull = (unsigned long long int*) address; unsigned long long int old = *address_as_ull, assumed; + do { assumed = old; old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); + + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old); + + return __longlong_as_double(old); +} +#endif +#endif + + +template +__device__ inline void atomic_add(Real* address, Real value) { + atomicAdd(address, value); } template @@ -268,4 +287,3 @@ void cuda_chain_hmm_backward(dim3 Gr, dim3 Bl, this_beta, log_prob_deriv, log_prob_deriv_stride); } - diff --git a/src/chainbin/nnet3-chain-copy-egs.cc b/src/chainbin/nnet3-chain-copy-egs.cc index 00ed56308b3..b0c963595a1 100644 --- a/src/chainbin/nnet3-chain-copy-egs.cc +++ b/src/chainbin/nnet3-chain-copy-egs.cc @@ -316,15 +316,15 @@ int main(int argc, char *argv[]) { num_written++; } } else if (count > 0) { - const NnetChainExample &eg = example_reader.Value(); + NnetChainExample eg = example_reader.Value(); + if (frame_shift != 0) + ShiftChainExampleTimes(frame_shift, exclude_names, &eg); NnetChainExample eg_out; if (left_context != -1 || right_context != -1) ModifyChainExampleContext(eg, left_context, right_context, frame_subsampling_factor, &eg_out); else - eg_out = eg; - if (frame_shift != 0) - ShiftChainExampleTimes(frame_shift, exclude_names, &eg_out); + eg_out.Swap(&eg); if (truncate_deriv_weights != 0) TruncateDerivWeights(truncate_deriv_weights, &eg_out); for (int32 c = 0; c < count; c++) { @@ -344,5 +344,3 @@ int main(int argc, char *argv[]) { return -1; } } - - diff --git a/src/configure b/src/configure index 1b94d744228..3446a9532e0 100755 --- a/src/configure +++ b/src/configure @@ -9,8 +9,8 @@ # Example command lines: -# ./configure # ./configure --shared ## shared libraries. +# ./configure # ./configure --mkl-root=/opt/intel/mkl # ./configure --mkl-root=/opt/intel/mkl --threaded-math=yes # ./configure --mkl-root=/opt/intel/mkl --threaded-math=yes --mkl-threading=tbb @@ -447,8 +447,8 @@ function configure_cuda { fi case $CUDA_VERSION in - 5_5) CUDA_ARCH="-gencode arch=compute_13,code=sm_13 -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35" ;; - 6_*) CUDA_ARCH="-gencode arch=compute_13,code=sm_13 -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50" ;; + 5_5) CUDA_ARCH="-gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35" ;; + 6_*) CUDA_ARCH="-gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50" ;; 7_*) CUDA_ARCH="-gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_53,code=sm_53" ;; 8_*) CUDA_ARCH="-gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_53,code=sm_53 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_62,code=sm_62" ;; *) echo "Unsupported CUDA_VERSION (CUDA_VERSION=$CUDA_VERSION), please report it to Kaldi mailing list, together with 'nvcc -h' or 'ptxas -h' which lists allowed -gencode values..."; exit 1 ;; diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h index 4642048989e..d475143d444 100644 --- a/src/cudamatrix/cu-kernels-ansi.h +++ b/src/cudamatrix/cu-kernels-ansi.h @@ -220,6 +220,9 @@ void cudaF_soft_hinge(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride); void cudaF_group_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size, float power); +void cudaF_normalize_per_row(size_t Gr, size_t Bl, float *y, int y_stride, + const float *x, MatrixDim x_d, float tartget_rms, + bool add_log_stddev); void cudaF_group_spec_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size, float power); @@ -489,6 +492,9 @@ void cudaD_soft_hinge(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, void cudaD_group_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size, double power); +void cudaD_normalize_per_row(size_t Gr, size_t Bl, double *y, int y_stride, + const double *x, MatrixDim x_d, double tartget_rms, + bool add_log_stddev); void cudaD_group_spec_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size, double power); diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index bddd1227441..00f4f14cb66 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -28,6 +28,8 @@ #include #include "cudamatrix/cu-kernels-ansi.h" + + /*********************************************************************** * Generic __device__ functions */ @@ -379,7 +381,7 @@ static void _max(Real* mat, const Real* A, MatrixDim dst_d, int src_stride) { int32_cuda dst_index = i + j * dst_d.stride, src_index = i + j * src_stride; if (i < dst_d.cols && j < dst_d.rows) { Real a = mat[dst_index], b = A[src_index]; - mat[dst_index] = (a > b ? a : b); + mat[dst_index] = fmax(a, b); } } @@ -890,9 +892,8 @@ static void _add_diag_mat_mat_MNT(const Real alpha, const Real* M, // Tree reduce to 2x warpSize elements. # pragma unroll for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) { - if (tid < shift) { + if (tid < shift) ssum[tid] += ssum[tid + shift]; - } __syncthreads(); } @@ -1248,7 +1249,7 @@ struct TransReduceOp { } __forceinline__ __device__ Real Reduce(const Real& a, const Real& b) const { - return max(a, b); + return fmax(a, b); } __forceinline__ __device__ Real PostReduce(const Real& x, const Real& output) const { @@ -1288,7 +1289,7 @@ struct TransReduceOp { } __forceinline__ __device__ Real Reduce(const Real& a, const Real& b) const { - return max(a, b); + return fmax(a, b); } __forceinline__ __device__ Real PostReduce(const Real& x, const Real& output) const { @@ -2155,7 +2156,7 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) { // reduce to CU1DBLOCK elements per row. Real tmax = sizeof(Real) == sizeof(float) ? -CUDART_INF_F : -CUDART_INF; for (int j = tid; j < d.cols; j += CU1DBLOCK) { - tmax = max(tmax, x[x_start + j]); + tmax = fmax(tmax, x[x_start + j]); } smem[tid] = tmax; __syncthreads(); @@ -2164,7 +2165,7 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) { # pragma unroll for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) { if (tid < shift) { - smem[tid] = max(smem[tid], smem[tid + shift]); + smem[tid] = fmax(smem[tid], smem[tid + shift]); } __syncthreads(); } @@ -2173,7 +2174,7 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) { if (tid < warpSize) { # pragma unroll for (int shift = warpSize; shift > 0; shift >>= 1) { - smem[tid] = max(smem[tid], smem[tid + shift]); + smem[tid] = fmax(smem[tid], smem[tid + shift]); } } @@ -2217,6 +2218,77 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) { } } +// The output y_i = scale * x_i, +// and we want to RMS value of the y_i to equal target_rms, +// so y^t y = D * target_rms^2 (if y is one row of the input). +// we need to have scale = 1.0 / sqrt(x^t x / (D * target_rms^2)). +// there is also flooring involved, to avoid division-by-zero +// problems. It's important for the backprop, that the floor's +// square root is exactly representable as float. +// If add_log_stddev is true, log(max(epsi, sqrt(x^t x / D))) +// is an extra dimension of the output. +// +// 1D grid is used. Each 256-thread block works on 1 row of the data matrix. +// The block is also of 1D. Strided memory access is used if the length of the +// row is longer than 256. +template +__global__ +static void _normalize_per_row(Real *y, int y_stride, const Real *x, + MatrixDim x_d, Real target_rms, + bool add_log_stddev) { + const int i = blockIdx.x; + const int tid = threadIdx.x; + const Real* x_row = x + i * x_d.stride; + __shared__ Real ssum[CU1DBLOCK]; + + // Reduce x_j^2 to CU1DBLOCK elements per row + Real tsum = Real(0); + for (int j = tid; j < x_d.cols; j += CU1DBLOCK) { + tsum += x_row[j] * x_row[j]; + } + ssum[tid] = tsum; + __syncthreads(); + + // Tree reduce to 2x warpSize elements per row +# pragma unroll + for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) { + if (tid < shift) + ssum[tid] += ssum[tid + shift]; + __syncthreads(); + } + + // Reduce last warp to 1 element per row. + // Threads implicitly synchronized within a warp. + if (tid < warpSize) { +# pragma unroll + for (int shift = warpSize; shift > 0; shift >>= 1) { + ssum[tid] += ssum[tid + shift]; + } + } + + const Real kSquaredNormFloor = 1.35525271560688e-20; // 2^-66 + if (tid == 0) { + ssum[0] = sqrt( + fmax(ssum[0] / (target_rms * target_rms * x_d.cols), kSquaredNormFloor)); + } + + // Broadcast floored stddev to all threads. + __syncthreads(); + const Real stddev_div_target_rms = ssum[0]; + const Real scale = Real(1) / stddev_div_target_rms; + + // Store normalized input to output + Real* y_row = y + i * y_stride; + for (int j = tid; j < x_d.cols; j += CU1DBLOCK) { + y_row[j] = x_row[j] * scale; + } + + if (tid == 0 && add_log_stddev) { + y_row[x_d.cols] = log(stddev_div_target_rms * target_rms); + } +} + + // Per-row log-softmax operation on 'x', with writing to 'y'. // note, x and y may point to the same memory. This is equivalent to setting // matrix y to matrix x and then, for each row of y, subtracting the offset that @@ -2240,7 +2312,7 @@ static void _log_softmax_reduce(Real* y, const Real* x, MatrixDim y_dim, // reduce to CU1DBLOCK elements per row. Real tmax = -1e20; for (int j = tid; j < y_dim.cols; j += CU1DBLOCK) { - tmax = max(tmax, x[x_start + j]); + tmax = fmax(tmax, x[x_start + j]); } smem[tid] = tmax; __syncthreads(); @@ -2249,7 +2321,7 @@ static void _log_softmax_reduce(Real* y, const Real* x, MatrixDim y_dim, # pragma unroll for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) { if (tid < shift) { - smem[tid] = max(smem[tid], smem[tid + shift]); + smem[tid] = fmax(smem[tid], smem[tid + shift]); } __syncthreads(); } @@ -2257,7 +2329,7 @@ static void _log_softmax_reduce(Real* y, const Real* x, MatrixDim y_dim, // reduce to 1 element per row if (tid < warpSize) { for (int shift = warpSize; shift > 0; shift >>= 1) { - smem[tid] = max(smem[tid], smem[tid + shift]); + smem[tid] = fmax(smem[tid], smem[tid + shift]); } } @@ -3182,6 +3254,12 @@ void cudaF_splice(dim3 Gr, dim3 Bl, float* y, const float* x, _splice<<>>(y,x,off,d_out,d_in); } +void cudaF_normalize_per_row(size_t Gr, size_t Bl, float *y, int y_stride, + const float *x, MatrixDim x_d, float target_rms, + bool add_log_stddev) { + _normalize_per_row<<>>(y, y_stride, x, x_d, target_rms, add_log_stddev); +} + void cudaF_one(int Gr, int Bl, float* x, int dim) { _one<<>>(x,dim); } @@ -3811,6 +3889,12 @@ void cudaD_log_softmax_reduce(size_t Gr, size_t Bl, double* y, const double* x, _log_softmax_reduce<<>>(y, x, y_dim, x_stride); } +void cudaD_normalize_per_row(size_t Gr, size_t Bl, double *y, int y_stride, + const double *x, MatrixDim x_d, double target_rms, + bool add_log_stddev) { + _normalize_per_row<<>>(y, y_stride, x, x_d, target_rms, add_log_stddev); +} + void cudaD_splice(dim3 Gr, dim3 Bl, double* y, const double* x, const int32_cuda* off, MatrixDim d_out, MatrixDim d_in) { _splice<<>>(y,x,off,d_out,d_in); diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index a6e81db5d6c..55259cba147 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -590,6 +590,12 @@ inline void cuda_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, MatrixDim d) { cudaF_diff_xent(Gr, Bl, vec_tgt, mat_net_out, vec_log_post, d); } +inline void cuda_normalize_per_row(size_t Gr, size_t Bl, float *y, int y_stride, + const float *x, MatrixDim x_d, + float target_rms, bool add_log_stddev) { + cudaF_normalize_per_row(Gr, Bl, y, y_stride, x, x_d, target_rms, + add_log_stddev); +} inline void cuda_diff_softmax(dim3 Gr, dim3 Bl, float* x, const MatrixDim dim, const float* value, const int value_stride, const float* diff, const int diff_stride) { @@ -1110,6 +1116,12 @@ inline void cuda_log_softmax_reduce(size_t Gr, size_t Bl, double *y, int x_stride) { cudaD_log_softmax_reduce(Gr, Bl, y, x, y_dim, x_stride); } +inline void cuda_normalize_per_row(size_t Gr, size_t Bl, double *y, + int y_stride, const double *x, MatrixDim x_d, + double target_rms, bool add_log_stddev) { + cudaD_normalize_per_row(Gr, Bl, y, y_stride, x, x_d, target_rms, + add_log_stddev); +} inline void cuda_regularize_l1(dim3 Gr, dim3 Bl, double *wei, double *grad, double l1, double lr, MatrixDim d, diff --git a/src/cudamatrix/cu-math-test.cc b/src/cudamatrix/cu-math-test.cc index 2e096e76ae8..494c676250b 100644 --- a/src/cudamatrix/cu-math-test.cc +++ b/src/cudamatrix/cu-math-test.cc @@ -139,6 +139,76 @@ static void UnitTestCuMathSplice() { } } + +template +static void UnitTestCuMathNormalizePerRow() { + + for (int32 i = 0; i < 2; i++) { + int row = 10 + Rand() % 40; + int col = 10 + Rand() % 50; + + Matrix Hi(row,col); + Matrix Ho(row,col+1); + Hi.SetRandn(); + Hi.Scale(5.0); + + CuMatrix Di(row, col); + CuMatrix Do(row, col+1); + Di.CopyFromMat(Hi); + + Real target_rms = 0.3456; + bool add_log_stddev = true; + const Real kSquaredNormFloor = 1.35525271560688e-20; // 2^-66 + + //gpu + cu::NormalizePerRow(Di, target_rms, add_log_stddev, &Do); + + //cpu + { + MatrixBase& in(Hi); + MatrixBase& out(Ho); + Real target_rms=0.3456; + SubMatrix out_no_log(out, 0, out.NumRows(), 0, in.NumCols()); + if (in.Data() != out_no_log.Data()) + out_no_log.CopyFromMat(in); + Vector in_norm(in.NumRows()); + Real d_scaled = in.NumCols() * target_rms * target_rms; + in_norm.AddDiagMat2(1.0 / d_scaled, in, kNoTrans, 0.0); + in_norm.ApplyFloor(kSquaredNormFloor); + in_norm.ApplyPow(-0.5); + out_no_log.MulRowsVec(in_norm); + if (add_log_stddev) { + in_norm.ApplyLog(); + in_norm.Scale(-1.0); + in_norm.Add(log(target_rms)); + out.CopyColFromVec(in_norm, in.NumCols()); + } + } + + Matrix Ho2(Do); + AssertEqual(Ho,Ho2,0.00001); + } + + for (int dim = 16; dim <= 1024; dim *= 2) { + BaseFloat time_in_secs = 0.025; + CuMatrix M(dim, dim), N(dim, dim + 1); + M.SetRandn(); + N.SetRandn(); + Timer tim; + int32 iter = 0; + for (; tim.Elapsed() < time_in_secs; iter++) { + cu::NormalizePerRow(M, Real(1), true, &N); + } + + BaseFloat gflops = ((BaseFloat) dim * dim * iter) + / (tim.Elapsed() * 1.0e+09); + KALDI_LOG << "For CuMatrix::NormalizePerRow" + << (sizeof(Real)==8?"":"") << ", for dim = " + << dim << ", speed was " << gflops << " gigaflops."; + } +} + + template void CudaMathUnitTest() { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().DoublePrecisionSupported()) @@ -146,6 +216,7 @@ template void CudaMathUnitTest() { UnitTestCuMathRandomize(); UnitTestCuMathSplice(); UnitTestCuMathCopy(); + UnitTestCuMathNormalizePerRow(); } diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc index 97757ba68dd..f01760d41bb 100644 --- a/src/cudamatrix/cu-math.cc +++ b/src/cudamatrix/cu-math.cc @@ -29,15 +29,15 @@ namespace kaldi { namespace cu { /* - * templated functions wrapping the ANSI-C CUDA kernel functions + * templated functions wrapping the ANSI-C CUDA kernel functions */ template void RegularizeL1(CuMatrixBase *weight, CuMatrixBase *grad, Real l1, Real lr) { KALDI_ASSERT(SameDim(*weight, *grad)); -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); @@ -46,7 +46,7 @@ void RegularizeL1(CuMatrixBase *weight, CuMatrixBase *grad, Real l1, cuda_regularize_l1(dimGrid, dimBlock, weight->Data(), grad->Data(), l1, lr, weight->Dim(), grad->Stride()); CU_SAFE_CALL(cudaGetLastError()); - + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif @@ -55,11 +55,11 @@ void RegularizeL1(CuMatrixBase *weight, CuMatrixBase *grad, Real l1, MatrixBase &grad2 = grad->Mat(); for(MatrixIndexT r=0; r &src, #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; - + /* - Note: default 16x16 block-size limits the --cachesize to matrix size 16*65535 x 16*65535 + Note: default 16x16 block-size limits the --cachesize to matrix size 16*65535 x 16*65535 dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(copy_from_idx.Dim(), CU2DBLOCK)); */ /* * Let's use blocksize 4 x 128 (512 threads/block) - * and extend the randomizable matrices to: col 4*65535, row 128*65535 + * and extend the randomizable matrices to: col 4*65535, row 128*65535 * (ie. max-cols:262140 (dim), max-rows:8388480 (datapoints)) */ dim3 dimBlock(4, 128); @@ -111,7 +111,7 @@ void Randomize(const CuMatrixBase &src, cuda_randomize(dimGrid, dimBlock, tgt->Data(), src.Data(), copy_from_idx.Data(), dimtgt, dimsrc); CU_SAFE_CALL(cudaGetLastError()); - + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif @@ -124,28 +124,28 @@ void Randomize(const CuMatrixBase &src, tgtmat.Row(i).CopyFromVec(srcmat.Row(copy_from_idxvec[i])); } } -} +} template void Splice(const CuMatrixBase &src, const CuArray &frame_offsets, CuMatrixBase *tgt) { - + KALDI_ASSERT(src.NumCols()*frame_offsets.Dim() == tgt->NumCols()); KALDI_ASSERT(src.NumRows() == tgt->NumRows()); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; - + dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK)); - + cuda_splice(dimGrid, dimBlock, tgt->Data(), src.Data(), frame_offsets.Data(), tgt->Dim(), src.Dim()); CU_SAFE_CALL(cudaGetLastError()); - + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif @@ -171,7 +171,7 @@ void Splice(const CuMatrixBase &src, const CuArray &frame_offsets, template void Copy(const CuMatrixBase &src, const CuArray ©_from_indices, - CuMatrixBase *tgt) { + CuMatrixBase *tgt) { KALDI_ASSERT(copy_from_indices.Dim() == tgt->NumCols()); KALDI_ASSERT(src.NumRows() == tgt->NumRows()); @@ -179,14 +179,14 @@ void Copy(const CuMatrixBase &src, const CuArray ©_from_indices #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; - + dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK)); - + cuda_copy(dimGrid, dimBlock, tgt->Data(), src.Data(), copy_from_indices.Data(), tgt->Dim(), src.Dim()); CU_SAFE_CALL(cudaGetLastError()); - + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif @@ -233,9 +233,65 @@ void Randomize(const CuMatrixBase &src, const CuArray ©_from_idx, CuMatrixBase *tgt); +// The output y_i = scale * x_i, +// and we want to RMS value of the y_i to equal target_rms, +// so y^t y = D * target_rms^2 (if y is one row of the input). +// we need to have scale = 1.0 / sqrt(x^t x / (D * target_rms^2)). +// there is also flooring involved, to avoid division-by-zero +// problems. It's important for the backprop, that the floor's +// square root is exactly representable as float. +// If add_log_stddev_ is true, log(max(epsi, sqrt(x^t x / D))) +// is an extra dimension of the output. +template +void NormalizePerRow(const CuMatrixBase& in, const Real target_rms, + const bool add_log_stddev, CuMatrixBase* out) { + const Real kSquaredNormFloor = 1.35525271560688e-20; // 2^-66 + if (add_log_stddev) { + KALDI_ASSERT(in.NumRows() == out->NumRows()); + KALDI_ASSERT(in.NumCols() + 1 == out->NumCols()); + } else { + KALDI_ASSERT(SameDim(in, *out)); + } + +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + Timer tim; + size_t dimBlock = CU1DBLOCK; + size_t dimGrid = out->NumRows(); + cuda_normalize_per_row(dimGrid, dimBlock, out->Data(), out->Stride(), + in.Data(), in.Dim(), target_rms, add_log_stddev); + CU_SAFE_CALL(cudaGetLastError()); + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + } else +#endif + { + CuSubMatrix out_no_log(*out, 0, out->NumRows(), 0, in.NumCols()); + if (in.Data() != out_no_log.Data()) + out_no_log.CopyFromMat(in); + CuVector in_norm(in.NumRows()); + Real d_scaled = in.NumCols() * target_rms * target_rms; + in_norm.AddDiagMat2(1.0 / d_scaled, in, kNoTrans, 0.0); + in_norm.ApplyFloor(kSquaredNormFloor); + in_norm.ApplyPow(-0.5); + out_no_log.MulRowsVec(in_norm); + if (add_log_stddev) { + in_norm.ApplyLog(); + in_norm.Scale(-1.0); + in_norm.Add(log(target_rms)); + out->CopyColFromVec(in_norm, in.NumCols()); + } + } +} + +template +void NormalizePerRow(const CuMatrixBase& in, const float target_rms, + const bool add_log_stddev, CuMatrixBase* out); +template +void NormalizePerRow(const CuMatrixBase& in, const double target_rms, + const bool add_log_stddev, CuMatrixBase* out); + } //namespace cu } //namespace kaldi - diff --git a/src/cudamatrix/cu-math.h b/src/cudamatrix/cu-math.h index 65a4c0c4af3..0afbb9476a1 100644 --- a/src/cudamatrix/cu-math.h +++ b/src/cudamatrix/cu-math.h @@ -78,6 +78,23 @@ void Group2norm(const CuMatrixBase &src, CuMatrixBase *dest, int32 group_stride); +/// Normalize nonlinearity modifies the vector of activations +/// by scaling it so that the root-mean-square equals 1.0. +/// +/// The output y_i = scale * x_i, +/// and we want to RMS value of the y_i to equal target_rms, +/// so y^t y = D * target_rms^2 (if y is one row of the input). +/// we need to have scale = 1.0 / sqrt(x^t x / (D * target_rms^2)). +/// there is also flooring involved, to avoid division-by-zero +/// problems. It's important for the backprop, that the floor's +/// square root is exactly representable as float. +/// If add_log_stddev_ is true, log(max(epsi, sqrt(x^t x / D))) +/// is an extra dimension of the output. +template +void NormalizePerRow(const CuMatrixBase& in, const Real target_rms, + const bool add_log_stddev, CuMatrixBase* out); + + diff --git a/src/doc/hmm.dox b/src/doc/hmm.dox index 30873cfa9b0..5788b95d9c0 100644 --- a/src/doc/hmm.dox +++ b/src/doc/hmm.dox @@ -92,7 +92,17 @@ great loss. The pdf-class is a concept that relates to the HmmTopology object. The HmmTopology object specifies a prototype HMM for each phone. Each numbered state of a -"prototype HMM" has a variable "pdf_class". If two states have the same +"prototype HMM" has two variables "forward_pdf_class" and "self_loop_pdf_class". +The "self_loop_pdf_class" is a kind of pdf-class that is associated +with self-loop transition. It is by default identical to "forward_pdf_class", +but it can be used to define less-convectional HMM topologies +where the pdfs on the self-loop and forward transitions are different. +The decision to allow the pdf-class on just the self-loop to be different, +while not embracing a fully "arc-based" representation where the pdfs on +all transitions in the HMM are potentially independent, was made as a compromise, +to allow for compatibility with previous versions of Kaldi while supporting the topology +used in our "chain models" AKA lattice-free MMI. +If two states have the same pdf_class variable, then they will always share the same probability distribution function (p.d.f.) if they are in the same phonetic context. This is because the decision-tree code does not get to "see" the HMM-state directly, @@ -121,11 +131,14 @@ object to get the pdf-ids associated with particular phonetic contexts). The decision that underlies a lot of the transition-modeling code is as follows: we have decided to make the transition probability of a -context dependent HMM state depend on the following four things (you could view -them as a 4-tuple): +context dependent HMM state depend on the following five things (you could view +them as a 5-tuple): - The phone (whose HMM we are in) - The source HMM-state (as interpreted by the HmmTopology object, i.e. normally 0, 1 or 2) - - The \ref pdf_id "pdf-id" (i.e. the index of the pdf associated with the state) + - The \ref pdf_id "forward-pdf-id" + (i.e. the index of the forward transition pdfs associated with the state) + - The \ref pdf_id "self-loop-pdf-id" + (i.e. the index of the self-loop pdfs associated with the state) - The index of the transition in the HmmTopology object. The last of these four items could be viewed as encoding the destination @@ -198,7 +211,7 @@ prototype HMM (as given in the HmmTopology object). from (transition-state, transition-index) to transition-id, and vice versa. There are also in the transition-modeling code reference to the following concepts: - - A triple means a triple (phone, hmm-state, pdf) which is mappable to and from a transition-state. + - A tuple means a 4-tuple (phone, hmm-state, forward pdf, self-loop pdf) which is mappable to and from a transition-state. - A pair means a pair (transition-state, transition-index) which is mappable to and from a transition-id. \section hmm_transition_training Training the transition model diff --git a/src/feat/feature-plp.cc b/src/feat/feature-plp.cc index 0034027cbe6..719e55dd6da 100644 --- a/src/feat/feature-plp.cc +++ b/src/feat/feature-plp.cc @@ -125,7 +125,7 @@ void PlpComputer::Compute(BaseFloat signal_log_energy, if (opts_.use_energy && !opts_.raw_energy) signal_log_energy = Log(std::max(VecVec(*signal_frame, *signal_frame), - std::numeric_limits::min())); + std::numeric_limits::min())); if (srfft_ != NULL) // Compute FFT using split-radix algorithm. srfft_->Compute(signal_frame->Data(), true); diff --git a/src/feat/feature-window.cc b/src/feat/feature-window.cc index 2726462d22c..65c0a2a29c3 100644 --- a/src/feat/feature-window.cc +++ b/src/feat/feature-window.cc @@ -140,7 +140,7 @@ void ProcessWindow(const FrameExtractionOptions &opts, if (log_energy_pre_window != NULL) { BaseFloat energy = std::max(VecVec(*window, *window), - std::numeric_limits::epsilon()); + std::numeric_limits::epsilon()); *log_energy_pre_window = Log(energy); } diff --git a/src/fstbin/fstdeterminizestar.cc b/src/fstbin/fstdeterminizestar.cc index ccd70764189..5e3de3e7ef9 100644 --- a/src/fstbin/fstdeterminizestar.cc +++ b/src/fstbin/fstdeterminizestar.cc @@ -24,10 +24,12 @@ #include "fstext/determinize-star.h" #include "fstext/fstext-utils.h" #include "fstext/kaldi-fst-io.h" -#ifndef _MSC_VER +#if !defined(_MSC_VER) && !defined(__APPLE__) #include // Comment this line and the call to signal below if // it causes compilation problems. It is only to enable a debugging procedure -// when determinization does not terminate. +// when determinization does not terminate. We are disabling this code if +// compiling on Windows because signal.h is not available there, and on +// MacOS due to a problem with in the initial release of Sierra. #endif /* some test examples: @@ -91,7 +93,7 @@ int main(int argc, char *argv[]) { // This enables us to get traceback info from determinization that is // not seeming to terminate. -#ifndef _MSC_VER +#if !defined(_MSC_VER) && !defined(__APPLE__) signal(SIGUSR1, signal_handler); #endif if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) { @@ -138,4 +140,3 @@ int main(int argc, char *argv[]) { return -1; } } - diff --git a/src/fstbin/fstpropfinal.cc b/src/fstbin/fstpropfinal.cc index f469d7a93d1..d9a221f7805 100644 --- a/src/fstbin/fstpropfinal.cc +++ b/src/fstbin/fstpropfinal.cc @@ -1,6 +1,7 @@ // fstbin/fstpropfinal.cc // Copyright 2009-2011 Microsoft Corporation +// 2016 Johns Hopkins University (author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // @@ -25,13 +26,7 @@ #include "fstext/fstext-utils.h" #include "fstext/kaldi-fst-io.h" -#ifndef _MSC_VER -#include // Comment this line and the call to signal below if -// it causes compilation problems. It is only to enable a debugging procedure -// when determinization does not terminate. -#endif - -/* some test examples. +/* A test example. You have to have the right things on your PATH for this to work. cat < 1.fst - fstpredeterminize out.lst 1.fst | fstdeterminizestar | fstrmsymbols out.lst > 2.fst - fstequivalent --random=true 1.fst 2.fst || echo "Test failed" - echo -n "." - done - - Test of debugging [with non-determinizable input]: - ( echo " 0 0 1 0 1.0"; echo "0 1 1 0"; echo "1 1 1 0 0"; echo "0 2 2 0"; echo "2"; echo "1" ) | fstcompile | fstdeterminizestar - kill -SIGUSR1 [the process-id of fstdeterminizestar] - # prints out a bunch of debugging output showing the mess it got itself into. */ -bool debug_location = false; -void signal_handler(int) { - debug_location = true; -} - - - int main(int argc, char *argv[]) { try { using namespace kaldi; @@ -93,7 +62,7 @@ int main(int argc, char *argv[]) { std::string phi_str = po.GetOptArg(1), fst_in_str = po.GetOptArg(2), fst_out_str = po.GetOptArg(3); - + int32 phi_label; if (!ConvertStringToInteger(phi_str, &phi_label) @@ -104,9 +73,9 @@ int main(int argc, char *argv[]) { VectorFst *fst = ReadFstKaldi(fst_in_str); - + PropagateFinal(phi_label, fst); - + WriteFstKaldi(*fst, fst_out_str); delete fst; return 0; @@ -115,4 +84,3 @@ int main(int argc, char *argv[]) { return -1; } } - diff --git a/src/hmm/hmm-test-utils.cc b/src/hmm/hmm-test-utils.cc index 4cfebcd0d51..ceca116c828 100644 --- a/src/hmm/hmm-test-utils.cc +++ b/src/hmm/hmm-test-utils.cc @@ -203,7 +203,7 @@ void GeneratePathThroughHmm(const HmmTopology &topology, const HmmTopology::HmmState &cur_hmm_state = this_entry[cur_state]; int32 num_transitions = cur_hmm_state.transitions.size(), transition_index = RandInt(0, num_transitions - 1); - if (cur_hmm_state.pdf_class != -1) { + if (cur_hmm_state.forward_pdf_class != -1) { std::pair pr(cur_state, transition_index); if (!reorder) { path->push_back(pr); @@ -257,12 +257,15 @@ void GenerateRandomAlignment(const ContextDependencyInterface &ctx_dep, trans_model.GetTopo().TopologyForPhone(phone); int32 hmm_state = path[k].first, transition_index = path[k].second, - pdf_class = entry[hmm_state].pdf_class, - pdf_id; - bool ans = ctx_dep.Compute(context_window, pdf_class, &pdf_id); + forward_pdf_class = entry[hmm_state].forward_pdf_class, + self_loop_pdf_class = entry[hmm_state].self_loop_pdf_class, + forward_pdf_id, self_loop_pdf_id; + bool ans = ctx_dep.Compute(context_window, forward_pdf_class, &forward_pdf_id); KALDI_ASSERT(ans && "context-dependency computation failed."); - int32 transition_state = trans_model.TripleToTransitionState( - phone, hmm_state, pdf_id), + ans = ctx_dep.Compute(context_window, self_loop_pdf_class, &self_loop_pdf_id); + KALDI_ASSERT(ans && "context-dependency computation failed."); + int32 transition_state = trans_model.TupleToTransitionState( + phone, hmm_state, forward_pdf_id, self_loop_pdf_id), transition_id = trans_model.PairToTransitionId(transition_state, transition_index); alignment->push_back(transition_id); diff --git a/src/hmm/hmm-topology-test.cc b/src/hmm/hmm-topology-test.cc index 61cf13e17bc..14081d2355d 100644 --- a/src/hmm/hmm-topology-test.cc +++ b/src/hmm/hmm-topology-test.cc @@ -58,6 +58,17 @@ void TestHmmTopology() { " \n" " \n"; + std::string chain_input_str = "\n" + "\n" + " 1 2 3 4 5 6 7 8 9 \n" + " 0 0 1\n" + " 0 0.5\n" + " 1 0.5\n" + " \n" + " 1 \n" + "\n" + "\n"; + HmmTopology topo; if (RandInt(0, 1) == 0) { @@ -84,6 +95,13 @@ void TestHmmTopology() { KALDI_ASSERT(oss1.str() == oss2.str()); } + { // test chain topology + HmmTopology chain_topo; + std::istringstream chain_iss(chain_input_str); + chain_topo.Read(chain_iss, false); + KALDI_ASSERT(chain_topo.MinLength(3) == 1); + } + { // make sure GetDefaultTopology does not crash. std::vector phones; phones.push_back(1); diff --git a/src/hmm/hmm-topology.cc b/src/hmm/hmm-topology.cc index 54144326766..cf134065dbf 100644 --- a/src/hmm/hmm-topology.cc +++ b/src/hmm/hmm-topology.cc @@ -76,12 +76,24 @@ void HmmTopology::Read(std::istream &is, bool binary) { KALDI_ERR << "States are expected to be in order from zero, expected " << this_entry.size() << ", got " << state; ReadToken(is, binary, &token); - int32 pdf_class = kNoPdf; // -1 by default, means no pdf. + int32 forward_pdf_class = kNoPdf; // -1 by default, means no pdf. if (token == "") { - ReadBasicType(is, binary, &pdf_class); + ReadBasicType(is, binary, &forward_pdf_class); + this_entry.push_back(HmmState(forward_pdf_class)); ReadToken(is, binary, &token); - } - this_entry.push_back(HmmState(pdf_class)); + if (token == "") + KALDI_ERR << "pdf classes should be defined using " + << "or / pair"; + } else if (token == "") { + int32 self_loop_pdf_class = kNoPdf; + ReadBasicType(is, binary, &forward_pdf_class); + ReadToken(is, binary, &token); + KALDI_ASSERT(token == ""); + ReadBasicType(is, binary, &self_loop_pdf_class); + this_entry.push_back(HmmState(forward_pdf_class, self_loop_pdf_class)); + ReadToken(is, binary, &token); + } else + this_entry.push_back(HmmState(forward_pdf_class)); while (token == "") { int32 dst_state; BaseFloat trans_prob; @@ -118,13 +130,22 @@ void HmmTopology::Read(std::istream &is, bool binary) { ReadIntegerVector(is, binary, &phone2idx_); int32 sz; ReadBasicType(is, binary, &sz); + bool is_hmm = true; + if (sz == -1) { + is_hmm = false; + ReadBasicType(is, binary, &sz); + } entries_.resize(sz); for (int32 i = 0; i < sz; i++) { int32 thist_sz; ReadBasicType(is, binary, &thist_sz); entries_[i].resize(thist_sz); for (int32 j = 0 ; j < thist_sz; j++) { - ReadBasicType(is, binary, &(entries_[i][j].pdf_class)); + ReadBasicType(is, binary, &(entries_[i][j].forward_pdf_class)); + if (is_hmm) + entries_[i][j].self_loop_pdf_class = entries_[i][j].forward_pdf_class; + else + ReadBasicType(is, binary, &(entries_[i][j].self_loop_pdf_class)); int32 thiss_sz; ReadBasicType(is, binary, &thiss_sz); entries_[i][j].transitions.resize(thiss_sz); @@ -141,6 +162,7 @@ void HmmTopology::Read(std::istream &is, bool binary) { void HmmTopology::Write(std::ostream &os, bool binary) const { + bool is_hmm = IsHmm(); WriteToken(os, binary, ""); if (!binary) { // Text-mode write. os << "\n"; @@ -159,9 +181,17 @@ void HmmTopology::Write(std::ostream &os, bool binary) const { for (size_t j = 0; j < entries_[i].size(); j++) { WriteToken(os, binary, ""); WriteBasicType(os, binary, static_cast(j)); - if (entries_[i][j].pdf_class != kNoPdf) { - WriteToken(os, binary, ""); - WriteBasicType(os, binary, entries_[i][j].pdf_class); + if (entries_[i][j].forward_pdf_class != kNoPdf) { + if (is_hmm) { + WriteToken(os, binary, ""); + WriteBasicType(os, binary, entries_[i][j].forward_pdf_class); + } else { + WriteToken(os, binary, ""); + WriteBasicType(os, binary, entries_[i][j].forward_pdf_class); + KALDI_ASSERT(entries_[i][j].self_loop_pdf_class != kNoPdf); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, entries_[i][j].self_loop_pdf_class); + } } for (size_t k = 0; k < entries_[i][j].transitions.size(); k++) { WriteToken(os, binary, ""); @@ -177,11 +207,15 @@ void HmmTopology::Write(std::ostream &os, bool binary) const { } else { WriteIntegerVector(os, binary, phones_); WriteIntegerVector(os, binary, phone2idx_); + // -1 is put here as a signal that the object has the new, + // extended format with SelfLoopPdfClass + if (!is_hmm) WriteBasicType(os, binary, static_cast(-1)); WriteBasicType(os, binary, static_cast(entries_.size())); for (size_t i = 0; i < entries_.size(); i++) { WriteBasicType(os, binary, static_cast(entries_[i].size())); for (size_t j = 0; j < entries_[i].size(); j++) { - WriteBasicType(os, binary, entries_[i][j].pdf_class); + WriteBasicType(os, binary, entries_[i][j].forward_pdf_class); + if (!is_hmm) WriteBasicType(os, binary, entries_[i][j].self_loop_pdf_class); WriteBasicType(os, binary, static_cast(entries_[i][j].transitions.size())); for (size_t k = 0; k < entries_[i][j].transitions.size(); k++) { WriteBasicType(os, binary, entries_[i][j].transitions[k].first); @@ -215,7 +249,7 @@ void HmmTopology::Check() { if (!entries_[i][num_states-1].transitions.empty()) KALDI_ERR << "HmmTopology::Check(), last state must have no transitions."; // not sure how necessary this next stipulation is. - if (entries_[i][num_states-1].pdf_class != kNoPdf) + if (entries_[i][num_states-1].forward_pdf_class != kNoPdf) KALDI_ERR << "HmmTopology::Check(), last state must not be emitting."; std::vector has_trans_in(num_states, false); @@ -223,8 +257,10 @@ void HmmTopology::Check() { for (int32 j = 0; j < num_states; j++) { // j is the state-id. BaseFloat tot_prob = 0.0; - if (entries_[i][j].pdf_class != kNoPdf) - seen_pdf_classes.push_back(entries_[i][j].pdf_class); + if (entries_[i][j].forward_pdf_class != kNoPdf) { + seen_pdf_classes.push_back(entries_[i][j].forward_pdf_class); + seen_pdf_classes.push_back(entries_[i][j].self_loop_pdf_class); + } std::set seen_transition; for (int32 k = 0; static_cast(k) < entries_[i][j].transitions.size(); @@ -238,7 +274,7 @@ void HmmTopology::Check() { // that are being built, which enable the creation of phone-level lattices // and rescoring these with a different lexicon and LM. if (dst_state == num_states-1 // && j != 0 - && entries_[i][j].pdf_class == kNoPdf) + && entries_[i][j].forward_pdf_class == kNoPdf) KALDI_ERR << "We do not allow any state to be " "nonemitting and have a transition to the final-state (this would " "stop the SplitToPhones function from identifying the last state " @@ -248,7 +284,8 @@ void HmmTopology::Check() { if (seen_transition.count(dst_state) != 0) KALDI_ERR << "HmmTopology::Check(), duplicate transition found."; if (dst_state == k) { // self_loop... - KALDI_ASSERT(entries_[i][j].pdf_class != kNoPdf && "Nonemitting states cannot have self-loops."); + KALDI_ASSERT(entries_[i][j].self_loop_pdf_class != kNoPdf && + "Nonemitting states cannot have self-loops."); } seen_transition.insert(dst_state); has_trans_in[dst_state] = true; @@ -275,6 +312,22 @@ void HmmTopology::Check() { } } +bool HmmTopology::IsHmm() const { + const std::vector &phones = GetPhones(); + KALDI_ASSERT(!phones.empty()); + for (size_t i = 0; i < phones.size(); i++) { + int32 phone = phones[i]; + const TopologyEntry &entry = TopologyForPhone(phone); + for (int32 j = 0; j < static_cast(entry.size()); j++) { // for each state... + int32 forward_pdf_class = entry[j].forward_pdf_class, + self_loop_pdf_class = entry[j].self_loop_pdf_class; + if (forward_pdf_class != self_loop_pdf_class) + return false; + } + } + return true; +} + const HmmTopology::TopologyEntry& HmmTopology::TopologyForPhone(int32 phone) const { // Will throw if phone not covered. if (static_cast(phone) >= phone2idx_.size() || phone2idx_[phone] == -1) { KALDI_ERR << "TopologyForPhone(), phone "<<(phone)<<" not covered."; @@ -286,8 +339,10 @@ int32 HmmTopology::NumPdfClasses(int32 phone) const { // will throw if phone not covered. const TopologyEntry &entry = TopologyForPhone(phone); int32 max_pdf_class = 0; - for (size_t i = 0; i < entry.size(); i++) - max_pdf_class = std::max(max_pdf_class, entry[i].pdf_class); + for (size_t i = 0; i < entry.size(); i++) { + max_pdf_class = std::max(max_pdf_class, entry[i].forward_pdf_class); + max_pdf_class = std::max(max_pdf_class, entry[i].self_loop_pdf_class); + } return max_pdf_class+1; } @@ -299,7 +354,7 @@ int32 HmmTopology::MinLength(int32 phone) const { std::numeric_limits::max()); KALDI_ASSERT(!entry.empty()); - min_length[0] = (entry[0].pdf_class == -1 ? 0 : 1); + min_length[0] = (entry[0].forward_pdf_class == -1 ? 0 : 1); int32 num_states = min_length.size(); bool changed = true; while (changed) { @@ -313,7 +368,7 @@ int32 HmmTopology::MinLength(int32 phone) const { int32 next_state = iter->first; KALDI_ASSERT(next_state < num_states); int32 next_state_min_length = min_length[s] + - (entry[next_state].pdf_class == -1 ? 0 : 1); + (entry[next_state].forward_pdf_class == -1 ? 0 : 1); if (next_state_min_length < min_length[next_state]) { min_length[next_state] = next_state_min_length; if (next_state < s) diff --git a/src/hmm/hmm-topology.h b/src/hmm/hmm-topology.h index 79b535e7d6b..edea02998c0 100644 --- a/src/hmm/hmm-topology.h +++ b/src/hmm/hmm-topology.h @@ -95,23 +95,38 @@ class HmmTopology { public: /// A structure defined inside HmmTopology to represent a HMM state. struct HmmState { - /// The \ref pdf_class pdf-class, typically 0, 1 or 2 (the same as the HMM-state index), + /// The \ref pdf_class forward-pdf-class, typically 0, 1 or 2 (the same as the HMM-state index), /// but may be different to enable us to hardwire sharing of state, and may be /// equal to \ref kNoPdf == -1 in order to specify nonemitting states (unusual). - int32 pdf_class; + int32 forward_pdf_class; + + /// The \ref pdf_class self-loop pdf-class, similar to \ref pdf_class forward-pdf-class. + /// They will either both be \ref kNoPdf, or neither be \ref kNoPdf. + int32 self_loop_pdf_class; /// A list of transitions, indexed by what we call a 'transition-index'. /// The first member of each pair is the index of the next HmmState, and the /// second is the default transition probability (before training). std::vector > transitions; - explicit HmmState(int32 p): pdf_class(p) { } + explicit HmmState(int32 pdf_class) { + this->forward_pdf_class = pdf_class; + this->self_loop_pdf_class = pdf_class; + } + explicit HmmState(int32 forward_pdf_class, int32 self_loop_pdf_class) { + KALDI_ASSERT((forward_pdf_class != kNoPdf && self_loop_pdf_class != kNoPdf) || + (forward_pdf_class == kNoPdf && self_loop_pdf_class == kNoPdf)); + this->forward_pdf_class = forward_pdf_class; + this->self_loop_pdf_class = self_loop_pdf_class; + } bool operator == (const HmmState &other) const { - return (pdf_class == other.pdf_class && transitions == other.transitions); + return (forward_pdf_class == other.forward_pdf_class && + self_loop_pdf_class == other.self_loop_pdf_class && + transitions == other.transitions); } - HmmState(): pdf_class(-1) { } + HmmState(): forward_pdf_class(-1), self_loop_pdf_class(-1) { } }; /// TopologyEntry is a typedef that represents the topology of @@ -124,6 +139,15 @@ class HmmTopology { // Checks that the object is valid, and throw exception otherwise. void Check(); + /// Returns true if this HmmTopology is really 'hmm-like', i.e. the pdf-class on + /// the self-loops and forward transitions of all states are identical. [note: in HMMs, + /// the densities are associated with the states.] We have extended this to + /// support 'non-hmm-like' topologies (where those pdf-classes are different), + /// in order to make for more compact decoding graphs in our so-called 'chain models' + /// (AKA lattice-free MMI), where we use 1-state topologies that have different pdf-classes + /// for the self-loop and the forward transition. Note that we always use the 'reorder=true' + /// option so the 'forward transition' actually comes before the self-loop. + bool IsHmm() const; /// Returns the topology entry (i.e. vector of HmmState) for this phone; /// will throw exception if phone not covered by the topology. diff --git a/src/hmm/hmm-utils.cc b/src/hmm/hmm-utils.cc index 04ec09d14b7..ab0b133f708 100644 --- a/src/hmm/hmm-utils.cc +++ b/src/hmm/hmm-utils.cc @@ -93,11 +93,16 @@ fst::VectorFst *GetHmmAsFst( for (int32 hmm_state = 0; hmm_state < static_cast(entry.size()); hmm_state++) { - int32 pdf_class = entry[hmm_state].pdf_class, pdf; - if (pdf_class == kNoPdf) pdf = kNoPdf; // nonemitting state. - else { - KALDI_ASSERT(pdf_class < static_cast(pdfs.size())); - pdf = pdfs[pdf_class]; + int32 forward_pdf_class = entry[hmm_state].forward_pdf_class, forward_pdf; + int32 self_loop_pdf_class = entry[hmm_state].self_loop_pdf_class, self_loop_pdf; + if (forward_pdf_class == kNoPdf) { // nonemitting state. + forward_pdf = kNoPdf; + self_loop_pdf = kNoPdf; + } else { + KALDI_ASSERT(forward_pdf_class < static_cast(pdfs.size())); + KALDI_ASSERT(self_loop_pdf_class < static_cast(pdfs.size())); + forward_pdf = pdfs[forward_pdf_class]; + self_loop_pdf = pdfs[self_loop_pdf_class]; } int32 trans_idx; for (trans_idx = 0; @@ -110,7 +115,7 @@ fst::VectorFst *GetHmmAsFst( if (is_self_loop) continue; // We will add self-loops in at a later stage of processing, // not in this function. - if (pdf_class == kNoPdf) { + if (forward_pdf_class == kNoPdf) { // no pdf, hence non-estimated probability. // [would not happen with normal topology] . There is no transition-state // involved in this case. @@ -118,7 +123,7 @@ fst::VectorFst *GetHmmAsFst( label = 0; } else { // normal probability. int32 trans_state = - trans_model.TripleToTransitionState(phone, hmm_state, pdf); + trans_model.TupleToTransitionState(phone, hmm_state, forward_pdf, self_loop_pdf); int32 trans_id = trans_model.PairToTransitionId(trans_state, trans_idx); log_prob = trans_model.GetTransitionLogProbIgnoringSelfLoops(trans_id); @@ -183,10 +188,15 @@ GetHmmAsFstSimple(std::vector phone_window, for (int32 hmm_state = 0; hmm_state < static_cast(entry.size()); hmm_state++) { - int32 pdf_class = entry[hmm_state].pdf_class, pdf; - if (pdf_class == kNoPdf) pdf = kNoPdf; // nonemitting state; not generally used. - else { - bool ans = ctx_dep.Compute(phone_window, pdf_class, &pdf); + int32 forward_pdf_class = entry[hmm_state].forward_pdf_class, forward_pdf; + int32 self_loop_pdf_class = entry[hmm_state].self_loop_pdf_class, self_loop_pdf; + if (forward_pdf_class == kNoPdf) { // nonemitting state; not generally used. + forward_pdf = kNoPdf; + self_loop_pdf = kNoPdf; + } else { + bool ans = ctx_dep.Compute(phone_window, forward_pdf_class, &forward_pdf); + KALDI_ASSERT(ans && "Context-dependency computation failed."); + ans = ctx_dep.Compute(phone_window, self_loop_pdf_class, &self_loop_pdf); KALDI_ASSERT(ans && "Context-dependency computation failed."); } int32 trans_idx; @@ -196,7 +206,7 @@ GetHmmAsFstSimple(std::vector phone_window, BaseFloat log_prob; Label label; int32 dest_state = entry[hmm_state].transitions[trans_idx].first; - if (pdf_class == kNoPdf) { + if (forward_pdf_class == kNoPdf) { // no pdf, hence non-estimated probability. very unusual case. [would // not happen with normal topology] . There is no transition-state // involved in this case. @@ -205,7 +215,7 @@ GetHmmAsFstSimple(std::vector phone_window, label = 0; } else { // normal probability. int32 trans_state = - trans_model.TripleToTransitionState(phone, hmm_state, pdf); + trans_model.TupleToTransitionState(phone, hmm_state, forward_pdf, self_loop_pdf); int32 trans_id = trans_model.PairToTransitionId(trans_state, trans_idx); log_prob = prob_scale * trans_model.GetTransitionLogProb(trans_id); @@ -652,8 +662,8 @@ static bool SplitToPhonesInternal(const TransitionModel &trans_model, int32 trans_state = trans_model.TransitionIdToTransitionState(alignment[cur_point]); int32 phone = trans_model.TransitionStateToPhone(trans_state); - int32 pdf_class = trans_model.GetTopo().TopologyForPhone(phone)[0].pdf_class; - if (pdf_class != kNoPdf) // initial-state of the current phone is emitting + int32 forward_pdf_class = trans_model.GetTopo().TopologyForPhone(phone)[0].forward_pdf_class; + if (forward_pdf_class != kNoPdf) // initial-state of the current phone is emitting if (trans_model.TransitionStateToHmmState(trans_state) != 0) was_ok = false; for (size_t j = cur_point; j < end_points[i]; j++) @@ -739,14 +749,19 @@ static inline void ConvertAlignmentForPhone( // the topologies and lengths match -> we can directly transfer // the alignment. for (int32 j = 0; j < alignment_size; j++) { - int32 old_tid = old_phone_alignment[j]; - int32 pdf_class = old_trans_model.TransitionIdToPdfClass(old_tid); + int32 old_tid = old_phone_alignment[j], + old_tstate = old_trans_model.TransitionIdToTransitionState(old_tid); + int32 forward_pdf_class = + old_trans_model.TransitionStateToForwardPdfClass(old_tstate), + self_loop_pdf_class = + old_trans_model.TransitionStateToSelfLoopPdfClass(old_tstate); int32 hmm_state = old_trans_model.TransitionIdToHmmState(old_tid); int32 trans_idx = old_trans_model.TransitionIdToTransitionIndex(old_tid); - int32 new_pdf = pdf_ids[pdf_class]; + int32 new_forward_pdf = pdf_ids[forward_pdf_class]; + int32 new_self_loop_pdf = pdf_ids[self_loop_pdf_class]; int32 new_trans_state = - new_trans_model.TripleToTransitionState(new_central_phone, hmm_state, - new_pdf); + new_trans_model.TupleToTransitionState(new_central_phone, hmm_state, + new_forward_pdf, new_self_loop_pdf); int32 new_tid = new_trans_model.PairToTransitionId(new_trans_state, trans_idx); (*new_phone_alignment)[j] = new_tid; diff --git a/src/hmm/transition-model.cc b/src/hmm/transition-model.cc index df22169cd25..83edbaf5805 100644 --- a/src/hmm/transition-model.cc +++ b/src/hmm/transition-model.cc @@ -24,13 +24,26 @@ namespace kaldi { -void TransitionModel::ComputeTriples(const ContextDependencyInterface &ctx_dep) { +void TransitionModel::ComputeTuples(const ContextDependencyInterface &ctx_dep) { + if (IsHmm()) + ComputeTuplesIsHmm(ctx_dep); + else + ComputeTuplesNotHmm(ctx_dep); + + // now tuples_ is populated with all possible tuples of (phone, hmm_state, pdf, self_loop_pdf). + std::sort(tuples_.begin(), tuples_.end()); // sort to enable reverse lookup. + // this sorting defines the transition-ids. +} + +void TransitionModel::ComputeTuplesIsHmm(const ContextDependencyInterface &ctx_dep) { const std::vector &phones = topo_.GetPhones(); - std::vector > > pdf_info; KALDI_ASSERT(!phones.empty()); + + // this is the case for normal models. but not fot chain models + std::vector > > pdf_info; std::vector num_pdf_classes( 1 + *std::max_element(phones.begin(), phones.end()), -1); for (size_t i = 0; i < phones.size(); i++) - num_pdf_classes[phones[i]] = topo_.NumPdfClasses(phones[i]); + num_pdf_classes[phones[i]] = topo_.NumPdfClasses(phones[i]); ctx_dep.GetPdfInfo(phones, num_pdf_classes, &pdf_info); // pdf_info is list indexed by pdf of which (phone, pdf_class) it // can correspond to. @@ -43,47 +56,108 @@ void TransitionModel::ComputeTriples(const ContextDependencyInterface &ctx_dep) int32 phone = phones[i]; const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(phone); for (int32 j = 0; j < static_cast(entry.size()); j++) { // for each state... - int32 pdf_class = entry[j].pdf_class; + int32 pdf_class = entry[j].forward_pdf_class; if (pdf_class != kNoPdf) { to_hmm_state_list[std::make_pair(phone, pdf_class)].push_back(j); } } } + for (int32 pdf = 0; pdf < static_cast(pdf_info.size()); pdf++) { for (size_t j = 0; j < pdf_info[pdf].size(); j++) { int32 phone = pdf_info[pdf][j].first, - pdf_class = pdf_info[pdf][j].second; + pdf_class = pdf_info[pdf][j].second; const std::vector &state_vec = to_hmm_state_list[std::make_pair(phone, pdf_class)]; KALDI_ASSERT(!state_vec.empty()); // state_vec is a list of the possible HMM-states that emit this // pdf_class. for (size_t k = 0; k < state_vec.size(); k++) { int32 hmm_state = state_vec[k]; - triples_.push_back(Triple(phone, hmm_state, pdf)); + tuples_.push_back(Tuple(phone, hmm_state, pdf, pdf)); } } } +} - // now triples_ is populated with all possible triples of (phone, hmm_state, pdf). - std::sort(triples_.begin(), triples_.end()); // sort to enable reverse lookup. - // this sorting defines the transition-ids. +void TransitionModel::ComputeTuplesNotHmm(const ContextDependencyInterface &ctx_dep) { + const std::vector &phones = topo_.GetPhones(); + KALDI_ASSERT(!phones.empty()); + + // pdf_info is a set of lists indexed by phone. Each list is indexed by + // (pdf-class, self-loop pdf-class) of each state of that phone, and the element + // is a list of possible (pdf, self-loop pdf) pairs that that (pdf-class, self-loop pdf-class) + // pair generates. + std::vector > > > pdf_info; + // pdf_class_pairs is a set of lists indexed by phone. Each list stores + // (pdf-class, self-loop pdf-class) of each state of that phone. + std::vector > > pdf_class_pairs; + pdf_class_pairs.resize(1 + *std::max_element(phones.begin(), phones.end())); + for (size_t i = 0; i < phones.size(); i++) { + int32 phone = phones[i]; + const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(phone); + for (int32 j = 0; j < static_cast(entry.size()); j++) { // for each state... + int32 forward_pdf_class = entry[j].forward_pdf_class, self_loop_pdf_class = entry[j].self_loop_pdf_class; + if (forward_pdf_class != kNoPdf) + pdf_class_pairs[phone].push_back(std::make_pair(forward_pdf_class, self_loop_pdf_class)); + } + } + ctx_dep.GetPdfInfo(phones, pdf_class_pairs, &pdf_info); + + std::vector, std::vector > > to_hmm_state_list; + to_hmm_state_list.resize(1 + *std::max_element(phones.begin(), phones.end())); + // to_hmm_state_list is a phone-indexed set of maps from (pdf-class, self-loop pdf_class) to the list + // of hmm-states in the HMM for that phone that that (pdf-class, self-loop pdf-class) + // can correspond to. + for (size_t i = 0; i < phones.size(); i++) { // setting up to_hmm_state_list. + int32 phone = phones[i]; + const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(phone); + std::map, std::vector > phone_to_hmm_state_list; + for (int32 j = 0; j < static_cast(entry.size()); j++) { // for each state... + int32 forward_pdf_class = entry[j].forward_pdf_class, self_loop_pdf_class = entry[j].self_loop_pdf_class; + if (forward_pdf_class != kNoPdf) { + phone_to_hmm_state_list[std::make_pair(forward_pdf_class, self_loop_pdf_class)].push_back(j); + } + } + to_hmm_state_list[phone] = phone_to_hmm_state_list; + } + + for (int32 i = 0; i < phones.size(); i++) { + int32 phone = phones[i]; + for (int32 j = 0; j < static_cast(pdf_info[phone].size()); j++) { + int32 pdf_class = pdf_class_pairs[phone][j].first, + self_loop_pdf_class = pdf_class_pairs[phone][j].second; + const std::vector &state_vec = + to_hmm_state_list[phone][std::make_pair(pdf_class, self_loop_pdf_class)]; + KALDI_ASSERT(!state_vec.empty()); + for (size_t k = 0; k < state_vec.size(); k++) { + int32 hmm_state = state_vec[k]; + for (size_t m = 0; m < pdf_info[phone][j].size(); m++) { + int32 pdf = pdf_info[phone][j][m].first, + self_loop_pdf = pdf_info[phone][j][m].second; + tuples_.push_back(Tuple(phone, hmm_state, pdf, self_loop_pdf)); + } + } + } + } } void TransitionModel::ComputeDerived() { - state2id_.resize(triples_.size()+2); // indexed by transition-state, which + state2id_.resize(tuples_.size()+2); // indexed by transition-state, which // is one based, but also an entry for one past end of list. int32 cur_transition_id = 1; num_pdfs_ = 0; for (int32 tstate = 1; - tstate <= static_cast(triples_.size()+1); // not a typo. + tstate <= static_cast(tuples_.size()+1); // not a typo. tstate++) { state2id_[tstate] = cur_transition_id; - if (static_cast(tstate) <= triples_.size()) { - int32 phone = triples_[tstate-1].phone, - hmm_state = triples_[tstate-1].hmm_state, - pdf = triples_[tstate-1].pdf; - num_pdfs_ = std::max(num_pdfs_, 1+pdf); + if (static_cast(tstate) <= tuples_.size()) { + int32 phone = tuples_[tstate-1].phone, + hmm_state = tuples_[tstate-1].hmm_state, + forward_pdf = tuples_[tstate-1].forward_pdf, + self_loop_pdf = tuples_[tstate-1].self_loop_pdf; + num_pdfs_ = std::max(num_pdfs_, 1 + forward_pdf); + num_pdfs_ = std::max(num_pdfs_, 1 + self_loop_pdf); const HmmTopology::HmmState &state = topo_.TopologyForPhone(phone)[hmm_state]; int32 my_num_ids = static_cast(state.transitions.size()); cur_transition_id += my_num_ids; // # trans out of this state. @@ -91,20 +165,26 @@ void TransitionModel::ComputeDerived() { } id2state_.resize(cur_transition_id); // cur_transition_id is #transition-ids+1. - for (int32 tstate = 1; tstate <= static_cast(triples_.size()); tstate++) - for (int32 tid = state2id_[tstate]; tid < state2id_[tstate+1]; tid++) + id2pdf_id_.resize(cur_transition_id); + for (int32 tstate = 1; tstate <= static_cast(tuples_.size()); tstate++) + for (int32 tid = state2id_[tstate]; tid < state2id_[tstate+1]; tid++) { id2state_[tid] = tstate; - + if (IsSelfLoop(tid)) + id2pdf_id_[tid] = tuples_[tstate-1].self_loop_pdf; + else + id2pdf_id_[tid] = tuples_[tstate-1].forward_pdf; + } } + void TransitionModel::InitializeProbs() { log_probs_.Resize(NumTransitionIds()+1); // one-based array, zeroth element empty. for (int32 trans_id = 1; trans_id <= NumTransitionIds(); trans_id++) { int32 trans_state = id2state_[trans_id]; int32 trans_index = trans_id - state2id_[trans_state]; - const Triple &triple = triples_[trans_state-1]; - const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(triple.phone); - KALDI_ASSERT(static_cast(triple.hmm_state) < entry.size()); - BaseFloat prob = entry[triple.hmm_state].transitions[trans_index].second; + const Tuple &tuple = tuples_[trans_state-1]; + const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(tuple.phone); + KALDI_ASSERT(static_cast(tuple.hmm_state) < entry.size()); + BaseFloat prob = entry[tuple.hmm_state].transitions[trans_index].second; if (prob <= 0.0) KALDI_ERR << "TransitionModel::InitializeProbs, zero " "probability [should remove that entry in the topology]"; @@ -129,40 +209,55 @@ void TransitionModel::Check() const { KALDI_ASSERT(tid == PairToTransitionId(tstate, index)); int32 phone = TransitionStateToPhone(tstate), hmm_state = TransitionStateToHmmState(tstate), - pdf = TransitionStateToPdf(tstate); - KALDI_ASSERT(tstate == TripleToTransitionState(phone, hmm_state, pdf)); + forward_pdf = TransitionStateToForwardPdf(tstate), + self_loop_pdf = TransitionStateToSelfLoopPdf(tstate); + KALDI_ASSERT(tstate == TupleToTransitionState(phone, hmm_state, forward_pdf, self_loop_pdf)); KALDI_ASSERT(log_probs_(tid) <= 0.0 && log_probs_(tid) - log_probs_(tid) == 0.0); // checking finite and non-positive (and not out-of-bounds). } } +bool TransitionModel::IsHmm() const { + const std::vector &phones = topo_.GetPhones(); + KALDI_ASSERT(!phones.empty()); + for (size_t i = 0; i < phones.size(); i++) { + int32 phone = phones[i]; + const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(phone); + for (int32 j = 0; j < static_cast(entry.size()); j++) { // for each state... + if (entry[j].forward_pdf_class != entry[j].self_loop_pdf_class) + return false; + } + } + return true; +} + TransitionModel::TransitionModel(const ContextDependencyInterface &ctx_dep, const HmmTopology &hmm_topo): topo_(hmm_topo) { - // First thing is to get all possible triples. - ComputeTriples(ctx_dep); + // First thing is to get all possible tuples. + ComputeTuples(ctx_dep); ComputeDerived(); InitializeProbs(); Check(); } -int32 TransitionModel::TripleToTransitionState(int32 phone, int32 hmm_state, int32 pdf) const { - Triple triple(phone, hmm_state, pdf); +int32 TransitionModel::TupleToTransitionState(int32 phone, int32 hmm_state, int32 pdf, int32 self_loop_pdf) const { + Tuple tuple(phone, hmm_state, pdf, self_loop_pdf); // Note: if this ever gets too expensive, which is unlikely, we can refactor // this code to sort first on pdf, and then index on pdf, so those // that have the same pdf are in a contiguous range. - std::vector::const_iterator iter = - std::lower_bound(triples_.begin(), triples_.end(), triple); - if (iter == triples_.end() || !(*iter == triple)) { - KALDI_ERR << "TransitionModel::TripleToTransitionState, triple not found." + std::vector::const_iterator iter = + std::lower_bound(tuples_.begin(), tuples_.end(), tuple); + if (iter == tuples_.end() || !(*iter == tuple)) { + KALDI_ERR << "TransitionModel::TupleToTransitionState, tuple not found." << " (incompatible tree and model?)"; } - // triples_ is indexed by transition_state-1, so add one. - return static_cast((iter - triples_.begin())) + 1; + // tuples_ is indexed by transition_state-1, so add one. + return static_cast((iter - tuples_.begin())) + 1; } int32 TransitionModel::NumTransitionIndices(int32 trans_state) const { - KALDI_ASSERT(static_cast(trans_state) <= triples_.size()); + KALDI_ASSERT(static_cast(trans_state) <= tuples_.size()); return static_cast(state2id_[trans_state+1]-state2id_[trans_state]); } @@ -177,32 +272,57 @@ int32 TransitionModel::TransitionIdToTransitionIndex(int32 trans_id) const { } int32 TransitionModel::TransitionStateToPhone(int32 trans_state) const { - KALDI_ASSERT(static_cast(trans_state) <= triples_.size()); - return triples_[trans_state-1].phone; + KALDI_ASSERT(static_cast(trans_state) <= tuples_.size()); + return tuples_[trans_state-1].phone; } -int32 TransitionModel::TransitionStateToPdf(int32 trans_state) const { - KALDI_ASSERT(static_cast(trans_state) <= triples_.size()); - return triples_[trans_state-1].pdf; +int32 TransitionModel::TransitionStateToForwardPdf(int32 trans_state) const { + KALDI_ASSERT(static_cast(trans_state) <= tuples_.size()); + return tuples_[trans_state-1].forward_pdf; +} + +int32 TransitionModel::TransitionStateToForwardPdfClass( + int32 trans_state) const { + KALDI_ASSERT(static_cast(trans_state) <= tuples_.size()); + const Tuple &t = tuples_[trans_state-1]; + const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(t.phone); + KALDI_ASSERT(static_cast(t.hmm_state) < entry.size()); + return entry[t.hmm_state].forward_pdf_class; +} + + +int32 TransitionModel::TransitionStateToSelfLoopPdfClass( + int32 trans_state) const { + KALDI_ASSERT(static_cast(trans_state) <= tuples_.size()); + const Tuple &t = tuples_[trans_state-1]; + const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(t.phone); + KALDI_ASSERT(static_cast(t.hmm_state) < entry.size()); + return entry[t.hmm_state].self_loop_pdf_class; +} + + +int32 TransitionModel::TransitionStateToSelfLoopPdf(int32 trans_state) const { + KALDI_ASSERT(static_cast(trans_state) <= tuples_.size()); + return tuples_[trans_state-1].self_loop_pdf; } int32 TransitionModel::TransitionStateToHmmState(int32 trans_state) const { - KALDI_ASSERT(static_cast(trans_state) <= triples_.size()); - return triples_[trans_state-1].hmm_state; + KALDI_ASSERT(static_cast(trans_state) <= tuples_.size()); + return tuples_[trans_state-1].hmm_state; } int32 TransitionModel::PairToTransitionId(int32 trans_state, int32 trans_index) const { - KALDI_ASSERT(static_cast(trans_state) <= triples_.size()); + KALDI_ASSERT(static_cast(trans_state) <= tuples_.size()); KALDI_ASSERT(trans_index < state2id_[trans_state+1] - state2id_[trans_state]); return state2id_[trans_state] + trans_index; } int32 TransitionModel::NumPhones() const { - int32 num_trans_state = triples_.size(); + int32 num_trans_state = tuples_.size(); int32 max_phone_id = 0; for (int32 i = 0; i < num_trans_state; ++i) { - if (triples_[i].phone > max_phone_id) - max_phone_id = triples_[i].phone; + if (tuples_[i].phone > max_phone_id) + max_phone_id = tuples_[i].phone; } return max_phone_id; } @@ -212,36 +332,25 @@ bool TransitionModel::IsFinal(int32 trans_id) const { KALDI_ASSERT(static_cast(trans_id) < id2state_.size()); int32 trans_state = id2state_[trans_id]; int32 trans_index = trans_id - state2id_[trans_state]; - const Triple &triple = triples_[trans_state-1]; - const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(triple.phone); - KALDI_ASSERT(static_cast(triple.hmm_state) < entry.size()); - KALDI_ASSERT(static_cast(triple.hmm_state) < entry.size()); + const Tuple &tuple = tuples_[trans_state-1]; + const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(tuple.phone); + KALDI_ASSERT(static_cast(tuple.hmm_state) < entry.size()); + KALDI_ASSERT(static_cast(tuple.hmm_state) < entry.size()); KALDI_ASSERT(static_cast(trans_index) < - entry[triple.hmm_state].transitions.size()); + entry[tuple.hmm_state].transitions.size()); // return true if the transition goes to the final state of the // topology entry. - return (entry[triple.hmm_state].transitions[trans_index].first + 1 == + return (entry[tuple.hmm_state].transitions[trans_index].first + 1 == static_cast(entry.size())); } -bool TransitionModel::IsSelfLoop(int32 trans_id) const { - KALDI_ASSERT(static_cast(trans_id) < id2state_.size()); - int32 trans_state = id2state_[trans_id]; - int32 trans_index = trans_id - state2id_[trans_state]; - const Triple &triple = triples_[trans_state-1]; - int32 phone = triple.phone, hmm_state = triple.hmm_state; - const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(phone); - KALDI_ASSERT(static_cast(hmm_state) < entry.size()); - return (static_cast(trans_index) < entry[hmm_state].transitions.size() - && entry[hmm_state].transitions[trans_index].first == hmm_state); -} int32 TransitionModel::SelfLoopOf(int32 trans_state) const { // returns the self-loop transition-id, - KALDI_ASSERT(static_cast(trans_state-1) < triples_.size()); - const Triple &triple = triples_[trans_state-1]; + KALDI_ASSERT(static_cast(trans_state-1) < tuples_.size()); + const Tuple &tuple = tuples_[trans_state-1]; // or zero if does not exist. - int32 phone = triple.phone, hmm_state = triple.hmm_state; + int32 phone = tuple.phone, hmm_state = tuple.hmm_state; const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(phone); KALDI_ASSERT(static_cast(hmm_state) < entry.size()); for (int32 trans_index = 0; @@ -274,16 +383,22 @@ void TransitionModel::ComputeDerivedOfProbs() { void TransitionModel::Read(std::istream &is, bool binary) { ExpectToken(is, binary, ""); topo_.Read(is, binary); - ExpectToken(is, binary, ""); + std::string token; + ReadToken(is, binary, &token); int32 size; ReadBasicType(is, binary, &size); - triples_.resize(size); + tuples_.resize(size); for (int32 i = 0; i < size; i++) { - ReadBasicType(is, binary, &(triples_[i].phone)); - ReadBasicType(is, binary, &(triples_[i].hmm_state)); - ReadBasicType(is, binary, &(triples_[i].pdf)); + ReadBasicType(is, binary, &(tuples_[i].phone)); + ReadBasicType(is, binary, &(tuples_[i].hmm_state)); + ReadBasicType(is, binary, &(tuples_[i].forward_pdf)); + if (token == "") + ReadBasicType(is, binary, &(tuples_[i].self_loop_pdf)); + else if (token == "") + tuples_[i].self_loop_pdf = tuples_[i].forward_pdf; } - ExpectToken(is, binary, ""); + ReadToken(is, binary, &token); + KALDI_ASSERT(token == "" || token == ""); ComputeDerived(); ExpectToken(is, binary, ""); log_probs_.Read(is, binary); @@ -294,19 +409,28 @@ void TransitionModel::Read(std::istream &is, bool binary) { } void TransitionModel::Write(std::ostream &os, bool binary) const { + bool is_hmm = IsHmm(); WriteToken(os, binary, ""); if (!binary) os << "\n"; topo_.Write(os, binary); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, static_cast(triples_.size())); + if (is_hmm) + WriteToken(os, binary, ""); + else + WriteToken(os, binary, ""); + WriteBasicType(os, binary, static_cast(tuples_.size())); if (!binary) os << "\n"; - for (int32 i = 0; i < static_cast (triples_.size()); i++) { - WriteBasicType(os, binary, triples_[i].phone); - WriteBasicType(os, binary, triples_[i].hmm_state); - WriteBasicType(os, binary, triples_[i].pdf); + for (int32 i = 0; i < static_cast (tuples_.size()); i++) { + WriteBasicType(os, binary, tuples_[i].phone); + WriteBasicType(os, binary, tuples_[i].hmm_state); + WriteBasicType(os, binary, tuples_[i].forward_pdf); + if (!is_hmm) + WriteBasicType(os, binary, tuples_[i].self_loop_pdf); if (!binary) os << "\n"; } - WriteToken(os, binary, ""); + if (is_hmm) + WriteToken(os, binary, ""); + else + WriteToken(os, binary, ""); if (!binary) os << "\n"; WriteToken(os, binary, ""); if (!binary) os << "\n"; @@ -473,8 +597,12 @@ void TransitionModel::MleUpdateShared(const Vector &stats, std::map > pdf_to_tstate; for (int32 tstate = 1; tstate <= NumTransitionStates(); tstate++) { - int32 pdf = TransitionStateToPdf(tstate); + int32 pdf = TransitionStateToForwardPdf(tstate); pdf_to_tstate[pdf].insert(tstate); + if (!IsHmm()) { + pdf = TransitionStateToSelfLoopPdf(tstate); + pdf_to_tstate[pdf].insert(tstate); + } } std::map >::iterator map_iter; for (map_iter = pdf_to_tstate.begin(); @@ -567,8 +695,12 @@ void TransitionModel::MapUpdateShared(const Vector &stats, std::map > pdf_to_tstate; for (int32 tstate = 1; tstate <= NumTransitionStates(); tstate++) { - int32 pdf = TransitionStateToPdf(tstate); + int32 pdf = TransitionStateToForwardPdf(tstate); pdf_to_tstate[pdf].insert(tstate); + if (!IsHmm()) { + pdf = TransitionStateToSelfLoopPdf(tstate); + pdf_to_tstate[pdf].insert(tstate); + } } std::map >::iterator map_iter; for (map_iter = pdf_to_tstate.begin(); @@ -642,24 +774,27 @@ void TransitionModel::MapUpdateShared(const Vector &stats, int32 TransitionModel::TransitionIdToPhone(int32 trans_id) const { KALDI_ASSERT(trans_id != 0 && static_cast(trans_id) < id2state_.size()); int32 trans_state = id2state_[trans_id]; - return triples_[trans_state-1].phone; + return tuples_[trans_state-1].phone; } int32 TransitionModel::TransitionIdToPdfClass(int32 trans_id) const { KALDI_ASSERT(trans_id != 0 && static_cast(trans_id) < id2state_.size()); int32 trans_state = id2state_[trans_id]; - const Triple &t = triples_[trans_state-1]; + const Tuple &t = tuples_[trans_state-1]; const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(t.phone); KALDI_ASSERT(static_cast(t.hmm_state) < entry.size()); - return entry[t.hmm_state].pdf_class; + if (IsSelfLoop(trans_id)) + return entry[t.hmm_state].self_loop_pdf_class; + else + return entry[t.hmm_state].forward_pdf_class; } int32 TransitionModel::TransitionIdToHmmState(int32 trans_id) const { KALDI_ASSERT(trans_id != 0 && static_cast(trans_id) < id2state_.size()); int32 trans_state = id2state_[trans_id]; - const Triple &t = triples_[trans_state-1]; + const Tuple &t = tuples_[trans_state-1]; return t.hmm_state; } @@ -668,23 +803,34 @@ void TransitionModel::Print(std::ostream &os, const Vector *occs) { if (occs != NULL) KALDI_ASSERT(occs->Dim() == NumPdfs()); + bool is_hmm = IsHmm(); for (int32 tstate = 1; tstate <= NumTransitionStates(); tstate++) { - const Triple &triple = triples_[tstate-1]; - KALDI_ASSERT(static_cast(triple.phone) < phone_names.size()); - std::string phone_name = phone_names[triple.phone]; + const Tuple &tuple = tuples_[tstate-1]; + KALDI_ASSERT(static_cast(tuple.phone) < phone_names.size()); + std::string phone_name = phone_names[tuple.phone]; os << "Transition-state " << tstate << ": phone = " << phone_name - << " hmm-state = " << triple.hmm_state << " pdf = " << triple.pdf << '\n'; + << " hmm-state = " << tuple.hmm_state; + if (is_hmm) + os << " pdf = " << tuple.forward_pdf << '\n'; + else + os << " forward-pdf = " << tuple.forward_pdf << " self-loop-pdf = " + << tuple.self_loop_pdf << '\n'; for (int32 tidx = 0; tidx < NumTransitionIndices(tstate); tidx++) { int32 tid = PairToTransitionId(tstate, tidx); BaseFloat p = GetTransitionProb(tid); os << " Transition-id = " << tid << " p = " << p; - if (occs != NULL) os << " count of pdf = " << (*occs)(triple.pdf); + if (occs != NULL) { + if (IsSelfLoop(tid)) + os << " count of pdf = " << (*occs)(tuple.self_loop_pdf); + else + os << " count of pdf = " << (*occs)(tuple.forward_pdf); + } // now describe what it's a transition to. if (IsSelfLoop(tid)) os << " [self-loop]\n"; else { - int32 hmm_state = triple.hmm_state; - const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(triple.phone); + int32 hmm_state = tuple.hmm_state; + const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(tuple.phone); KALDI_ASSERT(static_cast(hmm_state) < entry.size()); int32 next_hmm_state = entry[hmm_state].transitions[tidx].first; KALDI_ASSERT(next_hmm_state != hmm_state); @@ -702,14 +848,18 @@ bool GetPdfsForPhones(const TransitionModel &trans_model, pdfs->clear(); for (int32 tstate = 1; tstate <= trans_model.NumTransitionStates(); tstate++) { if (std::binary_search(phones.begin(), phones.end(), - trans_model.TransitionStateToPhone(tstate))) - pdfs->push_back(trans_model.TransitionStateToPdf(tstate)); + trans_model.TransitionStateToPhone(tstate))) { + pdfs->push_back(trans_model.TransitionStateToForwardPdf(tstate)); + pdfs->push_back(trans_model.TransitionStateToSelfLoopPdf(tstate)); + } } SortAndUniq(pdfs); for (int32 tstate = 1; tstate <= trans_model.NumTransitionStates(); tstate++) - if (std::binary_search(pdfs->begin(), pdfs->end(), - trans_model.TransitionStateToPdf(tstate)) + if ((std::binary_search(pdfs->begin(), pdfs->end(), + trans_model.TransitionStateToForwardPdf(tstate)) || + std::binary_search(pdfs->begin(), pdfs->end(), + trans_model.TransitionStateToSelfLoopPdf(tstate))) && !std::binary_search(phones.begin(), phones.end(), trans_model.TransitionStateToPhone(tstate))) return false; @@ -724,7 +874,9 @@ bool GetPhonesForPdfs(const TransitionModel &trans_model, phones->clear(); for (int32 tstate = 1; tstate <= trans_model.NumTransitionStates(); tstate++) { if (std::binary_search(pdfs.begin(), pdfs.end(), - trans_model.TransitionStateToPdf(tstate))) + trans_model.TransitionStateToForwardPdf(tstate)) || + std::binary_search(pdfs.begin(), pdfs.end(), + trans_model.TransitionStateToSelfLoopPdf(tstate))) phones->push_back(trans_model.TransitionStateToPhone(tstate)); } SortAndUniq(phones); @@ -732,16 +884,30 @@ bool GetPhonesForPdfs(const TransitionModel &trans_model, for (int32 tstate = 1; tstate <= trans_model.NumTransitionStates(); tstate++) if (std::binary_search(phones->begin(), phones->end(), trans_model.TransitionStateToPhone(tstate)) - && !std::binary_search(pdfs.begin(), pdfs.end(), - trans_model.TransitionStateToPdf(tstate))) + && !(std::binary_search(pdfs.begin(), pdfs.end(), + trans_model.TransitionStateToForwardPdf(tstate)) && + std::binary_search(pdfs.begin(), pdfs.end(), + trans_model.TransitionStateToSelfLoopPdf(tstate))) ) return false; return true; } bool TransitionModel::Compatible(const TransitionModel &other) const { - return (topo_ == other.topo_ && triples_ == other.triples_ && + return (topo_ == other.topo_ && tuples_ == other.tuples_ && state2id_ == other.state2id_ && id2state_ == other.id2state_ && num_pdfs_ == other.num_pdfs_); } +bool TransitionModel::IsSelfLoop(int32 trans_id) const { + KALDI_ASSERT(static_cast(trans_id) < id2state_.size()); + int32 trans_state = id2state_[trans_id]; + int32 trans_index = trans_id - state2id_[trans_state]; + const Tuple &tuple = tuples_[trans_state-1]; + int32 phone = tuple.phone, hmm_state = tuple.hmm_state; + const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(phone); + KALDI_ASSERT(static_cast(hmm_state) < entry.size()); + return (static_cast(trans_index) < entry[hmm_state].transitions.size() + && entry[hmm_state].transitions[trans_index].first == hmm_state); +} + } // End namespace kaldi diff --git a/src/hmm/transition-model.h b/src/hmm/transition-model.h index ff236e6de9e..33a0d55443e 100644 --- a/src/hmm/transition-model.h +++ b/src/hmm/transition-model.h @@ -53,7 +53,7 @@ namespace kaldi { // phone: a phone index (1, 2, 3 ...) // HMM-state: a number (0, 1, 2...) that indexes TopologyEntry (see hmm-topology.h) // pdf-id: a number output by the Compute function of ContextDependency (it -// indexes pdf's). Zero-based. +// indexes pdf's, either forward or self-loop). Zero-based. // transition-state: the states for which we estimate transition probabilities for transitions // out of them. In some topologies, will map one-to-one with pdf-ids. // One-based, since it appears on FSTs. @@ -66,14 +66,15 @@ namespace kaldi { // One-based, since it appears on FSTs. // // List of the possible mappings TransitionModel can do: -// (phone, HMM-state, pdf-id) -> transition-state -// (transition-state, transition-index) -> transition-id +// (phone, HMM-state, forward-pdf-id, self-loop-pdf-id) -> transition-state +// (transition-state, transition-index) -> transition-id // Reverse mappings: // transition-id -> transition-state // transition-id -> transition-index // transition-state -> phone // transition-state -> HMM-state -// transition-state -> pdf-id +// transition-state -> forward-pdf-id +// transition-state -> self-loop-pdf-id // // The main things the TransitionModel object can do are: // Get initialized (need ContextDependency and HmmTopology objects). @@ -141,13 +142,16 @@ class TransitionModel { /// \name Integer mapping functions /// @{ - int32 TripleToTransitionState(int32 phone, int32 hmm_state, int32 pdf) const; + int32 TupleToTransitionState(int32 phone, int32 hmm_state, int32 pdf, int32 self_loop_pdf) const; int32 PairToTransitionId(int32 trans_state, int32 trans_index) const; int32 TransitionIdToTransitionState(int32 trans_id) const; int32 TransitionIdToTransitionIndex(int32 trans_id) const; int32 TransitionStateToPhone(int32 trans_state) const; int32 TransitionStateToHmmState(int32 trans_state) const; - int32 TransitionStateToPdf(int32 trans_state) const; + int32 TransitionStateToForwardPdfClass(int32 trans_state) const; + int32 TransitionStateToSelfLoopPdfClass(int32 trans_state) const; + int32 TransitionStateToForwardPdf(int32 trans_state) const; + int32 TransitionStateToSelfLoopPdf(int32 trans_state) const; int32 SelfLoopOf(int32 trans_state) const; // returns the self-loop transition-id, or zero if // this state doesn't have a self-loop. @@ -172,7 +176,7 @@ class TransitionModel { int32 NumTransitionIndices(int32 trans_state) const; /// Returns the total number of transition-states (note, these are one-based). - int32 NumTransitionStates() const { return triples_.size(); } + int32 NumTransitionStates() const { return tuples_.size(); } // NumPdfs() actually returns the highest-numbered pdf we ever saw, plus one. // In normal cases this should equal the number of pdfs in the system, but if you @@ -249,30 +253,36 @@ class TransitionModel { void MapUpdateShared(const Vector &stats, const MapTransitionUpdateConfig &cfg, BaseFloat *objf_impr_out, BaseFloat *count_out); - void ComputeTriples(const ContextDependencyInterface &ctx_dep); // called from constructor. initializes triples_. + void ComputeTuples(const ContextDependencyInterface &ctx_dep); // called from constructor. initializes tuples_. + void ComputeTuplesIsHmm(const ContextDependencyInterface &ctx_dep); + void ComputeTuplesNotHmm(const ContextDependencyInterface &ctx_dep); void ComputeDerived(); // called from constructor and Read function: computes state2id_ and id2state_. void ComputeDerivedOfProbs(); // computes quantities derived from log-probs (currently just // non_self_loop_log_probs_; called whenever log-probs change. void InitializeProbs(); // called from constructor. void Check() const; + bool IsHmm() const; - struct Triple { + struct Tuple { int32 phone; int32 hmm_state; - int32 pdf; - Triple() { } - Triple(int32 phone, int32 hmm_state, int32 pdf): - phone(phone), hmm_state(hmm_state), pdf(pdf) { } - bool operator < (const Triple &other) const { + int32 forward_pdf; + int32 self_loop_pdf; + Tuple() { } + Tuple(int32 phone, int32 hmm_state, int32 forward_pdf, int32 self_loop_pdf): + phone(phone), hmm_state(hmm_state), forward_pdf(forward_pdf), self_loop_pdf(self_loop_pdf) { } + bool operator < (const Tuple &other) const { if (phone < other.phone) return true; else if (phone > other.phone) return false; else if (hmm_state < other.hmm_state) return true; else if (hmm_state > other.hmm_state) return false; - else return pdf < other.pdf; + else if (forward_pdf < other.forward_pdf) return true; + else if (forward_pdf > other.forward_pdf) return false; + else return (self_loop_pdf < other.self_loop_pdf); } - bool operator == (const Triple &other) const { + bool operator == (const Tuple &other) const { return (phone == other.phone && hmm_state == other.hmm_state - && pdf == other.pdf); + && forward_pdf == other.forward_pdf && self_loop_pdf == other.self_loop_pdf); } }; @@ -281,7 +291,7 @@ class TransitionModel { /// Triples indexed by transition state minus one; /// the triples are in sorted order which allows us to do the reverse mapping from /// triple to transition state - std::vector triples_; + std::vector tuples_; /// Gives the first transition_id of each transition-state; indexed by /// the transition-state. Array indexed 1..num-transition-states+1 (the last one @@ -292,6 +302,8 @@ class TransitionModel { /// state (indexed by transition-id). std::vector id2state_; + std::vector id2pdf_id_; + /// For each transition-id, the corresponding log-prob. Indexed by transition-id. Vector log_probs_; @@ -310,12 +322,9 @@ class TransitionModel { }; inline int32 TransitionModel::TransitionIdToPdf(int32 trans_id) const { - // If a lot of time is spent here we may create an extra array - // to handle this. - KALDI_ASSERT(static_cast(trans_id) < id2state_.size() && + KALDI_ASSERT(static_cast(trans_id) < id2pdf_id_.size() && "Likely graph/model mismatch (graph built from wrong model?)"); - int32 trans_state = id2state_[trans_id]; - return triples_[trans_state-1].pdf; + return id2pdf_id_[trans_id]; } /// Works out which pdfs might correspond to the given phones. Will return true diff --git a/src/itf/context-dep-itf.h b/src/itf/context-dep-itf.h index b989dd900ea..40681bb5ccd 100644 --- a/src/itf/context-dep-itf.h +++ b/src/itf/context-dep-itf.h @@ -63,9 +63,36 @@ class ContextDependencyInterface { /// GetPdfInfo returns a vector indexed by pdf-id, saying for each pdf which /// pairs of (phone, pdf-class) it can correspond to. (Usually just one). /// c.f. hmm/hmm-topology.h for meaning of pdf-class. - virtual void GetPdfInfo(const std::vector &phones, // list of phones - const std::vector &num_pdf_classes, // indexed by phone, - std::vector > > *pdf_info) + /// This is the old, simpler interface of GetPdfInfo(), and that this one can + /// only be called if the HmmTopology object's IsHmm() function call returns + /// true. + virtual void GetPdfInfo( + const std::vector &phones, // list of phones + const std::vector &num_pdf_classes, // indexed by phone, + std::vector > > *pdf_info) + const = 0; + + /// This function outputs information about what possible pdf-ids can + /// be generated for HMM-states; it covers the general case where + /// the self-loop pdf-class may be different from the forward-transition + /// pdf-class, so we are asking not about the set of possible pdf-ids + /// for a given (phone, pdf-class), but the set of possible ordered pairs + /// (forward-transition-pdf, self-loop-pdf) for a given (phone, + /// forward-transition-pdf-class, self-loop-pdf-class). + /// Note: 'phones' is a list of integer ids of phones, and + /// 'pdf-class-pairs', indexed by phone, is a list of pairs + /// (forward-transition-pdf-class, self-loop-pdf-class) that we can have for + /// that phone. + /// The output 'pdf_info' is indexed first by phone and then by the + /// same index that indexes each element of 'pdf_class_pairs', + /// and tells us for each pair in 'pdf_class_pairs', what is the + /// list of possible (forward-transition-pdf-id, self-loop-pdf-id) that + /// we can have. + /// This is less efficient than the other version of GetPdfInfo(). + virtual void GetPdfInfo( + const std::vector &phones, + const std::vector > > &pdf_class_pairs, + std::vector > > > *pdf_info) const = 0; diff --git a/src/lm/arpa-file-parser-test.cc b/src/lm/arpa-file-parser-test.cc index 7f1edd7b44b..5b5421873c4 100644 --- a/src/lm/arpa-file-parser-test.cc +++ b/src/lm/arpa-file-parser-test.cc @@ -219,13 +219,13 @@ ngram 1=4\n\ ngram 2=2\n\ ngram 3=2\n\ \n\ -\\1-grams:\n\ +\\1-grams: \n\ -5.2 a -3.3\n\ -3.4 \xCE\xB2\n\ 0.0 -2.5\n\ -4.3 \n\ \n\ -\\2-grams:\n\ +\\2-grams:\t\n\ -1.5 a \xCE\xB2 -3.2\n\ -1.3 a -4.2\n\ \n\ diff --git a/src/lm/arpa-file-parser.cc b/src/lm/arpa-file-parser.cc index 49b425adca4..d3307a477c0 100644 --- a/src/lm/arpa-file-parser.cc +++ b/src/lm/arpa-file-parser.cc @@ -18,10 +18,10 @@ // See the Apache 2 License for the specific language governing permissions and // limitations under the License. -#include - #include +#include + #include "base/kaldi-error.h" #include "base/kaldi-math.h" #include "lm/arpa-file-parser.h" @@ -38,6 +38,10 @@ ArpaFileParser::ArpaFileParser(ArpaParseOptions options, ArpaFileParser::~ArpaFileParser() { } +void TrimTrailingWhitespace(std::string *str) { + str->erase(str->find_last_not_of(" \n\r\t") + 1); +} + void ArpaFileParser::Read(std::istream &is, bool binary) { if (binary) { KALDI_ERR << "binary-mode reading is not implemented for ArpaFileParser"; @@ -84,6 +88,8 @@ void ArpaFileParser::Read(std::istream &is, bool binary) { while (++line_number_, getline(is, current_line_) && !is.eof()) { if (current_line_.empty()) continue; + TrimTrailingWhitespace(¤t_line_); + // Continue skipping lines until the \data\ marker alone on a line is found. if (!keyword_found) { if (current_line_ == "\\data\\") { @@ -147,7 +153,28 @@ void ArpaFileParser::Read(std::istream &is, bool binary) { int32 ngram_count = 0; while (++line_number_, getline(is, current_line_) && !is.eof()) { if (current_line_.empty()) continue; - if (current_line_[0] == '\\') break; + if (current_line_[0] == '\\') { + TrimTrailingWhitespace(¤t_line_); + std::ostringstream next_keyword; + next_keyword << "\\" << cur_order + 1 << "-grams:"; + if ((current_line_ != next_keyword.str()) && + (current_line_ != "\\end\\")) { + if (ShouldWarn()) { + KALDI_WARN << "ignoring possible directive '" << current_line_ + << "' expecting '" << next_keyword.str() << "'"; + + if (warning_count_ > 0 && + warning_count_ > static_cast(options_.max_warnings)) { + KALDI_WARN << "Of " << warning_count_ << " parse warnings, " + << options_.max_warnings << " were reported. " + << "Run program with --max_warnings=-1 " + << "to see all warnings"; + } + } + } else { + break; + } + } std::vector col; SplitStringToVector(current_line_, " \t", true, &col); @@ -183,7 +210,7 @@ void ArpaFileParser::Read(std::istream &is, bool binary) { } else { word = symbols_->Find(col[1 + index]); if (word == fst::SymbolTable::kNoSymbol) { - switch(options_.oov_handling) { + switch (options_.oov_handling) { case ArpaParseOptions::kReplaceWithUnk: word = options_.unk_symbol; break; @@ -227,7 +254,8 @@ void ArpaFileParser::Read(std::istream &is, bool binary) { PARSE_ERR << "invalid or unexpected directive line, expecting \\end\\"; } - if (warning_count_ > 0 && warning_count_ > (uint32)options_.max_warnings) { + if (warning_count_ > 0 && + warning_count_ > static_cast(options_.max_warnings)) { KALDI_WARN << "Of " << warning_count_ << " parse warnings, " << options_.max_warnings << " were reported. Run program with " << "--max_warnings=-1 to see all warnings"; @@ -246,7 +274,7 @@ std::string ArpaFileParser::LineReference() const { } bool ArpaFileParser::ShouldWarn() { - return ++warning_count_ <= (uint32)options_.max_warnings; + return ++warning_count_ <= static_cast(options_.max_warnings); } } // namespace kaldi diff --git a/src/lm/arpa-lm-compiler.cc b/src/lm/arpa-lm-compiler.cc index 14378aa374c..634a6267c4e 100644 --- a/src/lm/arpa-lm-compiler.cc +++ b/src/lm/arpa-lm-compiler.cc @@ -318,21 +318,35 @@ void ArpaLmCompiler::ConsumeNGram(const NGram &ngram) { void ArpaLmCompiler::RemoveRedundantStates() { fst::StdArc::Label backoff_symbol = sub_eps_; + if (backoff_symbol == 0) { + // The method of removing redundant states implemented in this function + // leads to slow determinization of L o G when people use the older style of + // usage of arpa2fst where the --disambig-symbol option was not specified. + // The issue seems to be that it creates a non-deterministic FST, while G is + // supposed to be deterministic. By 'return'ing below, we just disable this + // method if people were using an older script. This method isn't really + // that consequential anyway, and people will move to the newer-style + // scripts (see current utils/format_lm.sh), so this isn't much of a + // problem. + return; + } + fst::StdArc::StateId num_states = fst_.NumStates(); + + // replace the #0 symbols on the input of arcs out of redundant states (states // that are not final and have only a backoff arc leaving them), with . - if (backoff_symbol != 0) { - for (fst::StdArc::StateId state = 0; state < num_states; state++) { - if (fst_.NumArcs(state) == 1 && fst_.Final(state) == fst::TropicalWeight::Zero()) { - fst::MutableArcIterator iter(&fst_, state); - fst::StdArc arc = iter.Value(); - if (arc.ilabel == backoff_symbol) { - arc.ilabel = 0; - iter.SetValue(arc); - } + for (fst::StdArc::StateId state = 0; state < num_states; state++) { + if (fst_.NumArcs(state) == 1 && fst_.Final(state) == fst::TropicalWeight::Zero()) { + fst::MutableArcIterator iter(&fst_, state); + fst::StdArc arc = iter.Value(); + if (arc.ilabel == backoff_symbol) { + arc.ilabel = 0; + iter.SetValue(arc); } } } + // we could call fst::RemoveEps, and it would have the same effect in normal // cases, where backoff_symbol != 0 and there are no epsilons in unexpected // places, but RemoveEpsLocal is a bit safer in case something weird is going diff --git a/src/nnet2/nnet-component.cc b/src/nnet2/nnet-component.cc index 9608a5475e0..f807529159e 100644 --- a/src/nnet2/nnet-component.cc +++ b/src/nnet2/nnet-component.cc @@ -572,14 +572,7 @@ void NormalizeComponent::Propagate(const ChunkInfo &in_info, const ChunkInfo &out_info, const CuMatrixBase &in, CuMatrixBase *out) const { - out->CopyFromMat(in); - - CuVector in_norm(in.NumRows()); - in_norm.AddDiagMat2(1.0 / in.NumCols(), - in, kNoTrans, 0.0); - in_norm.ApplyFloor(kNormFloor); - in_norm.ApplyPow(-0.5); - out->MulRowsVec(in_norm); + cu::NormalizePerRow(in, BaseFloat(1), false, out); } /* diff --git a/src/nnet3/nnet-compile.cc b/src/nnet3/nnet-compile.cc index d58a58e6f2b..cee9e8f9bd7 100644 --- a/src/nnet3/nnet-compile.cc +++ b/src/nnet3/nnet-compile.cc @@ -156,13 +156,18 @@ void Compiler::ComputeDerivNeeded( if (request_.outputs[output_index].has_deriv) (*deriv_needed)[step] = true; } - // If this is an updatable Component node and the user requested model - // derivatives (e.g. during training), we need this step's derivative. + // If this is an updatable Component node with a nonzero learning rate and + // the user requested model derivatives (e.g. during training), we need this + // step's derivative. if (nnet_.IsComponentNode(node_index) && request_.need_model_derivative) { const NetworkNode &node = nnet_.GetNode(node_index); const Component *c = nnet_.GetComponent(node.u.component_index); - if (c->Properties() & kUpdatableComponent) - (*deriv_needed)[step] = true; + if (c->Properties() & kUpdatableComponent) { + const UpdatableComponent *u = dynamic_cast(c); + KALDI_ASSERT(u != NULL); + if (u->LearningRate() != 0) + (*deriv_needed)[step] = true; + } } } if (GetVerboseLevel() >= 5) { diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc index 5c38d125c98..b40670407c8 100644 --- a/src/nnet3/nnet-component-itf.cc +++ b/src/nnet3/nnet-component-itf.cc @@ -55,6 +55,8 @@ ComponentPrecomputedIndexes* ComponentPrecomputedIndexes::NewComponentPrecompute ans = new StatisticsExtractionComponentPrecomputedIndexes(); } else if (cpi_type == "StatisticsPoolingComponentPrecomputedIndexes") { ans = new StatisticsPoolingComponentPrecomputedIndexes(); + } else if (cpi_type == "BackpropTruncationComponentPrecomputedIndexes") { + ans = new BackpropTruncationComponentPrecomputedIndexes(); } if (ans != NULL) { KALDI_ASSERT(cpi_type == ans->Type()); @@ -143,6 +145,8 @@ Component* Component::NewComponentOfType(const std::string &component_type) { ans = new ConstantFunctionComponent(); } else if (component_type == "DropoutComponent") { ans = new DropoutComponent(); + } else if (component_type == "BackpropTruncationComponent") { + ans = new BackpropTruncationComponent(); } if (ans != NULL) { KALDI_ASSERT(component_type == ans->Type()); diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc index 80793bf1d98..f5687ec1d71 100644 --- a/src/nnet3/nnet-general-component.cc +++ b/src/nnet3/nnet-general-component.cc @@ -880,6 +880,291 @@ void StatisticsPoolingComponent::Backprop( indexes->backward_indexes); } +// virtual +void BackpropTruncationComponent::Read(std::istream &is, bool binary) { + // might not see the "" part because + // of how ReadNew() works. + ExpectOneOrTwoTokens(is, binary, "", + ""); + ReadBasicType(is, binary, &dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &clipping_threshold_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &zeroing_threshold_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &zeroing_interval_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &recurrence_interval_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &num_clipped_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &num_zeroed_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &count_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &count_zeroing_boundaries_); + ExpectToken(is, binary, ""); +} + +// virtual +void BackpropTruncationComponent::Write(std::ostream &os, bool binary) const { + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, clipping_threshold_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, zeroing_threshold_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, zeroing_interval_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, recurrence_interval_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, num_clipped_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, num_zeroed_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, count_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, count_zeroing_boundaries_); + WriteToken(os, binary, ""); +} + +void BackpropTruncationComponentPrecomputedIndexes::Write(std::ostream &ostream, + bool binary) const { + WriteToken(ostream, binary, + ""); + WriteToken(ostream, binary, ""); + zeroing.Write(ostream, binary); + WriteToken(ostream, binary, ""); + WriteBasicType(ostream, binary, zeroing_sum); + WriteToken(ostream, binary, + ""); +} + +void BackpropTruncationComponentPrecomputedIndexes::Read(std::istream &istream, + bool binary) { + ExpectOneOrTwoTokens(istream, binary, + "", + ""); + zeroing.Read(istream, binary); + ExpectToken(istream, binary, ""); + ReadBasicType(istream, binary, &zeroing_sum); + ExpectToken(istream, binary, + ""); +} + +std::string BackpropTruncationComponent::Info() const { + std::ostringstream stream; + stream << Type() << ", dim=" << dim_ + << ", clipping-threshold=" << clipping_threshold_ + << ", clipped-proportion=" + << (count_ > 0.0 ? num_clipped_ / count_ : 0) + << ", zeroing-threshold=" << zeroing_threshold_ + << ", zeroed-proportion=" + << (count_zeroing_boundaries_ > 0.0 ? + num_zeroed_ / count_zeroing_boundaries_ : 0) + << ", count-zeroing-boundaries=" + << static_cast(count_zeroing_boundaries_); + return stream.str(); +} + +void BackpropTruncationComponent::Init(int32 dim, + BaseFloat clipping_threshold, + BaseFloat zeroing_threshold, + int32 zeroing_interval, + int32 recurrence_interval) { + KALDI_ASSERT(clipping_threshold >= 0 && zeroing_threshold >= 0 && + zeroing_interval > 0 && recurrence_interval > 0 && dim > 0); + dim_ = dim; + clipping_threshold_ = clipping_threshold; + zeroing_threshold_ = zeroing_threshold; + zeroing_interval_ = zeroing_interval; + recurrence_interval_ = recurrence_interval; + num_clipped_ = 0.0; + num_zeroed_ = 0.0; + count_ = 0.0; + count_zeroing_boundaries_ = 0.0; +} + +// virtual +void BackpropTruncationComponent::InitFromConfig(ConfigLine *cfl) { + int32 dim = 0; + bool ok = cfl->GetValue("dim", &dim); + BaseFloat clipping_threshold = 15.0; + BaseFloat zeroing_threshold = 2.0; + int32 zeroing_interval = 20, recurrence_interval = 1; + cfl->GetValue("clipping-threshold", &clipping_threshold); + cfl->GetValue("zeroing-threshold", &zeroing_threshold); + cfl->GetValue("zeroing-interval", &zeroing_interval); + cfl->GetValue("recurrence-interval", &recurrence_interval); + if (!ok || cfl->HasUnusedValues() || + clipping_threshold < 0 || zeroing_threshold < 0 || zeroing_interval < 1 || + recurrence_interval < 1 || dim <= 0) + KALDI_ERR << "Invalid initializer for layer of type " + << Type() << ": \"" << cfl->WholeLine() << "\""; + Init(dim, clipping_threshold, zeroing_threshold, + zeroing_interval, recurrence_interval); +} + +// virtual +Component* BackpropTruncationComponent::Copy() const { + BackpropTruncationComponent *ans = new BackpropTruncationComponent(); + ans->dim_ = dim_; + ans->clipping_threshold_ = clipping_threshold_; + ans->zeroing_threshold_ = zeroing_threshold_; + ans->zeroing_interval_ = zeroing_interval_; + ans->recurrence_interval_ = recurrence_interval_; + ans->num_clipped_ = num_clipped_; + ans->num_zeroed_ = num_zeroed_; + ans->count_ = count_; + ans->count_zeroing_boundaries_ = count_zeroing_boundaries_; + return ans; +} + +// virtual +ComponentPrecomputedIndexes* +BackpropTruncationComponent::PrecomputeIndexes( + const MiscComputationInfo &misc_info, + const std::vector &input_indexes, + const std::vector &output_indexes, + bool need_backprop) const { + int32 num_input_indexes = input_indexes.size(), + num_output_indexes = output_indexes.size(); + KALDI_ASSERT(num_input_indexes == num_output_indexes); + Vector zeroing_cpu(num_output_indexes); + + for (int32 i = 0; i < num_output_indexes; i++) { + const int32 output_n = output_indexes[i].n; + const int32 output_t = output_indexes[i].t; + // checks if output_t crosses a boundary that is a multiple of + // zeroing_interval_. Note that frame (output_t - recurrence_interval_) is + // right before frame output_t in RNNs. If the range + // [output_t - recurrence_interval_, output_t] contains a multiple of + // zeroing_interval_, then frame output_t crosses the boundary. + // output_n is used to shift where we put the boundary, so that + // we don't always zero out gradients on frame 0. It will help avoid + // learning utterance-boundary effects. + if (DivideRoundingDown(output_t - output_n, zeroing_interval_) != + DivideRoundingDown(output_t - recurrence_interval_ - output_n, + zeroing_interval_)) + zeroing_cpu(i) = -1.0; + } + + BackpropTruncationComponentPrecomputedIndexes *ans = new + BackpropTruncationComponentPrecomputedIndexes(); + ans->zeroing = zeroing_cpu; + ans->zeroing_sum = -zeroing_cpu.Sum(); + return ans; +} + +// virtual +void BackpropTruncationComponent::Propagate( + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const { + out->CopyFromMat(in); +} + +// virtual +void BackpropTruncationComponent::Backprop(const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes_in, + const CuMatrixBase &, //in_value + const CuMatrixBase &, + const CuMatrixBase &out_deriv, + Component *to_update_in, // may be NULL; may be + // identical to "this" or different. + CuMatrixBase *in_deriv) const { + const BackpropTruncationComponentPrecomputedIndexes *indexes = + dynamic_cast( + indexes_in); + KALDI_ASSERT(indexes->zeroing.Dim() == out_deriv.NumRows()); + // the following statement will do nothing if in_deriv and out_deriv have same + // memory. + in_deriv->CopyFromMat(out_deriv); + + BackpropTruncationComponent *to_update = + dynamic_cast(to_update_in); + + // computes clipping_scales + BaseFloat clipping_threshold = + (clipping_threshold_ <= 0.0 ? 1.0e+10 : clipping_threshold_); + // each row in the derivative matrix, which corresponds to one sample in + // the mini-batch, is scaled to have a max-norm of clipping_threshold_ + CuVector clipping_scales(in_deriv->NumRows()); + clipping_scales.AddDiagMat2(pow(clipping_threshold, -2), *in_deriv, + kNoTrans, 0.0); + // now clipping_scales contains the squared (norm of each row divided by + // clipping_threshold) + int32 num_not_scaled = clipping_scales.ApplyFloor(1.0); + // now clipping_scales contains min(1, squared-(norm/clipping_threshold)) + clipping_scales.ApplyPow(-0.5); + // now clipping_scales contains max(1, clipping_threshold/vector_norm) + if (to_update != NULL) { + to_update->num_clipped_ += (clipping_scales.Dim() - num_not_scaled); + to_update->count_ += clipping_scales.Dim(); + } + + // computes zeroing_scales + BaseFloat zeroing_threshold = + (zeroing_threshold_ <= 0.0 ? 1.0e+10 : zeroing_threshold_); + // zeroing_scales_vec is actually a 1-row matrix. (the ApplyHeaviside + // function isn't defined for vectors). + CuMatrix zeroing_scales(1, in_deriv->NumRows()); + CuSubVector zeroing_scales_vec(zeroing_scales, 0); + zeroing_scales_vec.Set(-pow(zeroing_threshold, 2)); + // now zeroing_scales_vec contains -(squared zeroing_threshold) + zeroing_scales_vec.AddDiagMat2(1.0, *in_deriv, kNoTrans, 1.0); + // now zeroing_scales_vec contains squared norm of each row - + // squared zeroing_threshold + zeroing_scales.ApplyHeaviside(); + // now the element of zeroing_scales_vec is 1.0 if its corresponding + // sample's norm exceeds zero_threshold, and 0.0 otherwise + zeroing_scales_vec.MulElements(indexes->zeroing); + // now the element of zeroing_scales_vec is -1.0 if we want to zero its + // corresponding sample's gradient, and 0.0 otherwise + if (to_update != NULL) { + to_update->num_zeroed_ -= zeroing_scales_vec.Sum(); // since it is negative + to_update->count_zeroing_boundaries_ += indexes->zeroing_sum; + } + zeroing_scales_vec.Add(1.0); + // now the element of zeroing_scales_vec is 0.0 if we want to zero its + // corresponding sample's gradient, and 1.0 otherwise + + // combines clipping_scales and zeroing_scales and applies combined_scales + // to in_deriv all at once + CuVector combined_scales(clipping_scales); + combined_scales.MulElements(zeroing_scales_vec); + in_deriv->MulRowsVec(combined_scales); +} + +// virtual +void BackpropTruncationComponent::ZeroStats() { + count_ = 0.0; + count_zeroing_boundaries_ = 0.0; + num_clipped_ = 0.0; + num_zeroed_ = 0.0; +} + +// virtual +void BackpropTruncationComponent::Scale(BaseFloat scale) { + count_ *= scale; + count_zeroing_boundaries_ *= scale; + num_clipped_ *= scale; + num_zeroed_ *= scale; +} + +// virtual +void BackpropTruncationComponent::Add(BaseFloat alpha, + const Component &other_in) { + const BackpropTruncationComponent *other = + dynamic_cast(&other_in); + KALDI_ASSERT(other != NULL); + count_ += alpha * other->count_; + count_zeroing_boundaries_ += alpha * other->count_zeroing_boundaries_; + num_clipped_ += alpha * other->num_clipped_; + num_zeroed_ += alpha * other->num_zeroed_; +} } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h index 5e94d4ba332..93a46eaedbf 100644 --- a/src/nnet3/nnet-general-component.h +++ b/src/nnet3/nnet-general-component.h @@ -440,6 +440,139 @@ class StatisticsPoolingComponentPrecomputedIndexes: virtual std::string Type() const { return "StatisticsPoolingComponentPrecomputedIndexes"; } }; +// BackpropTruncationComponent zeroes out the gradients every certain number +// of frames, as well as having gradient-clipping functionality as +// ClipGradientComponent. +// This component will be used to prevent gradient explosion problem in +// recurrent neural networks +class BackpropTruncationComponent: public Component { + public: + BackpropTruncationComponent(int32 dim, + BaseFloat clipping_threshold, + BaseFloat zeroing_threshold, + int32 zeroing_interval, + int32 recurrence_interval) { + Init(dim, clipping_threshold, zeroing_threshold, + zeroing_interval, recurrence_interval);} + + BackpropTruncationComponent(): dim_(0), clipping_threshold_(-1), + zeroing_threshold_(-1), zeroing_interval_(0), recurrence_interval_(0), + num_clipped_(0), num_zeroed_(0), count_(0), count_zeroing_boundaries_(0) { } + + virtual int32 InputDim() const { return dim_; } + virtual int32 OutputDim() const { return dim_; } + virtual void InitFromConfig(ConfigLine *cfl); + void Init(int32 dim, BaseFloat clipping_threshold, + BaseFloat zeroing_threshold, int32 zeroing_interval, + int32 recurrence_interval); + + virtual std::string Type() const { return "BackpropTruncationComponent"; } + + virtual int32 Properties() const { + return kLinearInInput|kPropagateInPlace|kBackpropInPlace; + } + + virtual void ZeroStats(); + + virtual Component* Copy() const; + + virtual void Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const; + virtual void Backprop(const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &, // in_value, + const CuMatrixBase &, // out_value, + const CuMatrixBase &out_deriv, + Component *to_update, + CuMatrixBase *in_deriv) const; + + virtual ComponentPrecomputedIndexes* PrecomputeIndexes( + const MiscComputationInfo &misc_info, + const std::vector &input_indexes, + const std::vector &output_indexes, + bool need_backprop) const; + + virtual void Scale(BaseFloat scale); + virtual void Add(BaseFloat alpha, const Component &other); + virtual void Read(std::istream &is, bool binary); // This Read function + // requires that the Component has the correct type. + /// Write component to stream + virtual void Write(std::ostream &os, bool binary) const; + virtual std::string Info() const; + virtual ~BackpropTruncationComponent() { + } + private: + // input/output dimension + int32 dim_; + + // threshold (e.g., 30) to be used for clipping corresponds to max-row-norm + BaseFloat clipping_threshold_; + + // threshold (e.g., 3) to be used for zeroing corresponds to max-row-norm + BaseFloat zeroing_threshold_; + + // interval (e.g., 20, in number of frames) at which we would zero the + // gradient if the norm of the gradient is above zeroing_threshold_ + int32 zeroing_interval_; + + // recurrence_interval_ should be the absolute recurrence offset used in RNNs + // (e.g., 3). It is used to see whether the index the component is processing, + // crosses a boundary that's a multiple of zeroing_interval_ frames. + int32 recurrence_interval_; + + // component-node name, used in the destructor to print out stats of + // self-repair + std::string debug_info_; + + BackpropTruncationComponent &operator = + (const BackpropTruncationComponent &other); // Disallow. + + protected: + // variables to store stats + // An element corresponds to rows of derivative matrix + double num_clipped_; // number of elements which were clipped + double num_zeroed_; // number of elements which were zeroed + double count_; // number of elements which were processed + double count_zeroing_boundaries_; // number of zeroing boundaries where we had + // the opportunity to perform zeroing + // the gradient + +}; + +class BackpropTruncationComponentPrecomputedIndexes: + public ComponentPrecomputedIndexes { + public: + + // zeroing has the same dimension as the number of rows of out-deriv. + // Each element in zeroing can take two possible values: -1.0, meaning its + // corresponding frame is one that we need to consider zeroing the + // gradient of, and 0.0 otherwise + CuVector zeroing; + + // caches the negative sum of elements in zeroing for less CUDA calls + // (the sum is computed by CPU). Note that this value would be positive. + BaseFloat zeroing_sum; + + BackpropTruncationComponentPrecomputedIndexes(): zeroing_sum(0.0) {} + + // this class has a virtual destructor so it can be deleted from a pointer + // to ComponentPrecomputedIndexes. + virtual ~BackpropTruncationComponentPrecomputedIndexes() { } + + virtual ComponentPrecomputedIndexes* Copy() const { + return new BackpropTruncationComponentPrecomputedIndexes(*this); + } + + virtual void Write(std::ostream &ostream, bool binary) const; + + virtual void Read(std::istream &istream, bool binary); + + virtual std::string Type() const { + return "BackpropTruncationComponentPrecomputedIndexes"; + } +}; + } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-nnet.cc b/src/nnet3/nnet-nnet.cc index af2147147d7..ad5f715a294 100644 --- a/src/nnet3/nnet-nnet.cc +++ b/src/nnet3/nnet-nnet.cc @@ -783,6 +783,13 @@ Nnet& Nnet::operator =(const Nnet &nnet) { std::string Nnet::Info() const { std::ostringstream os; + + if(IsSimpleNnet(*this)) { + int32 left_context, right_context; + ComputeSimpleNnetContext(*this, &left_context, &right_context); + os << "left-context: " << left_context << "\n"; + os << "right-context: " << right_context << "\n"; + } os << "num-parameters: " << NumParameters(*this) << "\n"; os << "modulus: " << this->Modulus() << "\n"; std::vector config_lines; diff --git a/src/nnet3/nnet-parse.cc b/src/nnet3/nnet-parse.cc index 733d162748e..3bacf455f3b 100644 --- a/src/nnet3/nnet-parse.cc +++ b/src/nnet3/nnet-parse.cc @@ -427,7 +427,7 @@ bool IsValidName(const std::string &name) { for (size_t i = 0; i < name.size(); i++) { if (i == 0 && !isalpha(name[i]) && name[i] != '_') return false; - if (!isalnum(name[i]) && name[i] != '_' && name[i] != '-') + if (!isalnum(name[i]) && name[i] != '_' && name[i] != '-' && name[i] != '.') return false; } return true; diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index b84ac90c76e..390ab2885a9 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -96,16 +96,16 @@ void DropoutComponent::InitFromConfig(ConfigLine *cfl) { BaseFloat dropout_proportion = 0.0; bool ok = cfl->GetValue("dim", &dim) && cfl->GetValue("dropout-proportion", &dropout_proportion); - if (!ok || cfl->HasUnusedValues() || dim <= 0 || + if (!ok || cfl->HasUnusedValues() || dim <= 0 || dropout_proportion < 0.0 || dropout_proportion > 1.0) - KALDI_ERR << "Invalid initializer for layer of type " - << Type() << ": \"" << cfl->WholeLine() << "\""; + KALDI_ERR << "Invalid initializer for layer of type " + << Type() << ": \"" << cfl->WholeLine() << "\""; Init(dim, dropout_proportion); } std::string DropoutComponent::Info() const { std::ostringstream stream; - stream << Type() << ", dim = " << dim_ + stream << Type() << ", dim = " << dim_ << ", dropout-proportion = " << dropout_proportion_; return stream.str(); } @@ -119,12 +119,12 @@ void DropoutComponent::Propagate(const ComponentPrecomputedIndexes *indexes, BaseFloat dropout = dropout_proportion_; KALDI_ASSERT(dropout >= 0.0 && dropout <= 1.0); - // This const_cast is only safe assuming you don't attempt + // This const_cast is only safe assuming you don't attempt // to use multi-threaded code with the GPU. - const_cast&>(random_generator_).RandUniform(out); + const_cast&>(random_generator_).RandUniform(out); - out->Add(-dropout); // now, a proportion "dropout" will be <0.0 - out->ApplyHeaviside(); // apply the function (x>0?1:0). Now, a proportion "dropout" will + out->Add(-dropout); // now, a proportion "dropout" will be <0.0 + out->ApplyHeaviside(); // apply the function (x>0?1:0). Now, a proportion "dropout" will // be zero and (1 - dropout) will be 1.0. out->MulElements(in); @@ -147,7 +147,7 @@ void DropoutComponent::Backprop(const std::string &debug_info, } - + void DropoutComponent::Read(std::istream &is, bool binary) { ExpectOneOrTwoTokens(is, binary, "", ""); ReadBasicType(is, binary, &dim_); @@ -415,21 +415,7 @@ void NormalizeComponent::Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in, CuMatrixBase *out) const { KALDI_ASSERT(out->NumCols() == in.NumCols() + (add_log_stddev_ ? 1 : 0)); - CuSubMatrix out_no_log(*out, 0, out->NumRows(), 0, input_dim_); - if (in.Data() != out_no_log.Data()) - out_no_log.CopyFromMat(in); - CuVector in_norm(in.NumRows()); - BaseFloat d_scaled = in.NumCols() * target_rms_ * target_rms_; - in_norm.AddDiagMat2(1.0 / d_scaled, in, kNoTrans, 0.0); - in_norm.ApplyFloor(kSquaredNormFloor); - in_norm.ApplyPow(-0.5); - out_no_log.MulRowsVec(in_norm); - if (add_log_stddev_) { - in_norm.ApplyLog(); - in_norm.Scale(-1.0); - in_norm.Add(log(target_rms_)); - out->CopyColFromVec(in_norm, in.NumCols()); - } + cu::NormalizePerRow(in, target_rms_, add_log_stddev_, out); } /* diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc index e02ae4974c9..0b000b5b4ef 100644 --- a/src/nnet3/nnet-test-utils.cc +++ b/src/nnet3/nnet-test-utils.cc @@ -513,6 +513,178 @@ void GenerateConfigSequenceLstm( configs->push_back(os.str()); } +void GenerateConfigSequenceLstmWithTruncation( + const NnetGenerationOptions &opts, + std::vector *configs) { + std::ostringstream os; + + std::vector splice_context; + for (int32 i = -5; i < 4; i++) + if (Rand() % 3 == 0) + splice_context.push_back(i); + if (splice_context.empty()) + splice_context.push_back(0); + + int32 input_dim = 10 + Rand() % 20, + spliced_dim = input_dim * splice_context.size(), + output_dim = (opts.output_dim > 0 ? + opts.output_dim : + 100 + Rand() % 200), + cell_dim = 40 + Rand() % 50, + projection_dim = std::ceil(cell_dim / (Rand() % 10 + 1)); + int32 clipping_threshold = RandInt(6, 50), + zeroing_threshold = RandInt(1, 5), + zeroing_interval = RandInt(1, 5) * 10; + + os << "input-node name=input dim=" << input_dim << std::endl; + + // Parameter Definitions W*(* replaced by - to have valid names) + // Input gate control : Wi* matrices + os << "component name=Wi-xr type=NaturalGradientAffineComponent" + << " input-dim=" << spliced_dim + projection_dim + << " output-dim=" << cell_dim << std::endl; + os << "component name=Wic type=PerElementScaleComponent " + << " dim=" << cell_dim << std::endl; + + // Forget gate control : Wf* matrices + os << "component name=Wf-xr type=NaturalGradientAffineComponent" + << " input-dim=" << spliced_dim + projection_dim + << " output-dim=" << cell_dim << std::endl; + os << "component name=Wfc type=PerElementScaleComponent " + << " dim=" << cell_dim << std::endl; + + // Output gate control : Wo* matrices + os << "component name=Wo-xr type=NaturalGradientAffineComponent" + << " input-dim=" << spliced_dim + projection_dim + << " output-dim=" << cell_dim << std::endl; + os << "component name=Woc type=PerElementScaleComponent " + << " dim=" << cell_dim << std::endl; + + // Cell input matrices : Wc* matrices + os << "component name=Wc-xr type=NaturalGradientAffineComponent" + << " input-dim=" << spliced_dim + projection_dim + << " output-dim=" << cell_dim << std::endl; + + + + // projection matrices : Wrm and Wpm + os << "component name=W-m type=NaturalGradientAffineComponent " + << " input-dim=" << cell_dim + << " output-dim=" << 2 * projection_dim << std::endl; + + // Output : Wyr and Wyp + os << "component name=Wy- type=NaturalGradientAffineComponent " + << " input-dim=" << 2 * projection_dim + << " output-dim=" << cell_dim << std::endl; + + // Defining the diagonal matrices + // Defining the final affine transform + os << "component name=final_affine type=NaturalGradientAffineComponent " + << "input-dim=" << cell_dim << " output-dim=" << output_dim << std::endl; + os << "component name=logsoftmax type=LogSoftmaxComponent dim=" + << output_dim << std::endl; + + // Defining the non-linearities + // declare a no-op component so that we can use a sum descriptor's output + // multiple times, and to make the config more readable given the equations + os << "component name=i type=SigmoidComponent dim=" + << cell_dim << std::endl; + os << "component name=f type=SigmoidComponent dim=" + << cell_dim << std::endl; + os << "component name=o type=SigmoidComponent dim=" + << cell_dim << std::endl; + os << "component name=g type=TanhComponent dim=" + << cell_dim << std::endl; + os << "component name=h type=TanhComponent dim=" + << cell_dim << std::endl; + os << "component name=c1 type=ElementwiseProductComponent " + << " input-dim=" << 2 * cell_dim + << " output-dim=" << cell_dim << std::endl; + os << "component name=c2 type=ElementwiseProductComponent " + << " input-dim=" << 2 * cell_dim + << " output-dim=" << cell_dim << std::endl; + os << "component name=m type=ElementwiseProductComponent " + << " input-dim=" << 2 * cell_dim + << " output-dim=" << cell_dim << std::endl; + os << "component name=c type=BackpropTruncationComponent dim=" + << cell_dim + << " clipping-threshold=" << clipping_threshold + << " zeroing-threshold=" << zeroing_threshold + << " zeroing-interval=" << zeroing_interval + << " recurrence-interval=1" << std::endl; + os << "component name=r type=BackpropTruncationComponent dim=" + << projection_dim + << " clipping-threshold=" << clipping_threshold + << " zeroing-threshold=" << zeroing_threshold + << " zeroing-interval=" << zeroing_interval + << " recurrence-interval=1" << std::endl; + + // Defining the computations + std::ostringstream temp_string_stream; + for (size_t i = 0; i < splice_context.size(); i++) { + int32 offset = splice_context[i]; + temp_string_stream << "Offset(input, " << offset << ")"; + if (i + 1 < splice_context.size()) + temp_string_stream << ", "; + } + std::string spliced_input = temp_string_stream.str(); + + std::string c_tminus1 = "IfDefined(Offset(c_t, -1))"; + os << "component-node name=c_t component=c input=Sum(c1_t, c2_t)\n"; + + // i_t + os << "component-node name=i1 component=Wi-xr input=Append(" + << spliced_input << ", IfDefined(Offset(r_t, -1)))\n"; + os << "component-node name=i2 component=Wic " + << " input=" << c_tminus1 << std::endl; + os << "component-node name=i_t component=i input=Sum(i1, i2)\n"; + + // f_t + os << "component-node name=f1 component=Wf-xr input=Append(" + << spliced_input << ", IfDefined(Offset(r_t, -1)))\n"; + os << "component-node name=f2 component=Wfc " + << " input=" << c_tminus1 << std::endl; + os << "component-node name=f_t component=f input=Sum(f1, f2)\n"; + + // o_t + os << "component-node name=o1 component=Wo-xr input=Append(" + << spliced_input << ", IfDefined(Offset(r_t, -1)))\n"; + os << "component-node name=o2 component=Woc input=Sum(c1_t, c2_t)\n"; + os << "component-node name=o_t component=o input=Sum(o1, o2)\n"; + + // h_t + os << "component-node name=h_t component=h input=Sum(c1_t, c2_t)\n"; + + // g_t + os << "component-node name=g1 component=Wc-xr input=Append(" + << spliced_input << ", IfDefined(Offset(r_t, -1)))\n"; + os << "component-node name=g_t component=g input=g1\n"; + + // parts of c_t + os << "component-node name=c1_t component=c1 " + << " input=Append(f_t, " << c_tminus1 << ")\n"; + os << "component-node name=c2_t component=c2 input=Append(i_t, g_t)\n"; + + // m_t + os << "component-node name=m_t component=m input=Append(o_t, h_t)\n"; + + // r_t and p_t + os << "component-node name=rp_t component=W-m input=m_t\n"; + // Splitting outputs of Wy- node + os << "dim-range-node name=r_t_pretrunc input-node=rp_t dim-offset=0 " + << "dim=" << projection_dim << std::endl; + os << "component-node name=r_t component=r input=r_t_pretrunc\n"; + + // y_t + os << "component-node name=y_t component=Wy- input=rp_t\n"; + + // Final affine transform + os << "component-node name=final_affine component=final_affine input=y_t\n"; + os << "component-node name=posteriors component=logsoftmax input=final_affine\n"; + os << "output-node name=output input=posteriors\n"; + configs->push_back(os.str()); +} + // This is a different LSTM config where computation is bunched according // to inputs this is not complete, it is left here for future comparisons void GenerateConfigSequenceLstmType2( @@ -802,7 +974,7 @@ void GenerateConfigSequence( const NnetGenerationOptions &opts, std::vector *configs) { start: - int32 network_type = RandInt(0, 10); + int32 network_type = RandInt(0, 11); switch(network_type) { case 0: GenerateConfigSequenceSimplest(opts, configs); @@ -855,6 +1027,12 @@ void GenerateConfigSequence( case 10: GenerateConfigSequenceStatistics(opts, configs); break; + case 11: + if (!opts.allow_recursion || !opts.allow_context || + !opts.allow_nonlinearity) + goto start; + GenerateConfigSequenceLstmWithTruncation(opts, configs); + break; default: KALDI_ERR << "Error generating config sequence."; } diff --git a/src/nnet3/online-nnet3-decodable-simple.h b/src/nnet3/online-nnet3-decodable-simple.h index cfd70ccea38..af7c18da64b 100644 --- a/src/nnet3/online-nnet3-decodable-simple.h +++ b/src/nnet3/online-nnet3-decodable-simple.h @@ -102,6 +102,7 @@ class DecodableNnet3SimpleOnline: public DecodableInterface { /// Indices are one-based! This is for compatibility with OpenFst. virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); } + int32 FrameSubsamplingFactor() const { return opts_.frame_subsampling_factor; } private: /// If the neural-network outputs for this frame are not cached, it computes diff --git a/src/nnet3bin/nnet3-latgen-faster-parallel.cc b/src/nnet3bin/nnet3-latgen-faster-parallel.cc index 7d157f6e89c..e55a213f14f 100644 --- a/src/nnet3bin/nnet3-latgen-faster-parallel.cc +++ b/src/nnet3bin/nnet3-latgen-faster-parallel.cc @@ -228,8 +228,10 @@ int main(int argc, char *argv[]) { } } + // the following constructor takes ownership of the FST pointer so that + // it is deleted when 'decoder' is deleted. LatticeFasterDecoder *decoder = - new LatticeFasterDecoder(fst_reader.Value(), config); + new LatticeFasterDecoder(config, fst_reader.Value().Copy()); DecodableInterface *nnet_decodable = new DecodableAmNnetSimpleParallel( diff --git a/src/online2/online-nnet3-decoding.cc b/src/online2/online-nnet3-decoding.cc index fd4881666ae..8dd366166c0 100644 --- a/src/online2/online-nnet3-decoding.cc +++ b/src/online2/online-nnet3-decoding.cc @@ -72,8 +72,9 @@ void SingleUtteranceNnet3Decoder::GetBestPath(bool end_of_utterance, bool SingleUtteranceNnet3Decoder::EndpointDetected( const OnlineEndpointConfig &config) { + int32 subsample = decodable_.FrameSubsamplingFactor(); return kaldi::EndpointDetected(config, tmodel_, - feature_pipeline_->FrameShiftInSeconds(), + feature_pipeline_->FrameShiftInSeconds() * subsample, decoder_); } diff --git a/src/sgmm2/Makefile b/src/sgmm2/Makefile index 41a4175aa3b..f0da85e48de 100644 --- a/src/sgmm2/Makefile +++ b/src/sgmm2/Makefile @@ -14,6 +14,6 @@ LIBNAME = kaldi-sgmm2 ADDLIBS = ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \ ../tree/kaldi-tree.a ../util/kaldi-util.a ../thread/kaldi-thread.a \ - ../matrix/kaldi-matrix.a ../base/kaldi-base.a + ../matrix/kaldi-matrix.a ../base/kaldi-base.a include ../makefiles/default_rules.mk diff --git a/src/transform/Makefile b/src/transform/Makefile index 3ae8b1fa3a4..4df681f1ade 100644 --- a/src/transform/Makefile +++ b/src/transform/Makefile @@ -14,8 +14,8 @@ OBJFILES = regression-tree.o regtree-mllr-diag-gmm.o lda-estimate.o \ LIBNAME = kaldi-transform -ADDLIBS = ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \ - ../thread/kaldi-thread.a ../matrix/kaldi-matrix.a \ - ../base/kaldi-base.a +ADDLIBS = ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a \ + ../util/kaldi-util.a ../thread/kaldi-thread.a \ + ../matrix/kaldi-matrix.a ../base/kaldi-base.a include ../makefiles/default_rules.mk diff --git a/src/tree/context-dep.cc b/src/tree/context-dep.cc index 81eee5bb4ee..4eab67f52be 100644 --- a/src/tree/context-dep.cc +++ b/src/tree/context-dep.cc @@ -178,9 +178,107 @@ void ContextDependency::Read (std::istream &is, bool binary) { to_pdf_ = to_pdf; } -void ContextDependency::GetPdfInfo(const std::vector &phones, - const std::vector &num_pdf_classes, // indexed by phone, - std::vector > > *pdf_info) const { +void ContextDependency::EnumeratePairs( + const std::vector &phones, + int32 self_loop_pdf_class, int32 forward_pdf_class, + const std::vector &phone_window, + unordered_set, PairHasher > *pairs) const { + std::vector new_phone_window(phone_window); + EventType vec; + + std::vector forward_pdfs, self_loop_pdfs; + + // get list of possible forward pdfs + vec.clear(); + for (size_t i = 0; i < N_; i++) + if (phone_window[i] >= 0) + vec.push_back(std::make_pair(static_cast(i), + static_cast(phone_window[i]))); + vec.push_back(std::make_pair(kPdfClass, static_cast(forward_pdf_class))); + std::sort(vec.begin(), vec.end()); + to_pdf_->MultiMap(vec, &forward_pdfs); + SortAndUniq(&forward_pdfs); + + // get list of possible self-loop pdfs + vec.clear(); + for (size_t i = 0; i < N_; i++) + if (phone_window[i] >= 0) + vec.push_back(std::make_pair(static_cast(i), + static_cast(phone_window[i]))); + vec.push_back(std::make_pair(kPdfClass, static_cast(self_loop_pdf_class))); + std::sort(vec.begin(), vec.end()); + to_pdf_->MultiMap(vec, &self_loop_pdfs); + SortAndUniq(&self_loop_pdfs); + + if (forward_pdfs.size() == 1 || self_loop_pdfs.size() == 1) { + for (size_t m = 0; m < forward_pdfs.size(); m++) + for (size_t n = 0; n < self_loop_pdfs.size(); n++) + pairs->insert(std::make_pair(forward_pdfs[m], self_loop_pdfs[n])); + } else { + // Choose 'position' as a phone position in 'context' that's currently + // -1, and that is as close as possible to the central position P. + int32 position = 0; + int32 min_dist = N_ - 1; + for (int32 i = 0; i < N_; i++) { + int32 dist = (P_ - i > 0) ? (P_ - i) : (i - P_); + if (phone_window[i] == -1 && dist < min_dist) { + position = i; + min_dist = dist; + } + } + KALDI_ASSERT(min_dist < N_); + KALDI_ASSERT(position != P_); + + // The next two lines have to do with how BOS/EOS effects are handled in + // phone context. Zero phone value in a non-central position (i.e. not + // position P_... and 'position' will never equal P_) means 'there is no + // phone here because we're at BOS or EOS'. + new_phone_window[position] = 0; + EnumeratePairs(phones, self_loop_pdf_class, forward_pdf_class, + new_phone_window, pairs); + + for (size_t i = 0 ; i < phones.size(); i++) { + new_phone_window[position] = phones[i]; + EnumeratePairs(phones, self_loop_pdf_class, forward_pdf_class, + new_phone_window, pairs); + } + } +} + +void ContextDependency::GetPdfInfo( + const std::vector &phones, + const std::vector > > &pdf_class_pairs, + std::vector > > > *pdf_info) const { + + KALDI_ASSERT(pdf_info != NULL); + pdf_info->resize(1 + *std::max_element(phones.begin(), phones.end())); + std::vector phone_window(N_, -1); + EventType vec; + for (size_t i = 0 ; i < phones.size(); i++) { + // loop over phones + int32 phone = phones[i]; + (*pdf_info)[phone].resize(pdf_class_pairs[phone].size()); + for (size_t j = 0; j < pdf_class_pairs[phone].size(); j++) { + // loop over pdf_class pairs + int32 pdf_class = pdf_class_pairs[phone][j].first, + self_loop_pdf_class = pdf_class_pairs[phone][j].second; + phone_window[P_] = phone; + + unordered_set, PairHasher > pairs; + EnumeratePairs(phones, self_loop_pdf_class, pdf_class, phone_window, &pairs); + unordered_set, PairHasher >::iterator iter = pairs.begin(), + end = pairs.end(); + for (; iter != end; ++iter) + (*pdf_info)[phone][j].push_back(*iter); + std::sort( ((*pdf_info)[phone][j]).begin(), ((*pdf_info)[phone][j]).end()); + } + } +} + +void ContextDependency::GetPdfInfo( + const std::vector &phones, + const std::vector &num_pdf_classes, // indexed by phone, + std::vector > > *pdf_info) const { EventType vec; KALDI_ASSERT(pdf_info != NULL); diff --git a/src/tree/context-dep.h b/src/tree/context-dep.h index 08dc974570d..6342d89667b 100644 --- a/src/tree/context-dep.h +++ b/src/tree/context-dep.h @@ -20,6 +20,7 @@ #ifndef KALDI_TREE_CONTEXT_DEP_H_ #define KALDI_TREE_CONTEXT_DEP_H_ +#include "util/stl-utils.h" #include "itf/context-dep-itf.h" #include "tree/event-map.h" #include "matrix/matrix-lib.h" @@ -99,9 +100,36 @@ class ContextDependency: public ContextDependencyInterface { /// GetPdfInfo returns a vector indexed by pdf-id, saying for each pdf which /// pairs of (phone, pdf-class) it can correspond to. (Usually just one). /// c.f. hmm/hmm-topology.h for meaning of pdf-class. - virtual void GetPdfInfo(const std::vector &phones, // list of phones - const std::vector &num_pdf_classes, // indexed by phone, - std::vector > > *pdf_info) + /// This is the old, simpler interface of GetPdfInfo(), and that this one can + /// only be called if the HmmTopology object's IsHmm() function call returns + /// true. + virtual void GetPdfInfo( + const std::vector &phones, // list of phones + const std::vector &num_pdf_classes, // indexed by phone, + std::vector > > *pdf_info) + const; + + /// This function outputs information about what possible pdf-ids can + /// be generated for HMM-states; it covers the general case where + /// the self-loop pdf-class may be different from the forward-transition + /// pdf-class, so we are asking not about the set of possible pdf-ids + /// for a given (phone, pdf-class), but the set of possible ordered pairs + /// (forward-transition-pdf, self-loop-pdf) for a given (phone, + /// forward-transition-pdf-class, self-loop-pdf-class). + /// Note: 'phones' is a list of integer ids of phones, and + /// 'pdf-class-pairs', indexed by phone, is a list of pairs + /// (forward-transition-pdf-class, self-loop-pdf-class) that we can have for + /// that phone. + /// The output 'pdf_info' is indexed first by phone and then by the + /// same index that indexes each element of 'pdf_class_pairs', + /// and tells us for each pair in 'pdf_class_pairs', what is the + /// list of possible (forward-transition-pdf-id, self-loop-pdf-id) that + /// we can have. + /// This is less efficient than the other version of GetPdfInfo(). + virtual void GetPdfInfo( + const std::vector &phones, + const std::vector > > &pdf_class_pairs, + std::vector > > > *pdf_info) const; private: @@ -109,6 +137,20 @@ class ContextDependency: public ContextDependencyInterface { int32 P_; EventMap *to_pdf_; // owned here. + // 'context' is the context-window of phones, of + // length N, with -1 for those positions where phones + // that are currently unknown, treated as wildcards; at least + // the central phone [position P] must be a real phone, i.e. + // not -1. + // This function inserts any allowed pairs (forward_pdf, self_loop_pdf) + // to the set "pairs". + void EnumeratePairs( + const std::vector &phones, + int32 self_loop_pdf_class, int32 forward_pdf_class, + const std::vector &context, + unordered_set, PairHasher > *pairs) + const; + KALDI_DISALLOW_COPY_AND_ASSIGN(ContextDependency); }; diff --git a/src/util/kaldi-holder.cc b/src/util/kaldi-holder.cc index ee7dd66e922..a26bdf2ce29 100644 --- a/src/util/kaldi-holder.cc +++ b/src/util/kaldi-holder.cc @@ -72,8 +72,8 @@ bool ExtractObjectRange(const Matrix &input, const std::string &range, // template instantiation template bool ExtractObjectRange(const Matrix &, const std::string &, Matrix *); -template bool ExtractObjectRange(const Matrix &, const std::string &, - Matrix *); +template bool ExtractObjectRange(const Matrix &, const std::string &, + Matrix *); bool ExtractRangeSpecifier(const std::string &rxfilename_with_range, std::string *data_rxfilename, diff --git a/tools/Makefile b/tools/Makefile index 548afafca1e..714e613e4bf 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -78,8 +78,12 @@ openfst-$(OPENFST_VERSION)/lib: | openfst-$(OPENFST_VERSION)/Makefile # Add the -O flag to CXXFLAGS on cygwin as it can fix the compilation error # "file too big". openfst-$(OPENFST_VERSION)/Makefile: openfst-$(OPENFST_VERSION)/.patched | check_required_programs +# Note: OSTYPE path is probably dead for latest cygwin64 (installed on 2016/11/11). ifeq ($(OSTYPE),cygwin) - cd openfst-$(OPENFST_VERSION)/; ./configure --prefix=`pwd` --enable-static --enable-shared --enable-far --enable-ngram-fsts CXX=$(CXX) CXXFLAGS="$(CXXFLAGS) -O" LDFLAGS="$(LDFLAGS)" LIBS="-ldl" + cd openfst-$(OPENFST_VERSION)/; ./configure --prefix=`pwd` --enable-static --enable-shared --enable-far --enable-ngram-fsts CXX=$(CXX) CXXFLAGS="$(CXXFLAGS) -O -Wa,-mbig-obj" LDFLAGS="$(LDFLAGS)" LIBS="-ldl" +# This new OS path is confirmed working on Windows 10 / Cygwin64 +else ifeq ($(OS),Windows_NT) + cd openfst-$(OPENFST_VERSION)/; ./configure --prefix=`pwd` --enable-static --enable-shared --enable-far --enable-ngram-fsts CXX=$(CXX) CXXFLAGS="$(CXXFLAGS) -O -Wa,-mbig-obj" LDFLAGS="$(LDFLAGS)" LIBS="-ldl" else cd openfst-$(OPENFST_VERSION)/; ./configure --prefix=`pwd` --enable-static --enable-shared --enable-far --enable-ngram-fsts CXX=$(CXX) CXXFLAGS="$(CXXFLAGS)" LDFLAGS="$(LDFLAGS)" LIBS="-ldl" endif @@ -96,7 +100,7 @@ openfst-$(OPENFST_VERSION): openfst-$(OPENFST_VERSION).tar.gz tar xozf openfst-$(OPENFST_VERSION).tar.gz openfst-$(OPENFST_VERSION).tar.gz: - wget http://openfst.cs.nyu.edu/twiki/pub/FST/FstDownload/openfst-$(OPENFST_VERSION).tar.gz || \ + wget --tries=1 -T 5 http://openfst.cs.nyu.edu/twiki/pub/FST/FstDownload/openfst-$(OPENFST_VERSION).tar.gz || \ wget -T 10 -t 3 http://www.openslr.org/resources/2/openfst-$(OPENFST_VERSION).tar.gz sclite: sclite_compiled @@ -172,5 +176,3 @@ openblas_compiled: cd OpenBLAS; sed 's:# FCOMMON_OPT = -frecursive:FCOMMON_OPT = -frecursive:' < Makefile.rule >tmp && mv tmp Makefile.rule # $(MAKE) PREFIX=`pwd`/OpenBLAS/install FC=gfortran $(fortran_opt) DEBUG=1 USE_THREAD=1 NUM_THREADS=64 -C OpenBLAS all install $(MAKE) PREFIX=`pwd`/OpenBLAS/install FC=gfortran $(fortran_opt) DEBUG=1 USE_THREAD=0 -C OpenBLAS all install - - diff --git a/tools/extras/install_sequitur.sh b/tools/extras/install_sequitur.sh index b03020f292d..02145c7f0c8 100755 --- a/tools/extras/install_sequitur.sh +++ b/tools/extras/install_sequitur.sh @@ -36,6 +36,11 @@ else fi fi +command -v swig >/dev/null 2>&1 || { + echo >&2 "$0: Error: I require swig but it's not installed."; + echo >&2 " Please install swig and run this script again. " + exit 1; +} if [ -d ./g2p ] || [ -d sequitur ] ; then echo >&2 "$0: Warning: old installation of Sequitur found. You should manually" @@ -59,7 +64,7 @@ if [ ! -d ./sequitur-g2p ] ; then } fi #just to retain backward compatibility for a while. Can be removed -#in a couple of months. +#in a couple of months. ln -sf sequitur-g2p sequitur diff --git a/tools/extras/travis_script.sh b/tools/extras/travis_script.sh index 564760a7353..a857f538edd 100755 --- a/tools/extras/travis_script.sh +++ b/tools/extras/travis_script.sh @@ -54,7 +54,7 @@ runvx cd tools runvx make openfst $CCC CXXFLAGS="$CF" -j$MAXPAR cd .. runvx cd src -runvx ./configure --use-cuda=no --mathlib=OPENBLAS --openblas-root=$XROOT/usr +runvx ./configure --shared --use-cuda=no --mathlib=OPENBLAS --openblas-root=$XROOT/usr make_kaldi() { runvx make "$@" $CCC EXTRA_CXXFLAGS="$CF" EXTRA_LDLIBS="$LDF"