diff --git a/egs/ami/s5/local/chain/run_blstm_ami_5.sh b/egs/ami/s5/local/chain/run_blstm_ami_5.sh index d9437af7e0c..5943494d8e1 100755 --- a/egs/ami/s5/local/chain/run_blstm_ami_5.sh +++ b/egs/ami/s5/local/chain/run_blstm_ami_5.sh @@ -118,7 +118,6 @@ if [ $stage -le 17 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 128 \ --trainer.frames-per-iter 1200000 \ --trainer.max-param-change 2.0 \ @@ -129,6 +128,7 @@ if [ $stage -le 17 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 10 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/ami/s5b/RESULTS_ihm b/egs/ami/s5b/RESULTS_ihm index f74a4ebaf6a..52b24e3a27c 100644 --- a/egs/ami/s5b/RESULTS_ihm +++ b/egs/ami/s5b/RESULTS_ihm @@ -48,6 +48,13 @@ %WER 24.2 | 13098 94477 | 79.3 12.2 8.6 3.5 24.2 57.1 | -0.178 | exp/ihm/nnet3/tdnn_sp/decode_dev/ascore_11/dev_hires.ctm.filt.sys %WER 25.4 | 12643 89970 | 77.6 13.7 8.7 3.0 25.4 56.3 | -0.067 | exp/ihm/nnet3/tdnn_sp/decode_eval/ascore_12/eval_hires.ctm.filt.sys +# local/nnet3/run_blstm.sh --mic ihm +# nnet3 xent BLSTM with data cleaning +# for d in exp/ihm/nnet3_cleaned/lstm_bidirectional_sp/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done +# Note: the results are with ClipGradientComponent, which may be different from with BackpropTruncationComponent +%WER 22.3 | 13098 94494 | 80.9 11.7 7.4 3.2 22.3 55.7 | -0.618 | exp/ihm/nnet3_cleaned/lstm_bidirectional_sp/decode_dev/ascore_10/dev_hires.ctm.filt.sys +%WER 22.5 | 12643 89962 | 80.2 12.7 7.1 2.7 22.5 53.4 | -0.476 | exp/ihm/nnet3_cleaned/lstm_bidirectional_sp/decode_eval/ascore_10/eval_hires.ctm.filt.sys + ############################################ # local/chain/run_tdnn.sh --mic ihm --stage 12 & diff --git a/egs/ami/s5b/RESULTS_sdm b/egs/ami/s5b/RESULTS_sdm index a6c9d8192ec..7b1e56b5903 100644 --- a/egs/ami/s5b/RESULTS_sdm +++ b/egs/ami/s5b/RESULTS_sdm @@ -46,6 +46,12 @@ %WER 41.6 | 14493 94516 | 63.3 23.5 13.2 4.9 41.6 66.8 | 0.639 | exp/sdm1/nnet3/tdnn_sp_ihmali/decode_dev/ascore_13/dev_hires_o4.ctm.filt.sys %WER 46.0 | 13597 89967 | 57.5 24.9 17.6 3.6 46.0 68.1 | 0.601 | exp/sdm1/nnet3/tdnn_sp_ihmali/decode_eval/ascore_14/eval_hires_o4.ctm.filt.sys +# xent BLSTM system; cleaned data and IHM alignments. +# local/nnet3/run_blstm.sh --mic sdm1 --use-ihm-ali true +# for d in exp/sdm1/nnet3_cleaned/lstm_bidirectional_sp/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done +# Note: the results are with ClipGradientComponent, which may be different from with BackpropTruncationComponent +%WER 37.8 | 14633 94518 | 67.1 22.3 10.7 4.9 37.8 64.2 | 0.745 | exp/sdm1/nnet3_cleaned/lstm_bidirectional_sp_ihmali/decode_dev/ascore_11/dev_hires_o4.ctm.filt.sys +%WER 41.4 | 13809 89628 | 62.7 24.1 13.2 4.1 41.4 65.2 | 0.723 | exp/sdm1/nnet3_cleaned/lstm_bidirectional_sp_ihmali/decode_eval/ascore_11/eval_hires_o4.ctm.filt.sys # ========================= diff --git a/egs/ami/s5b/local/nnet3/run_blstm.sh b/egs/ami/s5b/local/nnet3/run_blstm.sh new file mode 100755 index 00000000000..776151fb5aa --- /dev/null +++ b/egs/ami/s5b/local/nnet3/run_blstm.sh @@ -0,0 +1,52 @@ +stage=0 +train_stage=-10 +mic=ihm +affix=bidirectional +common_egs_dir= +remove_egs=true +use_ihm_ali=false +train_set=train_cleaned +ihm_gmm=tri3 +nnet3_affix=_cleaned + +# BLSTM params +cell_dim=512 +rp_dim=128 +nrp_dim=128 +chunk_left_context=40 +chunk_right_context=40 + +# training options +srand=0 +num_jobs_initial=2 +num_jobs_final=12 +samples_per_iter=20000 +num_epochs=10 +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +local/nnet3/run_lstm.sh --affix $affix \ + --stage $stage \ + --srand $srand \ + --train-stage $train_stage \ + --train-set $train_set \ + --ihm-gmm $ihm_gmm \ + --nnet3-affix $nnet3_affix \ + --lstm-delay " [-1,1] [-2,2] [-3,3] " \ + --label-delay 0 \ + --cell-dim $cell_dim \ + --recurrent-projection-dim $rp_dim \ + --non-recurrent-projection-dim $nrp_dim \ + --common-egs-dir "$common_egs_dir" \ + --chunk-left-context $chunk_left_context \ + --chunk-right-context $chunk_right_context \ + --mic $mic \ + --num-jobs-initial $num_jobs_initial \ + --num-jobs-final $num_jobs_final \ + --samples-per-iter $samples_per_iter \ + --num-epochs $num_epochs \ + --use-ihm-ali $use_ihm_ali \ + --remove-egs $remove_egs + diff --git a/egs/ami/s5b/local/nnet3/run_lstm.sh b/egs/ami/s5b/local/nnet3/run_lstm.sh new file mode 100755 index 00000000000..c5583e2d0ef --- /dev/null +++ b/egs/ami/s5b/local/nnet3/run_lstm.sh @@ -0,0 +1,245 @@ +#!/bin/bash + +# This is the standard "lstm" system, built in nnet3. +# Please see RESULTS_* for examples of command lines invoking this script. + + +# local/nnet3/run_lstm.sh --mic sdm1 --use-ihm-ali true + +# local/nnet3/run_lstm.sh --mic ihm --stage 11 +# local/nnet3/run_lstm.sh --mic ihm --train-set train --gmm tri3 --nnet3-affix "" & +# +# local/nnet3/run_lstm.sh --mic sdm1 --stage 11 --affix cleaned2 --gmm tri4a_cleaned2 --train-set train_cleaned2 & + +# local/nnet3/run_lstm.sh --use-ihm-ali true --mic sdm1 --train-set train --gmm tri3 --nnet3-affix "" & + +# local/nnet3/run_lstm.sh --use-ihm-ali true --mic mdm8 & + +# local/nnet3/run_lstm.sh --use-ihm-ali true --mic mdm8 --train-set train --gmm tri3 --nnet3-affix "" & + +# this is an example of how you'd train a non-IHM system with the IHM +# alignments. the --gmm option in this case refers to the IHM gmm that's used +# to get the alignments. +# local/nnet3/run_lstm.sh --mic sdm1 --use-ihm-ali true --affix cleaned2 --gmm tri4a --train-set train_cleaned2 & + + + +set -e -o pipefail -u + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +mic=ihm +nj=30 +min_seg_len=1.55 +use_ihm_ali=false +train_set=train_cleaned +gmm=tri3_cleaned # this is the source gmm-dir for the data-type of interest; it + # should have alignments for the specified training data. +ihm_gmm=tri3 # Only relevant if $use_ihm_ali is true, the name of the gmm-dir in + # the ihm directory that is to be used for getting alignments. +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for exp dirs, e.g. _cleaned + +# Options which are not passed through to run_ivector_common.sh +affix= +common_egs_dir= +reporting_email= + +# LSTM options +splice_indexes="-2,-1,0,1,2 0 0" +lstm_delay=" -1 -2 -3 " +label_delay=5 +num_lstm_layers=3 +cell_dim=1024 +hidden_dim=1024 +recurrent_projection_dim=256 +non_recurrent_projection_dim=256 +chunk_width=20 +chunk_left_context=40 +chunk_right_context=0 +max_param_change=2.0 + +# training options +train_stage=-10 +srand=0 +num_epochs=10 +initial_effective_lrate=0.0003 +final_effective_lrate=0.00003 +num_jobs_initial=2 +num_jobs_final=12 +momentum=0.5 +num_chunk_per_minibatch=100 +samples_per_iter=20000 +remove_egs=true + +#decode options +extra_left_context= +extra_right_context= +frames_per_chunk= +decode_iter= + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat </dev/null || true + if [ -z $extra_left_context ]; then + extra_left_context=$chunk_left_context + fi + if [ -z $extra_right_context ]; then + extra_right_context=$chunk_right_context + fi + if [ -z $frames_per_chunk ]; then + frames_per_chunk=$chunk_width + fi + model_opts= + [ ! -z $decode_iter ] && model_opts=" --iter $decode_iter "; + for decode_set in dev eval; do + ( + num_jobs=`cat data/$mic/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + decode_dir=${dir}/decode_${decode_set} + steps/nnet3/decode.sh --nj 250 --cmd "$decode_cmd" \ + $model_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/$mic/${decode_set}_hires $decode_dir || exit 1; + ) & + done + wait; + if [ -f $dir/.error ]; then + echo "$0: error detected during decoding" + exit 1 + fi +fi + +exit 0; diff --git a/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh b/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh index 79d633b1ebd..522498d847d 100755 --- a/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh +++ b/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh @@ -176,7 +176,6 @@ if [ $stage -le 12 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.max-param-change 1.414 \ --egs.stage $get_egs_stage \ @@ -193,6 +192,7 @@ if [ $stage -le 12 ]; then --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.shrink-value 0.99 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 10 \ --cleanup.remove-egs $remove_egs \ --feat-dir data/train_rvb_min${min_seg_len}_hires \ --tree-dir $treedir \ diff --git a/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh b/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh index 5fa4ea565cd..c11420e5cfd 100755 --- a/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh +++ b/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh @@ -173,7 +173,6 @@ if [ $stage -le 12 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.max-param-change 1.414 \ --egs.stage $get_egs_stage \ @@ -188,6 +187,7 @@ if [ $stage -le 12 ]; then --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.shrink-value 0.99 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 10 \ --cleanup.remove-egs $remove_egs \ --feat-dir data/train_rvb_min${min_seg_len}_hires \ --tree-dir $treedir \ diff --git a/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh b/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh index b70da4e852a..a48e7ed55af 100644 --- a/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh +++ b/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh @@ -117,7 +117,6 @@ if [ $stage -le 13 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.frames-per-iter 1200000 \ --trainer.max-param-change 1.414 \ @@ -128,6 +127,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 10 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/multi_en/s5/local/chain/run_blstm_6h.sh b/egs/multi_en/s5/local/chain/run_blstm_6h.sh index 51ca7db0495..5a68947282a 100644 --- a/egs/multi_en/s5/local/chain/run_blstm_6h.sh +++ b/egs/multi_en/s5/local/chain/run_blstm_6h.sh @@ -124,7 +124,6 @@ if [ $stage -le 13 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.frames-per-iter 1200000 \ --trainer.max-param-change 1.414 \ @@ -135,6 +134,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 10 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/swbd/s5c/local/chain/compare_wer.sh b/egs/swbd/s5c/local/chain/compare_wer.sh deleted file mode 100755 index ded03563711..00000000000 --- a/egs/swbd/s5c/local/chain/compare_wer.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/bin/bash - - -echo -n "System " -for x in $*; do printf "% 10s" $x; done -echo - -echo -n "WER on train_dev(tg) " -for x in $*; do - wer=$(grep WER exp/chain/tdnn_${x}_sp/decode_train_dev_sw1_tg/wer_* | utils/best_wer.sh | awk '{print $2}') - printf "% 10s" $wer -done -echo - -echo -n "WER on train_dev(fg) " -for x in $*; do - wer=$(grep WER exp/chain/tdnn_${x}_sp/decode_train_dev_sw1_fsh_fg/wer_* | utils/best_wer.sh | awk '{print $2}') - printf "% 10s" $wer -done -echo - -echo -n "WER on eval2000(tg) " -for x in $*; do - wer=$(grep Sum exp/chain/tdnn_${x}_sp/decode_eval2000_sw1_tg/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}') - printf "% 10s" $wer -done -echo - -echo -n "WER on eval2000(fg) " -for x in $*; do - wer=$(grep Sum exp/chain/tdnn_${x}_sp/decode_eval2000_sw1_fsh_fg/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}') - printf "% 10s" $wer -done -echo - -echo -n "Final train prob " -for x in $*; do - prob=$(grep Overall exp/chain/tdnn_${x}_sp/log/compute_prob_train.final.log | grep -v xent | awk '{print $8}') - printf "% 10s" $prob -done -echo - -echo -n "Final valid prob " -for x in $*; do - prob=$(grep Overall exp/chain/tdnn_${x}_sp/log/compute_prob_valid.final.log | grep -v xent | awk '{print $8}') - printf "% 10s" $prob -done -echo - -echo -n "Final train prob (xent) " -for x in $*; do - prob=$(grep Overall exp/chain/tdnn_${x}_sp/log/compute_prob_train.final.log | grep -w xent | awk '{print $8}') - printf "% 10s" $prob -done -echo - -echo -n "Final valid prob (xent) " -for x in $*; do - prob=$(grep Overall exp/chain/tdnn_${x}_sp/log/compute_prob_valid.final.log | grep -w xent | awk '{print $8}') - printf "% 10s" $prob -done -echo diff --git a/egs/swbd/s5c/local/chain/compare_wer_general.sh b/egs/swbd/s5c/local/chain/compare_wer_general.sh new file mode 100755 index 00000000000..c8aae0b3b94 --- /dev/null +++ b/egs/swbd/s5c/local/chain/compare_wer_general.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +echo -n "System " +for x in $*; do printf "% 10s" $x; done +echo + +echo -n "WER on train_dev(tg) " +for x in $*; do + wer=$(grep WER exp/chain/${x}_sp/decode_train_dev_sw1_tg/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "WER on train_dev(fg) " +for x in $*; do + wer=$(grep WER exp/chain/${x}_sp/decode_train_dev_sw1_fsh_fg/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "WER on eval2000(tg) " +for x in $*; do + wer=$(grep Sum exp/chain/${x}_sp/decode_eval2000_sw1_tg/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "WER on eval2000(fg) " +for x in $*; do + wer=$(grep Sum exp/chain/${x}_sp/decode_eval2000_sw1_fsh_fg/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "Final train prob " +for x in $*; do + prob=$(grep Overall exp/chain/${x}_sp/log/compute_prob_train.final.log | grep -v xent | awk '{print $8}') + printf "% 10s" $prob +done +echo + +echo -n "Final valid prob " +for x in $*; do + prob=$(grep Overall exp/chain/${x}_sp/log/compute_prob_valid.final.log | grep -v xent | awk '{print $8}') + printf "% 10s" $prob +done +echo + +echo -n "Final train prob (xent) " +for x in $*; do + prob=$(grep Overall exp/chain/${x}_sp/log/compute_prob_train.final.log | grep -w xent | awk '{print $8}') + printf "% 10s" $prob +done +echo + +echo -n "Final valid prob (xent) " +for x in $*; do + prob=$(grep Overall exp/chain/${x}_sp/log/compute_prob_valid.final.log | grep -w xent | awk '{print $8}') + printf "% 10s" $prob +done +echo diff --git a/egs/swbd/s5c/local/chain/compare_wer_tdnn.sh b/egs/swbd/s5c/local/chain/compare_wer_tdnn.sh new file mode 100755 index 00000000000..542dae82581 --- /dev/null +++ b/egs/swbd/s5c/local/chain/compare_wer_tdnn.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +models="" +for x in $*; do models="$models tdnn_${x}"; done + +local/chain/compare_wer_general.sh $models diff --git a/egs/swbd/s5c/local/chain/run_blstm.sh b/egs/swbd/s5c/local/chain/run_blstm.sh new file mode 120000 index 00000000000..0160247619f --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_blstm.sh @@ -0,0 +1 @@ +tuning/run_blstm_6j.sh \ No newline at end of file diff --git a/egs/swbd/s5c/local/chain/run_lstm.sh b/egs/swbd/s5c/local/chain/run_lstm.sh index 28e5e6cc20c..8b421ac2649 120000 --- a/egs/swbd/s5c/local/chain/run_lstm.sh +++ b/egs/swbd/s5c/local/chain/run_lstm.sh @@ -1 +1 @@ -tuning/run_lstm_6i.sh \ No newline at end of file +tuning/run_lstm_6j.sh \ No newline at end of file diff --git a/egs/swbd/s5c/local/chain/run_tdnn.sh b/egs/swbd/s5c/local/chain/run_tdnn.sh index 669740d5f27..7b86453e14b 120000 --- a/egs/swbd/s5c/local/chain/run_tdnn.sh +++ b/egs/swbd/s5c/local/chain/run_tdnn.sh @@ -1 +1 @@ -tuning/run_tdnn_7f.sh \ No newline at end of file +tuning/run_tdnn_7h.sh \ No newline at end of file diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh index 95f7aef2708..9ab72b40ac2 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh @@ -144,7 +144,6 @@ if [ $stage -le 13 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.frames-per-iter 1200000 \ --trainer.max-param-change 2.0 \ @@ -155,6 +154,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 10 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh index 26cdaed29d7..6e1712c5187 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh @@ -150,7 +150,6 @@ if [ $stage -le 13 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.frames-per-iter 1200000 \ --trainer.max-param-change 2.0 \ @@ -161,6 +160,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 10 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6j.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6j.sh new file mode 100755 index 00000000000..496bf502491 --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6j.sh @@ -0,0 +1,228 @@ +#!/bin/bash + +# 6j is same as 6i but using the xconfig format of network specification. +# Also, the model is trained without layer-wise discriminative pretraining. +# Another minor change is that the final-affine component has param-stddev-0 +# and bias-stddev=0 initialization. +# This run also accounts for changes in training due to the BackpropTruncationComponent + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/blstm_6j # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= +decode_dir_affix= + +# training options +leftmost_questions_truncate=-1 +chunk_width=150 +chunk_left_context=40 +chunk_right_context=40 +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=0 + +# decode options +extra_left_context=50 +extra_right_context=50 +frames_per_chunk= + +remove_egs=false +common_egs_dir= + +affix= +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info exp/chain/tri5_7d_tree_sp/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + lstmp-layer name=blstm1-forward input=lda cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + lstmp-layer name=blstm1-backward input=lda cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 + + lstmp-layer name=blstm2-forward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + lstmp-layer name=blstm2-backward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 + + lstmp-layer name=blstm3-forward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + lstmp-layer name=blstm3-backward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 + + ## adding the layers for chain branch + output-layer name=output input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --chain.left-deriv-truncate 0 \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 1200000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $extra_right_context ] && extra_right_context=$chunk_right_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh index fbced146199..3155e21b618 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh @@ -149,7 +149,6 @@ if [ $stage -le 13 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.frames-per-iter 1200000 \ --trainer.max-param-change 2.0 \ @@ -160,6 +159,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 10 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh index c5548cbfa5c..f1a42cc175c 100644 --- a/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh @@ -149,7 +149,6 @@ if [ $stage -le 13 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.frames-per-iter 1200000 \ --trainer.max-param-change 2.0 \ @@ -160,6 +159,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 10 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh new file mode 100755 index 00000000000..4c765d35d30 --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh @@ -0,0 +1,236 @@ +#!/bin/bash + +# 6j is same as 6i but using the xconfig format of network specification. +# Also, the model is trained without layer-wise discriminative pretraining. +# Another minor change is that the final-affine component has param-stddev-0 +# and bias-stddev=0 initialization. + + + +# This run is affected by the bug that per-element-scale components do not have +# max-change. The updated results without the bug will be submitted soon. +#System lstm_6i_ld5 lstm_6j_ld5 +#WER on train_dev(tg) 14.65 14.43 +#WER on train_dev(fg) 13.38 13.17 +#WER on eval2000(tg) 16.9 16.9 +#WER on eval2000(fg) 15.4 15.3 +#Final train prob -0.0751668-0.0795697 +#Final valid prob -0.0928206-0.0926466 +#Final train prob (xent) -1.34549 -1.16067 +#Final valid prob (xent) -1.41301 -1.23679 +# + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/lstm_6j # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= +decode_dir_affix= + +# training options +leftmost_questions_truncate=-1 +chunk_width=150 +chunk_left_context=40 +chunk_right_context=0 +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 +# decode options +extra_left_context=50 +extra_right_context=0 +frames_per_chunk= + +remove_egs=false +common_egs_dir= + +affix= +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info exp/chain/tri5_7d_tree_sp/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --chain.left-deriv-truncate 0 \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 1200000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $extra_right_context ] && extra_right_context=$chunk_right_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh index 28c20c92ab0..a678fe22044 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh @@ -155,7 +155,6 @@ if [ $stage -le 13 ]; then --chain.xent-regularize $xent_regularize \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.max-param-change 2.0 \ --trainer.num-epochs 4 \ @@ -165,6 +164,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 10 \ --egs.stage $get_egs_stage \ --egs.opts="--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh new file mode 100755 index 00000000000..7a4512097d3 --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh @@ -0,0 +1,228 @@ +#!/bin/bash + + +# 7g is same as 7f but using the xconfig format of network specification. +# Also, the model is trained without layer-wise discriminative pretraining. + + +# System 7f 7g +# WER on train_dev(tg) 14.46 13.85 +# WER on train_dev(fg) 13.23 12.67 +# WER on eval2000(tg) 17.0 16.5 +# WER on eval2000(fg) 15.4 14.8 +# Final train prob -0.0882071 -0.0885075 +# Final valid prob -0.107545 -0.113462 +# Final train prob (xent) -1.26246 -1.25788 +# Final valid prob (xent) -1.35525 -1.37058 + + + +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_7g # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# training options +num_epochs=6 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false +#common_egs_dir=exp/chain/tdnn_7e_sp/egs +xent_regularize=0.1 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info exp/chain/tri5_7d_tree_sp/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=625 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=625 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=625 + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=625 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=625 + # it doesn't make sense to have -6,0,6 splicing for a chain model + # as we compute a sequence of outputs and computation can be shared + # this has to be split into two -3,0,3 layers. But I will keep this + # to have same setup as 7f + relu-renorm-layer name=tdnn6 input=Append(-6,0,6) dim=625 + + ## adding the layers for chain branch + relu-renorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-renorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh new file mode 100755 index 00000000000..00743ca9ebf --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh @@ -0,0 +1,218 @@ +#!/bin/bash + +#System tdnn_7g tdnn_7h +#WER on train_dev(tg) 13.98 13.84 +#WER on train_dev(fg) 12.78 12.84 +#WER on eval2000(tg) 16.7 16.5 +#WER on eval2000(fg) 14.9 14.8 +#Final train prob -0.0817467-0.0889771 +#Final valid prob -0.110475 -0.113102 +#Final train prob (xent) -1.20065 -1.2533 +#Final valid prob (xent) -1.3313 -1.36743 +# +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_7h # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info exp/chain/tri5_7d_tree_sp/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=625 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=625 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=625 + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=625 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=625 + relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=625 + relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=625 + + ## adding the layers for chain branch + relu-renorm-layer name=prefinal-chain input=tdnn7 dim=625 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-renorm-layer name=prefinal-xent input=tdnn7 dim=625 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh b/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh index 1908b390151..be984ac24ee 100755 --- a/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh +++ b/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh @@ -8,8 +8,7 @@ set -e # note: this relies on having a cluster that has plenty of CPUs as well as GPUs, # since the lattice generation runs in about real-time, so takes of the order of # 1000 hours of CPU time. -# -. cmd.sh +# stage=0 @@ -26,7 +25,7 @@ extra_right_context=40 extra_left_context_initial=-1 extra_right_context_final=-1 -. cmd.sh +. ./cmd.sh . ./path.sh . ./utils/parse_options.sh @@ -52,9 +51,9 @@ effective_learning_rate=0.0000125 max_param_change=1 num_jobs_nnet=4 num_epochs=4 -regularization_opts= # Applicable for providing --xent-regularize and --l2-regularize options +regularization_opts= # Applicable for providing --xent-regularize and --l2-regularize options minibatch_size=64 -adjust_priors=true # May need to be set to false +adjust_priors=true # May need to be set to false # because it does not help in some setups modify_learning_rates=true last_layer_factor=0.1 @@ -64,8 +63,8 @@ decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we deci if $use_gpu; then if ! cuda-compiled; then - cat </dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + ${graph_dir} data/${dset}_hires ${dir}/decode_${dset} || exit 1 + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 +fi + + +exit 0; diff --git a/egs/wsj/s5/steps/libs/__init__.py b/egs/wsj/s5/steps/libs/__init__.py new file mode 100644 index 00000000000..2a472386568 --- /dev/null +++ b/egs/wsj/s5/steps/libs/__init__.py @@ -0,0 +1,9 @@ + + +# Copyright 2016 Vimal Manohar +# Apache 2.0. + +""" This package contains modules and subpackages used in kaldi scripts. +""" + +__all__ = ["common"] diff --git a/egs/wsj/s5/steps/libs/nnet3/__init__.py b/egs/wsj/s5/steps/libs/nnet3/__init__.py new file mode 100644 index 00000000000..03131a3a8d6 --- /dev/null +++ b/egs/wsj/s5/steps/libs/nnet3/__init__.py @@ -0,0 +1,12 @@ + +# Copyright 2016 Johns Hopkins University (Dan Povey) +# 2016 Vimal Manohar +# 2016 Vijayaditya Peddinti +# 2016 Yiming Wang +# Apache 2.0. + + +# This module has the python functions which facilitate the use of nnet3 toolkit +# It has two sub-modules +# xconfig : Library for parsing high level description of neural networks +# train : Library for training scripts diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/__init__.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/__init__.py new file mode 100644 index 00000000000..6c824b1195b --- /dev/null +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/__init__.py @@ -0,0 +1,39 @@ +# Copyright 2016 Johns Hopkins University (Dan Povey) +# 2016 Vijayaditya Peddinti +# 2016 Yiming Wang +# Apache 2.0. + +"""This library has classes and methods to form neural network computation graphs, +in the nnet3 framework, using higher level abstractions called 'layers' +(e.g. sub-graphs like LSTMS ). + +Note : We use the term 'layer' though the computation graph can have a highly +non-linear structure as, other terms such as nodes/components have already been +used in C++ codebase of nnet3. + +This is basically a config parser module, where the configs have very concise +descriptions of a neural network. + +This module has methods to convert the xconfigs into a configs interpretable by +nnet3 C++ library. + +It generates three different configs: + 'init.config' : which is the config with the info necessary for computing + the preconditioning matrix i.e., LDA transform + e.g. + input-node name=input dim=40 + input-node name=ivector dim=100 + output-node name=output input=Append(Offset(input, -2), Offset(input, -1), input, Offset(input, 1), Offset(input, 2), ReplaceIndex(ivector, t, 0)) objective=linear + + 'ref.config' : which is a version of the config file used to generate + a model for getting left and right context (it doesn't read + anything for the LDA-like transform and/or + presoftmax-prior-scale components) + + 'final.config' : which has the actual config used to initialize the model used + in training i.e, it has file paths for LDA transform and + other initialization files +""" + + +__all__ = ["utils", "layers", "parser"] diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py new file mode 100644 index 00000000000..35f19e5a626 --- /dev/null +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -0,0 +1,902 @@ +# Copyright 2016 Johns Hopkins University (Dan Povey) +# 2016 Vijayaditya Peddinti +# Apache 2.0. + +""" This module contains the parent class from which all layers are inherited +and some basic layer definitions. +""" + +from __future__ import print_function +import sys +import libs.nnet3.xconfig.utils as xutils +from libs.nnet3.xconfig.utils import XconfigParserError as xparser_error + + +class XconfigLayerBase(object): + """ A base-class for classes representing layers of xconfig files. + """ + + def __init__(self, first_token, key_to_value, all_layers): + """ + first_token: first token on the xconfig line, e.g. 'affine-layer'.f + key_to_value: dictionary with parameter values + { 'name':'affine1', + 'input':'Append(0, 1, 2, ReplaceIndex(ivector, t, 0))', + 'dim=1024' }. + The only required and 'special' values that are dealt with directly + at this level, are 'name' and 'input'. The rest are put in + self.config and are dealt with by the child classes' init functions. + all_layers: An array of objects inheriting XconfigLayerBase for all + previously parsed layers. + """ + + self.layer_type = first_token + if not 'name' in key_to_value: + raise xparser_error("Expected 'name' to be specified.", self.str()) + self.name = key_to_value['name'] + if not xutils.is_valid_line_name(self.name): + raise xparser_error("Invalid value: name={0}".format( + key_to_value['name']), self.str()) + + # the following, which should be overridden in the child class, sets + # default config parameters in self.config. + self.set_default_configs() + # The following is not to be reimplemented in child classes; + # it sets the config values to those specified by the user, and + # parses any Descriptors. + self.set_configs(key_to_value, all_layers) + # This method, sets the derived default config values + # i.e., config values when not specified can be derived from + # other values. It can be overridden in the child class. + self.set_derived_configs() + # the following, which should be overridden in the child class, checks + # that the config parameters that have been set are reasonable. + self.check_configs() + + + def set_configs(self, key_to_value, all_layers): + """ Sets the config variables. + We broke this code out of __init__ for clarity. + the child-class constructor will deal with the configuration values + in a more specific way. + """ + + for key,value in key_to_value.items(): + if key != 'name': + if not key in self.config: + raise xparser_error("Configuration value {0}={1} was not" + " expected in layer of type {2}" + "".format(key, value, self.layer_type), + self.str()) + self.config[key] = xutils.convert_value_to_type(key, + type(self.config[key]), + value) + self.descriptors = dict() + self.descriptor_dims = dict() + # Parse Descriptors and get their dims and their 'final' string form. + # in self.descriptors[key] + for key in self.get_input_descriptor_names(): + if not key in self.config: + raise xparser_error("{0}: object of type {1} needs to override" + " get_input_descriptor_names()." + "".format(sys.argv[0], str(type(self))), + self.str()) + descriptor_string = self.config[key] # input string. + assert isinstance(descriptor_string, str) + desc = self.convert_to_descriptor(descriptor_string, all_layers) + desc_dim = self.get_dim_for_descriptor(desc, all_layers) + desc_norm_str = desc.str() + + # desc_output_str contains the "final" component names, those that + # appear in the actual config file (i.e. not names like + # 'layer.auxiliary_output'); that's how it differs from desc_norm_str. + # Note: it's possible that the two strings might be the same in + # many, even most, cases-- it depends whether + # output_name(self, auxiliary_output) + # returns self.get_name() + '.' + auxiliary_output + # when auxiliary_output is not None. + # That's up to the designer of the layer type. + desc_output_str = self.get_string_for_descriptor(desc, all_layers) + self.descriptors[key] = {'string':desc, + 'normalized-string':desc_norm_str, + 'final-string':desc_output_str, + 'dim':desc_dim} + + # the following helps to check the code by parsing it again. + desc2 = self.convert_to_descriptor(desc_norm_str, all_layers) + desc_norm_str2 = desc2.str() + # if the following ever fails we'll have to do some debugging. + if desc_norm_str != desc_norm_str2: + raise xparser_error("Likely code error: '{0}' != '{1}'" + "".format(desc_norm_str, desc_norm_str2), + self.str()) + + def str(self): + """Converts 'this' to a string which could be printed to + an xconfig file; in xconfig_to_configs.py we actually expand all the + lines to strings and write it as xconfig.expanded as a reference + (so users can see any defaults). + """ + + ans = '{0} name={1}'.format(self.layer_type, self.name) + ans += ' ' + ' '.join([ '{0}={1}'.format(key, self.config[key]) + for key in sorted(self.config.keys())]) + return ans + + def __str__(self): + + return self.str() + + + def normalize_descriptors(self): + """Converts any config variables in self.config which correspond to + Descriptors, into a 'normalized form' derived from parsing them as + Descriptors, replacing things like [-1] with the actual layer names, + and regenerating them as strings. We stored this when the object was + initialized, in self.descriptors; this function just copies them back + to the config. + """ + + for key, desc_str_dict in self.descriptors.items(): + self.config[key] = desc_str_dict['normalized-string'] + + def convert_to_descriptor(self, descriptor_string, all_layers): + """Convenience function intended to be called from child classes, + converts a string representing a descriptor ('descriptor_string') + into an object of type Descriptor, and returns it. It needs 'self' and + 'all_layers' (where 'all_layers' is a list of objects of type + XconfigLayerBase) so that it can work out a list of the names of other + layers, and get dimensions from them. + """ + + prev_names = xutils.get_prev_names(all_layers, self) + tokens = xutils.tokenize_descriptor(descriptor_string, prev_names) + pos = 0 + (descriptor, pos) = xutils.parse_new_descriptor(tokens, pos, prev_names) + # note: 'pos' should point to the 'end of string' marker + # that terminates 'tokens'. + if pos != len(tokens) - 1: + raise xparser_error("Parsing Descriptor, saw junk at end: " + + ' '.join(tokens[pos:-1]), self.str()) + return descriptor + + def get_dim_for_descriptor(self, descriptor, all_layers): + """Returns the dimension of a Descriptor object. This is a convenience + function used in set_configs. + """ + + layer_to_dim_func = \ + lambda name: xutils.get_dim_from_layer_name(all_layers, self, + name) + return descriptor.dim(layer_to_dim_func) + + def get_string_for_descriptor(self, descriptor, all_layers): + """Returns the 'final' string form of a Descriptor object, + as could be used in config files. This is a convenience function + provided for use in child classes; + """ + + layer_to_string_func = \ + lambda name: xutils.get_string_from_layer_name(all_layers, + self, name) + return descriptor.config_string(layer_to_string_func) + + def get_name(self): + """Returns the name of this layer, e.g. 'affine1'. It does not + necessarily correspond to a component name. + """ + + return self.name + + ###### Functions that might be overridden by the child class: ##### + + def set_default_configs(self): + """Child classes should override this. + """ + + raise Exception("Child classes must override set_default_configs().") + + def set_derived_configs(self): + """This is expected to be called after set_configs and before + check_configs(). + """ + + if self.config['dim'] <= 0: + self.config['dim'] = self.descriptors['input']['dim'] + + def check_configs(self): + """child classes should override this. + """ + + pass + + def get_input_descriptor_names(self): + """This function, which may be (but usually will not have to be) + overridden by child classes, returns a list of names of the input + descriptors expected by this component. Typically this would just + return ['input'] as most layers just have one 'input'. However some + layers might require more inputs (e.g. cell state of previous LSTM layer + in Highway LSTMs). It is used in the function 'normalize_descriptors()'. + This implementation will work for layer types whose only + Descriptor-valued config is 'input'. + If a child class adds more inputs, or does not have an input + (e.g. the XconfigInputLayer), it should override this function's + implementation to something like: `return ['input', 'input2']` + """ + + return [ 'input' ] + + def auxiliary_outputs(self): + """Returns a list of all auxiliary outputs that this layer supports. + These are either 'None' for the regular output, or a string + (e.g. 'projection' or 'memory_cell') for any auxiliary outputs that + the layer might provide. Most layer types will not need to override + this. + """ + + return [ None ] + + def output_name(self, auxiliary_output = None): + """Called with auxiliary_output == None, this returns the component-node + name of the principal output of the layer (or if you prefer, the text + form of a descriptor that gives you such an output; such as + Append(some_node, some_other_node)). + The 'auxiliary_output' argument is a text value that is designed for + extensions to layers that have additional auxiliary outputs. + For example, to implement a highway LSTM you need the memory-cell of a + layer, so you might allow auxiliary_output='memory_cell' for such a + layer type, and it would return the component node or a suitable + Descriptor: something like 'lstm3.c_t' + """ + + raise Exception("Child classes must override output_name()") + + def output_dim(self, auxiliary_output = None): + """The dimension that this layer outputs. The 'auxiliary_output' + parameter is for layer types which support auxiliary outputs. + """ + + raise Exception("Child classes must override output_dim()") + + def get_full_config(self): + """This function returns lines destined for the 'full' config format, as + would be read by the C++ programs. Since the program + xconfig_to_configs.py writes several config files, this function returns + a list of pairs of the form (config_file_basename, line), + e.g. something like + [ ('init', 'input-node name=input dim=40'), + ('ref', 'input-node name=input dim=40') ] + which would be written to config_dir/init.config and config_dir/ref.config. + """ + + raise Exception("Child classes must override get_full_config()") + + +class XconfigInputLayer(XconfigLayerBase): + """This class is for lines like + 'input name=input dim=40' + or + 'input name=ivector dim=100' + in the config file. + """ + + + def __init__(self, first_token, key_to_value, prev_names = None): + + assert first_token == 'input' + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + + def set_default_configs(self): + + self.config = { 'dim': -1} + + def check_configs(self): + + if self.config['dim'] <= 0: + raise xparser_error("Dimension of input-layer '{0}'" + "should be positive.".format(self.name), + self.str()) + + def get_input_descriptor_names(self): + + return [] # there is no 'input' field in self.config. + + def output_name(self, auxiliary_outputs = None): + + # there are no auxiliary outputs as this layer will just pass the input + assert auxiliary_outputs is None + return self.name + + def output_dim(self, auxiliary_outputs = None): + + # there are no auxiliary outputs as this layer will just pass the input + assert auxiliary_outputs is None + return self.config['dim'] + + def get_full_config(self): + + # unlike other layers the input layers need to be printed in + # 'init.config' (which initializes the neural network prior to the LDA) + ans = [] + for config_name in [ 'init', 'ref', 'final' ]: + ans.append( (config_name, + 'input-node name={0} dim={1}'.format(self.name, + self.config['dim']))) + return ans + + + +class XconfigTrivialOutputLayer(XconfigLayerBase): + """This class is for lines like + 'output name=output input=Append(input@-1, input@0, input@1, ReplaceIndex(ivector, t, 0))' + This is for outputs that are not really output "layers" + (there is no affine transform or nonlinearity), they just directly map to an + output-node in nnet3. + """ + + def __init__(self, first_token, key_to_value, prev_names = None): + + assert first_token == 'output' + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + + # note: self.config['input'] is a descriptor, '[-1]' means output + # the most recent layer. + self.config = { 'input':'[-1]' } + + def check_configs(self): + + pass # nothing to check; descriptor-parsing can't happen in this function. + + def output_name(self, auxiliary_outputs = None): + + # there are no auxiliary outputs as this layer will just pass the output + # of the previous layer + assert auxiliary_outputs is None + return self.name + + def output_dim(self, auxiliary_outputs = None): + + assert auxiliary_outputs is None + # note: each value of self.descriptors is (descriptor, dim, normalized-string, output-string). + return self.descriptors['input']['dim'] + + def get_full_config(self): + + # the input layers need to be printed in 'init.config' (which + # initializes the neural network prior to the LDA), in 'ref.config', + # which is a version of the config file used for getting left and right + # context (it doesn't read anything for the LDA-like transform and/or + # presoftmax-prior-scale components) + # In 'full.config' we write everything, this is just for reference, + # and also for cases where we don't use the LDA-like transform. + ans = [] + + # note: each value of self.descriptors is (descriptor, dim, + # normalized-string, output-string). + # by 'output-string' we mean a string that can appear in + # config-files, i.e. it contains the 'final' names of nodes. + descriptor_final_str = self.descriptors['input']['final-string'] + + for config_name in ['init', 'ref', 'final' ]: + ans.append( (config_name, + 'output-node name={0} input={1}'.format( + self.name, descriptor_final_str))) + return ans + + +class XconfigOutputLayer(XconfigLayerBase): + """This class is for lines like + 'output-layer name=output dim=4257 input=Append(input@-1, input@0, input@1, ReplaceIndex(ivector, t, 0))' + By default this includes a log-softmax component. The parameters are + initialized to zero, asthis is best for output layers. + + Parameters of the class, and their defaults: + input='[-1]' : Descriptor giving the input of the layer. + dim=None : Output dimension of layer, will normally equal the number of pdfs. + include-log-softmax=true : setting it to false will omit the + log-softmax component- useful for chain models. + objective-type=linear : the only other choice currently is + 'quadratic', for use in regression problems + learning-rate-factor=1.0 : Learning rate factor for the final + affine component, multiplies the standard learning rate. normally + you'll leave this as-is, but for xent regularization output layers + for chain models you'll want to set + learning-rate-factor=(0.5/xent_regularize), + normally learning-rate-factor=5.0 since xent_regularize is + normally 0.1. + presoftmax-scale-file=None : If set, a filename for a vector that + will be used to scale the output of the affine component before the + log-softmax (if include-log-softmax=true), or before the output + (if not). This is helpful to avoid instability in training due to + some classes having much more data than others. The way we normally + create this vector is to take the priors of the classes to the + power -0.25 and rescale them so the average is 1.0. This factor + -0.25 is referred to as presoftmax_prior_scale_power in scripts. In + the scripts this would normally be set to + config_dir/presoftmax_prior_scale.vec + """ + + def __init__(self, first_token, key_to_value, prev_names = None): + + assert first_token == 'output-layer' + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + + # note: self.config['input'] is a descriptor, '[-1]' means output + # the most recent layer. + self.config = {'input' : '[-1]', + 'dim' : -1, + 'include-log-softmax' : True, + # this would be false for chain models + 'objective-type' : 'linear', + # see Nnet::ProcessOutputNodeConfigLine in + # nnet-nnet.cc for other options + 'learning-rate-factor' : 1.0, + 'presoftmax-scale-file' : '', + # used in DNN (not RNN) training when using + # frame-level objfns, + 'max-change' : 1.5, + 'param-stddev' : 0.0, + 'bias-stddev' : 0.0, + 'output-delay' : 0 + } + + def check_configs(self): + + if self.config['dim'] <= -1: + raise xparser_error("In output-layer, dim has invalid value {0}" + "".format(self.config['dim']), self.str()) + + if self.config['objective-type'] != 'linear' and \ + self.config['objective_type'] != 'quadratic': + raise xparser_error("In output-layer, objective-type has" + " invalid value {0}" + "".format(self.config['objective-type']), + self.str()) + + if self.config['learning-rate-factor'] <= 0.0: + raise xparser_error("In output-layer, learning-rate-factor has" + " invalid value {0}" + "".format(self.config['learning-rate-factor']), + self.str()) + + + # you cannot access the output of this layer from other layers... see + # comment in output_name for the reason why. + def auxiliary_outputs(self): + + return [] + + def output_name(self, auxiliary_outputs = None): + + # Note: nodes of type output-node in nnet3 may not be accessed in + # Descriptors, so calling this with auxiliary_outputs=None doesn't + # make sense. But it might make sense to make the output of the softmax + # layer and/or the output of the affine layer available as inputs to + # other layers, in some circumstances. + # we'll implement that when it's needed. + raise xparser_error("Outputs of output-layer may not be used by other" + " layers", self.str()) + + def output_dim(self, auxiliary_output = None): + + # see comment in output_name(). + raise xparser_error("Outputs of output-layer may not be used by other" + " layers", self.str()) + + def get_full_config(self): + + ans = [] + + # note: each value of self.descriptors is (descriptor, dim, + # normalized-string, output-string). + # by 'descriptor_final_string' we mean a string that can appear in + # config-files, i.e. it contains the 'final' names of nodes. + descriptor_final_string = self.descriptors['input']['final-string'] + input_dim = self.descriptors['input']['dim'] + output_dim = self.config['dim'] + objective_type = self.config['objective-type'] + learning_rate_factor = self.config['learning-rate-factor'] + include_log_softmax = self.config['include-log-softmax'] + presoftmax_scale_file = self.config['presoftmax-scale-file'] + param_stddev = self.config['param-stddev'] + bias_stddev = self.config['bias-stddev'] + output_delay = self.config['output-delay'] + max_change = self.config['max-change'] + + # note: ref.config is used only for getting the left-context and + # right-context of the network; + # final.config is where we put the actual network definition. + for config_name in [ 'ref', 'final' ]: + # First the affine node. + line = ('component name={0}.affine' + ' type=NaturalGradientAffineComponent' + ' input-dim={1}' + ' output-dim={2}' + ' param-stddev={3}' + ' bias-stddev={4}' + ' max-change={5} ' + ''.format(self.name, input_dim, output_dim, + param_stddev, bias_stddev, max_change) + + ('learning-rate-factor={0} '.format(learning_rate_factor) + if learning_rate_factor != 1.0 else '')) + ans.append((config_name, line)) + + line = ('component-node name={0}.affine' + ' component={0}.affine input={1}' + ''.format(self.name, descriptor_final_string)) + ans.append((config_name, line)) + cur_node = '{0}.affine'.format(self.name) + + if presoftmax_scale_file is not '' and config_name == 'final': + # don't use the presoftmax-scale in 'ref.config' since that + # file won't exist at the time we evaluate it. + # (ref.config is used to find the left/right context). + line = ('component name={0}.fixed-scale' + ' type=FixedScaleComponent scales={1}' + ''.format(self.name, presoftmax_scale_file)) + ans.append((config_name, line)) + + line = ('component-node name={0}.fixed-scale' + ' component={0}.fixed-scale input={1}' + ''.format(self.name, cur_node)) + ans.append((config_name, line)) + cur_node = '{0}.fixed-scale'.format(self.name) + + if include_log_softmax: + line = ('component name={0}.log-softmax' + ' type=LogSoftmaxComponent dim={1}' + ''.format(self.name, output_dim)) + ans.append((config_name, line)) + + line = ('component-node name={0}.log-softmax' + ' component={0}.log-softmax input={1}' + ''.format(self.name, cur_node)) + ans.append((config_name, line)) + cur_node = '{0}.log-softmax'.format(self.name) + + if output_delay != 0: + cur_node = 'Offset({0}, {1})'.format(cur_node, output_delay) + + line = ('output-node name={0} input={1}'.format(self.name, cur_node)) + ans.append((config_name, line)) + return ans + + +# This class is for parsing lines like +# 'relu-renorm-layer name=layer1 dim=1024 input=Append(-3,0,3)' +# or: +# 'sigmoid-layer name=layer1 dim=1024 input=Append(-3,0,3)' +# which specify addition of an affine component and a sequence of non-linearities. +# Here, the name of the layer itself dictates the sequence of nonlinearities +# that are applied after the affine component; the name should contain some +# combination of 'relu', 'renorm', 'sigmoid' and 'tanh', +# and these nonlinearities will be added along with the affine component. +# +# The dimension specified is the output dim; the input dim is worked out from the input descriptor. +# This class supports only nonlinearity types that do not change the dimension; we can create +# another layer type to enable the use p-norm and similar dimension-reducing nonlinearities. +# +# See other configuration values below. +# +# Parameters of the class, and their defaults: +# input='[-1]' [Descriptor giving the input of the layer.] +# dim=None [Output dimension of layer, e.g. 1024] +# self-repair-scale=1.0e-05 [Affects relu, sigmoid and tanh layers.] +# +class XconfigBasicLayer(XconfigLayerBase): + def __init__(self, first_token, key_to_value, prev_names = None): + # Here we just list some likely combinations.. you can just add any + # combinations you want to use, to this list. + print(first_token) + assert first_token in [ 'relu-layer', 'relu-renorm-layer', 'sigmoid-layer', + 'tanh-layer' ] + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + + # note: self.config['input'] is a descriptor, '[-1]' means output + # the most recent layer. + self.config = { 'input':'[-1]', + 'dim':-1, + 'max-change' : 0.75, + 'bias-stddev' : 0, + 'param-stddev' : -1, # default value is derived + 'self-repair-scale' : 1.0e-05, + 'target-rms' : 1.0, + 'ng-affine-options' : ''} + + def set_derived_configs(self): + super(XconfigBasicLayer, self).set_derived_configs() + if self.config['param-stddev'] < 0: + self.config['param-stddev'] = 1.0 / self.descriptors['input']['dim'] + + + def check_configs(self): + if self.config['dim'] < 0: + raise xparser_error("dim has invalid value {0}".format(self.config['dim']), self.str()) + if self.config['self-repair-scale'] < 0.0 or self.config['self-repair-scale'] > 1.0: + raise xparser_error("self-repair-scale has invalid value {0}".format(self.config['self-repair-scale']), self.str()) + if self.config['target-rms'] < 0.0: + raise xparser_error("target-rms has invalid value {0}".format(self.config['target-rms']), self.str()) + + def output_name(self, auxiliary_output=None): + # at a later stage we might want to expose even the pre-nonlinearity + # vectors + assert auxiliary_output == None + + split_layer_name = self.layer_type.split('-') + assert split_layer_name[-1] == 'layer' + last_nonlinearity = split_layer_name[-2] + # return something like: layer3.renorm + return '{0}.{1}'.format(self.name, last_nonlinearity) + + def output_dim(self, auxiliary_output = None): + output_dim = self.config['dim'] + # If not set, the output-dim defaults to the input-dim. + if output_dim <= 0: + output_dim = self.descriptors['input']['dim'] + return output_dim + + def get_full_config(self): + + ans = [] + + split_layer_name = self.layer_type.split('-') + assert split_layer_name[-1] == 'layer' + nonlinearities = split_layer_name[:-1] + + # by 'descriptor_final_string' we mean a string that can appear in + # config-files, i.e. it contains the 'final' names of nodes. + descriptor_final_string = self.descriptors['input']['final-string'] + input_dim = self.descriptors['input']['dim'] + output_dim = self.output_dim() + self_repair_scale = self.config['self-repair-scale'] + target_rms = self.config['target-rms'] + param_stddev = self.config['param-stddev'] + bias_stddev = self.config['bias-stddev'] + max_change = self.config['max-change'] + ng_opt_str = self.config['ng-affine-options'] + + for config_name in [ 'ref', 'final' ]: + # First the affine node. + line = ('component name={0}.affine' + ' type=NaturalGradientAffineComponent' + ' input-dim={1}' + ' output-dim={2}' + ' param-stddev={3}' + ' bias-stddev={4}' + ' max-change={5}' + ' {6}' + ''.format(self.name, input_dim, output_dim, + param_stddev, bias_stddev, max_change, ng_opt_str)) + ans.append((config_name, line)) + + line = ('component-node name={0}.affine' + ' component={0}.affine input={1}' + ''.format(self.name, descriptor_final_string)) + ans.append((config_name, line)) + cur_node = '{0}.affine'.format(self.name) + + for nonlinearity in nonlinearities: + if nonlinearity == 'relu': + line = ('component name={0}.{1}' + ' type=RectifiedLinearComponent dim={2}' + ' self-repair-scale={3}' + ''.format(self.name, nonlinearity, output_dim, + self_repair_scale)) + + elif nonlinearity == 'sigmoid': + line = ('component name={0}.{1}' + ' type=SigmoidComponent dim={2}' + ' self-repair-scale={3}' + ''.format(self.name, nonlinearity, output_dim, + self_repair_scale)) + + elif nonlinearity == 'tanh': + line = ('component name={0}.{1}' + ' type=TanhComponent dim={2}' + ' self-repair-scale={3}' + ''.format(self.name, nonlinearity, output_dim, + self_repair_scale)) + + elif nonlinearity == 'renorm': + line = ('component name={0}.{1}' + ' type=NormalizeComponent dim={2}' + ' target-rms={3}' + ''.format(self.name, nonlinearity, output_dim, + target_rms)) + + else: + raise xparser_error("Unknown nonlinearity type:" + "{0}".format(nonlinearity), self.str()) + + ans.append((config_name, line)) + line = ('component-node name={0}.{1}' + ' component={0}.{1} input={2}' + ''.format(self.name, nonlinearity, cur_node)) + + ans.append((config_name, line)) + cur_node = '{0}.{1}'.format(self.name, nonlinearity) + return ans + + +# This class is for lines like +# 'fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=foo/bar/lda.mat' +# +# The output dimension of the layer may be specified via 'dim=xxx', but if not specified, +# the dimension defaults to the same as the input. Note: we don't attempt to read that +# file at the time the config is created, because in the recipes, that file is created +# after the config files. +# +# See other configuration values below. +# +# Parameters of the class, and their defaults: +# input='[-1]' [Descriptor giving the input of the layer.] +# dim=None [Output dimension of layer; defaults to the same as the input dim.] +# affine-transform-file='' [Must be specified.] +# +class XconfigFixedAffineLayer(XconfigLayerBase): + def __init__(self, first_token, key_to_value, prev_names = None): + assert first_token == 'fixed-affine-layer' + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + # note: self.config['input'] is a descriptor, '[-1]' means output + # the most recent layer. + self.config = { 'input':'[-1]', + 'dim':-1, + 'affine-transform-file':''} + + def check_configs(self): + if self.config['affine-transform-file'] is None: + raise xparser_error("affine-transform-file must be set.", self.str()) + + def output_name(self, auxiliary_output = None): + # Fixed affine layer computes only one vector, there are no intermediate + # vectors. + assert auxiliary_output == None + return self.name + + def output_dim(self, auxiliary_output = None): + output_dim = self.config['dim'] + # If not set, the output-dim defaults to the input-dim. + if output_dim <= 0: + output_dim = self.descriptors['input']['dim'] + return output_dim + + def get_full_config(self): + ans = [] + + # note: each value of self.descriptors is (descriptor, dim, + # normalized-string, output-string). + # by 'descriptor_final_string' we mean a string that can appear in + # config-files, i.e. it contains the 'final' names of nodes. + descriptor_final_string = self.descriptors['input']['final-string'] + input_dim = self.descriptors['input']['dim'] + output_dim = self.output_dim() + transform_file = self.config['affine-transform-file'] + + + # to init.config we write an output-node with the name 'output' and + # with a Descriptor equal to the descriptor that's the input to this + # layer. This will be used to accumulate stats to learn the LDA transform. + line = 'output-node name=output input={0}'.format(descriptor_final_string) + ans.append(('init', line)) + + # write the 'real' component to final.config + line = 'component name={0} type=FixedAffineComponent matrix={1}'.format( + self.name, transform_file) + ans.append(('final', line)) + # write a random version of the component, with the same dims, to ref.config + line = 'component name={0} type=FixedAffineComponent input-dim={1} output-dim={2}'.format( + self.name, input_dim, output_dim) + ans.append(('ref', line)) + # the component-node gets written to final.config and ref.config. + line = 'component-node name={0} component={0} input={1}'.format( + self.name, descriptor_final_string) + ans.append(('final', line)) + ans.append(('ref', line)) + return ans + +# This class is for lines like +# 'affine-layer name=affine input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0))' +# +# The output dimension of the layer may be specified via 'dim=xxx', but if not specified, +# the dimension defaults to the same as the input. Note: we don't attempt to read that +# file at the time the config is created, because in the recipes, that file is created +# after the config files. +# +# See other configuration values below. +# +# Parameters of the class, and their defaults: +# input='[-1]' [Descriptor giving the input of the layer.] +# dim=None [Output dimension of layer; defaults to the same as the input dim.] +# +class XconfigAffineLayer(XconfigLayerBase): + def __init__(self, first_token, key_to_value, prev_names = None): + assert first_token == 'affine-layer' + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + # note: self.config['input'] is a descriptor, '[-1]' means output + # the most recent layer. + # use None for optional parameters as we want to default to the C++ defaults + # C++ component provides more options but I will just expose these for now + # Note : The type of the parameter is determined based on the value assigned + # so please use decimal point if your parameter is a float + self.config = { 'input' : '[-1]', + 'dim' : -1, + 'param-stddev' : -1.0, # this has to be initialized to 1/sqrt(input_dim) + 'bias-stddev' : 1.0, + 'bias-mean' : 0.0, + 'max-change' : 0.75, + 'learning-rate-factor' : 1.0, + 'ng-affine-options' : ''} + + def set_derived_configs(self): + super(XconfigAffineLayer, self).set_derived_configs() + if self.config['param-stddev'] < 0: + self.config['param-stddev'] = 1.0 / self.descriptors['input']['dim'] + + def check_configs(self): + if self.config['dim'] <= 0: + raise xparser_error("dim specified is invalid".format(self.name, self.layer_type), self.str()) + + def output_name(self, auxiliary_output = None): + # affine layer computes only one vector, there are no intermediate + # vectors. + assert auxiliary_output == None + return self.name + + def output_dim(self, auxiliary_output = None): + output_dim = self.config['dim'] + # If not set, the output-dim defaults to the input-dim. + if output_dim <= 0: + output_dim = self.descriptors['input']['dim'] + + return output_dim + + def get_full_config(self): + ans = [] + + # note: each value of self.descriptors is (descriptor, dim, + # normalized-string, output-string). + # by 'descriptor_final_string' we mean a string that can appear in + # config-files, i.e. it contains the 'final' names of nodes. + descriptor_final_string = self.descriptors['input']['final-string'] + input_dim = self.descriptors['input']['dim'] + output_dim = self.output_dim() + + option_string='' + for key in ['param-stddev', 'bias-stddev', 'bias-mean', 'max-change']: + option_string += ' {0}={1}'.format(key, self.config[key]) + option_string += self.config['ng-affine-options'] + + conf_lines = [] + # write the 'real' component to final.config + conf_lines.append('component name={n} type=NaturalGradientAffineComponent ' + 'input-dim={i} output-dim={o} {opts}'.format(n = self.name, + i = input_dim, + o = output_dim, + opts = option_string)) + # the component-node gets written to final.config and ref.config. + conf_lines.append('component-node name={0} component={0} input={1}'.format(self.name, + descriptor_final_string)) + + # the config is same for both final and ref configs + for conf_name in ['final', 'ref']: + for line in conf_lines: + ans.append((conf_name, line)) + return ans + + +def test_layers(): + # for some config lines that should be printed the same way as they + # are read, check that this is the case. + for x in [ 'input name=input dim=30' ]: + assert str(config_line_to_object(x, [])) == x diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py new file mode 100644 index 00000000000..fa356d15a18 --- /dev/null +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py @@ -0,0 +1,7 @@ +# Copyright 2016 Johns Hopkins University (Dan Povey) +# 2016 Vijayaditya Peddinti +# 2016 Yiming Wang +# Apache 2.0. + +from basic_layers import * +from lstm import * diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py new file mode 100644 index 00000000000..7b37958f81b --- /dev/null +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py @@ -0,0 +1,532 @@ +# Copyright 2016 Johns Hopkins University (Dan Povey) +# 2016 Vijayaditya Peddinti +# 2016 Yiming Wang +# Apache 2.0. + + +""" This module has the implementations of different LSTM layers. +""" +import re + +from libs.nnet3.xconfig.basic_layers import XconfigLayerBase +from libs.nnet3.xconfig.utils import XconfigParserError as xparser_error + + +# This class is for lines like +# 'lstm-layer name=lstm1 input=[-1] delay=-3' +# It generates an LSTM sub-graph without output projections. +# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified, +# the dimension defaults to the same as the input. +# See other configuration values below. +# +# Parameters of the class, and their defaults: +# input='[-1]' [Descriptor giving the input of the layer.] +# cell-dim=-1 [Dimension of the cell] +# delay=-1 [Delay in the recurrent connections of the LSTM ] +# clipping-threshold=30 [nnet3 LSTMs use a gradient clipping component at the recurrent connections. This is the threshold used to decide if clipping has to be activated ] +# norm-based-clipping=True [specifies if the gradient clipping has to activated based on total norm or based on per-element magnitude] +# self_repair_scale_nonlinearity=1e-5 [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent] +# i.e., SigmoidComponent, TanhComponent and RectifiedLinearComponent ] +# ng-per-element-scale-options='' [Additional options used for the diagonal matrices in the LSTM ] +# ng-affine-options='' [Additional options used for the full matrices in the LSTM, can be used to do things like set biases to initialize to 1] +class XconfigLstmLayer(XconfigLayerBase): + def __init__(self, first_token, key_to_value, prev_names = None): + assert first_token == "lstm-layer" + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + self.config = {'input':'[-1]', + 'cell-dim' : -1, # this is a compulsory argument + 'clipping-threshold' : 30.0, + 'norm-based-clipping' : True, + 'delay' : -1, + 'ng-per-element-scale-options' : ' max-change=0.75', + 'ng-affine-options' : ' max-change=0.75 ', + 'self-repair-scale-nonlinearity' : 0.00001, + 'zeroing-interval' : 20, + 'zeroing-threshold' : 3.0 + } + + def set_derived_configs(self): + if self.config['cell-dim'] <= 0: + self.config['cell-dim'] = self.InputDim() + + def check_configs(self): + key = 'cell-dim' + if self.config['cell-dim'] <= 0: + raise xparser_error("cell-dim has invalid value {0}.".format(self.config[key]), self.str()) + + for key in ['self-repair-scale-nonlinearity']: + if self.config[key] < 0.0 or self.config[key] > 1.0: + raise xparser_error("{0} has invalid value {1}.".format(key, self.config[key])) + + def auxiliary_outputs(self): + return ['c_t'] + + def output_name(self, auxiliary_output = None): + node_name = 'm_t' + if auxiliary_output is not None: + if auxiliary_output in self.auxiliary_outputs(): + node_name = auxiliary_output + else: + raise xparser_error("Unknown auxiliary output name {0}".format(auxiliary_output), self.str()) + + return '{0}.{1}'.format(self.name, node_name) + + def output_dim(self, auxiliary_output = None): + if auxiliary_output is not None: + if auxiliary_output in self.auxiliary_outputs(): + if node_name == 'c_t': + return self.config['cell-dim'] + # add code for other auxiliary_outputs here when we decide to expose them + else: + raise xparser_error("Unknown auxiliary output name {0}".format(auxiliary_output), self.str()) + + return self.config['cell-dim'] + + def get_full_config(self): + ans = [] + config_lines = self.generate_lstm_config() + + for line in config_lines: + for config_name in ['ref', 'final']: + # we do not support user specified matrices in LSTM initialization + # so 'ref' and 'final' configs are the same. + ans.append((config_name, line)) + return ans + + # convenience function to generate the LSTM config + def generate_lstm_config(self): + + # assign some variables to reduce verbosity + name = self.name + # in the below code we will just call descriptor_strings as descriptors for conciseness + input_dim = self.descriptors['input']['dim'] + input_descriptor = self.descriptors['input']['final-string'] + cell_dim = self.config['cell-dim'] + delay = self.config['delay'] + + repair_nonlin = self.config['self-repair-scale-nonlinearity'] + repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else '' + bptrunc_str = ("clipping-threshold={0}" + " zeroing-threshold={1}" + " zeroing-interval={2}" + " recurrence-interval={3}" + "".format(self.config['clipping-threshold'], + self.config['zeroing-threshold'], + self.config['zeroing-interval'], + abs(delay))) + affine_str = self.config['ng-affine-options'] + # Natural gradient per element scale parameters + # TODO: decide if we want to keep exposing these options + if re.search('param-mean', ng_per_element_scale_options) is None and \ + re.search('param-stddev', ng_per_element_scale_options) is None: + ng_per_element_scale_options += " param-mean=0.0 param-stddev=1.0 " + pes_str = ng_per_element_scale_options + + + + configs = [] + + # the equations implemented here are + # TODO: write these + # naming convention + # .W_. e.g. Lstm1.W_i.xr for matrix providing output to gate i and operating on an appended vector [x,r] + configs.append("# Input gate control : W_i* matrices") + configs.append("component name={0}.W_i.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim, affine_str)) + configs.append("# note : the cell outputs pass through a diagonal matrix") + configs.append("component name={0}.w_i.c type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim, pes_str)) + + configs.append("# Forget gate control : W_f* matrices") + configs.append("component name={0}.W_f.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim, affine_str)) + configs.append("# note : the cell outputs pass through a diagonal matrix") + configs.append("component name={0}.w_f.c type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim, pes_str)) + + configs.append("# Output gate control : W_o* matrices") + configs.append("component name={0}.W_o.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim, affine_str)) + configs.append("# note : the cell outputs pass through a diagonal matrix") + configs.append("component name={0}.w_o.c type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim, pes_str)) + + configs.append("# Cell input matrices : W_c* matrices") + configs.append("component name={0}.W_c.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim, affine_str)) + + + configs.append("# Defining the non-linearities") + configs.append("component name={0}.i type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append("component name={0}.f type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append("component name={0}.g type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + + configs.append("# Defining the components for other cell computations") + configs.append("component name={0}.c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) + configs.append("component name={0}.c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) + configs.append("component name={0}.m type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) + configs.append("component name={0}.c type=BackpropTruncationComponent dim={1} {2}".format(name, cell_dim, bptrunc_str)) + + # c1_t and c2_t defined below + configs.append("component-node name={0}.c_t component={0}.c input=Sum({0}.c1_t, {0}.c2_t)".format(name)) + delayed_c_t_descriptor = "IfDefined(Offset({0}.c_t, {1}))".format(name, delay) + + configs.append("# i_t") + configs.append("component-node name={0}.i1_t component={0}.W_i.xr input=Append({1}, IfDefined(Offset({0}.r_t, {2})))".format(name, input_descriptor, delay)) + configs.append("component-node name={0}.i2_t component={0}.w_i.c input={1}".format(name, delayed_c_t_descriptor)) + configs.append("component-node name={0}.i_t component={0}.i input=Sum({0}.i1_t, {0}.i2_t)".format(name)) + + configs.append("# f_t") + configs.append("component-node name={0}.f1_t component={0}.W_f.xr input=Append({1}, IfDefined(Offset({0}.r_t, {2})))".format(name, input_descriptor, delay)) + configs.append("component-node name={0}.f2_t component={0}.w_f.c input={1}".format(name, delayed_c_t_descriptor)) + configs.append("component-node name={0}.f_t component={0}.f input=Sum({0}.f1_t, {0}.f2_t)".format(name)) + + configs.append("# o_t") + configs.append("component-node name={0}.o1_t component={0}.W_o.xr input=Append({1}, IfDefined(Offset({0}.r_t, {2})))".format(name, input_descriptor, delay)) + configs.append("component-node name={0}.o2_t component={0}.w_o.c input={0}.c_t".format(name)) + configs.append("component-node name={0}.o_t component={0}.o input=Sum({0}.o1_t, {0}.o2_t)".format(name)) + + configs.append("# h_t") + configs.append("component-node name={0}.h_t component={0}.h input={0}.c_t".format(name)) + + configs.append("# g_t") + configs.append("component-node name={0}.g1_t component={0}.W_c.xr input=Append({1}, IfDefined(Offset({0}.r_t, {2})))".format(name, input_descriptor, delay)) + configs.append("component-node name={0}.g_t component={0}.g input={0}.g1_t".format(name)) + + configs.append("# parts of c_t") + configs.append("component-node name={0}.c1_t component={0}.c1 input=Append({0}.f_t, {1})".format(name, delayed_c_t_descriptor)) + configs.append("component-node name={0}.c2_t component={0}.c2 input=Append({0}.i_t, {0}.g_t)".format(name)) + + configs.append("# m_t") + configs.append("component-node name={0}.m_t component={0}.m input=Append({0}.o_t, {0}.h_t)".format(name)) + + # add the recurrent connections + configs.append("component name={0}.r type=BackpropTruncationComponent dim={1} {2}".format(name, cell_dim, bptrunc_str)) + configs.append("component-node name={0}.r_t component={0}.r input={0}.m_t".format(name)) + + return configs + + +# This class is for lines like +# 'lstmp-layer name=lstm1 input=[-1] delay=-3' +# It generates an LSTM sub-graph with output projections. It can also generate +# outputs without projection, but you could use the XconfigLstmLayer for this +# simple LSTM. +# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified, +# the dimension defaults to the same as the input. +# See other configuration values below. +# +# Parameters of the class, and their defaults: +# input='[-1]' [Descriptor giving the input of the layer.] +# cell-dim=-1 [Dimension of the cell] +# recurrent_projection_dim [Dimension of the projection used in recurrent connections] +# non_recurrent_projection_dim [Dimension of the projection in non-recurrent connections] +# delay=-1 [Delay in the recurrent connections of the LSTM ] +# clipping-threshold=30 [nnet3 LSTMs use a gradient clipping component at the recurrent connections. This is the threshold used to decide if clipping has to be activated ] +# norm-based-clipping=True [specifies if the gradient clipping has to activated based on total norm or based on per-element magnitude] +# self_repair_scale_nonlinearity=1e-5 [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent] +# i.e., SigmoidComponent, TanhComponent and RectifiedLinearComponent ] +# ng-per-element-scale-options='' [Additional options used for the diagonal matrices in the LSTM ] +# ng-affine-options='' [Additional options used for the full matrices in the LSTM, can be used to do things like set biases to initialize to 1] +class XconfigLstmpLayer(XconfigLayerBase): + def __init__(self, first_token, key_to_value, prev_names = None): + print first_token + assert first_token == "lstmp-layer" + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + self.config = {'input' : '[-1]', + 'cell-dim' : -1, # this is a compulsory argument + 'recurrent-projection-dim' : -1, + 'non-recurrent-projection-dim' : -1, + 'clipping-threshold' : 30.0, + 'norm-based-clipping' : True, + 'delay' : -1, + 'ng-per-element-scale-options' : ' max-change=0.75 ', + 'ng-affine-options' : ' max-change=0.75 ', + 'self-repair-scale-nonlinearity' : 0.00001, + 'zeroing-interval' : 20, + 'zeroing-threshold' : 3.0 + } + + def set_derived_configs(self): + if self.config['cell-dim'] <= 0: + self.config['cell-dim'] = self.InputDim() + + for key in ['recurrent-projection-dim', 'non-recurrent-projection-dim']: + if self.config[key] <= 0: + self.config[key] = self.config['cell-dim'] / 2 + + def check_configs(self): + for key in ['cell-dim', 'recurrent-projection-dim', 'non-recurrent-projection-dim']: + if self.config[key] <= 0: + raise xparser_error("{0} has invalid value {1}.".format(key, self.config[key]), self.str()) + + for key in ['self-repair-scale-nonlinearity']: + if self.config[key] < 0.0 or self.config[key] > 1.0: + raise xparser_error("{0} has invalid value {2}.".format(self.layer_type, + key, + self.config[key])) + def auxiliary_outputs(self): + return ['c_t'] + + def output_name(self, auxiliary_output = None): + node_name = 'rp_t' + if auxiliary_output is not None: + if auxiliary_output in self.auxiliary_outputs(): + node_name = auxiliary_output + else: + raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output)) + + return '{0}.{1}'.format(self.name, node_name) + + def output_dim(self, auxiliary_output = None): + if auxiliary_output is not None: + if auxiliary_output in self.auxiliary_outputs(): + if node_name == 'c_t': + return self.config['cell-dim'] + # add code for other auxiliary_outputs here when we decide to expose them + else: + raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output)) + + return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim'] + + def get_full_config(self): + ans = [] + config_lines = self.generate_lstm_config() + + for line in config_lines: + for config_name in ['ref', 'final']: + # we do not support user specified matrices in LSTM initialization + # so 'ref' and 'final' configs are the same. + ans.append((config_name, line)) + return ans + + # convenience function to generate the LSTM config + def generate_lstm_config(self): + + # assign some variables to reduce verbosity + name = self.name + # in the below code we will just call descriptor_strings as descriptors for conciseness + input_dim = self.descriptors['input']['dim'] + input_descriptor = self.descriptors['input']['final-string'] + cell_dim = self.config['cell-dim'] + rec_proj_dim = self.config['recurrent-projection-dim'] + nonrec_proj_dim = self.config['non-recurrent-projection-dim'] + delay = self.config['delay'] + repair_nonlin = self.config['self-repair-scale-nonlinearity'] + repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else '' + bptrunc_str = ("clipping-threshold={0}" + " zeroing-threshold={1}" + " zeroing-interval={2}" + " recurrence-interval={3}" + "".format(self.config['clipping-threshold'], + self.config['zeroing-threshold'], + self.config['zeroing-interval'], + abs(delay))) + affine_str = self.config['ng-affine-options'] + pes_str = self.config['ng-per-element-scale-options'] + + # Natural gradient per element scale parameters + # TODO: decide if we want to keep exposing these options + if re.search('param-mean', pes_str) is None and \ + re.search('param-stddev', pes_str) is None: + pes_str += " param-mean=0.0 param-stddev=1.0 " + + configs = [] + # the equations implemented here are from Sak et. al. "Long Short-Term Memory Recurrent Neural Network Architectures for Large Scale Acoustic Modeling" + # http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43905.pdf + # naming convention + # .W_. e.g. Lstm1.W_i.xr for matrix providing output to gate i and operating on an appended vector [x,r] + configs.append("# Input gate control : W_i* matrices") + configs.append("component name={0}.W_i.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str)) + configs.append("# note : the cell outputs pass through a diagonal matrix") + configs.append("component name={0}.w_i.c type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim, pes_str)) + + configs.append("# Forget gate control : W_f* matrices") + configs.append("component name={0}.W_f.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str)) + configs.append("# note : the cell outputs pass through a diagonal matrix") + configs.append("component name={0}.w_f.c type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim, pes_str)) + + configs.append("# Output gate control : W_o* matrices") + configs.append("component name={0}.W_o.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str)) + configs.append("# note : the cell outputs pass through a diagonal matrix") + configs.append("component name={0}.w_o.c type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim, pes_str)) + + configs.append("# Cell input matrices : W_c* matrices") + configs.append("component name={0}.W_c.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str)) + + configs.append("# Defining the non-linearities") + configs.append("component name={0}.i type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append("component name={0}.f type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append("component name={0}.g type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + + configs.append("# Defining the components for other cell computations") + configs.append("component name={0}.c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) + configs.append("component name={0}.c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) + configs.append("component name={0}.m type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) + configs.append("component name={0}.c type=BackpropTruncationComponent dim={1} {2}".format(name, cell_dim, bptrunc_str)) + + # c1_t and c2_t defined below + configs.append("component-node name={0}.c_t component={0}.c input=Sum({0}.c1_t, {0}.c2_t)".format(name)) + delayed_c_t_descriptor = "IfDefined(Offset({0}.c_t, {1}))".format(name, delay) + + recurrent_connection = '{0}.r_t'.format(name) + configs.append("# i_t") + configs.append("component-node name={0}.i1_t component={0}.W_i.xr input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) + configs.append("component-node name={0}.i2_t component={0}.w_i.c input={1}".format(name, delayed_c_t_descriptor)) + configs.append("component-node name={0}.i_t component={0}.i input=Sum({0}.i1_t, {0}.i2_t)".format(name)) + + configs.append("# f_t") + configs.append("component-node name={0}.f1_t component={0}.W_f.xr input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) + configs.append("component-node name={0}.f2_t component={0}.w_f.c input={1}".format(name, delayed_c_t_descriptor)) + configs.append("component-node name={0}.f_t component={0}.f input=Sum({0}.f1_t, {0}.f2_t)".format(name)) + + configs.append("# o_t") + configs.append("component-node name={0}.o1_t component={0}.W_o.xr input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) + configs.append("component-node name={0}.o2_t component={0}.w_o.c input={0}.c_t".format(name)) + configs.append("component-node name={0}.o_t component={0}.o input=Sum({0}.o1_t, {0}.o2_t)".format(name)) + + configs.append("# h_t") + configs.append("component-node name={0}.h_t component={0}.h input={0}.c_t".format(name)) + + configs.append("# g_t") + configs.append("component-node name={0}.g1_t component={0}.W_c.xr input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) + configs.append("component-node name={0}.g_t component={0}.g input={0}.g1_t".format(name)) + + configs.append("# parts of c_t") + configs.append("component-node name={0}.c1_t component={0}.c1 input=Append({0}.f_t, {1})".format(name, delayed_c_t_descriptor)) + configs.append("component-node name={0}.c2_t component={0}.c2 input=Append({0}.i_t, {0}.g_t)".format(name)) + + configs.append("# m_t") + configs.append("component-node name={0}.m_t component={0}.m input=Append({0}.o_t, {0}.h_t)".format(name)) + + # add the recurrent connections + configs.append("# projection matrices : Wrm and Wpm") + configs.append("component name={0}.W_rp.m type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str)) + configs.append("component name={0}.r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str)) + + configs.append("# r_t and p_t : rp_t will be the output") + configs.append("component-node name={0}.rp_t component={0}.W_rp.m input={0}.m_t".format(name)) + configs.append("dim-range-node name={0}.r_t_preclip input-node={0}.rp_t dim-offset=0 dim={1}".format(name, rec_proj_dim)) + configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_preclip".format(name)) + + return configs + +# Same as the LSTMP layer except that the matrix multiplications are combined +# we probably keep only version after experimentation. One year old experiments +# show that this version is slightly worse and might require some tuning +class XconfigLstmpcLayer(XconfigLstmpLayer): + def __init__(self, first_token, key_to_value, prev_names = None): + assert first_token == "lstmpc-layer" + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + # convenience function to generate the LSTM config + def generate_lstm_config(self): + # assign some variables to reduce verbosity + name = self.name + # in the below code we will just call descriptor_strings as descriptors for conciseness + input_dim = self.descriptors['input']['dim'] + input_descriptor = self.descriptors['input']['final-string'] + cell_dim = self.config['cell-dim'] + rec_proj_dim = self.config['recurrent-projection-dim'] + nonrec_proj_dim = self.config['non-recurrent-projection-dim'] + delay = self.config['delay'] + + repair_nonlin = self.config['self-repair-scale-nonlinearity'] + repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else '' + bptrunc_str = ("clipping-threshold={0}" + " zeroing-threshold={1}" + " zeroing-interval={2}" + " recurrence-interval={3}" + "".format(self.config['clipping-threshold'], + self.config['zeroing-threshold'], + self.config['zeroing-interval'], + abs(delay))) + affine_str = self.config['ng-affine-options'] + # Natural gradient per element scale parameters + # TODO: decide if we want to keep exposing these options + if re.search('param-mean', ng_per_element_scale_options) is None and \ + re.search('param-stddev', ng_per_element_scale_options) is None: + ng_per_element_scale_options += " param-mean=0.0 param-stddev=1.0 " + pes_str = ng_per_element_scale_options + + configs = [] + # naming convention + # .W_. e.g. Lstm1.W_i.xr for matrix providing output to gate i and operating on an appended vector [x,r] + configs.append("# Full W_ifoc* matrix") + configs.append("component name={0}.W_ifoc.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, 4*cell_dim, affine_str)) + configs.append("# note : the cell outputs pass through a diagonal matrix") + + # we will not combine the diagonal matrix operations as one of these has a different delay + configs.append("# note : the cell outputs pass through a diagonal matrix") + configs.append("component name={0}.w_i.c type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim, pes_str)) + configs.append("component name={0}.w_f.c type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim, pes_str)) + configs.append("component name={0}.w_o.c type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim, pes_str)) + + configs.append("# Defining the non-linearities") + configs.append("component name={0}.i type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append("component name={0}.f type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append("component name={0}.g type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + + configs.append("# Defining the components for other cell computations") + configs.append("component name={0}.c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) + configs.append("component name={0}.c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) + configs.append("component name={0}.m type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) + configs.append("component name={0}.c type=BackpropTruncationComponent dim={1} {2}".format(name, cell_dim, bptrunc_str)) + + # c1_t and c2_t defined below + configs.append("component-node name={0}.c_t component={0}.c input=Sum({0}.c1_t, {0}.c2_t)".format(name)) + delayed_c_t_descriptor = "IfDefined(Offset({0}.c_t, {1}))".format(name, delay) + rec_connection = '{0}.rp_t'.format(name) + + component_nodes.append("component-node name={0}.ifoc_t component={0}.W_ifoc.xr input=Append({1}, IfDefined(Offset({0}_{2}, {3})))".format(name, input_descriptor, recurrent_connection, lstm_delay)) + + + offset = 0 + component_nodes.append("# i_t") + component_nodes.append("dim-range-node name={0}.i1_t input-node={0}.ifoc_t dim-offset={1} dim={2}".format(name, offset, cell_dim)) + offset += cell_dim + component_nodes.append("component-node name={0}.i2_t component={0}.w_i.cinput={1}".format(name, delayed_c_t_descriptor)) + component_nodes.append("component-node name={0}.i_t component={0}.i input=Sum({0}.i1_t, {0}.i2_t)".format(name)) + + component_nodes.append("# f_t") + component_nodes.append("dim-range-node name={0}.f1_t input-node={0}.ifoc_t dim-offset={1} dim={2}".format(name, offset, cell_dim)) + offset += cell_dim + component_nodes.append("component-node name={0}.f2_t component={0}.w_f.c input={1}".format(name, delayed_c_t_descriptor)) + component_nodes.append("component-node name={0}.f_t component={0}.f input=Sum({0}.f1_t, {0}.f2_t)".format(name)) + + component_nodes.append("# o_t") + component_nodes.append("dim-range-node name={0}.o1_t input-node={0}.ifoc_t dim-offset={1} dim={2}".format(name, offset, cell_dim)) + offset += cell_dim + component_nodes.append("component-node name={0}.o2_t component={0}.w_o.c input={0}.c_t".format(name)) + component_nodes.append("component-node name={0}.o_t component={0}.o input=Sum({0}.o1_t, {0}.o2_t)".format(name)) + + component_nodes.append("# h_t") + component_nodes.append("component-node name={0}.h_t component={0}.h input={0}.c_t".format(name)) + + component_nodes.append("# g_t") + component_nodes.append("dim-range-node name={0}.g1_t input-node={0}.ifoc_t dim-offset={1} dim={2}".format(name, offset, cell_dim)) + offset += cell_dim + component_nodes.append("component-node name={0}.g_t component={0}.g input={0}.g1_t".format(name)) + + + configs.append("# parts of c_t") + configs.append("component-node name={0}.c1_t component={0}.c1 input=Append({0}.f_t, {1})".format(name, delayed_c_t_descriptor)) + configs.append("component-node name={0}.c2_t component={0}.c2 input=Append({0}.i_t, {0}.g_t)".format(name)) + + configs.append("# m_t") + configs.append("component-node name={0}.m_t component={0}.m input=Append({0}.o_t, {0}.h_t)".format(name)) + + # add the recurrent connections + configs.append("# projection matrices : Wrm and Wpm") + configs.append("component name={0}.W_rp.m type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, recurrent_projection_dim + non_recurrent_projection_dim, affine_str)) + configs.append("component name={0}.r type=BackpropTruncationComponent dim={1} {2}".format(name, recurrent_projection_dim, bptrunc_str)) + + configs.append("# r_t and p_t : rp_t will be the output") + configs.append("component-node name={0}.rp_t component={0}.W_rp.m input={0}.m_t".format(name)) + configs.append("dim-range-node name={0}.r_t_preclip input-node={0}.rp_t dim-offset=0 dim={1}".format(name, recurrent_projection_dim)) + configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_preclip".format(name)) + + return configs diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py new file mode 100644 index 00000000000..7aacba1ee8f --- /dev/null +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -0,0 +1,90 @@ +# Copyright 2016 Johns Hopkins University (Dan Povey) +# 2016 Vijayaditya Peddinti +# Apache 2.0. + +""" This module contains the top level xconfig parsing functions. +""" + +import libs.nnet3.xconfig.layers as xlayers +import libs.nnet3.xconfig.utils as xutils +from libs.nnet3.xconfig.utils import XconfigParserError as xparser_error + + +# We have to modify this dictionary when adding new layers +config_to_layer = { + 'input' : xlayers.XconfigInputLayer, + 'output' : xlayers.XconfigTrivialOutputLayer, + 'output-layer' : xlayers.XconfigOutputLayer, + 'relu-layer' : xlayers.XconfigBasicLayer, + 'relu-renorm-layer' : xlayers.XconfigBasicLayer, + 'sigmoid-layer' : xlayers.XconfigBasicLayer, + 'tanh-layer' : xlayers.XconfigBasicLayer, + 'fixed-affine-layer' : xlayers.XconfigFixedAffineLayer, + 'affine-layer' : xlayers.XconfigAffineLayer, + 'lstm-layer' : xlayers.XconfigLstmLayer, + 'lstmp-layer' : xlayers.XconfigLstmpLayer, + 'lstmpc-layer' : xlayers.XconfigLstmpcLayer + } + +# Converts a line as parsed by ParseConfigLine() into a first +# token e.g. 'input-layer' and a key->value map, into +# an objet inherited from XconfigLayerBase. +# 'prev_names' is a list of previous layer names, it's needed +# to parse things like '[-1]' (meaning: the previous layer) +# when they appear in Desriptors. +def parsed_line_to_xconfig_layer(first_token, key_to_value, prev_names): + + conf_line = first_token + ' ' + ' '.join(['{0}={1}'.format(x,y) for x,y in key_to_value.items()]) + + if not config_to_layer.has_key(first_token): + raise xparser_error("No such layer type.", conf_line) + + try: + return config_to_layer[first_token](first_token, key_to_value, prev_names) + except xparser_error as e: + if e.conf_line is None: + # we want to throw informative errors which point to the xconfig line + e.conf_line = conf_line + raise + +# Uses ParseConfigLine() to turn a config line that has been parsed into +# a first token e.g. 'affine-layer' and a key->value map like { 'dim':'1024', 'name':'affine1' }, +# and then turns this into an object representing that line of the config file. +# 'prev_names' is a list of the names of preceding lines of the +# config file. +def config_line_to_object(config_line, prev_names = None): + (first_token, key_to_value) = xutils.parse_config_line(config_line) + return parsed_line_to_xconfig_layer(first_token, key_to_value, prev_names) + +# This function reads an xconfig file and returns it as a list of layers +# (usually we use the variable name 'all_layers' elsewhere for this). +# It will die if the xconfig file is empty or if there was +# some error parsing it. +def read_xconfig_file(xconfig_filename): + try: + f = open(xconfig_filename, 'r') + except Exception as e: + sys.exit("{0}: error reading xconfig file '{1}'; error was {2}".format( + sys.argv[0], xconfig_filename, repr(e))) + all_layers = [] + while True: + line = f.readline() + if line == '': + break + x = xutils.parse_config_line(line) + if x is None: + continue # line was blank or only comments. + (first_token, key_to_value) = x + # the next call will raise an easy-to-understand exception if + # it fails. + this_layer = parsed_line_to_xconfig_layer(first_token, + key_to_value, + all_layers) + all_layers.append(this_layer) + if len(all_layers) == 0: + raise xparser_error("{0}: xconfig file '{1}' is empty".format( + sys.argv[0], xconfig_filename)) + f.close() + return all_layers + + diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py new file mode 100644 index 00000000000..87c9d880089 --- /dev/null +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py @@ -0,0 +1,615 @@ +# Copyright 2016 Johns Hopkins University (Author: Daniel Povey). +# License: Apache 2.0. + +# This library contains various utilities that are involved in processing +# of xconfig -> config conversion. It contains "generic" lower-level code +# while xconfig_layers.py contains the code specific to layer types. + +from __future__ import print_function +import re +import sys + + +class XconfigParserError(RuntimeError): + def __init__(self, error_msg, conf_line=None): + self.conf_line = conf_line + if conf_line is not None: + self.msg = 'While parsing "{c}" :{e}'.format(c=conf_line, e=error_msg) + else: + self.msg = error_msg + + def __str__(self): + return self.msg + +# [utility function used in xconfig_layers.py] +# Given a list of objects of type XconfigLayerBase ('all_layers'), +# including at least the layers preceding 'current_layer' (and maybe +# more layers), return the names of layers preceding 'current_layer' +# This will be used in parsing expressions like [-1] in descriptors +# (which is an alias for the previous layer). +def get_prev_names(all_layers, current_layer): + prev_names = [] + for layer in all_layers: + if layer is current_layer: + break + prev_names.append(layer.get_name()) + prev_names_set = set() + for name in prev_names: + if name in prev_names_set: + raise XconfigParserError("{0}: Layer name {1} is used more than once.".format( + sys.argv[0], name), current_layer.str()) + prev_names_set.add(name) + return prev_names + + +# This is a convenience function to parser the auxiliary output name from the +# full layer name + +def split_layer_name(full_layer_name): + assert isinstance(full_layer_name, str) + split_name = full_layer_name.split('.') + if len(split_name) == 0: + raise XconfigParserError("Bad layer name: " + full_layer_name) + layer_name = split_name[0] + if len(split_name) == 1: + auxiliary_output = None + else: + # we probably expect len(split_name) == 2 in this case, + # but no harm in allowing dots in the auxiliary_output. + auxiliary_output = '.'.join(split_name[1:]) + + return [layer_name, auxiliary_output] + +# [utility function used in xconfig_layers.py] +# this converts a layer-name like 'ivector' or 'input', or a sub-layer name like +# 'lstm2.memory_cell', into a dimension. 'all_layers' is a vector of objects +# inheriting from XconfigLayerBase. 'current_layer' is provided so that the +# function can make sure not to look in layers that appear *after* this layer +# (because that's not allowed). +def get_dim_from_layer_name(all_layers, current_layer, full_layer_name): + layer_name, auxiliary_output = split_layer_name(full_layer_name) + for layer in all_layers: + if layer is current_layer: + break + if layer.get_name() == layer_name: + if not auxiliary_output in layer.auxiliary_outputs() and auxiliary_output is not None: + raise XconfigParserError("Layer '{0}' has no such auxiliary output: '{1}' ({0}.{1})".format(layer_name, auxiliary_output), layer.str()) + return layer.output_dim(auxiliary_output) + # No such layer was found. + if layer_name in [ layer.get_name() for layer in all_layers ]: + raise XconfigParserError("Layer '{0}' was requested before it appeared in " + "the xconfig file (circular dependencies or out-of-order " + "layers".format(layer_name)) + else: + raise XconfigParserError("No such layer: '{0}'".format(layer_name)) + + +# [utility function used in xconfig_layers.py] +# this converts a layer-name like 'ivector' or 'input', or a sub-layer name like +# 'lstm2.memory_cell', into a descriptor (usually, but not required to be a simple +# component-node name) that can appear in the generated config file. 'all_layers' is a vector of objects +# inheriting from XconfigLayerBase. 'current_layer' is provided so that the +# function can make sure not to look in layers that appear *after* this layer +# (because that's not allowed). +def get_string_from_layer_name(all_layers, current_layer, full_layer_name): + layer_name, auxiliary_output = split_layer_name(full_layer_name) + for layer in all_layers: + if layer is current_layer: + break + if layer.get_name() == layer_name: + if not auxiliary_output in layer.auxiliary_outputs() and auxiliary_output is not None: + raise XconfigParserError("Layer '{0}' has no such auxiliary output: '{1}' ({0}.{1})".format( + layer_name, auxiliary_output)) + return layer.output_name(auxiliary_output) + # No such layer was found. + if layer_name in [ layer.get_name() for layer in all_layers ]: + raise XconfigParserError("Layer '{0}' was requested before it appeared in " + "the xconfig file (circular dependencies or out-of-order " + "layers".format(layer_name)) + else: + raise XconfigParserError("No such layer: '{0}'".format(layer_name)) + + +# This function, used in converting string values in config lines to +# configuration values in self.config in layers, attempts to +# convert 'string_value' to an instance dest_type (which is of type Type) +# 'key' is only needed for printing errors. +def convert_value_to_type(key, dest_type, string_value): + if dest_type == type(bool()): + if string_value == "True" or string_value == "true": + return True + elif string_value == "False" or string_value == "false": + return False + else: + raise XconfigParserError("Invalid configuration value {0}={1} (expected bool)".format( + key, string_value)) + elif dest_type == type(int()): + try: + return int(string_value) + except: + raise XconfigParserError("Invalid configuration value {0}={1} (expected int)".format( + key, string_value)) + elif dest_type == type(float()): + try: + return float(string_value) + except: + raise XconfigParserError("Invalid configuration value {0}={1} (expected int)".format( + key, string_value)) + elif dest_type == type(str()): + return string_value + + + +# This class parses and stores a Descriptor-- expression +# like Append(Offset(input, -3), input) and so on. +# For the full range of possible expressions, see the comment at the +# top of src/nnet3/nnet-descriptor.h. +# Note: as an extension to the descriptor format used in the C++ +# code, we can have e.g. input@-3 meaning Offset(input, -3); +# and if bare integer numbers appear where a descriptor was expected, +# they are interpreted as Offset(prev_layer, -3) where 'prev_layer' +# is the previous layer in the config file. + +# Also, in any place a raw input/layer/output name can appear, we accept things +# like [-1] meaning the previous input/layer/output's name, or [-2] meaning the +# last-but-one input/layer/output, and so on. +class Descriptor: + def __init__(self, + descriptor_string = None, + prev_names = None): + # self.operator is a string that may be 'Offset', 'Append', + # 'Sum', 'Failover', 'IfDefined', 'Offset', 'Switch', 'Round', + # 'ReplaceIndex'; it also may be None, representing the base-case + # (where it's just a layer name) + + # self.items will be whatever items are + # inside the parentheses, e.g. if this is Sum(foo bar), + # then items will be [d1, d2], where d1 is a Descriptor for + # 'foo' and d1 is a Descriptor for 'bar'. However, there are + # cases where elements of self.items are strings or integers, + # for instance in an expression 'ReplaceIndex(ivector, x, 0)', + # self.items would be [d, 'x', 0], where d is a Descriptor + # for 'ivector'. In the case where self.operator is None (where + # this Descriptor represents just a bare layer name), self. + # items contains the name of the input layer as a string. + self.operator = None + self.items = None + + if descriptor_string != None: + try: + tokens = tokenize_descriptor(descriptor_string, prev_names) + pos = 0 + (d, pos) = parse_new_descriptor(tokens, pos, prev_names) + # note: 'pos' should point to the 'end of string' marker + # that terminates 'tokens'. + if pos != len(tokens) - 1: + raise XconfigParserError("Parsing Descriptor, saw junk at end: " + + ' '.join(tokens[pos:-1])) + # copy members from d. + self.operator = d.operator + self.items = d.items + except XconfigParserError as e: + traceback.print_tb(sys.exc_info()[2]) + raise XconfigParserError("Error parsing Descriptor '{0}', specific error was: {1}".format( + descriptor_string, repr(e))) + + # This is like the str() function, but it uses the layer_to_string function + # (which is a function from strings to strings) to convert layer names (or + # in general sub-layer names of the form 'foo.bar') to the component-node + # (or, in general, descriptor) names that appear in the final config file. + # This mechanism gives those designing layer types the freedom to name their + # nodes as they want. + def config_string(self, layer_to_string): + if self.operator is None: + assert len(self.items) == 1 and isinstance(self.items[0], str) + return layer_to_string(self.items[0]) + else: + assert isinstance(self.operator, str) + return self.operator + '(' + ', '.join( + [ item.config_string(layer_to_string) if isinstance(item, Descriptor) else str(item) + for item in self.items]) + ')' + + def str(self): + if self.operator is None: + assert len(self.items) == 1 and isinstance(self.items[0], str) + return self.items[0] + else: + assert isinstance(self.operator, str) + return self.operator + '(' + ', '.join([str(item) for item in self.items]) + ')' + + def __str__(self): + return self.str() + + # This function returns the dimension (i.e. the feature dimension) of the + # descriptor. It takes 'layer_to_dim' which is a function from + # layer-names (including sub-layer names, like lstm1.memory_cell) to + # dimensions, e.g. you might have layer_to_dim('ivector') = 100, or + # layer_to_dim('affine1') = 1024. + # note: layer_to_dim will raise an exception if a nonexistent layer or + # sub-layer is requested. + def dim(self, layer_to_dim): + if self.operator is None: + # base-case: self.items = [ layer_name ] (or sub-layer name, like + # 'lstm.memory_cell'). + return layer_to_dim(self.items[0]) + elif self.operator in [ 'Sum', 'Failover', 'IfDefined', 'Switch' ]: + # these are all operators for which all args are descriptors + # and must have the same dim. + dim = self.items[0].dim(layer_to_dim) + for desc in self.items[1:]: + next_dim = desc.dim(layer_to_dim) + if next_dim != dim: + raise XparserError("In descriptor {0}, different fields have different " + "dimensions: {1} != {2}".format(self.str(), dim, next_dim)) + return dim + elif self.operator in [ 'Offset', 'Round', 'ReplaceIndex' ]: + # for these operators, only the 1st arg is relevant. + return self.items[0].dim(layer_to_dim) + elif self.operator == 'Append': + return sum([ x.dim(layer_to_dim) for x in self.items]) + else: + raise XconfigParserError("Unknown operator {0}".format(self.operator)) + + + +# This just checks that seen_item == expected_item, and raises an +# exception if not. +def expect_token(expected_item, seen_item, what_parsing): + if seen_item != expected_item: + raise XconfigParserError("parsing {0}, expected '{1}' but got '{2}'".format( + what_parsing, expected_item, seen_item)) + +# returns true if 'name' is valid as the name of a line (input, layer or output); +# this is the same as IsValidname() in the nnet3 code. +def is_valid_line_name(name): + return isinstance(name, str) and re.match(r'^[a-zA-Z_][-a-zA-Z_0-9.]*', name) != None + +# This function for parsing Descriptors takes an array of tokens as produced +# by tokenize_descriptor. It parses a descriptor +# starting from position pos >= 0 of the array 'tokens', and +# returns a new position in the array that reflects any tokens consumed while +# parsing the descriptor. +# It returns a pair (d, pos) where d is the newly parsed Descriptor, +# and 'pos' is the new position after consuming the relevant input. +# 'prev_names' is so that we can find the most recent layer name for +# expressions like Append(-3, 0, 3) which is shorthand for the most recent +# layer spliced at those time offsets. +def parse_new_descriptor(tokens, pos, prev_names): + size = len(tokens) + first_token = tokens[pos] + pos += 1 + d = Descriptor() + + # when reading this function, be careful to note the indent level, + # there is an if-statement within an if-statement. + if first_token in [ 'Offset', 'Round', 'ReplaceIndex', 'Append', 'Sum', 'Switch', 'Failover', 'IfDefined' ]: + expect_token('(', tokens[pos], first_token + '()') + pos += 1 + d.operator = first_token + # the 1st argument of all these operators is a Descriptor. + (desc, pos) = parse_new_descriptor(tokens, pos, prev_names) + d.items = [desc] + + if first_token == 'Offset': + expect_token(',', tokens[pos], 'Offset()') + pos += 1 + try: + t_offset = int(tokens[pos]) + pos += 1 + d.items.append(t_offset) + except: + raise XconfigParserError("Parsing Offset(), expected integer, got " + tokens[pos]) + if tokens[pos] == ')': + return (d, pos + 1) + elif tokens[pos] != ',': + raise XconfigParserError("Parsing Offset(), expected ')' or ',', got " + tokens[pos]) + pos += 1 + try: + x_offset = int(tokens[pos]) + pos += 1 + d.items.append(x_offset) + except: + raise XconfigParserError("Parsing Offset(), expected integer, got " + tokens[pos]) + expect_token(')', tokens[pos], 'Offset()') + pos += 1 + elif first_token in [ 'Append', 'Sum', 'Switch', 'Failover', 'IfDefined' ]: + while True: + if tokens[pos] == ')': + # check num-items is correct for some special cases. + if first_token == 'Failover' and len(d.items) != 2: + raise XconfigParserError("Parsing Failover(), expected 2 items but got {0}".format(len(d.items))) + if first_token == 'IfDefined' and len(d.items) != 1: + raise XconfigParserError("Parsing IfDefined(), expected 1 item but got {0}".format(len(d.items))) + pos += 1 + break + elif tokens[pos] == ',': + pos += 1 # consume the comma. + else: + raise XconfigParserError("Parsing Append(), expected ')' or ',', got " + tokens[pos]) + + (desc, pos) = parse_new_descriptor(tokens, pos, prev_names) + d.items.append(desc) + elif first_token == 'Round': + expect_token(',', tokens[pos], 'Round()') + pos += 1 + try: + t_modulus = int(tokens[pos]) + assert t_modulus > 0 + pos += 1 + d.items.append(t_modulus) + except: + raise XconfigParserError("Parsing Offset(), expected integer, got " + tokens[pos]) + expect_token(')', tokens[pos], 'Round()') + pos += 1 + elif first_token == 'ReplaceIndex': + expect_token(',', tokens[pos], 'ReplaceIndex()') + pos += 1 + if tokens[pos] in [ 'x', 't' ]: + d.items.append(tokens[pos]) + pos += 1 + else: + raise XconfigParserError("Parsing ReplaceIndex(), expected 'x' or 't', got " + + tokens[pos]) + expect_token(',', tokens[pos], 'ReplaceIndex()') + pos += 1 + try: + new_value = int(tokens[pos]) + pos += 1 + d.items.append(new_value) + except: + raise XconfigParserError("Parsing Offset(), expected integer, got " + tokens[pos]) + expect_token(')', tokens[pos], 'ReplaceIndex()') + pos += 1 + else: + raise XconfigParserError("code error") + elif first_token in [ 'end of string', '(', ')', ',', '@' ]: + raise XconfigParserError("Expected descriptor, got " + first_token) + elif is_valid_line_name(first_token) or first_token == '[': + # This section parses a raw input/layer/output name, e.g. "affine2" + # (which must start with an alphabetic character or underscore), + # optionally followed by an offset like '@-3'. + + d.operator = None + d.items = [first_token] + + # If the layer-name o is followed by '@', then + # we're parsing something like 'affine1@-3' which + # is syntactic sugar for 'Offset(affine1, 3)'. + if tokens[pos] == '@': + pos += 1 + try: + offset_t = int(tokens[pos]) + pos += 1 + except: + raise XconfigParserError("Parse error parsing {0}@{1}".format( + first_token, tokens[pos])) + if offset_t != 0: + inner_d = d + d = Descriptor() + # e.g. foo@3 is equivalent to 'Offset(foo, 3)'. + d.operator = 'Offset' + d.items = [ inner_d, offset_t ] + else: + # the last possible case is that 'first_token' is just an integer i, + # which can appear in things like Append(-3, 0, 3). + # See if the token is an integer. + # In this case, it's interpreted as the name of previous layer + # (with that time offset applied). + try: + offset_t = int(first_token) + except: + raise XconfigParserError("Parsing descriptor, expected descriptor but got " + + first_token) + assert isinstance(prev_names, list) + if len(prev_names) < 1: + raise XconfigParserError("Parsing descriptor, could not interpret '{0}' because " + "there is no previous layer".format(first_token)) + d.operator = None + # the layer name is the name of the most recent layer. + d.items = [prev_names[-1]] + if offset_t != 0: + inner_d = d + d = Descriptor() + d.operator = 'Offset' + d.items = [ inner_d, offset_t ] + return (d, pos) + + +# This function takes a string 'descriptor_string' which might +# look like 'Append([-1], [-2], input)', and a list of previous layer +# names like prev_names = ['foo', 'bar', 'baz'], and replaces +# the integers in brackets with the previous layers. -1 means +# the most recent previous layer ('baz' in this case), -2 +# means the last layer but one ('bar' in this case), and so on. +# It will throw an exception if the number is out of range. +# If there are no such expressions in the string, it's OK if +# prev_names == None (this is useful for testing). +def replace_bracket_expressions_in_descriptor(descriptor_string, + prev_names = None): + fields = re.split(r'(\[|\])\s*', descriptor_string) + out_fields = [] + i = 0 + while i < len(fields): + f = fields[i] + i += 1 + if f == ']': + raise XconfigParserError("Unmatched ']' in descriptor") + elif f == '[': + if i + 2 >= len(fields): + raise XconfigParserError("Error tokenizing string '{0}': '[' found too close " + "to the end of the descriptor.".format(descriptor_string)) + assert isinstance(prev_names, list) + try: + offset = int(fields[i]) + assert offset < 0 and -offset <= len(prev_names) + i += 2 # consume the int and the ']'. + except: + raise XconfigParserError("Error tokenizing string '{0}': expression [{1}] has an " + "invalid or out of range offset.".format(descriptor_string, fields[i])) + this_field = prev_names[offset] + out_fields.append(this_field) + else: + out_fields.append(f) + return ''.join(out_fields) + +# tokenizes 'descriptor_string' into the tokens that may be part of Descriptors. +# Note: for convenience in parsing, we add the token 'end-of-string' to this +# list. +# The argument 'prev_names' (for the names of previous layers and input and +# output nodes) is needed to process expressions like [-1] meaning the most +# recent layer, or [-2] meaning the last layer but one. +# The default None for prev_names is only supplied for testing purposes. +def tokenize_descriptor(descriptor_string, + prev_names = None): + # split on '(', ')', ',', '@', and space. Note: the parenthesis () in the + # regexp causes it to output the stuff inside the () as if it were a field, + # which is how the call to re.split() keeps characters like '(' and ')' as + # tokens. + fields = re.split(r'(\(|\)|@|,|\s)\s*', + replace_bracket_expressions_in_descriptor(descriptor_string, + prev_names)) + ans = [] + for f in fields: + # don't include fields that are space, or are empty. + if re.match(r'^\s*$', f) is None: + ans.append(f) + + ans.append('end of string') + return ans + + +# This function parses a line in a config file, something like +# affine-layer name=affine1 input=Append(-3, 0, 3) +# and returns a pair, +# (first_token, fields), as (string, dict) e.g. in this case +# ('affine-layer', {'name':'affine1', 'input':'Append(-3, 0, 3)" +# Note: spaces are allowed in the field names but = signs are +# disallowed, which is why it's possible to parse them. +# This function also removes comments (anything after '#'). +# As a special case, this function will return None if the line +# is empty after removing spaces. +def parse_config_line(orig_config_line): + # Remove comments. + # note: splitting on '#' will always give at least one field... python + # treats splitting on space as a special case that may give zero fields. + config_line = orig_config_line.split('#')[0] + if re.match('[^a-zA-Z0-9\.\-\(\)_\s"]', config_line) is not None: + raise XconfigParserError("Xconfig line has unknown characters.", config_line) + + # Now split on space; later we may splice things back together. + fields=config_line.split() + if len(fields) == 0: + return None # Line was only whitespace after removing comments. + first_token = fields[0] + # if first_token does not look like 'foo-bar' or 'foo-bar2', then die. + if re.match('^[a-z][-a-z0-9]+$', first_token) is None: + raise XconfigParserError("Error parsing config line (first field doesn't look right): {0}".format( + orig_config_line)) + # get rid of the first field which we put in 'first_token'. + fields = fields[1:] + + rest_of_line = ' '.join(fields) + # rest of the line can be of the form 'a=1 b=" x=1 y=2 " c=Append( i1, i2)' + positions = map(lambda x: x.start(), re.finditer('"', rest_of_line)) + if not len(positions) % 2 == 0: + raise XconfigParserError('"s should occur in pairs', config_line) + + # add the " enclosed strings and corresponding keys to the dict + # and remove them from the rest_of_line + num_strings = len(positions) / 2 + fields = [] + for i in range(num_strings): + start = positions[i * 2] + end = positions[i * 2 + 1] + rest_of_line_after = rest_of_line[end + 1:] + parts = rest_of_line[:start].split() + rest_of_line_before = ' '.join(parts[:-1]) + assert(parts[-1][-1] == '=') + fields.append(parts[-1][:-1]) + fields.append(rest_of_line[start + 1 : end]) + rest_of_line = rest_of_line_before + ' ' + rest_of_line_after + + # suppose rest_of_line is: 'input=Append(foo, bar) foo=bar' + # then after the below we'll get + # fields = ['', 'input', 'Append(foo, bar)', 'foo', 'bar'] + ans_dict = dict() + other_fields = re.split(r'\s*([-a-zA-Z0-9_]*)=', rest_of_line) + if not (other_fields[0] == '' and len(other_fields) % 2 == 1): + raise XconfigParserError("Could not parse config line: " + orig_config_line) + fields += other_fields[1:] + num_variables = len(fields) / 2 + for i in range(num_variables): + var_name = fields[i * 2] + var_value = fields[i * 2 + 1] + if re.match(r'[a-zA-Z_]', var_name) is None: + raise XconfigParserError("Expected variable name '{0}' to start with alphabetic character or _, " + "in config line {1}".format(var_name, orig_config_line)) + if var_name in ans_dict: + raise XconfigParserError("Config line has multiply defined variable {0}: {1}".format( + var_name, orig_config_line)) + ans_dict[var_name] = var_value + return (first_token, ans_dict) + +# Reads a config file and returns a list of objects, where each object +# represents one line of the file. +def read_config_file(filename): + try: + f = open(filename, "r") + except XconfigParserError as e: + raise XconfigParserError("Error reading config file {0}: {1}".format( + filename, repr(e))) + ans = [] + prev_names = [] + while True: + line = f.readline() + if line == '': + break + x = parse_config_line(line) + if x is None: + continue # blank line + (first_token, key_to_value) = x + layer_object = config_line_to_object(first_token, key_to_value, prev_names) + ans.append(layer_object) + prev_names.append(layer_object.get_name()) + +def test_library(): + tokenize_test = lambda x: tokenize_descriptor(x)[:-1] # remove 'end of string' + assert tokenize_test("hi") == ['hi'] + assert tokenize_test("hi there") == ['hi', 'there'] + assert tokenize_test("hi,there") == ['hi', ',', 'there'] + assert tokenize_test("hi@-1,there") == ['hi', '@', '-1', ',', 'there'] + assert tokenize_test("hi(there)") == ['hi', '(', 'there', ')'] + assert tokenize_descriptor("[-1]@2", ['foo', 'bar'])[:-1] == ['bar', '@', '2' ] + assert tokenize_descriptor("[-2].special@2", ['foo', 'bar'])[:-1] == ['foo.special', '@', '2' ] + + assert Descriptor('foo').str() == 'foo' + assert Descriptor('Sum(foo,bar)').str() == 'Sum(foo, bar)' + assert Descriptor('Sum(Offset(foo,1),Offset(foo,0))').str() == 'Sum(Offset(foo, 1), Offset(foo, 0))' + for x in [ 'Append(foo, Sum(bar, Offset(baz, 1)))', 'Failover(foo, Offset(bar, -1))', + 'IfDefined(Round(baz, 3))', 'Switch(foo1, Offset(foo2, 2), Offset(foo3, 3))', + 'IfDefined(ReplaceIndex(ivector, t, 0))', 'ReplaceIndex(foo, x, 0)' ]: + if not Descriptor(x).str() == x: + print("Error: '{0}' != '{1}'".format(Descriptor(x).str(), x)) + + prev_names = ['last_but_one_layer', 'prev_layer'] + for x, y in [ ('Sum(foo,bar)', 'Sum(foo, bar)'), + ('Sum(foo1,bar-3_4)', 'Sum(foo1, bar-3_4)'), + ('Append(input@-3, input@0, input@3)', + 'Append(Offset(input, -3), input, Offset(input, 3))'), + ('Append(-3,0,3)', + 'Append(Offset(prev_layer, -3), prev_layer, Offset(prev_layer, 3))'), + ('[-1]', 'prev_layer'), + ('[-2]', 'last_but_one_layer'), + ('[-2]@3', + 'Offset(last_but_one_layer, 3)') ]: + if not Descriptor(x, prev_names).str() == y: + print("Error: '{0}' != '{1}'".format(Descriptor(x).str(), y)) + + + print(parse_config_line('affine-layer input=Append(foo, bar) foo=bar')) + print(parse_config_line('affine-layer input=Append(foo, bar) foo=bar opt2="a=1 b=2"')) + print(parse_config_line('affine-layer1 input=Append(foo, bar) foo=bar')) + print(parse_config_line('affine-layer')) + +if __name__ == "__main__": + test_library() diff --git a/egs/wsj/s5/steps/nnet3/chain/nnet3_chain_lib.py b/egs/wsj/s5/steps/nnet3/chain/nnet3_chain_lib.py index f012d06cca9..d58db33bf98 100644 --- a/egs/wsj/s5/steps/nnet3/chain/nnet3_chain_lib.py +++ b/egs/wsj/s5/steps/nnet3/chain/nnet3_chain_lib.py @@ -169,7 +169,8 @@ def PrepareInitialAcousticModel(dir, run_opts): command = run_opts.command, dir = dir)) def CombineModels(dir, num_iters, num_iters_combine, num_chunk_per_minibatch, - egs_dir, leaky_hmm_coefficient, l2_regularize, + egs_dir, left_context, right_context, + leaky_hmm_coefficient, l2_regularize, xent_regularize, run_opts): # Now do combination. In the nnet3 setup, the logic # for doing averaging of subsets of the models in the case where @@ -188,10 +189,13 @@ def CombineModels(dir, num_iters, num_iters_combine, num_chunk_per_minibatch, nnet3-chain-combine --num-iters=40 \ --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ --enforce-sum-to-one=true --enforce-positive-weights=true \ - --verbose=3 {dir}/den.fst {raw_models} "ark,bg:nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:{egs_dir}/combine.cegs ark:-|" \ + --verbose=3 {dir}/den.fst {raw_models} \ + "ark,bg:nnet3-chain-copy-egs --left-context={lc} --right-context={rc} ark:{egs_dir}/combine.cegs ark:- | \ + nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:- ark:-|" \ "|nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl {dir}/final.mdl" """.format(command = run_opts.command, combine_queue_opt = run_opts.combine_queue_opt, + lc = left_context, rc = right_context, l2 = l2_regularize, leaky = leaky_hmm_coefficient, dir = dir, raw_models = " ".join(raw_model_strings), num_chunk_per_minibatch = num_chunk_per_minibatch, @@ -201,9 +205,20 @@ def CombineModels(dir, num_iters, num_iters_combine, num_chunk_per_minibatch, # Compute the probability of the final, combined model with # the same subset we used for the previous compute_probs, as the # different subsets will lead to different probs. - ComputeTrainCvProbabilities(dir, 'final', egs_dir, l2_regularize, xent_regularize, leaky_hmm_coefficient, run_opts, wait = False) + ComputeTrainCvProbabilities(dir = dir, + iter = 'final', + egs_dir = egs_dir, + left_context = left_context, + right_context = right_context, + l2_regularize = l2_regularize, + xent_regularize = xent_regularize, + leaky_hmm_coefficient = leaky_hmm_coefficient, + run_opts = run_opts, + wait = False) -def ComputeTrainCvProbabilities(dir, iter, egs_dir, l2_regularize, xent_regularize, +def ComputeTrainCvProbabilities(dir, iter, + egs_dir, left_context, right_context, + l2_regularize, xent_regularize, leaky_hmm_coefficient, run_opts, wait = False): model = '{0}/{1}.mdl'.format(dir, iter) @@ -213,9 +228,10 @@ def ComputeTrainCvProbabilities(dir, iter, egs_dir, l2_regularize, xent_regulari nnet3-chain-compute-prob --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ --xent-regularize={xent_reg} \ "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \ - "ark,bg:nnet3-chain-merge-egs ark:{egs_dir}/valid_diagnostic.cegs ark:- |" + "ark,bg:nnet3-chain-copy-egs --left-context={lc} --right-context={rc} ark:{egs_dir}/valid_diagnostic.cegs ark:-| nnet3-chain-merge-egs ark:- ark:- |" """.format(command = run_opts.command, dir = dir, iter = iter, model = model, + lc = left_context, rc = right_context, l2 = l2_regularize, leaky = leaky_hmm_coefficient, xent_reg = xent_regularize, egs_dir = egs_dir), wait = wait) @@ -225,11 +241,12 @@ def ComputeTrainCvProbabilities(dir, iter, egs_dir, l2_regularize, xent_regulari nnet3-chain-compute-prob --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ --xent-regularize={xent_reg} \ "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \ - "ark,bg:nnet3-chain-merge-egs ark:{egs_dir}/train_diagnostic.cegs ark:- |" + "ark,bg:nnet3-chain-copy-egs --left-context={lc} --right-context={rc} ark:{egs_dir}/train_diagnostic.cegs ark:- | nnet3-chain-merge-egs ark:- ark:- |" """.format(command = run_opts.command, dir = dir, iter = iter, model = model, + lc = left_context, rc = right_context, l2 = l2_regularize, leaky = leaky_hmm_coefficient, xent_reg = xent_regularize, egs_dir = egs_dir), wait = wait) diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index cd9ebf4c7a3..53bd9f8924b 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -118,11 +118,7 @@ def GetArgs(): " chain model's output") parser.add_argument("--chain.left-deriv-truncate", type=int, dest='left_deriv_truncate', - default = None, help="") - parser.add_argument("--chain.right-deriv-truncate", type=int, - dest='right_deriv_truncate', - default = None, help="") - + default = None, help="Deprecated. Kept for back compatibility") # trainer options parser.add_argument("--trainer.srand", type=int, dest='srand', @@ -224,6 +220,12 @@ def GetArgs(): parser.add_argument("--trainer.num-chunk-per-minibatch", type=int, dest='num_chunk_per_minibatch', default=512, help="Number of sequences to be processed in parallel every minibatch" ) + parser.add_argument("--trainer.deriv-truncate-margin", type=int, dest='deriv_truncate_margin', + default = None, + help="If specified, it is the number of frames that the derivative will be backpropagated through the chunk boundaries, " + "e.g., During BLSTM model training if the chunk-width=150 and deriv-truncate-margin=5, then the derivative will be " + "backpropagated up to t=-5 and t=154 in the forward and backward LSTM sequence respectively; " + "otherwise, the derivative will be backpropagated to the end of the sequence.") # General options parser.add_argument("--stage", type=int, default=-4, @@ -284,6 +286,12 @@ def ProcessArgs(args): if args.chunk_right_context < 0: raise Exception("--egs.chunk-right-context should be non-negative") + if not args.left_deriv_truncate is None: + args.deriv_truncate_margin = -args.left_deriv_truncate + logger.warning("--chain.left-deriv-truncate (deprecated) is set by user, " + "and --trainer.deriv-truncate-margin is set to negative of that value={0}. " + "We recommend using the option --trainer.deriv-truncate-margin.".format(args.deriv_truncate_margin)) + if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")): raise Exception("""This scripts expects {0} to exist and have a configs directory which is the output of make_configs.py script""") @@ -325,9 +333,9 @@ def __init__(self): def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives, - raw_model_string, egs_dir, + raw_model_string, egs_dir, left_context, right_context, apply_deriv_weights, - left_deriv_truncate, right_deriv_truncate, + min_deriv_time, max_deriv_time, l2_regularize, xent_regularize, leaky_hmm_coefficient, momentum, max_param_change, shuffle_buffer_size, num_chunk_per_minibatch, @@ -340,10 +348,10 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi # but we use the same script for consistency with FF-DNN code deriv_time_opts="" - if left_deriv_truncate is not None: - deriv_time_opts += " --optimization.min-deriv-time={0}".format(left_deriv_truncate) - if right_deriv_truncate is not None: - deriv_time_opts += " --optimization.max-deriv-time={0}".format(int(chunk-width-right_deriv_truncate)) + if not min_deriv_time is None: + deriv_time_opts += " --optimization.min-deriv-time={0}".format(min_deriv_time) + if not max_deriv_time is None: + deriv_time_opts += " --optimization.max-deriv-time={0}".format(max_deriv_time) processes = [] for job in range(1,num_jobs+1): @@ -366,7 +374,7 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi --print-interval=10 --momentum={momentum} \ --max-param-change={max_param_change} \ "{raw_model}" {dir}/den.fst \ - "ark,bg:nnet3-chain-copy-egs --truncate-deriv-weights={trunc_deriv} --frame-shift={fr_shft} ark:{egs_dir}/cegs.{archive_index}.ark ark:- | nnet3-chain-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:- ark:- |" \ + "ark,bg:nnet3-chain-copy-egs --left-context={lc} --right-context={rc} --truncate-deriv-weights={trunc_deriv} --frame-shift={fr_shft} ark:{egs_dir}/cegs.{archive_index}.ark ark:- | nnet3-chain-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:- ark:- |" \ {dir}/{next_iter}.{job}.raw """.format(command = run_opts.command, train_queue_opt = run_opts.train_queue_opt, @@ -379,11 +387,12 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi parallel_train_opts = run_opts.parallel_train_opts, momentum = momentum, max_param_change = max_param_change, raw_model = raw_model_string, - egs_dir = egs_dir, archive_index = archive_index, + egs_dir = egs_dir, lc=left_context, rc=right_context, + archive_index = archive_index, shuffle_buffer_size = shuffle_buffer_size, cache_io_opts = cur_cache_io_opts, num_chunk_per_minibatch = num_chunk_per_minibatch), - wait = False) + wait = False) processes.append(process_handle) @@ -404,7 +413,8 @@ def TrainOneIteration(dir, iter, srand, egs_dir, num_jobs, num_archives_processed, num_archives, learning_rate, shrinkage_value, num_chunk_per_minibatch, num_hidden_layers, add_layers_period, - apply_deriv_weights, left_deriv_truncate, right_deriv_truncate, + left_context, right_context, + apply_deriv_weights, min_deriv_time, max_deriv_time, l2_regularize, xent_regularize, leaky_hmm_coefficient, momentum, max_param_change, shuffle_buffer_size, frame_subsampling_factor, truncate_deriv_weights, @@ -427,8 +437,15 @@ def TrainOneIteration(dir, iter, srand, egs_dir, f.write(str(srand)) f.close() - chain_lib.ComputeTrainCvProbabilities(dir, iter, egs_dir, - l2_regularize, xent_regularize, leaky_hmm_coefficient, run_opts) + chain_lib.ComputeTrainCvProbabilities(dir = dir, + iter = iter, + egs_dir = egs_dir, + left_context = left_context, + right_context = right_context, + l2_regularize = l2_regularize, + xent_regularize = xent_regularize, + leaky_hmm_coefficient = leaky_hmm_coefficient, + run_opts = run_opts) if iter > 0: chain_lib.ComputeProgress(dir, iter, run_opts) @@ -460,15 +477,30 @@ def TrainOneIteration(dir, iter, srand, egs_dir, cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2 cur_max_param_change = float(max_param_change) / math.sqrt(2) - TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives, - raw_model_string, egs_dir, - apply_deriv_weights, - left_deriv_truncate, right_deriv_truncate, - l2_regularize, xent_regularize, leaky_hmm_coefficient, - momentum, cur_max_param_change, - shuffle_buffer_size, cur_num_chunk_per_minibatch, - frame_subsampling_factor, truncate_deriv_weights, - cache_io_opts, run_opts) + TrainNewModels(dir = dir, + iter = iter, + srand = srand, + num_jobs = num_jobs, + num_archives_processed = num_archives_processed, + num_archives = num_archives, + raw_model_string = raw_model_string, + egs_dir = egs_dir, + left_context = left_context, + right_context = right_context, + apply_deriv_weights = apply_deriv_weights, + min_deriv_time = min_deriv_time, + max_deriv_time = max_deriv_time, + l2_regularize = l2_regularize, + xent_regularize = xent_regularize, + leaky_hmm_coefficient = leaky_hmm_coefficient, + momentum = momentum, + max_param_change = cur_max_param_change, + shuffle_buffer_size = shuffle_buffer_size, + num_chunk_per_minibatch = cur_num_chunk_per_minibatch, + frame_subsampling_factor = frame_subsampling_factor, + truncate_deriv_weights = truncate_deriv_weights, + cache_io_opts = cache_io_opts, + run_opts = run_opts) [models_to_average, best_model] = train_lib.GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter)) nnets_list = [] @@ -567,14 +599,15 @@ def Train(args, run_opts): left_context = args.chunk_left_context + model_left_context right_context = args.chunk_right_context + model_right_context + egs_left_context = left_context + args.frame_subsampling_factor/2 + egs_right_context = right_context + args.frame_subsampling_factor/2 default_egs_dir = '{0}/egs'.format(args.dir) if (args.stage <= -3) and args.egs_dir is None: logger.info("Generating egs") # this is where get_egs.sh is called. chain_lib.GenerateChainEgs(args.dir, args.feat_dir, args.lat_dir, default_egs_dir, - left_context + args.frame_subsampling_factor/2, - right_context + args.frame_subsampling_factor/2, + egs_left_context, egs_right_context, run_opts, left_tolerance = args.left_tolerance, right_tolerance = args.right_tolerance, @@ -594,7 +627,7 @@ def Train(args, run_opts): else: egs_dir = args.egs_dir - [egs_left_context, egs_right_context, frames_per_eg, num_archives] = train_lib.VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context) + [egs_left_context, egs_right_context, frames_per_eg, num_archives] = train_lib.VerifyEgsDir(egs_dir, feat_dim, ivector_dim, egs_left_context, egs_right_context) assert(args.chunk_width == frames_per_eg) num_archives_expanded = num_archives * args.frame_subsampling_factor @@ -638,6 +671,12 @@ def Train(args, run_opts): args.initial_effective_lrate, args.final_effective_lrate) + min_deriv_time = None + max_deriv_time = None + if not args.deriv_truncate_margin is None: + min_deriv_time = -args.deriv_truncate_margin + max_deriv_time = args.chunk_width - 1 + args.deriv_truncate_margin + logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters)) for iter in range(num_iters): if (args.exit_stage is not None) and (iter == args.exit_stage): @@ -653,18 +692,32 @@ def Train(args, run_opts): shrinkage_value = args.shrink_value logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value)) - TrainOneIteration(args.dir, iter, args.srand, egs_dir, current_num_jobs, - num_archives_processed, num_archives, - learning_rate(iter, current_num_jobs, num_archives_processed), - shrinkage_value, - args.num_chunk_per_minibatch, - num_hidden_layers, args.add_layers_period, - args.apply_deriv_weights, args.left_deriv_truncate, args.right_deriv_truncate, - args.l2_regularize, args.xent_regularize, args.leaky_hmm_coefficient, - args.momentum, args.max_param_change, - args.shuffle_buffer_size, - args.frame_subsampling_factor, - args.truncate_deriv_weights, run_opts) + TrainOneIteration(dir = args.dir, + iter = iter, + srand = args.srand, + egs_dir = egs_dir, + num_jobs = current_num_jobs, + num_archives_processed = num_archives_processed, + num_archives = num_archives, + learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed), + shrinkage_value = shrinkage_value, + num_chunk_per_minibatch = args.num_chunk_per_minibatch, + num_hidden_layers = num_hidden_layers, + add_layers_period = args.add_layers_period, + left_context = left_context, + right_context = right_context, + apply_deriv_weights = args.apply_deriv_weights, + min_deriv_time = min_deriv_time, + max_deriv_time = max_deriv_time, + l2_regularize = args.l2_regularize, + xent_regularize = args.xent_regularize, + leaky_hmm_coefficient = args.leaky_hmm_coefficient, + momentum = args.momentum, + max_param_change = args.max_param_change, + shuffle_buffer_size = args.shuffle_buffer_size, + frame_subsampling_factor = args.frame_subsampling_factor, + truncate_deriv_weights = args.truncate_deriv_weights, + run_opts = run_opts) if args.cleanup: # do a clean up everythin but the last 2 models, under certain conditions train_lib.RemoveModel(args.dir, iter-2, num_iters, num_iters_combine, @@ -683,10 +736,17 @@ def Train(args, run_opts): if args.stage <= num_iters: logger.info("Doing final combination to produce final.mdl") - chain_lib.CombineModels(args.dir, num_iters, num_iters_combine, - args.num_chunk_per_minibatch, egs_dir, - args.leaky_hmm_coefficient, args.l2_regularize, - args.xent_regularize, run_opts) + chain_lib.CombineModels(dir = args.dir, + num_iters = num_iters, + num_iters_combine = num_iters_combine, + num_chunk_per_minibatch = args.num_chunk_per_minibatch, + egs_dir = egs_dir, + left_context = left_context, + right_context = right_context, + leaky_hmm_coefficient = args.leaky_hmm_coefficient, + l2_regularize = args.l2_regularize, + xent_regularize = args.xent_regularize, + run_opts = run_opts) if args.cleanup: logger.info("Cleaning up the experiment directory {0}".format(args.dir)) diff --git a/egs/wsj/s5/steps/nnet3/components.py b/egs/wsj/s5/steps/nnet3/components.py index 95f6c784851..4bfcb219fc3 100644 --- a/egs/wsj/s5/steps/nnet3/components.py +++ b/egs/wsj/s5/steps/nnet3/components.py @@ -96,7 +96,7 @@ def AddAffineLayer(config_lines, name, input, output_dim, ng_affine_options = "" # Per-component max-change option max_change_options = "max-change={0:.2f}".format(max_change_per_component) if max_change_per_component is not None else '' - + components.append("component name={0}_affine type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, input['dimension'], output_dim, ng_affine_options, max_change_options)) component_nodes.append("component-node name={0}_affine component={0}_affine input={1}".format(name, input['descriptor'])) @@ -111,7 +111,7 @@ def AddAffRelNormLayer(config_lines, name, input, output_dim, ng_affine_options self_repair_string = "self-repair-scale={0:.10f}".format(self_repair_scale) if self_repair_scale is not None else '' # Per-component max-change option max_change_options = "max-change={0:.2f}".format(max_change_per_component) if max_change_per_component is not None else '' - + components.append("component name={0}_affine type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, input['dimension'], output_dim, ng_affine_options, max_change_options)) components.append("component name={0}_relu type=RectifiedLinearComponent dim={1} {2}".format(name, output_dim, self_repair_string)) components.append("component name={0}_renorm type=NormalizeComponent dim={1} target-rms={2}".format(name, output_dim, norm_target_rms)) @@ -484,4 +484,4 @@ def AddBLstmLayer(config_lines, 'descriptor': output_descriptor, 'dimension':output_dim } - + diff --git a/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py b/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py index 2290c4d2e7f..2a6499090e2 100755 --- a/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py +++ b/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py @@ -90,11 +90,12 @@ def GetDotNodeName(name_string, is_component = False): # this function is required as dot does not allow all the component names # allowed by nnet3. # Identified incompatibilities : - # 1. dot does not allow hyphen(-) in names + # 1. dot does not allow hyphen(-) and dot(.) in names # 2. Nnet3 names can be shared among components and component nodes # dot does not allow common names # node_name_string = re.sub("-", "hyphen", name_string) + node_name_string = re.sub("\.", "_dot_", node_name_string) if is_component: node_name_string += node_name_string.strip() + "_component" return {"label":name_string, "node":node_name_string} diff --git a/egs/wsj/s5/steps/nnet3/nnet3_libs/train/__init__.py b/egs/wsj/s5/steps/nnet3/nnet3_libs/train/__init__.py new file mode 100644 index 00000000000..e6dc907fe0a --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/nnet3_libs/train/__init__.py @@ -0,0 +1 @@ +# This module will house the latest training libraries being written by Vimal diff --git a/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh b/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh index c36de8c16bf..06ccf9657be 100755 --- a/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh +++ b/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh @@ -17,6 +17,7 @@ if [ $# != 3 ]; then echo " e.g.: $0 exp/sdm1/nnet3/lstm_sp/0.mdl lstm.dot lstm.png" echo "" echo "Main options (for others, see top of script file)" + echo " --info-bin # Name of the binary to generate the nnet3 file" echo " --component-attributes # attributes to be printed in nnet3 components" echo " --node-prefixes # list of prefixes. Nnet3 components/component-nodes with the same prefix" echo " # will be clustered together in the dot-graph" @@ -34,6 +35,7 @@ $info_bin $model | \ steps/nnet3/dot/nnet3_to_dot.py \ --component-attributes "$component_attributes" \ $attr $dot_file +echo "Generated the dot file $dot_file" command -v dot >/dev/null 2>&1 || { echo >&2 "This script requires dot but it's not installed. Please compile $dot_file with dot"; exit 1; } dot -Tpdf $dot_file -o $output_file diff --git a/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py b/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py index a43aa05176b..e92ab05a847 100644 --- a/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py +++ b/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py @@ -252,7 +252,10 @@ def VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context): raise Exception('There is mismatch between featdim/ivector_dim of the current experiment and the provided egs directory') if (egs_left_context < left_context) or (egs_right_context < right_context): - raise Exception('The egs have insufficient context') + raise Exception('The egs have insufficient context.' + ' Required left context is {rlc} and available left context is {alc}.' + ' Required right context is {rrc} and available right context is {arc}.'.format(rlc = left_context, alc = egs_left_context, + rrc = right_context, arc = egs_right_context)) frames_per_eg = int(open('{0}/info/frames_per_eg'.format(egs_dir)).readline()) num_archives = int(open('{0}/info/num_archives'.format(egs_dir)).readline()) @@ -506,52 +509,65 @@ def DoShrinkage(iter, model_file, non_linearity, shrink_threshold): return False -def ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, mb_size=256, wait = False): +def ComputeTrainCvProbabilities(dir, iter, egs_dir, left_context, right_context, + run_opts, mb_size=256, wait = False): model = '{0}/{1}.mdl'.format(dir, iter) + context_opts="--left-context={0} --right-context={1}".format( + left_context, right_context) + RunKaldiCommand(""" {command} {dir}/log/compute_prob_valid.{iter}.log \ nnet3-compute-prob "nnet3-am-copy --raw=true {model} - |" \ - "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/valid_diagnostic.egs ark:- |" + "ark,bg:nnet3-copy-egs {context_opts} ark:{egs_dir}/valid_diagnostic.egs ark:- | nnet3-merge-egs --minibatch-size={mb_size} ark:- ark:- |" """.format(command = run_opts.command, dir = dir, iter = iter, mb_size = mb_size, model = model, + context_opts = context_opts, egs_dir = egs_dir), wait = wait) RunKaldiCommand(""" {command} {dir}/log/compute_prob_train.{iter}.log \ nnet3-compute-prob "nnet3-am-copy --raw=true {model} - |" \ - "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/train_diagnostic.egs ark:- |" + "ark,bg:nnet3-copy-egs {context_opts} ark:{egs_dir}/train_diagnostic.egs ark:- | nnet3-merge-egs --minibatch-size={mb_size} ark:- ark:- |" """.format(command = run_opts.command, dir = dir, iter = iter, mb_size = mb_size, model = model, + context_opts = context_opts, egs_dir = egs_dir), wait = wait) -def ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=256, wait=False): +def ComputeProgress(dir, iter, egs_dir, left_context, right_context, + run_opts, mb_size=256, wait=False): prev_model = '{0}/{1}.mdl'.format(dir, iter - 1) model = '{0}/{1}.mdl'.format(dir, iter) + + + context_opts="--left-context={0} --right-context={1}".format( + left_context, right_context) + RunKaldiCommand(""" {command} {dir}/log/progress.{iter}.log \ nnet3-info "nnet3-am-copy --raw=true {model} - |" '&&' \ nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true {prev_model} - |" "nnet3-am-copy --raw=true {model} - |" \ -"ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/train_diagnostic.egs ark:-|" +"ark,bg:nnet3-copy-egs {context_opts} ark:{egs_dir}/train_diagnostic.egs ark:- | nnet3-merge-egs --minibatch-size={mb_size} ark:- ark:-|" """.format(command = run_opts.command, dir = dir, iter = iter, model = model, mb_size = mb_size, prev_model = prev_model, + context_opts = context_opts, egs_dir = egs_dir), wait = wait) def CombineModels(dir, num_iters, num_iters_combine, egs_dir, - run_opts, chunk_width = None): + run_opts, left_context, right_context, chunk_width = None): # Now do combination. In the nnet3 setup, the logic # for doing averaging of subsets of the models in the case where # there are too many models to reliably esetimate interpolation @@ -570,26 +586,39 @@ def CombineModels(dir, num_iters, num_iters_combine, egs_dir, else: mbsize = 1024 + + context_opts="--left-context={0} --right-context={1}".format( + left_context, right_context) + RunKaldiCommand(""" {command} {combine_queue_opt} {dir}/log/combine.log \ nnet3-combine --num-iters=40 \ --enforce-sum-to-one=true --enforce-positive-weights=true \ - --verbose=3 {raw_models} "ark,bg:nnet3-merge-egs --measure-output-frames=false --minibatch-size={mbsize} ark:{egs_dir}/combine.egs ark:-|" \ + --verbose=3 {raw_models} "ark,bg:nnet3-copy-egs {context_opts} ark:{egs_dir}/combine.egs ark:- | \ + nnet3-merge-egs --measure-output-frames=false --minibatch-size={mbsize} ark:- ark:-|" \ "|nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl {dir}/combined.mdl" """.format(command = run_opts.command, combine_queue_opt = run_opts.combine_queue_opt, dir = dir, raw_models = " ".join(raw_model_strings), mbsize = mbsize, num_iters = num_iters, + context_opts = context_opts, egs_dir = egs_dir)) # Compute the probability of the final, combined model with # the same subset we used for the previous compute_probs, as the # different subsets will lead to different probs. - ComputeTrainCvProbabilities(dir, 'combined', egs_dir, run_opts, wait = False) + ComputeTrainCvProbabilities(dir = dir, + iter = 'combined', + egs_dir = egs_dir, + left_context = left_context, + right_context = right_context, + run_opts = run_opts, + wait = False) def ComputeAveragePosterior(dir, iter, egs_dir, num_archives, - prior_subset_size, run_opts): + prior_subset_size, left_context, right_context, + run_opts): # Note: this just uses CPUs, using a smallish subset of data. """ Computes the average posterior of the network""" import glob @@ -601,19 +630,24 @@ def ComputeAveragePosterior(dir, iter, egs_dir, num_archives, else: egs_part = 'JOB' + context_opts="--left-context={0} --right-context={1}".format( + left_context, right_context) + RunKaldiCommand(""" {command} JOB=1:{num_jobs_compute_prior} {prior_queue_opt} {dir}/log/get_post.{iter}.JOB.log \ - nnet3-subset-egs --srand=JOB --n={prior_subset_size} ark:{egs_dir}/egs.{egs_part}.ark ark:- \| \ + nnet3-copy-egs {context_opts} ark:{egs_dir}/egs.{egs_part}.ark ark:- \| \ + nnet3-subset-egs --srand=JOB --n={prior_subset_size} ark:- ark:- \| \ nnet3-merge-egs --measure-output-frames=true --minibatch-size=128 ark:- ark:- \| \ nnet3-compute-from-egs {prior_gpu_opt} --apply-exp=true \ - "nnet3-am-copy --raw=true {dir}/combined.mdl -|" ark:- ark:- \| \ -matrix-sum-rows ark:- ark:- \| vector-sum ark:- {dir}/post.{iter}.JOB.vec + "nnet3-am-copy --raw=true {dir}/combined.mdl -|" ark:- ark:- \| \ + matrix-sum-rows ark:- ark:- \| vector-sum ark:- {dir}/post.{iter}.JOB.vec """.format(command = run_opts.command, dir = dir, num_jobs_compute_prior = run_opts.num_jobs_compute_prior, prior_queue_opt = run_opts.prior_queue_opt, iter = iter, prior_subset_size = prior_subset_size, egs_dir = egs_dir, egs_part = egs_part, + context_opts = context_opts, prior_gpu_opt = run_opts.prior_gpu_opt)) # make sure there is time for $dir/post.{iter}.*.vec to appear. diff --git a/egs/wsj/s5/steps/nnet3/report/generate_plots.py b/egs/wsj/s5/steps/nnet3/report/generate_plots.py index ea8f41749da..26ca16c364b 100755 --- a/egs/wsj/s5/steps/nnet3/report/generate_plots.py +++ b/egs/wsj/s5/steps/nnet3/report/generate_plots.py @@ -102,13 +102,23 @@ def Compile(self): lat_file.close() logger.info("Compiling the latex report.") try: - proc = subprocess.Popen(['pdflatex', '-output-directory='+str(dir_name), latex_file], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + proc = subprocess.Popen(['pdflatex', '-interaction=batchmode', '-output-directory='+str(dir_name), latex_file], stdout=subprocess.PIPE, stderr=subprocess.PIPE) proc.communicate() except Exception as e: logger.warning("There was an error compiling the latex file {0}, please do it manually.".format(latex_file)) return False return True +def LatexCompliantName(name_string): + # this function is required as latex does not allow all the component names + # allowed by nnet3. + # Identified incompatibilities : + # 1. latex does not allow dot(.) in file names + # + node_name_string = re.sub("\.", "_dot_", name_string) + + return node_name_string + def GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'accuracy', file_basename = 'accuracy', comparison_dir = None, start_iter = 1, latex_report = None): assert(start_iter >= 1) @@ -240,7 +250,8 @@ def GenerateNonlinStatsPlots(exp_dir, output_dir, plot, comparison_dir = None, s lgd = plt.legend(handles=plots, loc='lower center', bbox_to_anchor=(0.5, -0.5 + len(dirs) * -0.2 ), ncol=1, borderaxespad=0.) plt.grid(True) fig.suptitle("Mean and stddev of the value and derivative at {comp_name}".format(comp_name = component_name)) - figfile_name = '{dir}/nonlinstats_{comp_name}.pdf'.format(dir = output_dir, comp_name = component_name) + comp_name = LatexCompliantName(component_name) + figfile_name = '{dir}/nonlinstats_{comp_name}.pdf'.format(dir = output_dir, comp_name = comp_name) fig.savefig(figfile_name, bbox_extra_artists=(lgd,), bbox_inches='tight') if latex_report is not None: latex_report.AddFigure(figfile_name, "Mean and stddev of the value and derivative at {0}".format(component_name)) @@ -317,7 +328,8 @@ def GenerateClippedProportionPlots(exp_dir, output_dir, plot, comparison_dir = N lgd = plt.legend(handles=plots, loc='lower center', bbox_to_anchor=(0.5, -0.5 + len(dirs) * -0.2 ), ncol=1, borderaxespad=0.) plt.grid(True) fig.suptitle("Clipped-proportion value at {comp_name}".format(comp_name = component_name)) - figfile_name = '{dir}/clipped_proportion_{comp_name}.pdf'.format(dir = output_dir, comp_name = component_name) + comp_name = LatexCompliantName(component_name) + figfile_name = '{dir}/clipped_proportion_{comp_name}.pdf'.format(dir = output_dir, comp_name = comp_name) fig.savefig(figfile_name, bbox_extra_artists=(lgd,), bbox_inches='tight') if latex_report is not None: latex_report.AddFigure(figfile_name, "Clipped proportion at {0}".format(component_name)) @@ -417,7 +429,8 @@ def GenerateParameterDiffPlots(exp_dir, output_dir, plot, comparison_dir = None, lgd = plt.legend(handles=plots, loc='lower center', bbox_to_anchor=(0.5, -0.5 + len(dirs) * -0.2 ), ncol=1, borderaxespad=0.) plt.grid(True) fig.suptitle("Parameter differences at {comp_name}".format(comp_name = component_name)) - figfile_name = '{dir}/param_diff_{comp_name}.pdf'.format(dir = output_dir, comp_name = component_name) + comp_name = LatexCompliantName(component_name) + figfile_name = '{dir}/param_diff_{comp_name}.pdf'.format(dir = output_dir, comp_name = comp_name) fig.savefig(figfile_name, bbox_extra_artists=(lgd,), bbox_inches='tight') if latex_report is not None: latex_report.AddFigure(figfile_name, "Parameter differences at {0}".format(component_name)) diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py index e4a9e617e48..4139d446872 100755 --- a/egs/wsj/s5/steps/nnet3/train_dnn.py +++ b/egs/wsj/s5/steps/nnet3/train_dnn.py @@ -359,10 +359,14 @@ def TrainOneIteration(dir, iter, srand, egs_dir, f.write(str(srand)) f.close() - ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts) + ComputeTrainCvProbabilities(dir=dir, iter=iter, egs_dir=egs_dir, + left_context=left_context, right_context=right_context, + run_opts=run_opts) if iter > 0: - ComputeProgress(dir, iter, egs_dir, run_opts) + ComputeProgress(dir=dir, iter=iter, egs_dir=egs_dir, + left_context=left_context, right_context=right_context, + run_opts=run_opts) if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0): @@ -578,14 +582,24 @@ def Train(args, run_opts): logger.info("On iteration {0}, learning rate is {1}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed))) - TrainOneIteration(args.dir, iter, args.srand, egs_dir, current_num_jobs, - num_archives_processed, num_archives, - learning_rate(iter, current_num_jobs, num_archives_processed), - args.minibatch_size, args.frames_per_eg, - num_hidden_layers, args.add_layers_period, - left_context, right_context, - args.momentum, args.max_param_change, - args.shuffle_buffer_size, run_opts) + TrainOneIteration(dir = args.dir, + iter = iter, + srand = args.srand, + egs_dir = egs_dir, + num_jobs = current_num_jobs, + num_archives_processed = num_archives_processed, + num_archives = num_archives, + learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed), + minibatch_size = args.minibatch_size, + frames_per_eg = args.frames_per_eg, + num_hidden_layers = num_hidden_layers, + add_layers_period = args.add_layers_period, + left_context = left_context, + right_context = right_context, + momentum = args.momentum, + max_param_change = args.max_param_change, + shuffle_buffer_size = args.shuffle_buffer_size, + run_opts = run_opts) if args.cleanup: # do a clean up everythin but the last 2 models, under certain conditions RemoveModel(args.dir, iter-2, num_iters, num_iters_combine, @@ -604,12 +618,24 @@ def Train(args, run_opts): if args.stage <= num_iters: logger.info("Doing final combination to produce final.mdl") - CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts) + CombineModels(dir = args.dir, + num_iters = num_iters, + num_iters_combine = num_iters_combine, + egs_dir = egs_dir, + left_context = left_context, + right_context = right_context, + run_opts = run_opts) if args.stage <= num_iters + 1: logger.info("Getting average posterior for purposes of adjusting the priors.") - avg_post_vec_file = ComputeAveragePosterior(args.dir, 'combined', egs_dir, - num_archives, args.prior_subset_size, run_opts) + avg_post_vec_file = ComputeAveragePosterior(dir = args.dir, + iter = 'combined', + egs_dir = egs_dir, + num_archives = num_archives, + prior_subset_size = args.prior_subset_size, + left_context = left_context, + right_context = right_context, + run_opts = run_opts) logger.info("Re-adjusting priors based on computed posteriors") combined_model = "{dir}/combined.mdl".format(dir = args.dir) diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py index 7ac7a58a3d5..89db4276cfc 100755 --- a/egs/wsj/s5/steps/nnet3/train_rnn.py +++ b/egs/wsj/s5/steps/nnet3/train_rnn.py @@ -194,7 +194,7 @@ def GetArgs(): help="Number of sequences to be processed in parallel every minibatch" ) parser.add_argument("--trainer.rnn.num-bptt-steps", type=int, dest='num_bptt_steps', default=None, - help="The number of time steps to back-propagate from the last label in the chunk. By default it is same as the chunk-width." ) + help="The number of time steps to back-propagate from the last label in the chunk. By default it is set to (chunk-width + 10)." ) # General options parser.add_argument("--stage", type=int, default=-4, @@ -346,7 +346,7 @@ def __init__(self): def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives, raw_model_string, egs_dir, - left_context, right_context, min_deriv_time, + left_context, right_context, min_deriv_time, max_deriv_time, momentum, max_param_change, shuffle_buffer_size, num_chunk_per_minibatch, cache_read_opt, run_opts): @@ -375,7 +375,7 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi nnet3-train {parallel_train_opts} {cache_read_opt} {cache_write_opt} \ --print-interval=10 --momentum={momentum} \ --max-param-change={max_param_change} \ - --optimization.min-deriv-time={min_deriv_time} "{raw_model}" \ + --optimization.min-deriv-time={min_deriv_time} --optimization.max-deriv-time={max_deriv_time} "{raw_model}" \ "ark,bg:nnet3-copy-egs {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-merge-egs --minibatch-size={num_chunk_per_minibatch} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \ {dir}/{next_iter}.{job}.raw """.format(command = run_opts.command, @@ -384,7 +384,7 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi parallel_train_opts = run_opts.parallel_train_opts, cache_read_opt = cache_read_opt, cache_write_opt = cache_write_opt, momentum = momentum, max_param_change = max_param_change, - min_deriv_time = min_deriv_time, + min_deriv_time = min_deriv_time, max_deriv_time = max_deriv_time, raw_model = raw_model_string, context_opts = context_opts, egs_dir = egs_dir, archive_index = archive_index, shuffle_buffer_size = shuffle_buffer_size, @@ -409,7 +409,7 @@ def TrainOneIteration(dir, iter, srand, egs_dir, num_jobs, num_archives_processed, num_archives, learning_rate, shrinkage_value, num_chunk_per_minibatch, num_hidden_layers, add_layers_period, - left_context, right_context, min_deriv_time, + left_context, right_context, min_deriv_time, max_deriv_time, momentum, max_param_change, shuffle_buffer_size, cv_minibatch_size, run_opts): # Set off jobs doing some diagnostics, in the background. @@ -430,10 +430,22 @@ def TrainOneIteration(dir, iter, srand, egs_dir, f.close() - ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, mb_size=cv_minibatch_size) + ComputeTrainCvProbabilities(dir = dir, + iter = iter, + egs_dir = egs_dir, + left_context = left_context, + right_context = right_context, + run_opts = run_opts, + mb_size=cv_minibatch_size) if iter > 0: - ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=cv_minibatch_size) + ComputeProgress(dir = dir, + iter = iter, + egs_dir = egs_dir, + left_context = left_context, + right_context = right_context, + run_opts = run_opts, + mb_size=cv_minibatch_size) # an option for writing cache (storing pairs of nnet-computations # and computation-requests) during training. @@ -467,12 +479,24 @@ def TrainOneIteration(dir, iter, srand, egs_dir, except OSError: pass - TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives, - raw_model_string, egs_dir, - left_context, right_context, min_deriv_time, - momentum, max_param_change, - shuffle_buffer_size, cur_num_chunk_per_minibatch, - cache_read_opt, run_opts) + TrainNewModels(dir = dir, + iter = iter, + srand = srand, + num_jobs = num_jobs, + num_archives_processed = num_archives_processed, + num_archives = num_archives, + raw_model_string = raw_model_string, + egs_dir = egs_dir, + left_context = left_context, + right_context = right_context, + min_deriv_time = min_deriv_time, + max_deriv_time = max_deriv_time, + momentum = momentum, + max_param_change = max_param_change, + shuffle_buffer_size = shuffle_buffer_size, + num_chunk_per_minibatch = cur_num_chunk_per_minibatch, + cache_read_opt = cache_read_opt, + run_opts = run_opts) [models_to_average, best_model] = GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter)) nnets_list = [] for n in models_to_average: @@ -627,11 +651,13 @@ def Train(args, run_opts): cur_egs_dir=egs_dir if args.num_bptt_steps is None: - num_bptt_steps = args.chunk_width + # num_bptt_steps is set to (chunk_width + 10) by default + num_bptt_steps = args.chunk_width + min(10, args.chunk_left_context, args.chunk_right_context) else: num_bptt_steps = args.num_bptt_steps min_deriv_time = args.chunk_width - num_bptt_steps + max_deriv_time = num_bptt_steps - 1 logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters)) @@ -672,6 +698,7 @@ def Train(args, run_opts): left_context = left_context, right_context = right_context, min_deriv_time = min_deriv_time, + max_deriv_time = max_deriv_time, momentum = args.momentum, max_param_change= args.max_param_change, shuffle_buffer_size = args.shuffle_buffer_size, @@ -696,13 +723,25 @@ def Train(args, run_opts): if args.stage <= num_iters: logger.info("Doing final combination to produce final.mdl") - CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts, - chunk_width = args.chunk_width) + CombineModels(dir = args.dir, + num_iters = num_iters, + num_iters_combine = num_iters_combine, + egs_dir = egs_dir, + left_context = left_context, + right_context = right_context, + run_opts = run_opts, + chunk_width = args.chunk_width) if args.stage <= num_iters + 1: logger.info("Getting average posterior for purposes of adjusting the priors.") - avg_post_vec_file = ComputeAveragePosterior(args.dir, 'combined', egs_dir, - num_archives, args.prior_subset_size, run_opts) + avg_post_vec_file = ComputeAveragePosterior(dir = args.dir, + iter = 'combined', + egs_dir = egs_dir, + num_archives = num_archives, + prior_subset_size = args.prior_subset_size, + left_context = left_context, + right_context = right_context, + run_opts = run_opts) logger.info("Re-adjusting priors based on computed posteriors") combined_model = "{dir}/combined.mdl".format(dir = args.dir) diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py new file mode 100755 index 00000000000..e29a9404403 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py @@ -0,0 +1,231 @@ +#!/usr/bin/env python + +# we're using python 3.x style print but want it to work in python 2.x, +from __future__ import print_function +import os +import argparse +import shlex +import sys +import warnings +import copy +import imp +import ast +from collections import defaultdict + +sys.path.insert(0, 'steps/') +# the following is in case we weren't running this from the normal directory. +sys.path.insert(0, os.path.realpath(os.path.dirname(sys.argv[0])) + '/') + +import libs.nnet3.xconfig.parser as xparser +# do the proper import when python scripts have been refactored +nnet3_lib = imp.load_source('', 'steps/nnet3/nnet3_train_lib.py') + +def get_args(): + # we add compulsary arguments as named arguments for readability + parser = argparse.ArgumentParser(description='Reads an xconfig file and creates config files ' + 'for neural net creation and training', + epilog='Search egs/*/*/local/{nnet3,chain}/*sh for examples') + parser.add_argument('--xconfig-file', required=True, + help='Filename of input xconfig file') + parser.add_argument('--config-dir', required=True, + help='Directory to write config files and variables') + + print(' '.join(sys.argv)) + + args = parser.parse_args() + args = check_args(args) + + return args + +def check_args(args): + if not os.path.exists(args.config_dir): + os.makedirs(args.config_dir) + return args + + + + +def backup_xconfig_file(xconfig_file, config_dir): + # we write a copy of the xconfig file just to have a record of the original + # input. + try: + xconfig_file_out = open(config_dir + '/xconfig', 'w') + except: + sys.exit('{0}: error opening file {1}/xconfig for output'.format( + sys.argv[0], config_dir)) + try: + xconfig_file_in = open(xconfig_file) + except: + sys.exit('{0}: error opening file {1} for input'.format(sys.argv[0], config_dir)) + + print("# This file was created by the command:\n" + "# {0}\n" + "# It is a copy of the source from which the config files in " + "# this directory were generated.\n".format(' '.join(sys.argv)), + file=xconfig_file_out) + + while True: + line = xconfig_file_in.readline() + if line == '': + break + print(line.strip(), file=xconfig_file_out) + xconfig_file_out.close() + xconfig_file_in.close() + + +# This functions writes config_dir/xconfig.expanded.1 and +# config_dir/xconfig.expanded.2, showing some of the internal stages of +# processing the xconfig file before turning it into config files. +def write_expanded_xconfig_files(config_dir, all_layers): + try: + xconfig_file_out = open(config_dir + '/xconfig.expanded.1', 'w') + except: + sys.exit('{0}: error opening file {1}/xconfig.expanded.1 for output'.format( + sys.argv[0], config_dir)) + + + print('# This file was created by the command:\n' + '# ' + ' '.join(sys.argv) + '\n' + '#It contains the same content as ./xconfig but it was parsed and\n' + '#default config values were set.\n' + '# See also ./xconfig.expanded.2\n', file=xconfig_file_out) + + for layer in all_layers: + print(str(layer), file=xconfig_file_out) + xconfig_file_out.close() + + try: + xconfig_file_out = open(config_dir + '/xconfig.expanded.2', 'w') + except: + sys.exit('{0}: error opening file {1}/xconfig.expanded.2 for output'.format( + sys.argv[0], config_dir)) + + print('# This file was created by the command:\n' + '# ' + ' '.join(sys.argv) + '\n' + '# It contains the same content as ./xconfig but it was parsed,\n' + '# default config values were set, and Descriptors (input=xxx) were normalized.\n' + '# See also ./xconfig.expanded.1\n\n', + file=xconfig_file_out) + + for layer in all_layers: + layer.normalize_descriptors() + print(str(layer), file=xconfig_file_out) + xconfig_file_out.close() + +# This function returns a map from config-file basename +# e.g. 'init', 'ref', 'layer1' to a documentation string that goes +# at the top of the file. +def get_config_headers(): + ans = defaultdict(str) # resulting dict will default to the empty string + # for any config files not explicitly listed here. + ans['init'] = ('# This file was created by the command:\n' + '# ' + ' '.join(sys.argv) + '\n' + '# It contains the input of the network and is used in\n' + '# accumulating stats for an LDA-like transform of the\n' + '# input features.\n'); + ans['ref'] = ('# This file was created by the command:\n' + '# ' + ' '.join(sys.argv) + '\n' + '# It contains the entire neural network, but with those\n' + '# components that would normally require fixed vectors/matrices\n' + '# read from disk, replaced with random initialization\n' + '# (this applies to the LDA-like transform and the\n' + '# presoftmax-prior-scale, if applicable). This file\n' + '# is used only to work out the left-context and right-context\n' + '# of the network.\n'); + ans['final'] = ('# This file was created by the command:\n' + '# ' + ' '.join(sys.argv) + '\n' + '# It contains the entire neural network.\n') + + return ans; + + + + +# This is where most of the work of this program happens. +def write_config_files(config_dir, all_layers): + # config_basename_to_lines is map from the basename of the + # config, as a string (i.e. 'ref', 'all', 'init') to a list of + # strings representing lines to put in the config file. + config_basename_to_lines = defaultdict(list) + + config_basename_to_header = get_config_headers() + + for layer in all_layers: + try: + pairs = layer.get_full_config() + for config_basename, line in pairs: + config_basename_to_lines[config_basename].append(line) + except Exception as e: + print("{0}: error producing config lines from xconfig " + "line '{1}': error was: {2}".format(sys.argv[0], str(layer), + repr(e)), file=sys.stderr) + # we use raise rather than raise(e) as using a blank raise + # preserves the backtrace + raise + + for basename,lines in config_basename_to_lines.items(): + header = config_basename_to_header[basename] + filename = '{0}/{1}.config'.format(config_dir, basename) + try: + f = open(filename, 'w') + print(header, file=f) + for line in lines: + print(line, file=f) + f.close() + except Exception as e: + print('{0}: error writing to config file {1}: error is {2}'.format( + sys.argv[0], filename, repr(e)), file=sys.stderr) + # we use raise rather than raise(e) as using a blank raise + # preserves the backtrace + raise + +def add_back_compatibility_info(config_dir): + """This will be removed when python script refactoring is done.""" + + nnet3_lib.RunKaldiCommand("nnet3-init {0}/ref.config {0}/ref.raw".format(config_dir)) + out, err = nnet3_lib.RunKaldiCommand("nnet3-info {0}/ref.raw | head -4".format(config_dir)) + #out looks like this + # left-context: 7 + # right-context: 0 + # num-parameters: 90543902 + # modulus: 1 + info = {} + for line in out.split("\n"): + parts = line.split(":") + if len(parts) != 2: + continue + info[parts[0].strip()] = int(parts[1].strip()) + + # Writing the back-compatible vars file + # model_left_context=0 + # model_right_context=7 + # num_hidden_layers=3 + vf = open('{0}/vars'.format(config_dir), 'w') + vf.write('model_left_context={0}\n'.format(info['left-context'])) + vf.write('model_right_context={0}\n'.format(info['right-context'])) + vf.write('num_hidden_layers=1\n') + vf.close() + + nnet3_lib.ForceSymlink("final.config".format(config_dir), + "{0}/layer1.config".format(config_dir)) + +def main(): + args = get_args() + backup_xconfig_file(args.xconfig_file, args.config_dir) + all_layers = xparser.read_xconfig_file(args.xconfig_file) + write_expanded_xconfig_files(args.config_dir, all_layers) + write_config_files(args.config_dir, all_layers) + add_back_compatibility_info(args.config_dir) + + +if __name__ == '__main__': + main() + + +# test: +# mkdir -p foo; (echo 'input dim=40 name=input'; echo 'output name=output input=Append(-1,0,1)') >xconfig; ./xconfig_to_configs.py xconfig foo +# mkdir -p foo; (echo 'input dim=40 name=input'; echo 'output-layer name=output dim=1924 input=Append(-1,0,1)') >xconfig; ./xconfig_to_configs.py xconfig foo + +# mkdir -p foo; (echo 'input dim=40 name=input'; echo 'relu-renorm-layer name=affine1 dim=1024'; echo 'output-layer name=output dim=1924 input=Append(-1,0,1)') >xconfig; ./xconfig_to_configs.py xconfig foo + +# mkdir -p foo; (echo 'input dim=100 name=ivector'; echo 'input dim=40 name=input'; echo 'fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=foo/bar/lda.mat'; echo 'output-layer name=output dim=1924 input=Append(-1,0,1)') >xconfig; ./xconfig_to_configs.py xconfig foo diff --git a/egs/wsj/s5/utils/prepare_lang.sh b/egs/wsj/s5/utils/prepare_lang.sh index ea5264a0f07..054210cdd23 100755 --- a/egs/wsj/s5/utils/prepare_lang.sh +++ b/egs/wsj/s5/utils/prepare_lang.sh @@ -51,7 +51,6 @@ # Begin configuration section. num_sil_states=5 num_nonsil_states=3 -num_word_disambig_syms=1 position_dependent_phones=true # position_dependent_phones is false also when position dependent phones and word_boundary.txt # have been generated by another source diff --git a/src/chainbin/nnet3-chain-copy-egs.cc b/src/chainbin/nnet3-chain-copy-egs.cc index 00ed56308b3..b0c963595a1 100644 --- a/src/chainbin/nnet3-chain-copy-egs.cc +++ b/src/chainbin/nnet3-chain-copy-egs.cc @@ -316,15 +316,15 @@ int main(int argc, char *argv[]) { num_written++; } } else if (count > 0) { - const NnetChainExample &eg = example_reader.Value(); + NnetChainExample eg = example_reader.Value(); + if (frame_shift != 0) + ShiftChainExampleTimes(frame_shift, exclude_names, &eg); NnetChainExample eg_out; if (left_context != -1 || right_context != -1) ModifyChainExampleContext(eg, left_context, right_context, frame_subsampling_factor, &eg_out); else - eg_out = eg; - if (frame_shift != 0) - ShiftChainExampleTimes(frame_shift, exclude_names, &eg_out); + eg_out.Swap(&eg); if (truncate_deriv_weights != 0) TruncateDerivWeights(truncate_deriv_weights, &eg_out); for (int32 c = 0; c < count; c++) { @@ -344,5 +344,3 @@ int main(int argc, char *argv[]) { return -1; } } - - diff --git a/src/nnet3/nnet-nnet.cc b/src/nnet3/nnet-nnet.cc index af2147147d7..ad5f715a294 100644 --- a/src/nnet3/nnet-nnet.cc +++ b/src/nnet3/nnet-nnet.cc @@ -783,6 +783,13 @@ Nnet& Nnet::operator =(const Nnet &nnet) { std::string Nnet::Info() const { std::ostringstream os; + + if(IsSimpleNnet(*this)) { + int32 left_context, right_context; + ComputeSimpleNnetContext(*this, &left_context, &right_context); + os << "left-context: " << left_context << "\n"; + os << "right-context: " << right_context << "\n"; + } os << "num-parameters: " << NumParameters(*this) << "\n"; os << "modulus: " << this->Modulus() << "\n"; std::vector config_lines; diff --git a/src/nnet3/nnet-parse.cc b/src/nnet3/nnet-parse.cc index 733d162748e..3bacf455f3b 100644 --- a/src/nnet3/nnet-parse.cc +++ b/src/nnet3/nnet-parse.cc @@ -427,7 +427,7 @@ bool IsValidName(const std::string &name) { for (size_t i = 0; i < name.size(); i++) { if (i == 0 && !isalpha(name[i]) && name[i] != '_') return false; - if (!isalnum(name[i]) && name[i] != '_' && name[i] != '-') + if (!isalnum(name[i]) && name[i] != '_' && name[i] != '-' && name[i] != '.') return false; } return true; diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index b84ac90c76e..f48885175b4 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -96,16 +96,16 @@ void DropoutComponent::InitFromConfig(ConfigLine *cfl) { BaseFloat dropout_proportion = 0.0; bool ok = cfl->GetValue("dim", &dim) && cfl->GetValue("dropout-proportion", &dropout_proportion); - if (!ok || cfl->HasUnusedValues() || dim <= 0 || + if (!ok || cfl->HasUnusedValues() || dim <= 0 || dropout_proportion < 0.0 || dropout_proportion > 1.0) - KALDI_ERR << "Invalid initializer for layer of type " - << Type() << ": \"" << cfl->WholeLine() << "\""; + KALDI_ERR << "Invalid initializer for layer of type " + << Type() << ": \"" << cfl->WholeLine() << "\""; Init(dim, dropout_proportion); } std::string DropoutComponent::Info() const { std::ostringstream stream; - stream << Type() << ", dim = " << dim_ + stream << Type() << ", dim = " << dim_ << ", dropout-proportion = " << dropout_proportion_; return stream.str(); } @@ -119,12 +119,12 @@ void DropoutComponent::Propagate(const ComponentPrecomputedIndexes *indexes, BaseFloat dropout = dropout_proportion_; KALDI_ASSERT(dropout >= 0.0 && dropout <= 1.0); - // This const_cast is only safe assuming you don't attempt + // This const_cast is only safe assuming you don't attempt // to use multi-threaded code with the GPU. - const_cast&>(random_generator_).RandUniform(out); + const_cast&>(random_generator_).RandUniform(out); - out->Add(-dropout); // now, a proportion "dropout" will be <0.0 - out->ApplyHeaviside(); // apply the function (x>0?1:0). Now, a proportion "dropout" will + out->Add(-dropout); // now, a proportion "dropout" will be <0.0 + out->ApplyHeaviside(); // apply the function (x>0?1:0). Now, a proportion "dropout" will // be zero and (1 - dropout) will be 1.0. out->MulElements(in); @@ -147,7 +147,7 @@ void DropoutComponent::Backprop(const std::string &debug_info, } - + void DropoutComponent::Read(std::istream &is, bool binary) { ExpectOneOrTwoTokens(is, binary, "", ""); ReadBasicType(is, binary, &dim_); diff --git a/src/nnet3/online-nnet3-decodable-simple.h b/src/nnet3/online-nnet3-decodable-simple.h index cfd70ccea38..af7c18da64b 100644 --- a/src/nnet3/online-nnet3-decodable-simple.h +++ b/src/nnet3/online-nnet3-decodable-simple.h @@ -102,6 +102,7 @@ class DecodableNnet3SimpleOnline: public DecodableInterface { /// Indices are one-based! This is for compatibility with OpenFst. virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); } + int32 FrameSubsamplingFactor() const { return opts_.frame_subsampling_factor; } private: /// If the neural-network outputs for this frame are not cached, it computes diff --git a/src/online2/online-nnet3-decoding.cc b/src/online2/online-nnet3-decoding.cc index fd4881666ae..8dd366166c0 100644 --- a/src/online2/online-nnet3-decoding.cc +++ b/src/online2/online-nnet3-decoding.cc @@ -72,8 +72,9 @@ void SingleUtteranceNnet3Decoder::GetBestPath(bool end_of_utterance, bool SingleUtteranceNnet3Decoder::EndpointDetected( const OnlineEndpointConfig &config) { + int32 subsample = decodable_.FrameSubsamplingFactor(); return kaldi::EndpointDetected(config, tmodel_, - feature_pipeline_->FrameShiftInSeconds(), + feature_pipeline_->FrameShiftInSeconds() * subsample, decoder_); }