diff --git a/.gitattributes b/.gitattributes index 5a815654b4c..bede44edf8a 100644 --- a/.gitattributes +++ b/.gitattributes @@ -15,4 +15,6 @@ windows/INSTALL* eol=native windows/NewGuidCmd.exe.config text eol=crlf windows/NewGuidCmd.exe binary +# Prevent git changing CR-LF to LF when archiving (patch requires CR-LF on Windows). +**/*.patch -text diff --git a/egs/swbd/s5c/local/chain/README.txt b/egs/swbd/s5c/local/chain/README.txt new file mode 100644 index 00000000000..71ab9f0fa45 --- /dev/null +++ b/egs/swbd/s5c/local/chain/README.txt @@ -0,0 +1,12 @@ + +there are a lot of tuning experiments here. + +ones to look at right now: + 2y is a TDNN baseline + 4f is a good jesus-layer system + 4q is an improved TDNN with various bells and whistles from Vijay. + 4r is a slightly-better jesus-layer system than 4f, with one more layer. + 5e is the best configuration run so far. + + + diff --git a/egs/swbd/s5c/local/chain/run_discriminative.sh b/egs/swbd/s5c/local/chain/run_discriminative.sh new file mode 100755 index 00000000000..f2b4da87920 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_discriminative.sh @@ -0,0 +1,224 @@ +#!/bin/bash + +set -e +set -o pipefail + +# this is run_discriminative.sh + +# This script does discriminative training on top of nnet3 system. +# note: this relies on having a cluster that has plenty of CPUs as well as GPUs, +# since the lattice generation runs in about real-time, so takes of the order of +# 1000 hours of CPU time. +# +# Note: rather than using any features we have dumped on disk, this script +# regenerates them from the wav data three times-- when we do lattice +# generation, numerator alignment and discriminative training. This made the +# script easier to write and more generic, because we don't have to know where +# the features and the iVectors are, but of course it's a little inefficient. +# The time taken is dominated by the lattice generation anyway, so this isn't +# a huge deal. + +. cmd.sh + + +stage=0 +train_stage=-10 +get_egs_stage=-10 +use_gpu=true +srcdir=exp/chain/tdnn_5e_sp +criterion=smbr +drop_frames=false # only matters for MMI. +frames_per_eg=150 +frames_overlap_per_eg=30 +effective_learning_rate=0.0000125 +max_param_change=1 +num_jobs_nnet=4 +train_stage=-10 # can be used to start training in the middle. +decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more. +num_epochs=4 +degs_dir= +cleanup=false # run with --cleanup true --stage 6 to clean up (remove large things like denlats, + # alignments and degs). +regularization_opts= +lats_dir= +train_data_dir=data/train_nodup_sp_hires +online_ivector_dir=exp/nnet3/ivectors_train_nodup_sp +one_silence_class=true +truncate_deriv_weights=10 +minibatch_size=64 + +adjust_priors=true + +determinize=true +minimize=true +remove_output_symbols=true +remove_epsilons=true +collapse_transition_ids=true + +modify_learning_rates=true +last_layer_factor=1.0 + +set -e +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if $use_gpu; then + if ! cuda-compiled; then + cat <2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3c # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +# max_param_change=1.0 +max_param_change=0.5 # Changed it to this value on iteration 74. +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=64 # switched to 64 on iteration 7 after a failure. +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --jesus-dim 800 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3d.sh b/egs/swbd/s5c/local/chain/run_tdnn_3d.sh new file mode 100755 index 00000000000..ca8080db080 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3d.sh @@ -0,0 +1,286 @@ +#!/bin/bash + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# (note: cannot be reproduced using current scripts). +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" +# Results are about the same as 2y, or maybe just a little worse. + +# a03:s5c: ./show_wer.sh 3d +# %WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3d # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3e.sh b/egs/swbd/s5c/local/chain/run_tdnn_3e.sh new file mode 100755 index 00000000000..af5661b8c85 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3e.sh @@ -0,0 +1,275 @@ +#!/bin/bash + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. +# (note: cannot be reproduced using current scripts). + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3e # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000 --num-jesus-blocks 200" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3f.sh b/egs/swbd/s5c/local/chain/run_tdnn_3f.sh new file mode 100755 index 00000000000..f33459f5f08 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3f.sh @@ -0,0 +1,283 @@ +#!/bin/bash + + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# (note: cannot be reproduced using current scripts). +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3f # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000 --num-jesus-blocks 200" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3g.sh b/egs/swbd/s5c/local/chain/run_tdnn_3g.sh new file mode 100755 index 00000000000..ff1e539306f --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3g.sh @@ -0,0 +1,303 @@ +#!/bin/bash + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# (note: cannot be reproduced using current scripts). +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3g # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3h.sh b/egs/swbd/s5c/local/chain/run_tdnn_3h.sh new file mode 100755 index 00000000000..f0e9efc2ac4 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3h.sh @@ -0,0 +1,289 @@ +#!/bin/bash + +# _3h is as _3g but using a different and hopefully better type of recurrence, using +# steps/nnet3/make_jesus_configs_recurrent.py to create the configs. This is more +# similar to LSTMs. +# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worde. + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3h # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3i.sh b/egs/swbd/s5c/local/chain/run_tdnn_3i.sh new file mode 100755 index 00000000000..876048b5852 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3i.sh @@ -0,0 +1,311 @@ +#!/bin/bash + +# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is +# applied, in order to control how fast the final layer's affine component learns. +# also a code fix (the recurrent connections weren't being used; bug in OptionalDescriptor) + +# Here is the original decoding, with frame-per-chunk=50 +#./show_wer.sh 3i +#%WER 18.00 [ 8856 / 49204, 1025 ins, 2376 del, 5455 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 16.52 [ 8129 / 49204, 1084 ins, 1995 del, 5050 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 19.8 | 4459 42989 | 82.6 11.9 5.5 2.4 19.8 57.7 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.9 | 4459 42989 | 84.1 10.5 5.5 2.0 17.9 55.3 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + +# and a newer decoding with frames-per-chunk=100. +# ./show_wer.sh 3i +#%WER 17.86 [ 8787 / 49204, 1015 ins, 2366 del, 5406 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 16.52 [ 8130 / 49204, 1092 ins, 1969 del, 5069 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 19.6 | 4459 42989 | 82.5 11.4 6.0 2.2 19.6 57.5 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.8 | 4459 42989 | 84.1 10.4 5.5 2.0 17.8 55.1 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + +# after initial decoding wasn't great, trying increasing frames-per-chunk from +# 50 to 100. + +# _3h is as _3g but using a different and hopefully better type of recurrence, using +# steps/nnet3/make_jesus_configs_recurrent.py to create the configs. This is more +# similar to LSTMs. +# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worde. + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3i # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk 100 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3j.sh b/egs/swbd/s5c/local/chain/run_tdnn_3j.sh new file mode 100755 index 00000000000..faef84e8879 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3j.sh @@ -0,0 +1,296 @@ +#!/bin/bash + +# _3j is as _3i but using BlockAffineComponent instead of +# RepeatedAffineComponent in Jesus layers. (see --use-repeated-affine false +# option, which is newly added to the script). + +# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is +# applied, in order to control how fast the final layer's affine component learns. + +# _3h is as _3g but using a different and hopefully better type of recurrence, using +# steps/nnet3/make_jesus_configs_recurrent.py to create the configs. This is more +# similar to LSTMs. +# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worde. + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3j # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000 --use-repeated-affine false" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3k.sh b/egs/swbd/s5c/local/chain/run_tdnn_3k.sh new file mode 100755 index 00000000000..b869c7b2553 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3k.sh @@ -0,0 +1,310 @@ +#!/bin/bash + +# _3k is as _3i, but adding the option --jesus-stddev-scale 0.316 " +# [~sqrt(1/10)], which will make the jesus layer learn about 10 times faster- it +# was previously learning too slow, I think. I also changed the script +# make_jesus_configs_recurrent.py to give the recurrent affine layers an initial +# param-stddev of 0 which will discourage those corresponding input weights in +# the jesus layer from getting small in early iters; and removed the --normalize-target +# option and replaced it with the --final-layer-learning-rate-factor option. + +# # these results are with the non-optimal chunk size of 50 (in 3i, 100 was slightly better): +#%WER 17.86 [ 8787 / 49204, 1015 ins, 2366 del, 5406 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 16.52 [ 8130 / 49204, 1092 ins, 1969 del, 5069 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 19.6 | 4459 42989 | 82.5 11.4 6.0 2.2 19.6 57.5 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.8 | 4459 42989 | 84.1 10.4 5.5 2.0 17.8 55.1 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + +# The following are the corresponding results from 3i, decoded with the same chunk size. +##%WER 18.00 [ 8856 / 49204, 1025 ins, 2376 del, 5455 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0 +##%WER 16.52 [ 8129 / 49204, 1084 ins, 1995 del, 5050 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +##%WER 19.8 | 4459 42989 | 82.6 11.9 5.5 2.4 19.8 57.7 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +##%WER 17.9 | 4459 42989 | 84.1 10.5 5.5 2.0 17.9 55.3 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + + +# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is +# applied, in order to control how fast the final layer's affine component learns. + +# _3h is as _3g but using a different and hopefully better type of recurrence, using +# steps/nnet3/make_jesus_configs_recurrent.py to create the configs. This is more +# similar to LSTMs. +# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worde. + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3k # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.316 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3k2.sh b/egs/swbd/s5c/local/chain/run_tdnn_3k2.sh new file mode 100755 index 00000000000..7a016ed2197 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3k2.sh @@ -0,0 +1,358 @@ +#!/bin/bash + +# 3k2 is as 3k, but dumping the egs with --extra-left-context 20. +# Also there will have been some script changes in the meantime, +# e.g. possibly nonzero bias-mean; and reduced max-change on mix-up +# iters. + +# log-probs are better than 3k and in fact better than any experiment so far: +# valid -0.115->-0.107, and train -0.077 to -0.074. + +# Here is the WER using the default --frames-per-chunk of 50, and --extra-left-context 20: +#./show_wer.sh 3k2 +#%WER 20.45 [ 10060 / 49204, 988 ins, 3050 del, 6022 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_tg/wer_12_0.0 +#%WER 19.02 [ 9359 / 49204, 977 ins, 2877 del, 5505 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +#%WER 22.3 | 4459 42989 | 79.9 12.8 7.3 2.3 22.3 60.2 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +#%WER 20.4 | 4459 42989 | 81.5 11.1 7.4 1.9 20.4 58.4 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_fsh_fg/score_12_0.0/eval2000_hires.ctm.filt.sys + +#... and here is the WER after changing it to 150, still with --extra-left-context 20: +#./show_wer.sh 3k2 +#%WER 18.91 [ 9306 / 49204, 1076 ins, 2517 del, 5713 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 17.43 [ 8574 / 49204, 958 ins, 2607 del, 5009 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +#%WER 20.6 | 4459 42989 | 81.7 12.2 6.0 2.4 20.6 58.8 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.filt.sys +#%WER 18.8 | 4459 42989 | 83.4 10.9 5.6 2.3 18.8 56.0 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# the following is --frames-per-chunk 150, --extra-left-context 50 (changing the extra-left-context from 20 to 50 makes it worse): +#./show_wer.sh 3k2 +#%WER 19.46 [ 9574 / 49204, 1134 ins, 2635 del, 5805 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 17.87 [ 8792 / 49204, 880 ins, 3011 del, 4901 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +#%WER 21.0 | 4459 42989 | 81.2 12.4 6.3 2.2 21.0 58.6 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 19.2 | 4459 42989 | 82.7 10.8 6.5 1.9 19.2 56.0 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + +# the following is with --frames-per-chunk 150, --extra-left-context 50, --extra-left-context-initial 20. +#./show_wer.sh 3k2 +#%WER 19.10 [ 9400 / 49204, 1116 ins, 2498 del, 5786 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 17.54 [ 8628 / 49204, 884 ins, 2890 del, 4854 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +#%WER 20.6 | 4459 42989 | 81.7 12.2 6.1 2.3 20.6 58.4 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 18.7 | 4459 42989 | 83.4 10.8 5.8 2.1 18.7 55.6 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# the following is with --extra-left-context-initial 20 --extra-left-context 50 --frames-per-chunk 100. +# I think what's happening is that it's figuring out when it's near the end of the chunk, and encouraging +# deletions at that point, for reasons that relate to edge effects in the objective function. +#./show_wer.sh 3k2 +#%WER 17.87 [ 8793 / 49204, 1061 ins, 2277 del, 5455 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 16.36 [ 8049 / 49204, 1033 ins, 2148 del, 4868 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 19.7 | 4459 42989 | 82.8 11.8 5.5 2.5 19.7 57.8 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.8 | 4459 42989 | 84.4 10.3 5.2 2.2 17.8 54.7 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3k is as _3i, but adding the option --jesus-stddev-scale 0.316 " +# [~sqrt(1/10)], which will make the jesus layer learn about 10 times faster- it +# was previously learning too slow, I think. I also changed the script +# make_jesus_configs_recurrent.py to give the recurrent affine layers an initial +# param-stddev of 0 which will discourage those corresponding input weights in +# the jesus layer from getting small in early iters; and removed the --normalize-target +# option and replaced it with the --final-layer-learning-rate-factor option. + +# # these results are with the non-optimal chunk size of 50 (in 3i, 100 was slightly better): +#%WER 17.86 [ 8787 / 49204, 1015 ins, 2366 del, 5406 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 16.52 [ 8130 / 49204, 1092 ins, 1969 del, 5069 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 19.6 | 4459 42989 | 82.5 11.4 6.0 2.2 19.6 57.5 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.8 | 4459 42989 | 84.1 10.4 5.5 2.0 17.8 55.1 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + +# The following are the corresponding results from 3i, decoded with the same chunk size. +##%WER 18.00 [ 8856 / 49204, 1025 ins, 2376 del, 5455 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0 +##%WER 16.52 [ 8129 / 49204, 1084 ins, 1995 del, 5050 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +##%WER 19.8 | 4459 42989 | 82.6 11.9 5.5 2.4 19.8 57.7 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +##%WER 17.9 | 4459 42989 | 84.1 10.5 5.5 2.0 17.9 55.3 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + + +# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is +# applied, in order to control how fast the final layer's affine component learns. + +# _3h is as _3g but using a different and hopefully better type of recurrence, using +# steps/nnet3/make_jesus_configs_recurrent.py to create the configs. This is more +# similar to LSTMs. +# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worde. + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3k2 # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --extra-left-context 20 \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.316 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context-initial 20 \ + --extra-left-context 50 \ + --frames-per-chunk 100 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3l.sh b/egs/swbd/s5c/local/chain/run_tdnn_3l.sh new file mode 100755 index 00000000000..608e437659e --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3l.sh @@ -0,0 +1,306 @@ +#!/bin/bash + +# [abandoned, not working well.] +# _3l is as _3j, but making similar changes to as 3i->3k, which is (1) adding +# the option --jesus-stddev-scale 0.2 [0.32 was not strong enough], and (2) a +# script change to give the recurrent affine layers an initial param-stddev of +# 0. I also changed the script +# make_jesus_configs_recurrent.py to give the recurrent affine layers an initial +# param-stddev of 0 which will discourage those corresponding input weights in +# the jesus layer from getting small in early iters; and removed the --normalize-target +# option and replaced it with the --final-layer-learning-rate-factor option; +# and added a learning-rate factor for + +# _3j is as _3i but using BlockAffineComponent instead of +# RepeatedAffineComponent in Jesus layers. (see --use-repeated-affine false +# option, which is newly added to the script). + +# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is +# applied, in order to control how fast the final layer's affine component learns. + +# _3h is as _3g but using a different and hopefully better type of recurrence, using +# steps/nnet3/make_jesus_configs_recurrent.py to create the configs. This is more +# similar to LSTMs. +# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worde. + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3l # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000 --use-repeated-affine false --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3m.sh b/egs/swbd/s5c/local/chain/run_tdnn_3m.sh new file mode 100755 index 00000000000..b25f9f15130 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3m.sh @@ -0,0 +1,310 @@ +#!/bin/bash + +# [note: this uses BlockAffineComponent not RepeatedAffineComponent] +# _3m is as _3l, but changing --jesus-stddev-scale from 0.2 to 0.1, as the Jesus layers +# were learning too slowly in 3l (this will make them learn approximately 4x faster). +# [terminated, likelihoods were not promising]. + +# _3l is as _3j, but making similar changes to as 3i->3k, which is (1) adding +# the option --jesus-stddev-scale 0.2 [0.32 was not strong enough], and (2) a +# script change to give the recurrent affine layers an initial param-stddev of +# 0. I also changed the script +# make_jesus_configs_recurrent.py to give the recurrent affine layers an initial +# param-stddev of 0 which will discourage those corresponding input weights in +# the jesus layer from getting small in early iters; and removed the --normalize-target +# option and replaced it with the --final-layer-learning-rate-factor option; +# and added a learning-rate factor for + +# _3j is as _3i but using BlockAffineComponent instead of +# RepeatedAffineComponent in Jesus layers. (see --use-repeated-affine false +# option, which is newly added to the script). + +# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is +# applied, in order to control how fast the final layer's affine component learns. + +# _3h is as _3g but using a different and hopefully better type of recurrence, using +# steps/nnet3/make_jesus_configs_recurrent.py to create the configs. This is more +# similar to LSTMs. +# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worde. + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3m # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000 --use-repeated-affine false --jesus-stddev-scale 0.1 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3n.sh b/egs/swbd/s5c/local/chain/run_tdnn_3n.sh new file mode 100755 index 00000000000..dedbd84be75 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3n.sh @@ -0,0 +1,305 @@ +#!/bin/bash + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3n # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1800 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3o.sh b/egs/swbd/s5c/local/chain/run_tdnn_3o.sh new file mode 100755 index 00000000000..14383fe1a32 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3o.sh @@ -0,0 +1,309 @@ +#!/bin/bash + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. +# [ seemed helpful based on likelihoods on first iterations]: on iter 42, +# train prob is -0.1554->-0.1523, and valid prob is -0.1559->-0.1540. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3o # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1800 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3p.sh b/egs/swbd/s5c/local/chain/run_tdnn_3p.sh new file mode 100755 index 00000000000..ddba7e7f9c5 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3p.sh @@ -0,0 +1,333 @@ +#!/bin/bash + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# Comparing the WER with 2y, it's about 1% abs worse [see below]. However, this is +# for an odd reason: the model, while smaller than the 2y one (8.8 vs. 12.1 million +# parameters), seems to have a lot more learning capacity, with better train and worse valid +# prob. In 3r and 3s I am trying smaller versions of this architecture. + +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +# 2y 3p +# final-train-prob: -0.083068 -0.0771 +# final-valid-prob: -0.01212 -0.12715 +# num-parameters: 12094115 8804087 + + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3p # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1800 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3q.sh b/egs/swbd/s5c/local/chain/run_tdnn_3q.sh new file mode 100755 index 00000000000..9f67164b806 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3q.sh @@ -0,0 +1,315 @@ +#!/bin/bash + +# _3q is as _3p, but now trying out the 'block' training script, where in addition to +# the affine connections we have block-matrix connections between the layers. + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3q # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-block-opts "--jesus-full-output-dim 900 --jesus-full-input-dim 900 --jesus-block-input-dim 900 --jesus-block-output-dim 900 --jesus-hidden-dim 15000 --jesus-final-output-dim 600 --jesus-stddev-scale 0.4 --num-affine-blocks 25 --final-layer-target-rms 0.5" \ + --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,0,3 -6,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3r.sh b/egs/swbd/s5c/local/chain/run_tdnn_3r.sh new file mode 100755 index 00000000000..7815adffb9f --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3r.sh @@ -0,0 +1,321 @@ +#!/bin/bash + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] +# [I think I abandoned this after deciding to reduce the parameters even further, +# to the setup in 3s]. + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3r # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3s.sh b/egs/swbd/s5c/local/chain/run_tdnn_3s.sh new file mode 100755 index 00000000000..6cee8b11925 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3s.sh @@ -0,0 +1,340 @@ +#!/bin/bash + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3s # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3t.sh b/egs/swbd/s5c/local/chain/run_tdnn_3t.sh new file mode 100755 index 00000000000..25e30900e36 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3t.sh @@ -0,0 +1,336 @@ +#!/bin/bash + +# _3t is as _3s but using slightly wider context. Dumping our own egs. +# The final train prob is better -0.0851->-0.0815, but valid prob is worse -0.1231->-0.1243. +# WER is slightly worse. So we won't use this for now, but later if we use more data we +# could try wider context like this. +#a03:s5c: ./show_wer.sh 3s +#%WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# +#%WER 18.01 [ 8860 / 49204, 1043 ins, 2315 del, 5502 sub ] exp/chain/tdnn_3t_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 16.68 [ 8205 / 49204, 930 ins, 2420 del, 4855 sub ] exp/chain/tdnn_3t_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +#%WER 19.7 | 4459 42989 | 82.6 11.9 5.5 2.3 19.7 57.4 | exp/chain/tdnn_3t_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.filt.sys +#%WER 17.8 | 4459 42989 | 84.2 10.4 5.4 2.0 17.8 55.4 | exp/chain/tdnn_3t_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3t # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -3,-2,-1,0,1,2,3 -3,0,3 -6,-3,0,3,6 -6,-3,0,3,6" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3u.sh b/egs/swbd/s5c/local/chain/run_tdnn_3u.sh new file mode 100755 index 00000000000..d1b93d9084c --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3u.sh @@ -0,0 +1,330 @@ +#!/bin/bash + +# _3u is as _3s (and re-using the egs) but with one more layer; keeping the same dim +# and total context, and reducing --jesus-forward-output-dim from 1500 to 1300 to +# ensure that the number of parameters doesn't increase too much. +# [stopping this run, as the likelihoods weren't promising, e.g. by iteration +# 39, the valid-prob was worse vs. 3t, -0.1488 -> -0.1521 (train: -0.1510 -> -0.1532) + +# _3t is as _3s but using slightly wider context. Dumping our own egs. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3u # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_3t_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1300 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -3,-2,-1,0,1,2,3 -3,0,3 -3,0,3 -3,0,3 -6,-3,0,3,6" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3v.sh b/egs/swbd/s5c/local/chain/run_tdnn_3v.sh new file mode 100755 index 00000000000..c7fcb7e24f5 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3v.sh @@ -0,0 +1,328 @@ +#!/bin/bash + +# _3v is as _3t but decreasing the --num-jesus-blocks from 100 to 50. +# I stopped it early after likelihoods were not promising: +# on iter 90, train prob was -0.1226->-0.1240, valid -0.1304->-0.1340. + +# _3t is as _3s but using slightly wider context. Dumping our own egs. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3v # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_3t_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --num-jesus-blocks 50 --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -3,-2,-1,0,1,2,3 -3,0,3 -6,-3,0,3,6 -6,-3,0,3,6" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3w.sh b/egs/swbd/s5c/local/chain/run_tdnn_3w.sh new file mode 100755 index 00000000000..e4165e54de6 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3w.sh @@ -0,0 +1,332 @@ +#!/bin/bash + +# _3w is as _3t but instead of having a rectangular affine component in each +# layer, making it square (700->600 not 1300->400), and introducing a new script +# option --final-hidden-dim to have something like a bottleneck at the last +# layer, to avoid a blowup in parameters. +# (note: num-params was slightly smaller, 4.8 million vs 5.3 +# I stopped this on iter 65 after likelihoods were not promising: +# on iter 63, train -0.133->-0.138, valid -0.138->-0.141. + +# _3t is as _3s but using slightly wider context. Dumping our own egs. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3w # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_3t_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 800 --final-hidden-dim 400 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -3,-2,-1,0,1,2,3 -3,0,3 -6,-3,0,3,6 -6,-3,0,3,6" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3x.sh b/egs/swbd/s5c/local/chain/run_tdnn_3x.sh new file mode 100755 index 00000000000..1585d209a93 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3x.sh @@ -0,0 +1,341 @@ +#!/bin/bash + +# _3x is as _3s (and continuing the same kind of experimentation as in 3t->3w)... +# increasing --jesus-forward-output-dim from 1500 to 2000. +# More overtraining: final-train -0.0852->-0.0799, final-valid -0.1231->-0.1261, +# WER effect is very tiny but maybe slightly better. +#a03:s5c: ./show_wer.sh 3x +#%WER 17.78 [ 8750 / 49204, 910 ins, 2405 del, 5435 sub ] exp/chain/tdnn_3x_sp/decode_train_dev_sw1_tg/wer_12_0.0 +#%WER 16.60 [ 8166 / 49204, 921 ins, 2290 del, 4955 sub ] exp/chain/tdnn_3x_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +#%WER 19.5 | 4459 42989 | 82.7 11.4 5.9 2.2 19.5 57.5 | exp/chain/tdnn_3x_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.7 | 4459 42989 | 84.3 10.3 5.5 1.9 17.7 54.6 | exp/chain/tdnn_3x_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 3s +#%WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + + + +# _3t is as _3s but using slightly wider context. Dumping our own egs. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3x # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_3t_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 2000 --final-hidden-dim 350 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3y.sh b/egs/swbd/s5c/local/chain/run_tdnn_3y.sh new file mode 100755 index 00000000000..042ec84898b --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3y.sh @@ -0,0 +1,346 @@ +#!/bin/bash + +# _3y is as _3s but doubling jesus-hidden-dim from 15000 to 30000. +# not promising: by iteration 228, train prob changed -0.09583->-0.09575, and +# valid prob from -0.1213 -> -0.1239. Killed it. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 3s. + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3y # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 30000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3z.sh b/egs/swbd/s5c/local/chain/run_tdnn_3z.sh new file mode 100755 index 00000000000..f1fa2c5a45e --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3z.sh @@ -0,0 +1,350 @@ +#!/bin/bash + +# _3z is as _3s, but reducing the target num-states in the tree building from 9k to 6k. +# A slight degradation in WER, but it's not 100% consistent. The final train-prob +# was worse -0.0852 -> -0.0888, and valid-prob was worse -0.1231->-0.1280. +#./show_wer.sh 3z +#%WER 18.05 [ 8883 / 49204, 990 ins, 2397 del, 5496 sub ] exp/chain/tdnn_3z_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 16.50 [ 8120 / 49204, 960 ins, 2234 del, 4926 sub ] exp/chain/tdnn_3z_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 19.7 | 4459 42989 | 82.5 11.9 5.5 2.2 19.7 57.6 | exp/chain/tdnn_3z_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.8 | 4459 42989 | 84.1 10.4 5.5 1.9 17.8 55.1 | exp/chain/tdnn_3z_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=11 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3z # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 6000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4a.sh b/egs/swbd/s5c/local/chain/run_tdnn_4a.sh new file mode 100755 index 00000000000..c02ad2cb0e4 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4a.sh @@ -0,0 +1,349 @@ +#!/bin/bash + +# _4a is as _3s, but using narrower splice-indexes in the first layer. +# WER is maybe a fraction worse than 3s (see below); final train prob is +# worse -0->0852 -> -0.0879, and valid prob is better -0.121 ->-0.1213 +#./show_wer.sh 4a +#%WER 17.88 [ 8800 / 49204, 1017 ins, 2233 del, 5550 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 16.73 [ 8231 / 49204, 898 ins, 2397 del, 4936 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +#%WER 19.7 | 4459 42989 | 82.5 12.0 5.5 2.3 19.7 57.6 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.filt.sys +#%WER 17.8 | 4459 42989 | 84.2 10.3 5.5 2.0 17.8 55.1 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4a # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4b.sh b/egs/swbd/s5c/local/chain/run_tdnn_4b.sh new file mode 100755 index 00000000000..aad278c3037 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4b.sh @@ -0,0 +1,346 @@ +#!/bin/bash + +# _4b is as _4a, but even narrower splice-indexes in 1st layer (no splicing) +# stopped early after train and valid likelihoods were not promising. +# [later accidentally overwrote and moved the dir.] + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4b # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "0 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4c.sh b/egs/swbd/s5c/local/chain/run_tdnn_4c.sh new file mode 100755 index 00000000000..d9060251844 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4c.sh @@ -0,0 +1,357 @@ +#!/bin/bash + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. +# Yay-- WER is slightly better or the same. Final train-prob is worse +# -0.0879 -> -0.0882, and valid-prob worse -0.1213 -> -0.1241. + +# %WER 17.63 [ 8673 / 49204, 956 ins, 2334 del, 5383 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.61 [ 8175 / 49204, 964 ins, 2272 del, 4939 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.7 | 4459 42989 | 82.6 11.8 5.6 2.3 19.7 57.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.8 | 4459 42989 | 84.2 10.6 5.2 2.0 17.8 54.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4a +# %WER 17.88 [ 8800 / 49204, 1017 ins, 2233 del, 5550 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.73 [ 8231 / 49204, 898 ins, 2397 del, 4936 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.7 | 4459 42989 | 82.5 12.0 5.5 2.3 19.7 57.6 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.filt.sys +# %WER 17.8 | 4459 42989 | 84.2 10.3 5.5 2.0 17.8 55.1 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4c # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4d.sh b/egs/swbd/s5c/local/chain/run_tdnn_4d.sh new file mode 100755 index 00000000000..1ae220dc21a --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4d.sh @@ -0,0 +1,346 @@ +#!/bin/bash + +# _4d is as _4a, but with --egs-opts "--frames-overlap-per-eg 10 +# --cut-zero-frames 5" and changing apply-deriv-weights to true... this to +# activate the new-style derivative weights. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4d # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights true \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 10 --cut-zero-frames 5" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4e.sh b/egs/swbd/s5c/local/chain/run_tdnn_4e.sh new file mode 100755 index 00000000000..fea5495ee06 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4e.sh @@ -0,0 +1,362 @@ +#!/bin/bash + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. +# big improvement- about 0.7% WER abs. Considering the non-l2 part of the objf, the +# final valid objf c->e is -0.1241->-0.1266 [and the l2 term is -0.0196]. +# and for the training st it's -0.08820 -> -0.1149. + + +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4c +# %WER 17.63 [ 8673 / 49204, 956 ins, 2334 del, 5383 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.61 [ 8175 / 49204, 964 ins, 2272 del, 4939 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.7 | 4459 42989 | 82.6 11.8 5.6 2.3 19.7 57.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.8 | 4459 42989 | 84.2 10.6 5.2 2.0 17.8 54.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.filt.sys + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4e # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --l2-regularize 0.0001 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4f.sh b/egs/swbd/s5c/local/chain/run_tdnn_4f.sh new file mode 100755 index 00000000000..36d5f188c56 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4f.sh @@ -0,0 +1,366 @@ +#!/bin/bash + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4f # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4g.sh b/egs/swbd/s5c/local/chain/run_tdnn_4g.sh new file mode 100755 index 00000000000..430c6c28c70 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4g.sh @@ -0,0 +1,365 @@ +#!/bin/bash + +# _4g is as _4c, but reducing the --jesus-hidden-dim further from 7500 to 4000. +# Strangely, the trend from 4a->4a does not continue: instead of continuing to get worse, +# the train and valid probs both get better. + +# 4a 4c 4g +# Final train prob: -0.0879 -0.08820 -0.08784 +# Final valid prob: -0.1214 -0.1241 -0.1204 + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. +# Yay-- WER is slightly better or the same. Final train-prob is worse +# -0.0879 -> -0.0882, and valid-prob worse -0.1213 -> -0.1241. + +# %WER 17.63 [ 8673 / 49204, 956 ins, 2334 del, 5383 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.61 [ 8175 / 49204, 964 ins, 2272 del, 4939 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.7 | 4459 42989 | 82.6 11.8 5.6 2.3 19.7 57.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.8 | 4459 42989 | 84.2 10.6 5.2 2.0 17.8 54.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4a +# %WER 17.88 [ 8800 / 49204, 1017 ins, 2233 del, 5550 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.73 [ 8231 / 49204, 898 ins, 2397 del, 4936 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.7 | 4459 42989 | 82.5 12.0 5.5 2.3 19.7 57.6 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.filt.sys +# %WER 17.8 | 4459 42989 | 84.2 10.3 5.5 2.0 17.8 55.1 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4g # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 4000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4n.sh b/egs/swbd/s5c/local/chain/run_tdnn_4n.sh new file mode 100644 index 00000000000..9125d4e7967 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4n.sh @@ -0,0 +1,386 @@ +#!/bin/bash + +# _4n is as _4f, but adding the [new] option --convert-repeated-to-block-iter=100. +# reusing iter 100 of model 4f to avoid some iterations of training [did this by +# doing (cd exp/chain; cp -r tdnn_4f_sp tdnn_4n_sp), and then running this script with +# --iter 100]. +# [note: to get the block-affine stuff to train fast enough to make a difference +# I multiplied a factor of sqrt(num-blocks) into the learning-rate factor in +# the code. That change is not committed.] +# +# Essentially no effect on WER, but train and valid probs are worse. +# ./compare_wer.sh 4f 4n +# System 4f 4n +# WER on train_dev(tg) 16.83 16.84 +# WER on train_dev(fg) 15.73 15.69 +# WER on eval2000(tg) 18.4 18.4 +# WER on eval2000(fg) 16.6 16.6 +# Final train prob -0.105832 -0.111309 +# Final valid prob -0.123021 -0.123601 + + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4n # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --convert-repeated-to-block-iter 100 \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4p.sh b/egs/swbd/s5c/local/chain/run_tdnn_4p.sh new file mode 100755 index 00000000000..d2b073cdc77 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4p.sh @@ -0,0 +1,381 @@ +#!/bin/bash + +# _4p is as _4f, but one fewer layer, and making the final-layer context wider to +# compensate; also increasing the jesus-layer input and output dims 400->500 and 1500->1600 to +# somewhat compensate for the reduction in parameters. + +# definitely worse. Later with 4r I go in the opposite direction by adding a new layer, +# and get a small improvement. +# ./compare_wer.sh 4f 4p +# System 4f 4p +# WER on train_dev(tg) 16.83 17.36 +# WER on train_dev(fg) 15.73 16.10 +# WER on eval2000(tg) 18.4 19.1 +# WER on eval2000(fg) 16.6 17.2 +# Final train prob -0.105832 -0.104439 +# Final valid prob -0.123021 -0.125576 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4p # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 450 --jesus-forward-output-dim 1600 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -6,-3,0,3 -9,-6,-3,0,3,6" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4q.sh b/egs/swbd/s5c/local/chain/run_tdnn_4q.sh new file mode 100755 index 00000000000..9f2534f4f22 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4q.sh @@ -0,0 +1,177 @@ +#!/bin/bash + +# this is based on Dan's tdnn_2o script +# it has a different splicing configuration +# it uses the PerDimensionWeightedAverage pooling in place of the Jesus layer + +set -e + +#%WER 11.1 | 1831 21395 | 90.2 6.3 3.5 1.3 11.1 46.6 | exp/chain/tdnn_v1_trial6_sp/decode_eval2000_sw1_fsh_fg/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys +#%WER 16.6 | 4459 42989 | 85.2 9.5 5.3 1.8 16.6 53.4 | exp/chain/tdnn_v1_trial6_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +#%WER 15.59 [ 7671 / 49204, 883 ins, 2234 del, 4554 sub ] exp/chain/tdnn_v1_trial6_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 + + +# configs for 'chain' +affix= +stage=10 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4q # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,0,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" +# smoothing options +pool_window=7 +pool_type='per-dim-weighted-average' +pool_lpfilter_width= + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=64 +relu_dim=700 +frames_per_eg=150 +remove_egs=false +common_egs_dir= + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --apply-deriv-weights false \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --pool-type "$pool_type" \ + --pool-window "$pool_window" \ + --pool-lpfilter-width "$pool_lpfilter_width" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim $relu_dim \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + --egs-dir "$common_egs_dir" \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4r.sh b/egs/swbd/s5c/local/chain/run_tdnn_4r.sh new file mode 100755 index 00000000000..64831b5802a --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4r.sh @@ -0,0 +1,380 @@ +#!/bin/bash + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4r # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1400 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4s.sh b/egs/swbd/s5c/local/chain/run_tdnn_4s.sh new file mode 100755 index 00000000000..92a1a7da277 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4s.sh @@ -0,0 +1,380 @@ +#!/bin/bash + +# _4s is as _4f, but with --leaky-hmm-coefficient 0.02. [A new option- +#currently in a branch] +# Overall no real change. + +# ./compare_wer.sh 4f 4s +# System 4f 4s +# WER on train_dev(tg) 16.83 16.82 +# WER on train_dev(fg) 15.73 15.62 +# WER on eval2000(tg) 18.4 18.5 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.111371 +# Final valid prob -0.123021 -0.12648 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4s # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --leaky-hmm-coefficient 0.02 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4t.sh b/egs/swbd/s5c/local/chain/run_tdnn_4t.sh new file mode 100755 index 00000000000..30b383d05d7 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4t.sh @@ -0,0 +1,382 @@ +#!/bin/bash + +# _4t is as _4s, but with --leaky-hmm-coefficient 0.04. + +# [note, I accidentally overwrote this directory afterwards, and moved it.] +# It's really not clear whether it's helpful. +# ./compare_wer.sh 4f 4t +# System 4f 4t +# WER on train_dev(tg) 16.83 16.75 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.5 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.112721 +# Final valid prob -0.123021 -0.129688 + +# _4s is as _4f, but with --leaky-hmm-coefficient 0.02. [A new option.] + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4u # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --leaky-hmm-coefficient 0.08 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4u.sh b/egs/swbd/s5c/local/chain/run_tdnn_4u.sh new file mode 100755 index 00000000000..ae7cf02b426 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4u.sh @@ -0,0 +1,384 @@ +#!/bin/bash + +# _4u is as _4t, but with --leaky-hmm-coefficient 0.08. Note: the +# ultimate baseline is 4f. + +# It seems a bit better on average. +#./compare_wer.sh 4f 4u +#System 4f 4u +#WER on train_dev(tg) 16.83 16.47 +#WER on train_dev(fg) 15.73 15.23 +#WER on eval2000(tg) 18.4 18.4 +#WER on eval2000(fg) 16.6 16.7 +#Final train prob -0.105832 -0.118911 +#Final valid prob -0.123021 -0.135768 + +# _4t is as _4s, but with --leaky-hmm-coefficient 0.04. + +# _4s is as _4f, but with --leaky-hmm-coefficient 0.02. [A new option.] + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4t # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --leaky-hmm-coefficient 0.08 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4v.sh b/egs/swbd/s5c/local/chain/run_tdnn_4v.sh new file mode 100755 index 00000000000..9cdbfefb5a2 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4v.sh @@ -0,0 +1,394 @@ +#!/bin/bash + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +#./compare_wer.sh 4r 4v +#System 4r 4v +#WER on train_dev(tg) 16.50 15.95 +#WER on train_dev(fg) 15.45 14.69 +#WER on eval2000(tg) 18.3 17.7 +#WER on eval2000(fg) 16.7 16.0 +#Final train prob -0.103652 -0.106646 -1.60775 +#Final valid prob -0.121105 -0.118631 -1.62832 + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4v # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1400 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4w.sh b/egs/swbd/s5c/local/chain/run_tdnn_4w.sh new file mode 100755 index 00000000000..6dd5c587f7a --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4w.sh @@ -0,0 +1,397 @@ +#!/bin/bash + +# _4w is as _4v, but doubling --xent-regularize to 0.2 WER seems consistently a +# bit worse, although final valid prob is very slightly better. + +#./compare_wer.sh 4v 4w +#System 4v 4w +#WER on train_dev(tg) 15.95 16.05 +#WER on train_dev(fg) 14.69 14.92 +#WER on eval2000(tg) 17.7 18.0 +#WER on eval2000(fg) 16.0 16.2 +#Final train prob -0.106646 -0.108816 +#Final valid prob -0.118631 -0.118254 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4w # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.2 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1400 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4x.sh b/egs/swbd/s5c/local/chain/run_tdnn_4x.sh new file mode 100755 index 00000000000..0290e0bdbd5 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4x.sh @@ -0,0 +1,396 @@ +#!/bin/bash + +# _4x is as _4u, but with --leaky-hmm-coefficient 0.2. Note: the +# ultimate baseline is 4f. It seems a little bit worse than 4u on average: (+0.2, +0.2, 0.0, -0.1). +# So I'm guessing the best value is around --leaky-hmm-coefficient 0.1. +# +# ./compare_wer.sh 4f 4u 4x +# System 4f 4u 4x +# WER on train_dev(tg) 16.83 16.47 16.63 +# WER on train_dev(fg) 15.73 15.23 15.42 +# WER on eval2000(tg) 18.4 18.4 18.4 +# WER on eval2000(fg) 16.6 16.7 16.6 +# Final train prob -0.105832 -0.118911 -0.130674 +# Final valid prob -0.123021 -0.135768 -0.146351 + +# _4u is as _4t, but with --leaky-hmm-coefficient 0.08. Note: the +# ultimate baseline is 4f. + +#./compare_wer.sh 4f 4u +#System 4f 4u +#WER on train_dev(tg) 16.83 16.47 +#WER on train_dev(fg) 15.73 15.23 +#WER on eval2000(tg) 18.4 18.4 +#WER on eval2000(fg) 16.6 16.7 +#Final train prob -0.105832 -0.118911 +#Final valid prob -0.123021 -0.135768 + +# _4t is as _4s, but with --leaky-hmm-coefficient 0.04. + +# _4s is as _4f, but with --leaky-hmm-coefficient 0.02. [A new option.] + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4x # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --leaky-hmm-coefficient 0.2 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5a.sh b/egs/swbd/s5c/local/chain/run_tdnn_5a.sh new file mode 100755 index 00000000000..cd1de07a80d --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5a.sh @@ -0,0 +1,401 @@ +#!/bin/bash + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. Very helpful (between 0.2% +# and 0.6%). + +#./compare_wer.sh 4w 5a +#System 4w 5a +#WER on train_dev(tg) 16.05 15.86 +#WER on train_dev(fg) 14.92 14.74 +#WER on eval2000(tg) 18.0 17.4 +#WER on eval2000(fg) 16.2 15.6 +#Final train prob -0.108816-0.0998359 +#Final valid prob -0.118254 -0.115884 + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5a # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.2 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5b.sh b/egs/swbd/s5c/local/chain/run_tdnn_5b.sh new file mode 100755 index 00000000000..7e44c10920e --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5b.sh @@ -0,0 +1,404 @@ +#!/bin/bash + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5b # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.2 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5c.sh b/egs/swbd/s5c/local/chain/run_tdnn_5c.sh new file mode 100755 index 00000000000..93ebb59b16d --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5c.sh @@ -0,0 +1,409 @@ +#!/bin/bash + +# _5c is as _4w, but changing --xent-regularize to 0.05, since 0.2 seemed to be +# worse than 0.1. +# It seems a little worse on average: WER change is (+0.3, +0.3, -0.2, +0.2). +#System 4w 5c +#WER on train_dev(tg) 16.05 16.35 +#WER on train_dev(fg) 14.92 15.21 +#WER on eval2000(tg) 18.0 17.8 +#WER on eval2000(fg) 16.2 16.4 +#Final train prob -0.108816 -0.107098 +#Final valid prob -0.118254 -0.118209 + +# _4w is as _4v, but doubling --xent-regularize to 0.2. WER seems consistently +# a bit worse (+0.1, +0.2, +0.3, +0.2), although final valid prob is very +# slightly better. + +#./compare_wer.sh 4v 4w +#System 4v 4w +#WER on train_dev(tg) 15.95 16.05 +#WER on train_dev(fg) 14.69 14.92 +#WER on eval2000(tg) 17.7 18.0 +#WER on eval2000(fg) 16.0 16.2 +#Final train prob -0.106646 -0.108816 +#Final valid prob -0.118631 -0.118254 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5c # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.05 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1400 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5d.sh b/egs/swbd/s5c/local/chain/run_tdnn_5d.sh new file mode 100755 index 00000000000..8e6e9358003 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5d.sh @@ -0,0 +1,407 @@ +#!/bin/bash + +# _5d is as _5b, but increasing jesus-forward-input-dim from 500 to 600 and +# jesus-forward-output-dim from 1800 to 2000. + +# It's maybe slightly helpful: WER change is (-0.2, -0.2, 0, +0.1). +#./compare_wer.sh 5b 5d +#System 5b 5d +#WER on train_dev(tg) 15.51 15.29 +#WER on train_dev(fg) 14.39 14.17 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.7 +#Final train prob -0.112013 -0.107858 +#Final valid prob -0.130879 -0.128862 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5d # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.2 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 2000 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5e.sh b/egs/swbd/s5c/local/chain/run_tdnn_5e.sh new file mode 100755 index 00000000000..ed48b0673b8 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5e.sh @@ -0,0 +1,417 @@ +#!/bin/bash + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5e # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5f.sh b/egs/swbd/s5c/local/chain/run_tdnn_5f.sh new file mode 100755 index 00000000000..5fb1f0c445c --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5f.sh @@ -0,0 +1,423 @@ +#!/bin/bash + +# _5f is as _5e, but making the 5b->5d change (increasing the +# number of parameters)-- increasing jesus-forward-output-dim from 1800 to 2000, +# and jesus-forward-input-dim from 500 to 600. + +# WER change is (-0.1, -0.2, +0.2, +0.1). So zero on average. +# This means 5e remains the best system so far. + +#./compare_wer.sh 5e 5f +#System 5e 5f +#WER on train_dev(tg) 15.43 15.35 +#WER on train_dev(fg) 14.32 14.15 +#WER on eval2000(tg) 17.3 17.5 +#WER on eval2000(fg) 15.5 15.6 +#Final train prob -0.110056 -0.10574 +#Final valid prob -0.129184 -0.128112 + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.05 is better than 0.2 or 0.1). + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5f # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 2000 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/confidence_calibration.sh b/egs/swbd/s5c/local/confidence_calibration.sh index de330866622..1e2337ab298 100755 --- a/egs/swbd/s5c/local/confidence_calibration.sh +++ b/egs/swbd/s5c/local/confidence_calibration.sh @@ -49,9 +49,11 @@ paste $word_filter <(awk '{ print $3 }' $word_length) <(awk '{ print $3 }' $unig ###### Train the calibration, +false && \ steps/conf/train_calibration.sh --cmd "$decode_cmd" --lmwt $lmwt \ $dev_data $graph $word_feats $dev_latdir $dev_caldir + ###### Apply the calibration to eval set, steps/conf/apply_calibration.sh --cmd "$decode_cmd" \ $eval_data $graph $eval_latdir $dev_caldir $eval_caldir diff --git a/egs/swbd/s5c/local/nnet3/run_discriminative.sh b/egs/swbd/s5c/local/nnet3/run_discriminative.sh new file mode 100755 index 00000000000..3237102a63d --- /dev/null +++ b/egs/swbd/s5c/local/nnet3/run_discriminative.sh @@ -0,0 +1,229 @@ +#!/bin/bash + +set -e +set -o pipefail + +# this is run_discriminative.sh + +# This script does discriminative training on top of nnet3 system. +# note: this relies on having a cluster that has plenty of CPUs as well as GPUs, +# since the lattice generation runs in about real-time, so takes of the order of +# 1000 hours of CPU time. +# +# Note: rather than using any features we have dumped on disk, this script +# regenerates them from the wav data three times-- when we do lattice +# generation, numerator alignment and discriminative training. This made the +# script easier to write and more generic, because we don't have to know where +# the features and the iVectors are, but of course it's a little inefficient. +# The time taken is dominated by the lattice generation anyway, so this isn't +# a huge deal. + +. cmd.sh + + +stage=0 +train_stage=-10 +get_egs_stage=-10 +use_gpu=true +srcdir=exp/nnet3/nnet_ms_a +criterion=smbr +drop_frames=false # only matters for MMI. +frames_per_eg=150 +frames_overlap_per_eg=30 +effective_learning_rate=0.0000125 +max_param_change=1 +num_jobs_nnet=4 +train_stage=-10 # can be used to start training in the middle. +decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more. +num_epochs=4 +degs_dir= +cleanup=false # run with --cleanup true --stage 6 to clean up (remove large things like denlats, + # alignments and degs). +lats_dir= +train_data_dir=data/train_nodup_sp_hires +online_ivector_dir=exp/nnet3/ivectors_train_nodup_sp +one_silence_class=true +truncate_deriv_weights=10 +minibatch_size=64 + +adjust_priors=true + +determinize=true +minimize=true +remove_output_symbols=true +remove_epsilons=true +collapse_transition_ids=true + +modify_learning_rates=true +last_layer_factor=1.0 + +set -e +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if $use_gpu; then + if ! cuda-compiled; then + cat <) { @A = split(" ", $_); $id = shift @A; print "$id "; foreach $a (@A) { if (!defined $bad{$a}) { print "$a "; }} print "\n"; }' \ - '[NOISE]' '[LAUGHTER]' '[VOCALIZED-NOISE]' '' '%HESITATION' + '[NOISE]' '[LAUGHTER]' '[VOCALIZED-NOISE]' '' '%HESITATION' '[noise]' '[laughter]' '[vocalized-noise]' '' '%hesitation' } for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do diff --git a/egs/swbd/s5c/local/score_sclite.sh b/egs/swbd/s5c/local/score_sclite.sh index 3bce900aecf..7b8a620ac31 100755 --- a/egs/swbd/s5c/local/score_sclite.sh +++ b/egs/swbd/s5c/local/score_sclite.sh @@ -7,7 +7,9 @@ stage=0 min_lmwt=5 max_lmwt=20 reverse=false +iter=final word_ins_penalty=0.0,0.5,1.0 +get_conf=false #end configuration section. [ -f ./path.sh ] && . ./path.sh @@ -28,7 +30,7 @@ data=$1 lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied. dir=$3 -model=$dir/../final.mdl # assume model one level up from decoding dir. +model=$dir/../$iter.mdl # assume model one level up from decoding dir. hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl [ ! -f $hubscr ] && echo "Cannot find scoring program at $hubscr" && exit 1; @@ -62,16 +64,28 @@ mkdir -p $dir/scoring/log if [ $stage -le 0 ]; then for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do - $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.${wip}.log \ - mkdir -p $dir/score_LMWT_${wip}/ '&&' \ - lattice-scale --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ - lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ - lattice-1best ark:- ark:- \| \ - lattice-align-words $reorder_opt $lang/phones/word_boundary.int $model ark:- ark:- \| \ - nbest-to-ctm $frame_shift_opt ark:- - \| \ - utils/int2sym.pl -f 5 $lang/words.txt \| \ - utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \ - '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1; + if ! $get_conf; then + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.${wip}.log \ + mkdir -p $dir/score_LMWT_${wip}/ '&&' \ + lattice-scale --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ + lattice-1best ark:- ark:- \| \ + lattice-align-words $reorder_opt $lang/phones/word_boundary.int $model ark:- ark:- \| \ + nbest-to-ctm $frame_shift_opt ark:- - \| \ + utils/int2sym.pl -f 5 $lang/words.txt \| \ + utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \ + '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1; + else + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.${wip}.log \ + mkdir -p $dir/score_LMWT_${wip}/ '&&' \ + lattice-scale --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ + lattice-align-words $reorder_opt $lang/phones/word_boundary.int $model ark:- ark:- \| \ + lattice-to-ctm-conf $frame_shift_opt ark:- - \| \ + utils/int2sym.pl -f 5 $lang/words.txt \| \ + utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \ + '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1; + fi done fi diff --git a/egs/wsj/s5/local/nnet3/run_discriminative.sh b/egs/wsj/s5/local/nnet3/run_discriminative.sh new file mode 100755 index 00000000000..14ed587ade0 --- /dev/null +++ b/egs/wsj/s5/local/nnet3/run_discriminative.sh @@ -0,0 +1,224 @@ +#!/bin/bash + +set -e +set -o pipefail + +# this is run_discriminative.sh + +# This script does discriminative training on top of nnet3 system. +# note: this relies on having a cluster that has plenty of CPUs as well as GPUs, +# since the lattice generation runs in about real-time, so takes of the order of +# 1000 hours of CPU time. +# +# Note: rather than using any features we have dumped on disk, this script +# regenerates them from the wav data three times-- when we do lattice +# generation, numerator alignment and discriminative training. This made the +# script easier to write and more generic, because we don't have to know where +# the features and the iVectors are, but of course it's a little inefficient. +# The time taken is dominated by the lattice generation anyway, so this isn't +# a huge deal. + +. cmd.sh + + +stage=0 +train_stage=-10 +get_egs_stage=-10 +use_gpu=true +srcdir=exp/nnet3/nnet_ms_a +criterion=smbr +drop_frames=false # only matters for MMI. +frames_per_eg=150 +frames_overlap_per_eg=30 +effective_learning_rate=0.0000125 +num_jobs_nnet=4 +train_stage=-10 # can be used to start training in the middle. +decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more. +num_epochs=4 +degs_dir= +cleanup=false # run with --cleanup true --stage 6 to clean up (remove large things like denlats, + # alignments and degs). +lats_dir= +train_data_dir=data/train_si284_hires +online_ivector_dir=exp/nnet3/ivectors_train_si284 +one_silence_class=true +truncate_deriv_weights=10 +minibatch_size=64 + +adjust_priors=true + +determinize=true +minimize=true +remove_output_symbols=true +remove_epsilons=true +collapse_transition_ids=true + +modify_learning_rates=true +last_layer_factor=1.0 + +set -e +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if $use_gpu; then + if ! cuda-compiled; then + cat <' $dir/JOB.ctm # Merge and clean, @@ -75,8 +81,8 @@ fi # Create the forwarding data for logistic regression, if [ $stage -le 2 ]; then - steps/conf/prepare_calibration_data.py --conf-feats $dir/forward_feats.ark \ - $dir/ctm_int $word_feats $latdepth $word_categories + python steps/conf/prepare_calibration_data.py --conf-feats $dir/forward_feats.ark \ + --lattice-depth $latdepth $dir/ctm_int $word_feats $word_categories fi # Apply calibration model to dev, diff --git a/egs/wsj/s5/steps/conf/parse_arpa_unigrams.py b/egs/wsj/s5/steps/conf/parse_arpa_unigrams.py index 709f60b8ad6..3ccdf6fb164 100755 --- a/egs/wsj/s5/steps/conf/parse_arpa_unigrams.py +++ b/egs/wsj/s5/steps/conf/parse_arpa_unigrams.py @@ -1,4 +1,4 @@ -#!/bin/env python +#!/usr/bin/env python # Copyright 2015 Brno University of Technology (author: Karel Vesely) # Apache 2.0 diff --git a/egs/wsj/s5/steps/conf/prepare_calibration_data.py b/egs/wsj/s5/steps/conf/prepare_calibration_data.py index 003d77c5e8a..23db9633a1c 100755 --- a/egs/wsj/s5/steps/conf/prepare_calibration_data.py +++ b/egs/wsj/s5/steps/conf/prepare_calibration_data.py @@ -13,8 +13,8 @@ The logisitc-regression input features are: - posteriors from 'ctm' transformed by logit, - logarithm of word-length in letters, -- logarithm of average lattice-depth at position of the word, - 10base logarithm of unigram probability of a word from language model, +- logarithm of average lattice-depth at position of the word (optional), The logistic-regresion targets are: - 1 for correct word, @@ -33,12 +33,13 @@ parser = OptionParser(usage=usage, description=desc) parser.add_option("--conf-targets", help="Targets file for logistic regression (no targets generated if '') [default %default]", default='') parser.add_option("--conf-feats", help="Feature file for logistic regression. [default %default]", default='') +parser.add_option("--lattice-depth", help="Per-frame lattice depths, ascii-ark (optional). [default %default]", default='') (o, args) = parser.parse_args() -if len(args) != 4: +if len(args) != 3: parser.print_help() sys.exit(1) -ctm_file, word_feats_file, depths_file, word_categories_file = args +ctm_file, word_feats_file, word_categories_file = args assert(o.conf_feats != '') @@ -76,10 +77,12 @@ # Load the per-frame lattice-depth, # - we assume, the 1st column in 'ctm' is the 'utterance-key' in depth file, -depths = dict() -for l in open(depths_file): - utt,d = l.split(' ',1) - depths[utt] = map(int,d.split()) +# - if the 'ctm' and 'ark' keys don't match, we leave this feature out, +if o.lattice_depth: + depths = dict() + for l in open(o.lattice_depth): + utt,d = l.split(' ',1) + depths[utt] = map(int,d.split()) # Load the 'word_categories' mapping for categorical input features derived from 'lang/words.txt', wrd_to_cat = [ l.split() for l in open(word_categories_file) ] @@ -98,15 +101,19 @@ logit = math.log(float(conf)+damper) - math.log(1.0 - float(conf)+damper) # - log of word-length, log_word_length = math.log(word_length[wrd_id]) # i.e. number of phones in a word, - # - log of average-depth of lattice at the word position, - depth_slice = depths[utt][int(round(100.0*float(beg))):int(round(100.0*(float(beg)+float(dur))))] - log_avg_depth = math.log(float(sum(depth_slice))/len(depth_slice)) # - categorical distribution of words (with frequency higher than min-count), wrd_1_of_k = [0]*wrd_cat_num; wrd_1_of_k[wrd_to_cat[wrd_id]] = 1; # Compose the input feature vector, - feats = [ logit, log_word_length, log_avg_depth, other_feats[wrd_id] ] + wrd_1_of_k + feats = [ logit, log_word_length, other_feats[wrd_id] ] + wrd_1_of_k + + # Optionally add average-depth of lattice at the word position, + if o.lattice_depth != '': + depth_slice = depths[utt][int(round(100.0*float(beg))):int(round(100.0*(float(beg)+float(dur))))] + log_avg_depth = math.log(float(sum(depth_slice))/len(depth_slice)) + feats += [ log_avg_depth ] + # Store the input features, f.write(key + ' [ ' + ' '.join(map(str,feats)) + ' ]\n') diff --git a/egs/wsj/s5/steps/conf/train_calibration.sh b/egs/wsj/s5/steps/conf/train_calibration.sh index 64ca70022c8..c5d2082ab90 100755 --- a/egs/wsj/s5/steps/conf/train_calibration.sh +++ b/egs/wsj/s5/steps/conf/train_calibration.sh @@ -76,22 +76,22 @@ fi # Get evaluation of the 'ctm' using the 'text' reference, if [ $stage -le 1 ]; then - steps/conf/convert_ctm_to_tra.py $dir/ctm - | \ + python steps/conf/convert_ctm_to_tra.py $dir/ctm - | \ align-text --special-symbol="" ark:$data/text ark:- ark,t:- | \ utils/scoring/wer_per_utt_details.pl --special-symbol "" \ >$dir/align_text # Append alignment to ctm, - steps/conf/append_eval_to_ctm.py $dir/align_text $dir/ctm $dir/ctm_aligned + python steps/conf/append_eval_to_ctm.py $dir/align_text $dir/ctm $dir/ctm_aligned # Convert words to 'ids', cat $dir/ctm_aligned | utils/sym2int.pl -f 5 $lang/words.txt >$dir/ctm_aligned_int fi # Prepare word-categories (based on wotd frequencies in 'ctm'), if [ -z "$category_text" ]; then - steps/conf/convert_ctm_to_tra.py $dir/ctm - | \ - steps/conf/prepare_word_categories.py --min-count $word_min_count $lang/words.txt - $dir/word_categories + python steps/conf/convert_ctm_to_tra.py $dir/ctm - | \ + python steps/conf/prepare_word_categories.py --min-count $word_min_count $lang/words.txt - $dir/word_categories else - steps/conf/prepare_word_categories.py --min-count $word_min_count $lang/words.txt "$category_text" $dir/word_categories + python steps/conf/prepare_word_categories.py --min-count $word_min_count $lang/words.txt "$category_text" $dir/word_categories fi # Compute lattice-depth, @@ -102,9 +102,9 @@ fi # Create the training data for logistic regression, if [ $stage -le 3 ]; then - steps/conf/prepare_calibration_data.py \ + python steps/conf/prepare_calibration_data.py \ --conf-targets $dir/train_targets.ark --conf-feats $dir/train_feats.ark \ - $dir/ctm_aligned_int $word_feats $latdepth $dir/word_categories + --lattice-depth $latdepth $dir/ctm_aligned_int $word_feats $dir/word_categories fi # Train the logistic regression, diff --git a/egs/wsj/s5/steps/nnet2/decode.sh b/egs/wsj/s5/steps/nnet2/decode.sh index 753411f4563..e4e726522f0 100755 --- a/egs/wsj/s5/steps/nnet2/decode.sh +++ b/egs/wsj/s5/steps/nnet2/decode.sh @@ -151,7 +151,7 @@ if [ $stage -le 2 ]; then [ ! -x local/score.sh ] && \ echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; echo "score best paths" - local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir + local/score.sh --iter $iter $scoring_opts --cmd "$cmd" $data $graphdir $dir echo "score confidence and timing with sclite" fi fi diff --git a/egs/wsj/s5/steps/nnet2/get_egs_discriminative2.sh b/egs/wsj/s5/steps/nnet2/get_egs_discriminative2.sh index 4c08a08b824..b1c145b6157 100755 --- a/egs/wsj/s5/steps/nnet2/get_egs_discriminative2.sh +++ b/egs/wsj/s5/steps/nnet2/get_egs_discriminative2.sh @@ -28,6 +28,15 @@ online_ivector_dir= num_utts_subset=3000 num_archives_priors=10 +left_context= +right_context= + +collapse_transition_ids=true +determinize=true +minimize=true +split=true +excise=true + # End configuration section. @@ -248,8 +257,17 @@ if [ -d $dir/storage ]; then fi rm $dir/.error 2>/dev/null -left_context=$(nnet-am-info $dir/final.mdl | grep '^left-context' | awk '{print $2}') || exit 1 -right_context=$(nnet-am-info $dir/final.mdl | grep '^right-context' | awk '{print $2}') || exit 1 +if [ -z "$left_context" ]; then + left_context=$(nnet-am-info $dir/final.mdl | grep '^left-context' | awk '{print $2}') || exit 1 +fi +if [ -z "$right_context" ]; then + right_context=$(nnet-am-info $dir/final.mdl | grep '^right-context' | awk '{print $2}') || exit 1 +fi + +nnet_context_opts="--left-context=$left_context --right-context=$right_context" + +echo "left-context=$left_context" +echo "right-context=$right_context" ( @@ -261,8 +279,6 @@ for y in `seq $num_archives_priors`; do priors_egs_list="$priors_egs_list ark:$dir/priors_egs.$y.ark" done -nnet_context_opts="--left-context=$left_context --right-context=$right_context" - echo "$0: dumping egs for prior adjustment in the background." $cmd $dir/log/create_priors_subset.log \ @@ -279,13 +295,15 @@ fi ) & +discriminative_egs_opts="--determinize=$determinize --minimize=$minimize --collapse-transition-ids=$collapse_transition_ids --split=$split --excise=$excise" + if [ $stage -le 3 ]; then echo "$0: getting initial training examples by splitting lattices" degs_list=$(for n in $(seq $num_archives_temp); do echo ark:$dir/degs_orig.JOB.$n.ark; done) $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \ - nnet-get-egs-discriminative --criterion=$criterion --drop-frames=$drop_frames \ + nnet-get-egs-discriminative --criterion=$criterion --drop-frames=$drop_frames $nnet_context_opts $discriminative_egs_opts \ "$src_model" "$feats" "$ali_rspecifier" "ark,s,cs:gunzip -c $denlatdir/lat.JOB.gz|" ark:- \| \ nnet-copy-egs-discriminative $const_dim_opt ark:- $degs_list || exit 1; sleep 5; # wait a bit so NFS has time to write files. diff --git a/egs/wsj/s5/steps/nnet2/get_lda_block.sh b/egs/wsj/s5/steps/nnet2/get_lda_block.sh index c840e014250..7bd4ecf5647 100755 --- a/egs/wsj/s5/steps/nnet2/get_lda_block.sh +++ b/egs/wsj/s5/steps/nnet2/get_lda_block.sh @@ -104,7 +104,7 @@ while [ $[$cur_index+$block_size] -le $feat_dim ]; do echo >> $dir/indexes num_blocks=$[$num_blocks+1] cur_index=$[$cur_index+$block_shift] - if [ $[$cur_index+$block_size-1] -gt $feat_dim ]; then + if [ $[$cur_index+$block_size] -gt $feat_dim ]; then cur_index=$[$feat_dim-$block_size]; fi done diff --git a/egs/wsj/s5/steps/nnet2/train_discriminative2.sh b/egs/wsj/s5/steps/nnet2/train_discriminative2.sh index 85047efc581..eb719838f36 100755 --- a/egs/wsj/s5/steps/nnet2/train_discriminative2.sh +++ b/egs/wsj/s5/steps/nnet2/train_discriminative2.sh @@ -98,7 +98,7 @@ dir=$2 [ -z "$src_model" ] && src_model=$degs_dir/final.mdl # Check some files. -for f in $degs_dir/degs.1.ark $degs_dir/info/{num_archives,silence.csl,frames_per_archive} $src_model; do +for f in $degs_dir/degs.1.ark $degs_dir/info/{num_archives,silence.csl} $src_model; do [ ! -f $f ] && echo "$0: no such file $f" && exit 1; done diff --git a/egs/wsj/s5/steps/nnet3/adjust_priors.sh b/egs/wsj/s5/steps/nnet3/adjust_priors.sh new file mode 100755 index 00000000000..d75eef0536d --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/adjust_priors.sh @@ -0,0 +1,93 @@ +#!/bin/bash + +. path.sh + +cmd=run.pl +prior_subset_size=20000 # 20k samples per job, for computing priors. +num_jobs_compute_prior=10 # these are single-threaded, run on CPU. +use_gpu=false # if true, we run on GPU. +egs_type=egs +raw=false +use_degs=false +iter=final + +. utils/parse_options.sh + +if $use_degs && [ $egs_type == egs ]; then + egs_type=degs +fi + +echo "$0 $@" # Print the command line for logging + +if [ $# -ne 2 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 exp/nnet3_sad_snr/tdnn_train_100k_whole_1k_splice2_2_relu500" + exit 1 +fi + +dir=$1 +egs_dir=$2 + +if $use_gpu; then + prior_gpu_opt="--use-gpu=yes" + prior_queue_opt="--gpu 1" +else + prior_gpu_opt="--use-gpu=no" + prior_queue_opt="" +fi + +for f in $egs_dir/$egs_type.1.ark $egs_dir/info/num_archives; do + if [ ! -f $f ]; then + echo "$f not found" + exit 1 + fi +done + +if $raw; then + model=$dir/$iter.raw +else + model="nnet3-am-copy --raw=true $dir/$iter.mdl - |" +fi + +rm -f $dir/post.$iter.*.vec 2>/dev/null + +left_context=`cat $egs_dir/info/left_context` || exit 1 +right_context=`cat $egs_dir/info/right_context` || exit 1 + +context_opts="--left-context=$left_context --right-context=$right_context" + +num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; } +if [ $num_jobs_compute_prior -gt $num_archives ]; then egs_part=1; +else egs_part=JOB; fi + +if ! $use_degs; then + $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$iter.JOB.log \ + nnet3-copy-egs ark:$egs_dir/$egs_type.$egs_part.ark ark:- \| \ + nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \ + nnet3-merge-egs ark:- ark:- \| \ + nnet3-compute-from-egs $prior_gpu_opt --apply-exp=true \ + "$model" ark:- ark:- \| \ + matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$iter.JOB.vec || exit 1; +else + $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$iter.JOB.log \ + nnet3-discriminative-copy-egs ark:$egs_dir/$egs_type.$egs_part.ark ark:- \| \ + nnet3-discriminative-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \ + nnet3-discriminative-merge-egs ark:- ark:- \| \ + nnet3-compute-from-degs $prior_gpu_opt --apply-exp=true \ + "$model" ark:- ark:- \| \ + matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$iter.JOB.vec || exit 1; + +fi + +sleep 3; # make sure there is time for $dir/post.$iter.*.vec to appear. + +$cmd $dir/log/vector_sum.$iter.log \ + vector-sum $dir/post.$iter.*.vec $dir/post.$iter.vec || exit 1; + +if ! $raw; then + run.pl $dir/log/adjust_priors.$iter.log \ + nnet3-am-adjust-priors $dir/$iter.mdl $dir/post.$iter.vec $dir/$iter.adj.mdl +fi + +rm -f $dir/post.$iter.*.vec; + diff --git a/egs/wsj/s5/steps/nnet3/align.sh b/egs/wsj/s5/steps/nnet3/align.sh index 71947961da4..22d2a618ed7 100755 --- a/egs/wsj/s5/steps/nnet3/align.sh +++ b/egs/wsj/s5/steps/nnet3/align.sh @@ -118,9 +118,16 @@ echo "$0: aligning data in $data using model from $srcdir, putting alignments in tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|"; +frame_subsampling_opt= +if [ -f $srcdir/frame_subsampling_factor ]; then + # e.g. for 'chain' systems + frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)" + cp $srcdir/frame_subsampling_factor $dir +fi + $cmd JOB=1:$nj $dir/log/align.JOB.log \ compile-train-graphs $dir/tree $srcdir/${iter}.mdl $lang/L.fst "$tra" ark:- \| \ - nnet3-align-compiled $scale_opts $ivector_opts \ + nnet3-align-compiled $scale_opts $ivector_opts $frame_subsampling_opt \ --use-gpu=$use_gpu --beam=$beam --retry-beam=$retry_beam \ $srcdir/${iter}.mdl ark:- "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh index fc75932d0d3..6eb0f51308b 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh @@ -25,6 +25,8 @@ frames_per_eg=25 # number of frames of labels per example. more->less disk sp frames_overlap_per_eg=0 # number of supervised frames of overlap that we aim for per eg. # can be useful to avoid wasted data if you're using --left-deriv-truncate # and --right-deriv-truncate. +cut_zero_frames=-1 # if activated, activates new-style derivative weights.. i'll reorganize + # this if it works well. frame_subsampling_factor=3 # ratio between input and output frame-rate of nnet. left_context=4 # amount of left-context per eg (i.e. extra frames of input features # not present in the output supervision). @@ -44,7 +46,9 @@ num_egs_diagnostic=400 # number of frames for "compute_prob" jobs frames_per_iter=400000 # each iteration of training, see this many frames # per job. This is just a guideline; it will pick a number # that divides the number of samples in the entire data. + right_tolerance= #CTC right tolerance == max label delay. +left_tolerance= transform_dir= # If supplied, overrides latdir as the place to find fMLLR transforms @@ -263,7 +267,7 @@ if [ $stage -le 2 ]; then fi -egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --num-frames-overlap=$frames_overlap_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress" +egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --num-frames-overlap=$frames_overlap_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress --cut-zero-frames=$cut_zero_frames" [ -z $valid_left_context ] && valid_left_context=$left_context; @@ -275,6 +279,8 @@ ctc_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$frame [ ! -z $right_tolerance ] && \ ctc_supervision_all_opts="$ctc_supervision_all_opts --right-tolerance=$right_tolerance" +[ ! -z $left_tolerance ] && \ + ctc_supervision_all_opts="$ctc_supervision_all_opts --left-tolerance=$left_tolerance" echo $left_context > $dir/info/left_context echo $right_context > $dir/info/right_context diff --git a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh index 1a62d8d7bb6..f2af7d0fdcb 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh +++ b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh @@ -23,17 +23,27 @@ truncate_deriv_weights=0 # can be used to set to zero the weights of derivs fro apply_deriv_weights=true initial_effective_lrate=0.0002 final_effective_lrate=0.00002 +extra_left_context=0 # actually for recurrent setups. pnorm_input_dim=3000 pnorm_output_dim=300 relu_dim= # you can use this to make it use ReLU's instead of p-norms. + +jesus_opts= # opts to steps/nnet3/make_jesus_configs.py. + # If nonempty, assumes you want to use the jesus nonlinearity, + # and you should supply various options to that script in + # this string. rand_prune=4.0 # Relates to a speedup we do for LDA. minibatch_size=512 # This default is suitable for GPU-based training. # Set it to 128 for multi-threaded CPU-based training. lm_opts= # options to chain-est-phone-lm +l2_regularize=0.0 +leaky_hmm_coefficient=0.00001 +xent_regularize=0.0 frames_per_iter=800000 # each iteration of training, see this many [input] # frames per job. This option is passed to get_egs.sh. # Aim for about a minute of training time right_tolerance=10 +left_tolerance=5 denominator_scale=1.0 # relates to tombsone stuff. num_jobs_initial=1 # Number of neural net jobs to run in parallel at the start of training num_jobs_final=8 # Number of neural net jobs to run in parallel at the end of training @@ -66,6 +76,10 @@ exit_stage=-100 # you can set this to terminate the training early. Exits befor # count space-separated fields in splice_indexes to get num-hidden-layers. splice_indexes="-4,-3,-2,-1,0,1,2,3,4 0 -2,2 0 -4,4 0" +pool_type='none' +pool_window= +pool_lpfilter_width= + # Format : layer/....layer/ " # note: hidden layers which are composed of one or more components, # so hidden layer indexing is different from component count @@ -87,7 +101,7 @@ right_deriv_truncate= # number of time-steps to avoid using the deriv of, on th # End configuration section. -trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM +trap 'for pid in $(jobs -pr); do kill -TERM $pid; done' INT QUIT TERM echo "$0 $@" # Print the command line for logging @@ -197,23 +211,43 @@ num_leaves=$(am-info $dir/0.trans_mdl | grep -w pdfs | awk '{print $NF}') || exi if [ $stage -le -5 ]; then echo "$0: creating neural net configs"; - if [ ! -z "$relu_dim" ]; then - dim_opts="--relu-dim $relu_dim" + + if [ ! -z "$jesus_opts" ]; then + python steps/nnet3/make_jesus_configs.py \ + --xent-regularize=$xent_regularize \ + --include-log-softmax=false \ + --splice-indexes "$splice_indexes" \ + --feat-dim $feat_dim \ + --ivector-dim $ivector_dim \ + $jesus_opts \ + --num-targets $num_leaves \ + $dir/configs || exit 1; else - dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" - fi + [ $xent_regularize != "0.0" ] && \ + echo "$0: --xent-regularize option not supported by tdnn/make_configs.py." && exit 1; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi - # create the config files for nnet initialization - python steps/nnet3/make_tdnn_configs.py \ - --include-log-softmax=false \ - --final-layer-normalize-target $final_layer_normalize_target \ - --splice-indexes "$splice_indexes" \ - --feat-dim $feat_dim \ - --ivector-dim $ivector_dim \ - $dim_opts \ - --num-targets $num_leaves \ - --use-presoftmax-prior-scale false \ - $dir/configs || exit 1; + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + + python steps/nnet3/tdnn/make_configs.py $pool_opts \ + --include-log-softmax=false \ + --final-layer-normalize-target $final_layer_normalize_target \ + --splice-indexes "$splice_indexes" \ + --feat-dim $feat_dim \ + --ivector-dim $ivector_dim \ + $dim_opts \ + --num-targets $num_leaves \ + --use-presoftmax-prior-scale false \ + $dir/configs || exit 1; + fi # Initialize as "raw" nnet, prior to training the LDA-like preconditioning # matrix. This first config just does any initial splicing that we do; @@ -242,12 +276,14 @@ if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then extra_opts+=(--transform-dir $transform_dir) # we need a bit of extra left-context and right-context to allow for frame # shifts (we use shifted version of the data for more variety). - extra_opts+=(--left-context $[$left_context+$frame_subsampling_factor/2]) + extra_opts+=(--left-context $[$left_context+$frame_subsampling_factor/2+$extra_left_context]) extra_opts+=(--right-context $[$right_context+$frame_subsampling_factor/2]) echo "$0: calling get_egs.sh" steps/nnet3/chain/get_egs.sh $egs_opts "${extra_opts[@]}" \ --frames-per-iter $frames_per_iter --stage $get_egs_stage \ --cmd "$cmd" \ + --right-tolerance "$right_tolerance" \ + --left-tolerance "$left_tolerance" \ --frames-per-eg $frames_per_eg \ --frame-subsampling-factor $frame_subsampling_factor \ $data $dir $latdir $dir/egs || exit 1; @@ -414,11 +450,11 @@ while [ $x -lt $num_iters ]; do # Set off jobs doing some diagnostics, in the background. # Use the egs dir from the previous iteration for the diagnostics $cmd $dir/log/compute_prob_valid.$x.log \ - nnet3-chain-compute-prob \ + nnet3-chain-compute-prob --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \ "nnet3-am-copy --raw=true $dir/$x.mdl -|" $dir/den.fst \ "ark:nnet3-chain-merge-egs ark:$egs_dir/valid_diagnostic.cegs ark:- |" & $cmd $dir/log/compute_prob_train.$x.log \ - nnet3-chain-compute-prob \ + nnet3-chain-compute-prob --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \ "nnet3-am-copy --raw=true $dir/$x.mdl -|" $dir/den.fst \ "ark:nnet3-chain-merge-egs ark:$egs_dir/train_diagnostic.cegs ark:- |" & @@ -461,7 +497,9 @@ while [ $x -lt $num_iters ]; do rm $dir/.error 2>/dev/null - ( # this sub-shell is so that when we "wait" below, + ( + trap 'for pid in $(jobs -pr); do kill -TERM $pid; done' INT QUIT TERM + # this sub-shell is so that when we "wait" below, # we only wait for the training jobs that we just spawned, # not the diagnostic jobs that we spawned above. @@ -476,6 +514,7 @@ while [ $x -lt $num_iters ]; do $cmd $train_queue_opt $dir/log/train.$x.$n.log \ nnet3-chain-train --apply-deriv-weights=$apply_deriv_weights \ + --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \ $parallel_train_opts $deriv_time_opts \ --max-param-change=$this_max_param_change \ --print-interval=10 "$mdl" $dir/den.fst \ @@ -543,7 +582,7 @@ if [ $stage -le $num_iters ]; then # num-threads to 8 to speed it up (this isn't ideal...) $cmd $combine_queue_opt $dir/log/combine.log \ - nnet3-chain-combine --num-iters=40 \ + nnet3-chain-combine --num-iters=40 --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient \ --enforce-sum-to-one=true --enforce-positive-weights=true \ --verbose=3 $dir/den.fst "${nnets_list[@]}" "ark:nnet3-chain-merge-egs --minibatch-size=$minibatch_size ark:$egs_dir/combine.cegs ark:-|" \ "|nnet3-am-copy --set-raw-nnet=- $dir/$first_model_combine.mdl $dir/final.mdl" || exit 1; @@ -553,11 +592,11 @@ if [ $stage -le $num_iters ]; then # the same subset we used for the previous compute_probs, as the # different subsets will lead to different probs. $cmd $dir/log/compute_prob_valid.final.log \ - nnet3-chain-compute-prob \ + nnet3-chain-compute-prob --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \ "nnet3-am-copy --raw=true $dir/final.mdl - |" $dir/den.fst \ "ark:nnet3-chain-merge-egs ark:$egs_dir/valid_diagnostic.cegs ark:- |" & $cmd $dir/log/compute_prob_train.final.log \ - nnet3-chain-compute-prob \ + nnet3-chain-compute-prob --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \ "nnet3-am-copy --raw=true $dir/final.mdl - |" $dir/den.fst \ "ark:nnet3-chain-merge-egs ark:$egs_dir/train_diagnostic.cegs ark:- |" & fi diff --git a/egs/wsj/s5/steps/nnet3/components.py b/egs/wsj/s5/steps/nnet3/components.py index 87323a1c3e1..1fc49290dfe 100644 --- a/egs/wsj/s5/steps/nnet3/components.py +++ b/egs/wsj/s5/steps/nnet3/components.py @@ -6,6 +6,45 @@ import sys import warnings import copy +from operator import itemgetter +import numpy as np +try: + import scipy.signal as signal + has_scipy_signal = True +except ImportError: + has_scipy_signal = False + +def WriteKaldiMatrix(matrix, matrix_file_name): + assert(len(matrix.shape) == 2) + # matrix is a numpy array + matrix_file = open(matrix_file_name, "w") + [rows, cols ] = matrix.shape + matrix_file.write('[\n') + for row in range(rows): + matrix_file.write(' '.join( map(lambda x: '{0:f}'.format(x), matrix[row, : ]))) + if row == rows - 1: + matrix_file.write("]") + else: + matrix_file.write('\n') + matrix_file.close() +def GetSumDescriptor(inputs): + sum_descriptors = inputs + while len(sum_descriptors) != 1: + cur_sum_descriptors = [] + pair = [] + while len(sum_descriptors) > 0: + value = sum_descriptors.pop() + if value.strip() != '': + pair.append(value) + if len(pair) == 2: + cur_sum_descriptors.append("Sum({0}, {1})".format(pair[0], pair[1])) + pair = [] + if pair: + cur_sum_descriptors.append(pair[0]) + sum_descriptors = cur_sum_descriptors + return sum_descriptors + + # adds the input nodes and returns the descriptor def AddInputLayer(config_lines, feat_dim, splice_indexes=[0], ivector_dim=0): @@ -19,11 +58,26 @@ def AddInputLayer(config_lines, feat_dim, splice_indexes=[0], ivector_dim=0): components.append('input-node name=ivector dim=' + str(ivector_dim)) list.append('ReplaceIndex(ivector, t, 0)') output_dim += ivector_dim - splice_descriptor = "Append({0})".format(", ".join(list)) + if len(list) > 1: + splice_descriptor = "Append({0})".format(", ".join(list)) + else: + splice_descriptor = list[0] print(splice_descriptor) return {'descriptor': splice_descriptor, 'dimension': output_dim} +def AddNoOpLayer(config_lines, name, input): + components = config_lines['components'] + component_nodes = config_lines['component-nodes'] + + components.append('component name={0}_noop type=NoOpComponent dim={1}'.format(name, input['dimension'])) + component_nodes.append('component-node name={0}_noop component={0}_noop input={1}'.format(name, input['descriptor'])) + + return {'descriptor': '{0}_noop'.format(name), + 'dimension': input['dimension']} + + + def AddLdaLayer(config_lines, name, input, lda_file): components = config_lines['components'] component_nodes = config_lines['component-nodes'] @@ -34,6 +88,30 @@ def AddLdaLayer(config_lines, name, input, lda_file): return {'descriptor': '{0}_lda'.format(name), 'dimension': input['dimension']} +def AddBlockAffineLayer(config_lines, name, input, output_dim, num_blocks): + components = config_lines['components'] + component_nodes = config_lines['component-nodes'] + assert((input['dimension'] % num_blocks == 0) and + (output_dim % num_blocks == 0)) + components.append('component name={0}_block_affine type=BlockAffineComponent input-dim={1} output-dim={2} num-blocks={3}'.format(name, input['dimension'], output_dim, num_blocks)) + component_nodes.append('component-node name={0}_block_affine component={0}_block_affine input={1}'.format(name, input['descriptor'])) + + return {'descriptor' : '{0}_block_affine'.format(name), + 'dimension' : output_dim} + + +def AddPermuteLayer(config_lines, name, input, column_map): + components = config_lines['components'] + component_nodes = config_lines['component-nodes'] + permute_indexes = ",".join(map(lambda x: str(x), column_map)) + components.append('component name={0}_permute type=PermuteComponent column-map={1}'.format(name, permute_indexes)) + component_nodes.append('component-node name={0}_permute component={0}_permute input={1}'.format(name, input['descriptor'])) + + return {'descriptor': '{0}_permute'.format(name), + 'dimension': input['dimension']} + + + def AddAffineLayer(config_lines, name, input, output_dim, ng_affine_options = ""): components = config_lines['components'] component_nodes = config_lines['component-nodes'] @@ -44,13 +122,13 @@ def AddAffineLayer(config_lines, name, input, output_dim, ng_affine_options = "" return {'descriptor': '{0}_affine'.format(name), 'dimension': output_dim} -def AddAffRelNormLayer(config_lines, name, input, output_dim, ng_affine_options = ""): +def AddAffRelNormLayer(config_lines, name, input, output_dim, ng_affine_options = " bias-stddev=0 ", norm_target_rms = 1.0): components = config_lines['components'] component_nodes = config_lines['component-nodes'] components.append("component name={0}_affine type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input['dimension'], output_dim, ng_affine_options)) components.append("component name={0}_relu type=RectifiedLinearComponent dim={1}".format(name, output_dim)) - components.append("component name={0}_renorm type=NormalizeComponent dim={1}".format(name, output_dim)) + components.append("component name={0}_renorm type=NormalizeComponent dim={1} target-rms={2}".format(name, output_dim, norm_target_rms)) component_nodes.append("component-node name={0}_affine component={0}_affine input={1}".format(name, input['descriptor'])) component_nodes.append("component-node name={0}_relu component={0}_relu input={0}_affine".format(name)) @@ -60,6 +138,36 @@ def AddAffRelNormLayer(config_lines, name, input, output_dim, ng_affine_options 'dimension': output_dim} +def AddConvolutionLayer(config_lines, name, input, + input_x_dim, input_y_dim, input_z_dim, + filt_x_dim, filt_y_dim, + filt_x_step, filt_y_step, + num_filters, input_vectorization, + param_stddev = None, bias_stddev = None, + filter_bias_file = None, + is_updatable = True): + assert(input['dimension'] == input_x_dim * input_y_dim * input_z_dim) + components = config_lines['components'] + component_nodes = config_lines['component-nodes'] + + conv_init_string = "component name={0}_conv type=ConvolutionComponent input-x-dim={1} input-y-dim={2} input-z-dim={3} filt-x-dim={4} filt-y-dim={5} filt-x-step={6} filt-y-step={7} input-vectorization-order={8}".format(name, input_x_dim, input_y_dim, input_z_dim, filt_x_dim, filt_y_dim, filt_x_step, filt_y_step, input_vectorization) + if filter_bias_file is not None: + conv_init_string += " matrix={0}".format(filter_bias_file) + if is_updatable: + conv_init_string += " is-updatable=true" + else: + conv_init_string += " is-updatable=false" + + components.append(conv_init_string) + component_nodes.append("component-node name={0}_conv_t component={0}_conv input={1}".format(name, input['descriptor'])) + + num_x_steps = (1 + (input_x_dim - filt_x_dim) / filt_x_step) + num_y_steps = (1 + (input_y_dim - filt_y_dim) / filt_y_step) + output_dim = num_x_steps * num_y_steps * num_filters; + return {'descriptor': '{0}_conv_t'.format(name), + 'dimension': output_dim} + + def AddSoftmaxLayer(config_lines, name, input): components = config_lines['components'] @@ -72,7 +180,7 @@ def AddSoftmaxLayer(config_lines, name, input): 'dimension': input['dimension']} -def AddOutputNode(config_lines, input, label_delay=None): +def AddOutputLayer(config_lines, input, label_delay=None): components = config_lines['components'] component_nodes = config_lines['component-nodes'] if label_delay is None: @@ -80,12 +188,18 @@ def AddOutputNode(config_lines, input, label_delay=None): else: component_nodes.append('output-node name=output input=Offset({0},{1})'.format(input['descriptor'], label_delay)) -def AddFinalLayer(config_lines, input, output_dim, ng_affine_options = "", label_delay=None, include_softmax = "true"): +def AddFinalLayer(config_lines, input, output_dim, ng_affine_options = " param-stddev=0 bias-stddev=0 ", label_delay=None, use_presoftmax_prior_scale = False, prior_scale_file = None, include_log_softmax = True): + components = config_lines['components'] + component_nodes = config_lines['component-nodes'] + prev_layer_output = AddAffineLayer(config_lines, "Final", input, output_dim, ng_affine_options) - if include_softmax == "true": - prev_layer_output = AddSoftmaxLayer(config_lines, "Final", prev_layer_output) - AddOutputNode(config_lines, prev_layer_output, label_delay) - + if include_log_softmax: + if use_presoftmax_prior_scale : + components.append('component name=Final-fixed-scale type=FixedScaleComponent scales={0}'.format(prior_scale_file)) + component_nodes.append('component-node name=Final-fixed-scale component=Final-fixed-scale input={0}'.format(prev_layer_output['descriptor'])) + prev_layer_output['descriptor'] = "Final-fixed-scale" + prev_layer_output = AddSoftmaxLayer(config_lines, "Final", prev_layer_output) + AddOutputLayer(config_lines, prev_layer_output, label_delay) def AddLstmLayer(config_lines, name, input, cell_dim, diff --git a/egs/wsj/s5/steps/nnet3/decode.sh b/egs/wsj/s5/steps/nnet3/decode.sh index f4de09740ae..1a60118c67c 100755 --- a/egs/wsj/s5/steps/nnet3/decode.sh +++ b/egs/wsj/s5/steps/nnet3/decode.sh @@ -26,6 +26,10 @@ num_threads=1 # if >1, will use gmm-latgen-faster-parallel parallel_opts= # ignored now. scoring_opts= skip_scoring=false +extra_left_context=0 +extra_right_context=0 +extra_left_context_initial=-1 +extra_right_context_final=-1 feat_type= online_ivector_dir= minimize=false @@ -132,11 +136,12 @@ if [ ! -z "$online_ivector_dir" ]; then fi if [ "$post_decode_acwt" == 1.0 ]; then - lat_wspecifier="ark|gzip -c >$dir/lat.JOB.gz" + lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz" else lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz" fi +frame_subsampling_opt= if [ -f $srcdir/frame_subsampling_factor ]; then # e.g. for 'chain' systems frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)" @@ -146,6 +151,10 @@ if [ $stage -le 1 ]; then $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \ nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \ --frames-per-chunk=$frames_per_chunk \ + --extra-left-context=$extra_left_context \ + --extra-right-context=$extra_right_context \ + --extra-left-context-initial=$extra_left_context_initial \ + --extra-right-context-final=$extra_right_context_final \ --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \ --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \ --word-symbol-table=$graphdir/words.txt "$model" \ @@ -161,7 +170,7 @@ if [ $stage -le 2 ]; then [ ! -x local/score.sh ] && \ echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; echo "score best paths" - local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir + local/score.sh --iter $iter $scoring_opts --cmd "$cmd" $data $graphdir $dir echo "score confidence and timing with sclite" fi fi diff --git a/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py b/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py index 88cf54e824e..2290c4d2e7f 100755 --- a/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py +++ b/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py @@ -34,6 +34,11 @@ 'shape':'box', 'style':'filled' }, + 'ConvolutionComponent':{ + 'color':'lightpink', + 'shape':'box', + 'style':'filled' + }, 'FixedScaleComponent':{ 'color':'blueviolet', 'shape':'box', @@ -64,6 +69,11 @@ 'shape':'rectangle', 'style':'filled' }, + 'ClipGradientComponent':{ + 'color':'bisque', + 'shape':'rectangle', + 'style':'filled' + }, 'ElementwiseProductComponent':{ 'color':'green', 'shape':'rectangle', @@ -84,10 +94,10 @@ def GetDotNodeName(name_string, is_component = False): # 2. Nnet3 names can be shared among components and component nodes # dot does not allow common names # - name_string = re.sub("-", "hyphen", name_string) + node_name_string = re.sub("-", "hyphen", name_string) if is_component: - name_string += name_string.strip() + "_component" - return name_string + node_name_string += node_name_string.strip() + "_component" + return {"label":name_string, "node":node_name_string} def ProcessAppendDescriptor(segment, parent_node_name, affix, edge_attributes = None): dot_graph = [] @@ -96,18 +106,18 @@ def ProcessAppendDescriptor(segment, parent_node_name, affix, edge_attributes = for i in range(len(segment['sub_segments'])): sub_segment = segment['sub_segments'][i] part_name = "{0}{1}{2}".format(desc_name, sub_segment['name'], i) - names.append("<{0}> part {1}".format(GetDotNodeName(part_name), i)) + names.append("<{0}> part {1}".format(GetDotNodeName(part_name)['node'], i)) dot_graph += DescriptorSegmentToDot(sub_segment, "{0}:{1}".format(desc_name, part_name), desc_name) part_index = len(segment['sub_segments']) for i in range(len(segment['arguments'])): part_name = "{0}{1}{2}".format(desc_name, segment['arguments'][i], part_index + i) - names.append("<{0}> part {1}".format(GetDotNodeName(part_name), part_index + i)) - dot_graph.append("{0} -> {1}:{2}".format(GetDotNodeName(segment['arguments'][i]), GetDotNodeName(desc_name), GetDotNodeName(part_name))) + names.append("<{0}> part {1}".format(GetDotNodeName(part_name)['node'], part_index + i)) + dot_graph.append("{0} -> {1}:{2}".format(GetDotNodeName(segment['arguments'][i])['node'], GetDotNodeName(desc_name)['node'], GetDotNodeName(part_name)['node'])) label = "|".join(names) label = "{{"+label+"}|Append}" - dot_graph.append('{0} [shape=Mrecord, label="{1}"];'.format(GetDotNodeName(desc_name), label)) + dot_graph.append('{0} [shape=Mrecord, label="{1}"];'.format(GetDotNodeName(desc_name)['node'], label)) attr_string = '' if edge_attributes is not None: @@ -116,7 +126,7 @@ def ProcessAppendDescriptor(segment, parent_node_name, affix, edge_attributes = if edge_attributes.has_key('style'): attr_string += ' style={0} '.format(edge_attributes['style']) - dot_string = '{0} -> {1} [tailport=s]'.format(GetDotNodeName(desc_name), GetDotNodeName(parent_node_name)) + dot_string = '{0} -> {1} [tailport=s]'.format(GetDotNodeName(desc_name)['node'], GetDotNodeName(parent_node_name)['node']) if attr_string != '': dot_string += ' [{0}] '.format(attr_string) @@ -125,6 +135,28 @@ def ProcessAppendDescriptor(segment, parent_node_name, affix, edge_attributes = return dot_graph +def ProcessRoundDescriptor(segment, parent_node_name, affix, edge_attributes = None): + dot_graph = [] + + label = 'Round ({0})'.format(segment['arguments'][1]) + style = None + if edge_attributes is not None: + if edge_attributes.has_key('label'): + label = "{0} {1}".format(edge_attributes['label'], label) + if edge_attributes.has_key('style'): + style = 'style={0}'.format(edge_attributes['style']) + + attr_string = 'label="{0}"'.format(label) + if style is not None: + attr_string += ' {0}'.format(style) + dot_graph.append('{0}->{1} [ {2} ]'.format(GetDotNodeName(segment['arguments'][0])['node'], + GetDotNodeName(parent_node_name)['node'], + attr_string)) + if segment['sub_segments']: + raise Exception("Round can just deal with forwarding descriptor, no sub-segments allowed") + return dot_graph + + def ProcessOffsetDescriptor(segment, parent_node_name, affix, edge_attributes = None): dot_graph = [] @@ -140,8 +172,8 @@ def ProcessOffsetDescriptor(segment, parent_node_name, affix, edge_attributes = if style is not None: attr_string += ' {0}'.format(style) - dot_graph.append('{0}->{1} [ {2} ]'.format(GetDotNodeName(segment['arguments'][0]), - GetDotNodeName(parent_node_name), + dot_graph.append('{0}->{1} [ {2} ]'.format(GetDotNodeName(segment['arguments'][0])['node'], + GetDotNodeName(parent_node_name)['node'], attr_string)) if segment['sub_segments']: raise Exception("Offset can just deal with forwarding descriptor, no sub-segments allowed") @@ -151,21 +183,23 @@ def ProcessSumDescriptor(segment, parent_node_name, affix, edge_attributes = Non dot_graph = [] names = [] desc_name = 'Sum_{0}'.format(affix) + # create the sum node for i in range(len(segment['sub_segments'])): sub_segment = segment['sub_segments'][i] part_name = "{0}{1}{2}".format(desc_name, sub_segment['name'], i) - names.append("<{0}> part {1}".format(GetDotNodeName(part_name), i)) - dot_graph += DescriptorSegmentToDot(sub_segment, "{0}:{1}".format(desc_name, part_name), desc_name) + names.append("<{0}> part {1}".format(GetDotNodeName(part_name)['node'], i)) + dot_graph += DescriptorSegmentToDot(sub_segment, "{0}:{1}".format(desc_name, part_name), desc_name+"_"+str(i)) + # link the sum node parts to corresponding segments part_index = len(segment['sub_segments']) for i in range(len(segment['arguments'])): part_name = "{0}{1}{2}".format(desc_name, segment['arguments'][i], part_index + i) - names.append("<{0}> part {1}".format(GetDotNodeName(part_name), part_index + i)) - dot_graph.append("{0} -> {1}:{2}".format(GetDotNodeName(segment['arguments'][i]), GetDotNodeName(desc_name), GetDotNodeName(part_name))) + names.append("<{0}> part {1}".format(GetDotNodeName(part_name)['node'], part_index + i)) + dot_graph.append("{0} -> {1}:{2}".format(GetDotNodeName(segment['arguments'][i])['node'], GetDotNodeName(desc_name)['node'], GetDotNodeName(part_name)['node'])) label = "|".join(names) label = '{{'+label+'}|Sum}' - dot_graph.append('{0} [shape=Mrecord, label="{1}", color=red];'.format(GetDotNodeName(desc_name), label)) + dot_graph.append('{0} [shape=Mrecord, label="{1}", color=red];'.format(GetDotNodeName(desc_name)['node'], label)) attr_string = '' if edge_attributes is not None: @@ -174,7 +208,7 @@ def ProcessSumDescriptor(segment, parent_node_name, affix, edge_attributes = Non if edge_attributes.has_key('style'): attr_string += ' style={0} '.format(edge_attributes['style']) - dot_string = '{0} -> {1}'.format(GetDotNodeName(desc_name), GetDotNodeName(parent_node_name)) + dot_string = '{0} -> {1}'.format(GetDotNodeName(desc_name)['node'], GetDotNodeName(parent_node_name)['node']) dot_string += ' [{0} tailport=s ] '.format(attr_string) dot_graph.append(dot_string) @@ -195,8 +229,8 @@ def ProcessReplaceIndexDescriptor(segment, parent_node_name, affix, edge_attribu if style is not None: attr_string += ' {0}'.format(style) - dot_graph.append('{0}->{1} [{2}]'.format(GetDotNodeName(segment['arguments'][0]), - GetDotNodeName(parent_node_name), + dot_graph.append('{0}->{1} [{2}]'.format(GetDotNodeName(segment['arguments'][0])['node'], + GetDotNodeName(parent_node_name)['node'], attr_string)) if segment['sub_segments']: raise Exception("ReplaceIndex can just deal with forwarding descriptor, no sub-segments allowed") @@ -215,7 +249,7 @@ def ProcessIfDefinedDescriptor(segment, parent_node_name, affix, edge_attributes dot_graph += DescriptorSegmentToDot(sub_segment, parent_node_name, parent_node_name, edge_attributes={'style':'dotted', 'label':'IfDefined'}) if segment['arguments']: - dot_graph.append('{0} -> {1} [style=dotted, label="IfDefined"]'.format(GetDotNodeName(segment['arguments'][0]), GetDotNodeName(parent_node_name))) + dot_graph.append('{0} -> {1} [style=dotted, label="IfDefined"]'.format(GetDotNodeName(segment['arguments'][0])['node'], GetDotNodeName(parent_node_name)['node'])) return dot_graph @@ -232,6 +266,8 @@ def DescriptorSegmentToDot(segment, parent_node_name, affix, edge_attributes = N dot_graph += ProcessIfDefinedDescriptor(segment, parent_node_name, affix, edge_attributes) elif segment['name'] == "ReplaceIndex": dot_graph += ProcessReplaceIndexDescriptor(segment, parent_node_name, affix, edge_attributes) + elif segment['name'] == "Round": + dot_graph += ProcessRoundDescriptor(segment, parent_node_name, affix, edge_attributes) else: raise Exception('Descriptor {0}, is not recognized by this script. Please add Process{0}Descriptor method'.format(segment['name'])) return dot_graph @@ -244,7 +280,7 @@ def Nnet3DescriptorToDot(descriptor, parent_node_name): dot_lines += DescriptorSegmentToDot(segment, parent_node_name, parent_node_name) elif arguments: assert(len(arguments) == 1) - dot_lines.append("{0} -> {1}".format(GetDotNodeName(arguments[0]), GetDotNodeName(parent_node_name))) + dot_lines.append("{0} -> {1}".format(GetDotNodeName(arguments[0])['node'], GetDotNodeName(parent_node_name)['node'])) return dot_lines def ParseNnet3String(string): @@ -298,27 +334,28 @@ def Nnet3ComponentToDot(component_config, component_attributes = None): except KeyError: pass - return ['{0} [label="{1}" {2}]'.format(GetDotNodeName(component_config['name'], is_component = True), label, attr_string)] + return ['{0} [label="{1}" {2}]'.format(GetDotNodeName(component_config['name'], is_component = True)['node'], label, attr_string)] # input-node name=input dim=40 def Nnet3InputToDot(parsed_config): - return ['{0} [ label="{1}\\ndim={2}"]'.format(GetDotNodeName(parsed_config['name']), parsed_config['name'], parsed_config['dim'] )] + return ['{0} [ label="{1}\\ndim={2}"]'.format(GetDotNodeName(parsed_config['name'])['node'], parsed_config['name'], parsed_config['dim'] )] # output-node name=output input=Final_log_softmax dim=3940 objective=linear +#output-node name=output input=Offset(Final_log_softmax, 5) dim=3940 objective=linear def Nnet3OutputToDot(parsed_config): dot_graph = [] - dot_graph.append('{0} [ label="{1}\\nobjective={2}"]'.format(GetDotNodeName(parsed_config['name']), parsed_config['name'], parsed_config['objective'])) - dot_graph.append('{0} -> {1}'.format(GetDotNodeName(parsed_config['input']), GetDotNodeName(parsed_config['name']))) + dot_graph += Nnet3DescriptorToDot(parsed_config['input'], parsed_config['name']) + dot_graph.append('{0} [ label="{1}\\nobjective={2}"]'.format(GetDotNodeName(parsed_config['name'])['node'], parsed_config['name'], parsed_config['objective'])) return dot_graph # dim-range-node name=Lstm1_r_t input-node=Lstm1_rp_t dim-offset=0 dim=256 def Nnet3DimrangeToDot(parsed_config): dot_graph = [] - dot_graph.append(parsed_config['name']) - dot_graph.append('{0} [shape=rectangle]'.format(GetDotNodeName(parsed_config['name']))) - dot_graph.append('{0} -> {1} [taillabel="dimrange({2}, {3})"]'.format(GetDotNodeName(parsed_config['input-node']), - GetDotNodeName(parsed_config['name']), + dot_node = GetDotNodeName(parsed_config['name']) + dot_graph.append('{0} [shape=rectangle, label="{1}"]'.format(dot_node['node'], dot_node['label'])) + dot_graph.append('{0} -> {1} [taillabel="dimrange({2}, {3})"]'.format(GetDotNodeName(parsed_config['input-node'])['node'], + GetDotNodeName(parsed_config['name'])['node'], parsed_config['dim-offset'], parsed_config['dim'])) return dot_graph @@ -326,9 +363,10 @@ def Nnet3DimrangeToDot(parsed_config): def Nnet3ComponentNodeToDot(parsed_config): dot_graph = [] dot_graph += Nnet3DescriptorToDot(parsed_config['input'], parsed_config['name']) - dot_graph.append('{0} [ label="{1}", shape=box ]'.format(GetDotNodeName(parsed_config['name']), parsed_config['name'])) - dot_graph.append('{0} -> {1} [ weight=10 ]'.format(GetDotNodeName(parsed_config['component'], is_component = True), - GetDotNodeName(parsed_config['name']))) + dot_node = GetDotNodeName(parsed_config['name']) + dot_graph.append('{0} [ label="{1}", shape=box ]'.format(dot_node['node'], dot_node['label'])) + dot_graph.append('{0} -> {1} [ weight=10 ]'.format(GetDotNodeName(parsed_config['component'], is_component = True)['node'], + GetDotNodeName(parsed_config['name'])['node'])) return dot_graph def GroupConfigs(configs, node_prefixes = []): @@ -408,6 +446,8 @@ def ParseConfigLines(lines, node_prefixes = [], component_attributes = None ): " will be clustered together in the dot-graph" " --node-prefixes Lstm1,Lstm2,Layer1", default=None) + parser.add_argument("dotfile", help="name of the dot output file") + print(' '.join(sys.argv), file=sys.stderr) args = parser.parse_args() @@ -420,4 +460,7 @@ def ParseConfigLines(lines, node_prefixes = [], component_attributes = None ): lines = sys.stdin.readlines() dot_graph = ParseConfigLines(lines, component_attributes = component_attributes, node_prefixes = node_prefixes) - print("\n".join(dot_graph)) + + dotfile_handle = open(args.dotfile, "w") + dotfile_handle.write("\n".join(dot_graph)) + dotfile_handle.close() diff --git a/egs/wsj/s5/steps/nnet3/get_egs.sh b/egs/wsj/s5/steps/nnet3/get_egs.sh index dc8cac9c0b0..364f6a72443 100755 --- a/egs/wsj/s5/steps/nnet3/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/get_egs.sh @@ -170,8 +170,8 @@ esac if [ -f $dir/trans.scp ]; then feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |" - valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp|' ark:- ark:- |" - train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp|' ark:- ark:- |" + valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" + train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" fi if [ ! -z "$online_ivector_dir" ]; then diff --git a/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh b/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh new file mode 100755 index 00000000000..2112b0ba227 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh @@ -0,0 +1,469 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. +# Copyright 2014-2015 Vimal Manohar + +# This script dumps examples MPE or MMI or state-level minimum bayes risk (sMBR) +# training of neural nets. +# Criterion supported are mpe, smbr and mmi + +# Begin configuration section. +cmd=run.pl +feat_type=raw # set it to 'lda' to use LDA features. +frames_per_eg=150 # number of frames of labels per example. more->less disk space and + # less time preparing egs, but more I/O during training. + # note: the script may reduce this if reduce_frames_per_eg is true. +frames_overlap_per_eg=30 # number of supervised frames of overlap that we aim for per eg. + # can be useful to avoid wasted data if you're using --left-deriv-truncate + # and --right-deriv-truncate. +frame_subsampling_factor=1 # ratio between input and output frame-rate of nnet. + # this should be read from the nnet. For now, it is taken as an option +left_context=4 # amount of left-context per eg (i.e. extra frames of input features + # not present in the output supervision). +right_context=4 # amount of right-context per eg. +valid_left_context= # amount of left_context for validation egs, typically used in + # recurrent architectures to ensure matched condition with + # training egs +valid_right_context= # amount of right_context for validation egs +adjust_priors=true +priors_left_context= # amount of left_context for priors egs +priors_right_context= # amount of right_context for priors egs +compress=true # set this to false to disable compression (e.g. if you want to see whether + # results are affected). +num_utts_subset=80 # number of utterances in validation and training + # subsets used for shrinkage and diagnostics. + +frames_per_iter=400000 # each iteration of training, see this many frames + # per job. This is just a guideline; it will pick a number + # that divides the number of samples in the entire data. + +determinize=false +minimize=false +remove_output_symbols=false +remove_epsilons=false +collapse_transition_ids=false +acwt=0.1 + +criterion=smbr + +stage=0 +#nj=15 # This should be set to the maximum number of jobs you are +# # comfortable to run in parallel; you can increase it if your disk +# # speed is greater and you have more machines. +max_shuffle_jobs_run=50 + +transform_dir= # If this is a SAT system, directory for transforms +online_ivector_dir= +cmvn_opts= # can be used for specifying CMVN options, if feature type is not lda (if lda, + # it doesn't make sense to use different options than were used as input to the + # LDA transform). This is used to turn off CMVN in the online-nnet experiments. + +num_priors_subset=100 +num_archives_priors=10 + +# End configuration section. + + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 6 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet_denlats exp/tri4/final.mdl exp/tri4_mpe/degs" + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs (probably would be good to add -tc 5 or so if using" + echo " # GridEngine (to avoid excessive NFS traffic)." + echo " --samples-per-iter <#samples|400000> # Number of samples of data to process per iteration, per" + echo " # process." + echo " --stage # Used to run a partially-completed training process from somewhere in" + echo " # the middle." + echo " --criterion # Training criterion: may be smbr, mmi or mpfe" + echo " --online-ivector-dir # Directory for online-estimated iVectors, used in the" + echo " # online-neural-net setup. (but you may want to use" + echo " # steps/online/nnet2/get_egs_discriminative2.sh instead)" + exit 1; +fi + +data=$1 +lang=$2 +alidir=$3 +denlatdir=$4 +src_model=$5 +dir=$6 + +extra_files= +[ ! -z $online_ivector_dir ] && \ + extra_files="$online_ivector_dir/ivector_period $online_ivector_dir/ivector_online.scp" + +# Check some files. +for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/num_jobs $alidir/tree \ + $denlatdir/lat.1.gz $denlatdir/num_jobs $src_model $extra_files; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +mkdir -p $dir/log $dir/info || exit 1; + +[ "$(readlink /bin/sh)" == dash ] && \ + echo "This script won't work if /bin/sh points to dash. make it point to bash." && exit 1 + +nj=$(cat $denlatdir/num_jobs) || exit 1; + +sdata=$data/split$nj +utils/split_data.sh $data $nj + +# Get list of validation utterances. +awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \ + > $dir/valid_uttlist || exit 1; + +if [ -f $data/utt2uniq ]; then # this matters if you use data augmentation. + echo "File $data/utt2uniq exists, so augmenting valid_uttlist to" + echo "include all perturbed versions of the same 'real' utterances." + mv $dir/valid_uttlist $dir/valid_uttlist.tmp + utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt + cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \ + sort | uniq | utils/apply_map.pl $dir/uniq2utt | \ + awk '{for(n=1;n<=NF;n++) print $n;}' | sort > $dir/valid_uttlist + rm $dir/uniq2utt $dir/valid_uttlist.tmp +fi + +awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \ + utils/shuffle_list.pl | head -$num_utts_subset > $dir/train_subset_uttlist || exit 1; + +[ -z "$transform_dir" ] && transform_dir=$alidir + +if [ $stage -le 1 ]; then + nj_ali=$(cat $alidir/num_jobs) + all_ids=$(seq -s, $nj_ali) + $cmd $dir/log/copy_alignments.log \ + copy-int-vector "ark:gunzip -c $alidir/ali.{$all_ids}.gz|" \ + ark,scp:$dir/ali.ark,$dir/ali.scp || exit 1; +fi + +prior_ali_rspecifier="ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $dir/ali.scp | ali-to-pdf $alidir/final.mdl scp:- ark:- |" + +if [ -f $transform_dir/trans.1 ] && [ $feat_type != "raw" ]; then + echo "$0: using transforms from $transform_dir" + if [ $stage -le 0 ]; then + $cmd $dir/log/copy_transforms.log \ + copy-feats "ark:cat $transform_dir/trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp" + fi +fi +if [ -f $transform_dir/raw_trans.1 ] && [ $feat_type == "raw" ]; then + echo "$0: using raw transforms from $transform_dir" + if [ $stage -le 0 ]; then + $cmd $dir/log/copy_transforms.log \ + copy-feats "ark:cat $transform_dir/raw_trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp" + fi +fi + +silphonelist=`cat $lang/phones/silence.csl` || exit 1; +cp $alidir/tree $dir +cp $lang/phones/silence.csl $dir/info/ +cp $src_model $dir/final.mdl || exit 1 + +# Get list of utterances for prior computation. +awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \ + utils/shuffle_list.pl | head -$num_priors_subset \ + > $dir/priors_uttlist || exit 1; + +## We don't support deltas here, only LDA or raw (mainly because deltas are less +## frequently used). +if [ -z $feat_type ]; then + if [ -f $alidir/final.mat ] && [ ! -f $transform_dir/raw_trans.1 ]; then feat_type=lda; else feat_type=raw; fi +fi +echo "$0: feature type is $feat_type" + +case $feat_type in + raw) feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |" + valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" + train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" + priors_feats="ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" + echo $cmvn_opts > $dir/cmvn_opts + ;; + lda) + splice_opts=`cat $alidir/splice_opts 2>/dev/null` + cp $alidir/splice_opts $dir 2>/dev/null + cp $alidir/final.mat $dir + [ ! -z "$cmvn_opts" ] && \ + echo "You cannot supply --cmvn-opts option if feature type is LDA." && exit 1; + cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null` + cp $alidir/cmvn_opts $dir 2>/dev/null + feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |" + valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |" + train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |" + priors_feats="ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |" + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; +esac + +if [ -f $dir/trans.scp ]; then + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |" + valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp|' ark:- ark:- |" + train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp|' ark:- ark:- |" + priors_feats="$priors_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp|' ark:- ark:- |" +fi + +if [ ! -z $online_ivector_dir ]; then + ivector_period=$(cat $online_ivector_dir/ivector_period) + ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1; + echo $ivector_dim >$dir/info/ivector_dim + + ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'" + valid_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'" + train_subset_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'" + priors_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'" +fi + +if [ $stage -le 2 ]; then + echo "$0: working out number of frames of training data" + num_frames=$(steps/nnet2/get_num_frames.sh $data) + echo $num_frames > $dir/info/num_frames + echo "$0: working out feature dim" + feats_one="$(echo $feats | sed s/JOB/1/g)" + feat_dim=$(feat-to-dim "$feats_one" -) || exit 1; + echo $feat_dim > $dir/info/feat_dim +else + num_frames=$(cat $dir/info/num_frames) || exit 1; + feat_dim=$(cat $dir/info/feat_dim) || exit 1; +fi + +# Working out total number of archives. Add one on the assumption the +# num-frames won't divide exactly, and we want to round up. +num_archives=$[$num_frames/$frames_per_iter+1] + +# We may have to first create a smaller number of larger archives, with number +# $num_archives_intermediate, if $num_archives is more than the maximum number +# of open filehandles that the system allows per process (ulimit -n). +max_open_filehandles=$(ulimit -n) || exit 1 +num_archives_intermediate=$num_archives +archives_multiple=1 +while [ $[$num_archives_intermediate+4] -gt $max_open_filehandles ]; do + archives_multiple=$[$archives_multiple+1] + num_archives_intermediate=$[$num_archives/$archives_multiple] || exit 1; +done +# now make sure num_archives is an exact multiple of archives_multiple. +num_archives=$[$archives_multiple*$num_archives_intermediate] || exit 1; + +echo $num_archives >$dir/info/num_archives +echo $frames_per_eg >$dir/info/frames_per_eg +# Work out the number of egs per archive +egs_per_archive=$[$num_frames/($frames_per_eg*$num_archives)] || exit 1; +! [ $egs_per_archive -le $frames_per_iter ] && \ + echo "$0: script error: egs_per_archive=$egs_per_archive not <= frames_per_iter=$frames_per_iter" \ + && exit 1; + +echo $egs_per_archive > $dir/info/egs_per_archive + +echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with" +echo "$0: $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)" + + +if [ -e $dir/storage ]; then + # Make soft links to storage directories, if distributing this way.. See + # utils/create_split_dir.pl. + echo "$0: creating data links" + utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $dir/degs.$x.ark; done) + for x in $(seq $num_archives_intermediate); do + utils/create_data_link.pl $(for y in $(seq $nj); do echo $dir/degs_orig.$y.$x.ark; done) + done +fi + +if [ $stage -le 3 ]; then + echo "$0: copying training lattices" + + $cmd --max-jobs-run 6 JOB=1:$nj $dir/log/lattice_copy.JOB.log \ + lattice-copy --include="cat $dir/valid_uttlist $dir/train_subset_uttlist |" --ignore-missing \ + "ark:gunzip -c $denlatdir/lat.JOB.gz|" ark,scp:$dir/lat_special.JOB.ark,$dir/lat_special.JOB.scp || exit 1; + + for id in $(seq $nj); do cat $dir/lat_special.$id.scp; done > $dir/lat_special.scp +fi + +splitter_opts="--supervision-splitter.determinize=$determinize --supervision-splitter.minimize=$minimize --supervision-splitter.remove_output_symbols=$remove_output_symbols --supervision-splitter.remove_epsilons=$remove_epsilons --supervision-splitter.collapse-transition-ids=$collapse_transition_ids --supervision-splitter.acoustic-scale=$acwt" + +[ -z $valid_left_context ] && valid_left_context=$left_context; +[ -z $valid_right_context ] && valid_right_context=$right_context; + +[ -z $priors_left_context ] && priors_left_context=$left_context; +[ -z $priors_right_context ] && priors_right_context=$right_context; + +left_context=$[left_context+frame_subsampling_factor/2] +right_context=$[right_context+frame_subsampling_factor/2] + +egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --num-frames-overlap=$frames_overlap_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress $splitter_opts" + +valid_left_context=$[valid_left_context+frame_subsampling_factor/2] +valid_right_context=$[valid_right_context+frame_subsampling_factor/2] + +# don't do the overlap thing for the validation data. +valid_egs_opts="--left-context=$valid_left_context --right-context=$valid_right_context --num-frames=$frames_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress $splitter_opts" + +priors_left_context=$[priors_left_context+frame_subsampling_factor/2] +priors_right_context=$[priors_right_context+frame_subsampling_factor/2] + +# don't do the overlap thing for the priors computation data. +priors_egs_opts="--left-context=$priors_left_context --right-context=$priors_right_context --num-frames=1 --compress=$compress" + +supervision_all_opts="--frame-subsampling-factor=$frame_subsampling_factor" + +echo $left_context > $dir/info/left_context +echo $right_context > $dir/info/right_context + +echo $priors_left_context > $dir/info/priors_left_context +echo $priors_right_context > $dir/info/priors_right_context + +echo $frame_subsampling_factor > $dir/info/frame_subsampling_factor + +( + +if $adjust_priors && [ $stage -le 10 ]; then + +if [ ! -f $dir/ali.scp ]; then + nj_ali=$(cat $alidir/num_jobs) + all_ids=$(seq -s, $nj_ali) + $cmd $dir/log/copy_alignments.log \ + copy-int-vector "ark:gunzip -c $alidir/ali.{$all_ids}.gz|" \ + ark,scp:$dir/ali.ark,$dir/ali.scp || exit 1; +fi + +priors_egs_list= +for y in `seq $num_archives_priors`; do + utils/create_data_link.pl $dir/priors_egs.$y.ark + priors_egs_list="$priors_egs_list ark:$dir/priors_egs.$y.ark" +done + +echo "$0: dumping egs for prior adjustment in the background." + +num_pdfs=`am-info $alidir/final.mdl | grep pdfs | awk '{print $NF}' 2>/dev/null` || exit 1 + +$cmd $dir/log/create_priors_subset.log \ + nnet3-get-egs --num-pdfs=$num_pdfs $priors_ivector_opt $priors_egs_opts "$priors_feats" \ + "$prior_ali_rspecifier ali-to-post ark:- ark:- |" \ + ark:- \| nnet3-copy-egs ark:- $priors_egs_list || \ + { touch $dir/.error; echo "Error in creating priors subset. See $dir/log/create_priors_subset.log"; exit 1; } + +sleep 3; + +echo $num_archives_priors >$dir/info/num_archives_priors + +else + +echo 0 > $dir/info/num_archives_priors + +fi + +) & + +if [ $stage -le 4 ]; then + echo "$0: Getting validation and training subset examples." + rm -f $dir/.error 2>/dev/null || true + echo "$0: ... extracting validation and training-subset alignments." + + #utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \ + # <$dir/lat.scp >$dir/lat_special.scp + + utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \ + <$dir/ali.scp >$dir/ali_special.scp + + $cmd $dir/log/create_valid_subset.log \ + discriminative-get-supervision $supervision_all_opts \ + scp:$dir/ali_special.scp scp:$dir/lat_special.scp ark:- \| \ + nnet3-discriminative-get-egs $valid_ivector_opt $valid_egs_opts \ + $dir/final.mdl "$valid_feats" ark,s,cs:- "ark:$dir/valid_diagnostic.degs" || touch $dir/.error & + + $cmd $dir/log/create_train_subset.log \ + discriminative-get-supervision $supervision_all_opts \ + scp:$dir/ali_special.scp scp:$dir/lat_special.scp ark:- \| \ + nnet3-discriminative-get-egs $train_subset_ivector_opt $egs_opts \ + $dir/final.mdl "$train_subset_feats" ark,s,cs:- "ark:$dir/train_diagnostic.degs" || touch $dir/.error & + wait; + [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1 + echo "... Getting subsets of validation examples for diagnostics and combination." + + for f in $dir/{train_diagnostic,valid_diagnostic}.degs; do + [ ! -s $f ] && echo "No examples in file $f" && exit 1; + done +fi + +if [ $stage -le 5 ]; then + # create degs_orig.*.*.ark; the first index goes to $nj, + # the second to $num_archives_intermediate. + + degs_list= + for n in $(seq $num_archives_intermediate); do + degs_list="$degs_list ark:$dir/degs_orig.JOB.$n.ark" + done + echo "$0: Generating training examples on disk" + + # The examples will go round-robin to degs_list. + # To make it efficient we need to use a large 'nj', like 40, and in that case + # there can be too many small files to deal with, because the total number of + # files is the product of 'nj' by 'num_archives_intermediate', which might be + # quite large. + $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \ + discriminative-get-supervision $supervision_all_opts \ + "scp:utils/filter_scp.pl $sdata/JOB/utt2spk $dir/ali.scp |" \ + "ark:gunzip -c $denlatdir/lat.JOB.gz |" ark:- \| \ + nnet3-discriminative-get-egs $ivector_opt $egs_opts \ + $dir/final.mdl "$feats" ark,s,cs:- ark:- \| \ + nnet3-discriminative-copy-egs --random=true --srand=JOB ark:- $degs_list || exit 1; +fi + +if [ $stage -le 6 ]; then + echo "$0: recombining and shuffling order of archives on disk" + # combine all the "degs_orig.*.JOB.scp" (over the $nj splits of the data) and + # shuffle the order, writing to the degs.JOB.ark + + # the input is a concatenation over the input jobs. + degs_list= + for n in $(seq $nj); do + degs_list="$degs_list $dir/degs_orig.$n.JOB.ark" + done + + if [ $archives_multiple == 1 ]; then # normal case. + $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ + nnet3-discriminative-shuffle-egs --srand=JOB "ark:cat $degs_list|" ark:$dir/degs.JOB.ark || exit 1; + else + # we need to shuffle the 'intermediate archives' and then split into the + # final archives. we create soft links to manage this splitting, because + # otherwise managing the output names is quite difficult (and we don't want + # to submit separate queue jobs for each intermediate archive, because then + # the --max-jobs-run option is hard to enforce). + output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/degs.JOB.$y.ark; done)" + for x in $(seq $num_archives_intermediate); do + for y in $(seq $archives_multiple); do + archive_index=$[($x-1)*$archives_multiple+$y] + # degs.intermediate_archive.{1,2,...}.ark will point to degs.archive.ark + ln -sf degs.$archive_index.ark $dir/degs.$x.$y.ark || exit 1 + done + done + $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ + nnet3-discriminative-shuffle-egs --srand=JOB "ark:cat $degs_list|" ark:- \| \ + nnet3-discriminative-copy-egs ark:- $output_archives || exit 1; + fi +fi + +if [ $stage -le 7 ]; then + echo "$0: removing temporary archives" + ( + cd $dir + for f in $(ls -l . | grep 'degs_orig' | awk '{ X=NF-1; Y=NF-2; if ($X == "->") print $Y, $NF; }'); do rm $f; done + ) + if [ $archives_multiple -gt 1 ]; then + # there are some extra soft links that we should delete. + for f in $dir/degs.*.*.ark; do rm $f; done + fi + echo "$0: removing temporary lattices" + rm $dir/lat.* + echo "$0: removing temporary alignments and transforms" + # Ignore errors below because trans.* might not exist. + rm -f $dir/{ali,trans}.{ark,scp} 2>/dev/null || true +fi + +wait + +echo "$0: Finished preparing training examples" diff --git a/egs/wsj/s5/steps/nnet3/make_denlats.sh b/egs/wsj/s5/steps/nnet3/make_denlats.sh new file mode 100755 index 00000000000..3d0d1e5e418 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/make_denlats.sh @@ -0,0 +1,252 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# 2014-2015 Vimal Manohar +# Apache 2.0. + +# Create denominator lattices for MMI/MPE training. +# This version uses the neural-net models (version 3, i.e. the nnet3 code). +# Creates its output in $dir/lat.*.gz + +# Begin configuration section. +nj=4 +cmd=run.pl +sub_split=1 +beam=13.0 +frames_per_chunk=50 +lattice_beam=7.0 +self_loop_scale=0.1 +acwt=0.1 +max_active=5000 +min_active=200 +transform_dir= +max_mem=20000000 # This will stop the processes getting too large. +# This is in bytes, but not "real" bytes-- you have to multiply +# by something like 5 or 10 to get real bytes (not sure why so large) +num_threads=1 # Fixed to 1 for now +online_ivector_dir= +determinize=false +minimize=false +ivector_scale=1.0 +parallel_opts= # ignored now +extra_left_context=0 +extra_right_context=0 +extra_left_context_initial=-1 +extra_right_context_final=-1 +feat_type= # you can set this in order to run on top of delta features, although we don't + # normally want to do this. +# End configuration section. + + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +num_threads=1 # Fixed to 1 for now + +if [ $# != 4 ]; then + echo "Usage: steps/nnet3/make_denlats.sh [options] " + echo " e.g.: steps/nnet3/make_denlats.sh data/train data/lang exp/nnet4 exp/nnet4_denlats" + echo "Works for (delta|lda) features, and (with --transform-dir option) such features" + echo " plus transforms." + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --sub-split # e.g. 40; use this for " + echo " # large databases so your jobs will be smaller and" + echo " # will (individually) finish reasonably soon." + echo " --transform-dir # directory to find fMLLR transforms." + echo " --num-threads # number of threads per decoding job" + exit 1; +fi + +data=$1 +lang=$2 +srcdir=$3 +dir=$4 + + +extra_files= +[ ! -z "$online_ivector_dir" ] && \ + extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period" +for f in $data/feats.scp $lang/L.fst $srcdir/final.mdl $extra_files; do + [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1; +done + +sdata=$data/split$nj +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` +thread_string= +[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" + +mkdir -p $dir/log +split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +oov=`cat $lang/oov.int` || exit 1; + +cp -rH $lang $dir/ + +# Compute grammar FST which corresponds to unigram decoding graph. +new_lang="$dir/"$(basename "$lang") + +# mkgraph.sh expects a whole directory "lang", so put everything in one directory... +# it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and +# final.mdl from $srcdir; the output HCLG.fst goes in $dir/graph. + +echo "Compiling decoding graph in $dir/dengraph" +if [ -s $dir/dengraph/HCLG.fst ] && [ $dir/dengraph/HCLG.fst -nt $srcdir/final.mdl ]; then + echo "Graph $dir/dengraph/HCLG.fst already exists: skipping graph creation." +else + echo "Making unigram grammar FST in $new_lang" + cat $data/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \ + awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \ + utils/make_unigram_grammar.pl | fstcompile | fstarcsort --sort_type=ilabel > $new_lang/G.fst \ + || exit 1; + utils/mkgraph.sh --self-loop-scale $self_loop_scale $new_lang $srcdir $dir/dengraph || exit 1; +fi +cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` +cp $srcdir/cmvn_opts $dir 2>/dev/null + +if [ -z "$feat_type" ]; then + if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=raw; fi +fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" + ;; + lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + cp $srcdir/final.mat $dir + ;; + *) echo "Invalid feature type $feat_type" && exit 1; +esac + +if [ ! -z "$transform_dir" ]; then + echo "$0: using transforms from $transform_dir" + [ ! -s $transform_dir/num_jobs ] && \ + echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1; + nj_orig=$(cat $transform_dir/num_jobs) + + if [ $feat_type == "raw" ]; then trans=raw_trans; + else trans=trans; fi + if [ $feat_type == "lda" ] && ! cmp $transform_dir/final.mat $srcdir/final.mat; then + echo "$0: LDA transforms differ between $srcdir and $transform_dir" + exit 1; + fi + if [ ! -f $transform_dir/$trans.1 ]; then + echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)" + exit 1; + fi + if [ $nj -ne $nj_orig ]; then + # Copy the transforms into an archive with an index. + for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \ + copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |" + else + # number of jobs matches with alignment dir. + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |" + fi +fi + + +# if this job is interrupted by the user, we want any background jobs to be +# killed too. +cleanup() { + local pids=$(jobs -pr) + [ -n "$pids" ] && kill $pids +} +trap "cleanup" INT QUIT TERM EXIT + +if [ ! -z "$online_ivector_dir" ]; then + ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; + ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period" +fi + +if [ -f $srcdir/frame_subsampling_factor ]; then + # e.g. for 'chain' systems + frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)" + cp $srcdir/frame_subsampling_factor $dir +fi + +lattice_determinize_cmd= +if $determinize; then + lattice_determinize_cmd="lattice-determinize-non-compact --acoustic-scale=$acwt --max-mem=$max_mem --minimize=$minimize --prune --beam=$beam ark:- ark:- |" +fi + +if [ $sub_split -eq 1 ]; then + $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode_den.JOB.log \ + nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \ + --frames-per-chunk=$frames_per_chunk \ + --extra-left-context=$extra_left_context \ + --extra-right-context=$extra_right_context \ + --extra-left-context-initial=$extra_left_context_initial \ + --extra-right-context-final=$extra_right_context_final \ + --minimize=false --determinize-lattice=false \ + --word-determinize=false --phone-determinize-lattice=false \ + --max-active=$max_active --min-active=$min_active --beam=$beam \ + --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=false \ + --max-mem=$max_mem --word-symbol-table=$lang/words.txt $srcdir/final.mdl \ + $dir/dengraph/HCLG.fst "$feats" \ + "ark:|$lattice_determinize_cmd gzip -c >$dir/lat.JOB.gz" || exit 1 +else + + # each job from 1 to $nj is split into multiple pieces (sub-split), and we aim + # to have at most two jobs running at each time. The idea is that if we have stragglers + # from one job, we can be processing another one at the same time. + rm $dir/.error 2>/dev/null + + prev_pid= + for n in `seq $[nj+1]`; do + if [ $n -gt $nj ]; then + this_pid= + elif [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $srcdir/final.mdl ]; then + echo "Not processing subset $n as already done (delete $dir/.done.$n if not)"; + this_pid= + else + sdata2=$data/split$nj/$n/split$sub_split; + if [ ! -d $sdata2 ] || [ $sdata2 -ot $sdata/$n/feats.scp ]; then + split_data.sh --per-utt $sdata/$n $sub_split || exit 1; + fi + mkdir -p $dir/log/$n + mkdir -p $dir/part + feats_subset=`echo $feats | sed "s/trans.JOB/trans.$n/g" | sed s:JOB/:$n/split$sub_split/JOB/:g` + + $cmd --num-threads $num_threads JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \ + nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \ + --frames-per-chunk=$frames_per_chunk \ + --extra-left-context=$extra_left_context \ + --extra-right-context=$extra_right_context \ + --extra-left-context-initial=$extra_left_context_initial \ + --extra-right-context-final=$extra_right_context_final \ + --minimize=false --determinize-lattice=false \ + --word-determinize=false --phone-determinize=false \ + --max-active=$max_active --min-active=$min_active --beam=$beam \ + --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=false \ + --max-mem=$max_mem --word-symbol-table=$lang/words.txt $srcdir/final.mdl \ + $dir/dengraph/HCLG.fst "$feats_subset" \ + "ark:|$lattice_determinize_cmd gzip -c >$dir/lat.$n.JOB.gz" || touch $dir/.error & + this_pid=$! + fi + if [ ! -z "$prev_pid" ]; then # Wait for the previous job; merge the previous set of lattices. + wait $prev_pid + [ -f $dir/.error ] && echo "$0: error generating denominator lattices" && exit 1; + rm $dir/.merge_error 2>/dev/null + echo Merging archives for data subset $prev_n + for k in `seq $sub_split`; do + gunzip -c $dir/lat.$prev_n.$k.gz || touch $dir/.merge_error; + done | gzip -c > $dir/lat.$prev_n.gz || touch $dir/.merge_error; + [ -f $dir/.merge_error ] && echo "$0: Merging lattices for subset $prev_n failed (or maybe some other error)" && exit 1; + rm $dir/lat.$prev_n.*.gz + touch $dir/.done.$prev_n + fi + prev_n=$n + prev_pid=$this_pid + done +fi + + +echo "$0: done generating denominator lattices." + diff --git a/egs/wsj/s5/steps/nnet3/make_jesus_configs.py b/egs/wsj/s5/steps/nnet3/make_jesus_configs.py new file mode 100755 index 00000000000..a00e4cfd0e7 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/make_jesus_configs.py @@ -0,0 +1,490 @@ +#!/usr/bin/env python + +# tdnn or RNN with 'jesus layer' + + +# notes on jesus layer with recurrence: + +# inputs to jesus layer: +# - for each previous layer in regular splicing, the output of dim --jesus-forward-output-dim +# - for each recurrent connection: +# - direct input from the recurrence --jesus-direct-recurrence-dim +# - indirect [projected] input from recurrence. --jesus-projected-recurrence-input-dim +# outputs of jesus layer: +# for all layers: +# --jesus-forward-output-dim +# for recurrent layers: +# --jesus-direct-recurrence-dim +# --jesus-projected-recurrence-output-dim + + +# we're using python 3.x style print but want it to work in python 2.x, +from __future__ import print_function +import re, os, argparse, sys, math, warnings + + +parser = argparse.ArgumentParser(description="Writes config files and variables " + "for TDNNs creation and training", + epilog="See steps/nnet3/train_tdnn.sh for example."); +parser.add_argument("--splice-indexes", type=str, + help="Splice[:recurrence] indexes at each hidden layer, e.g. '-3,-2,-1,0,1,2,3 -3,0:-3 -3,0:-3 -6,-3,0:-6,-3'. " + "Note: recurrence indexes are optional, may not appear in 1st layer, and must be " + "either all negative or all positive for any given layer.") +parser.add_argument("--feat-dim", type=int, + help="Raw feature dimension, e.g. 13") +parser.add_argument("--ivector-dim", type=int, + help="iVector dimension, e.g. 100", default=0) +parser.add_argument("--include-log-softmax", type=str, + help="add the final softmax layer ", default="true", choices = ["false", "true"]) +parser.add_argument("--xent-regularize", type=float, + help="For chain models, if nonzero, add a separate output for cross-entropy " + "regularization (with learning-rate-factor equal to the inverse of this)", + default=0.0) +parser.add_argument("--use-repeated-affine", type=str, + help="if true use RepeatedAffineComponent, else BlockAffineComponent (i.e. no sharing)", + default="true", choices = ["false", "true"]) +parser.add_argument("--final-layer-learning-rate-factor", type=float, + help="Learning-rate factor for final affine component", + default=1.0) +parser.add_argument("--recurrent-projection-learning-rate-factor", type=float, + help="Learning-rate factor for recurrent projections", + default=10.0) +parser.add_argument("--jesus-hidden-dim", type=int, + help="hidden dimension of Jesus layer.", default=10000) +parser.add_argument("--jesus-forward-output-dim", type=int, + help="part of output dimension of Jesus layer that goes to next layer", + default=1000) +parser.add_argument("--jesus-forward-input-dim", type=int, + help="Input dimension of Jesus layer that comes from affine projection " + "from the previous layer (same as output dim of forward affine transform)", + default=1000) +parser.add_argument("--final-hidden-dim", type=int, + help="Final hidden layer dimension-- or if <0, the same as " + "--jesus-forward-input-dim", default=-1) +parser.add_argument("--jesus-direct-recurrence-dim", type=int, + help="part of output dimension of Jesus layer that comes directly from " + "different time instance of the same Jesus layer", default=1000) +parser.add_argument("--jesus-projected-recurrence-output-dim", type=int, + help="part of output dimension of Jesus layer (in recurrent layers) " + "that is destined for projection to dimension " + "--jesus-projected-recurrence-input-dim", default=500) +parser.add_argument("--jesus-projected-recurrence-input-dim", type=int, + help="part of input dimension of Jesus layer that comes via " + "projection from the output of the same Jesus layer at different time", + default=200) +parser.add_argument("--num-jesus-blocks", type=int, + help="number of blocks in Jesus layer. All configs of the form " + "--jesus-*-dim will be rounded up to be a multiple of this.", + default=100); +parser.add_argument("--jesus-stddev-scale", type=float, + help="Scaling factor on parameter stddev of Jesus layer (smaller->jesus layer learns faster)", + default=1.0) +parser.add_argument("--clipping-threshold", type=float, + help="clipping threshold used in ClipGradient components (only relevant if " + "recurrence indexes are specified). If clipping-threshold=0 no clipping is done", + default=15) +parser.add_argument("--num-targets", type=int, + help="number of network targets (e.g. num-pdf-ids/num-leaves)") +parser.add_argument("config_dir", + help="Directory to write config files and variables"); + +print(' '.join(sys.argv)) + +args = parser.parse_args() + +if not os.path.exists(args.config_dir): + os.makedirs(args.config_dir) + +## Check arguments. +if args.splice_indexes is None: + sys.exit("--splice-indexes argument is required"); +if args.feat_dim is None or not (args.feat_dim > 0): + sys.exit("--feat-dim argument is required"); +if args.num_targets is None or not (args.num_targets > 0): + sys.exit("--num-targets argument is required"); +if args.num_jesus_blocks < 1: + sys.exit("invalid --num-jesus-blocks value"); +if args.final_hidden_dim < 0: + args.final_hidden_dim = args.jesus_forward_input_dim + +for name in [ "jesus_hidden_dim", "jesus_forward_output_dim", "jesus_forward_input_dim", + "jesus_direct_recurrence_dim", "jesus_projected_recurrence_output_dim", + "jesus_projected_recurrence_input_dim", "final_hidden_dim" ]: + old_val = getattr(args, name) + if old_val % args.num_jesus_blocks != 0: + new_val = old_val + args.num_jesus_blocks - (old_val % args.num_jesus_blocks) + printable_name = '--' + name.replace('_', '-') + print('Rounding up {0} from {1} to {2} to be a multiple of --num-jesus-blocks={3}: '.format( + printable_name, old_val, new_val, args.num_jesus_blocks)) + setattr(args, name, new_val); + + +## Work out splice_array and recurrence_array, +## e.g. for +## args.splice_indexes == '-3,-2,-1,0,1,2,3 -3,0:-3 -3,0:-3 -6,-3,0:-6,-3' +## we would have +## splice_array = [ [ -3,-2,...3 ], [-3,0] [-3,0] [-6,-3,0] +## and +## recurrence_array = [ [], [-3], [-3], [-6,-3] ] +## Note, recurrence_array[0] must be empty; and any element of recurrence_array +## may be empty. Also it cannot contain zeros, or both positive and negative elements +## at the same layer. +splice_array = [] +recurrence_array = [] +left_context = 0 +right_context = 0 +split_on_spaces = args.splice_indexes.split(" "); # we already checked the string is nonempty. +if len(split_on_spaces) < 2: + sys.exit("invalid --splice-indexes argument, too short: " + + args.splice_indexes) +try: + for string in split_on_spaces: + this_layer = len(splice_array) + split_on_colon = string.split(":") # there will only be a colon if + # there is recurrence at this layer. + if len(split_on_colon) < 1 or len(split_on_colon) > 2 or (this_layer == 0 and len(split_on_colon) > 1): + sys.exit("invalid --splice-indexes argument: " + args.splice_indexes) + if len(split_on_colon) == 1: + split_on_colon.append("") + int_list = [] + this_splices = [ int(x) for x in split_on_colon[0].split(",") ] + this_recurrence = [ int(x) for x in split_on_colon[1].split(",") if x ] + splice_array.append(this_splices) + recurrence_array.append(this_recurrence) + if (len(this_splices) < 1): + sys.exit("invalid --splice-indexes argument [empty splices]: " + args.splice_indexes) + if len(this_recurrence) > 1 and this_recurrence[0] * this_recurrence[-1] <= 0: + sys.exit("invalid --splice-indexes argument [invalid recurrence indexes; would not be computable." + + args.splice_indexes) + if not this_splices == sorted(this_splices): + sys.exit("elements of --splice-indexes must be sorted: " + + args.splice_indexes) + left_context += -this_splices[0] + right_context += this_splices[-1] +except ValueError as e: + sys.exit("invalid --splice-indexes argument " + args.splice_indexes + " " + str(e)) +left_context = max(0, left_context) +right_context = max(0, right_context) +num_hidden_layers = len(splice_array) +input_dim = len(splice_array[0]) * args.feat_dim + args.ivector_dim + +f = open(args.config_dir + "/vars", "w") +print('left_context=' + str(left_context), file=f) +print('right_context=' + str(right_context), file=f) +print('num_hidden_layers=' + str(num_hidden_layers), file=f) +f.close() + + +f = open(args.config_dir + "/init.config", "w") +print('# Config file for initializing neural network prior to', file=f) +print('# preconditioning matrix computation', file=f) +print('input-node name=input dim=' + str(args.feat_dim), file=f) +list=[ ('Offset(input, {0})'.format(n) if n != 0 else 'input' ) for n in splice_array[0] ] +if args.ivector_dim > 0: + print('input-node name=ivector dim=' + str(args.ivector_dim), file=f) + list.append('ReplaceIndex(ivector, t, 0)') +# example of next line: +# output-node name=output input="Append(Offset(input, -3), Offset(input, -2), Offset(input, -1), ... , Offset(input, 3), ReplaceIndex(ivector, t, 0))" +print('output-node name=output input=Append({0})'.format(", ".join(list)), file=f) +f.close() + + +for l in range(1, num_hidden_layers + 1): + # the following summarizes the structure of the layers: Here, the Jesus component includes ReLU at its input and output, and renormalize + # at its output after the ReLU. + # layer1: splice + LDA-transform + affine + ReLU + renormalize + # layerX [non-recurrent]: splice + Jesus + affine + ReLU + # layerX [recurrent]: splice + Jesus + renormalize + split up: -> [forward] affine + ReLU + # -> [direct-recurrent] + # -> [projected-recurrent, one per delay]: affine + ReLU + # Inside the jesus component is: + # [permute +] ReLU + repeated-affine + ReLU + repeated-affine + # [we make the repeated-affine the last one so we don't have to redo that in backprop]. + # We follow this with a post-jesus composite component containing the operations: + # [permute +] ReLU + renormalize + # call this post-jesusN. + # After this we use dim-range nodes to split up the output into + # [ jesusN-forward-output, jesusN-direct-output and jesusN-projected-output ] + # parts; + # and nodes for the jesusN-forward-affine and jesusN-recurrent-affine-offsetN + # and jesusN-recurrent-affine-offsetN-clip + # computations. + + f = open(args.config_dir + "/layer{0}.config".format(l), "w") + print('# Config file for layer {0} of the network'.format(l), file=f) + if l == 1: + print('component name=lda type=FixedAffineComponent matrix={0}/lda.mat'. + format(args.config_dir), file=f) + splices = [ ('Offset(input, {0})'.format(n) if n != 0 else 'input') for n in splice_array[l-1] ] + if args.ivector_dim > 0: splices.append('ReplaceIndex(ivector, t, 0)') + orig_input='Append({0})'.format(', '.join(splices)) + # e.g. orig_input = 'Append(Offset(input, -2), ... Offset(input, 2), ivector)' + print('component-node name=lda component=lda input={0}'.format(orig_input), + file=f) + # after the initial LDA transform, put a trainable affine layer and a ReLU, followed + # by a NormalizeComponent. + print('component name=affine1 type=NaturalGradientAffineComponent ' + 'input-dim={0} output-dim={1} bias-stddev=0'.format( + input_dim, args.jesus_forward_input_dim), file=f) + print('component-node name=affine1 component=affine1 input=lda', + file=f) + # the ReLU after the affine + print('component name=relu1 type=RectifiedLinearComponent dim={1}'.format( + l, args.jesus_forward_input_dim), file=f) + print('component-node name=relu1 component=relu1 input=affine1', file=f) + # the renormalize component after the ReLU + print ('component name=renorm1 type=NormalizeComponent dim={0} '.format( + args.jesus_forward_input_dim), file=f) + print('component-node name=renorm1 component=renorm1 input=relu1', file=f) + cur_output = 'renorm1' + cur_affine_output_dim = args.jesus_forward_input_dim + else: + splices = [] + spliced_dims = [] + for offset in splice_array[l-1]: + # the connection from the previous layer + if l == 2: + splices.append('Offset(renorm1, {0})'.format(offset)) + else: + splices.append('Offset(jesus{0}-forward-output-affine, {1})'.format(l-1, offset)) + spliced_dims.append(args.jesus_forward_input_dim) + for offset in recurrence_array[l-1]: + # the direct recurrence + splices.append('IfDefined(Offset(jesus{0}-direct-output, {1}))'.format(l, offset)) + spliced_dims.append(args.jesus_direct_recurrence_dim) + # the indirect recurrence (via projection) + splices.append('IfDefined(Offset(jesus{0}-recurrent-affine-offset{1}-clip, {1}))'.format(l, offset)) + spliced_dims.append(args.jesus_projected_recurrence_input_dim) + + # get the input to the Jesus layer. + cur_input = 'Append({0})'.format(', '.join(splices)) + cur_dim = sum(spliced_dims) + + this_layer_is_recurrent = (len(recurrence_array[l-1]) != 0) + this_jesus_output_dim = args.jesus_forward_output_dim + ( + (args.jesus_projected_recurrence_output_dim + + args.jesus_direct_recurrence_dim) if this_layer_is_recurrent else 0) + + # As input to the Jesus component we'll append the spliced input and + # recurrent input, and the first thing inside the component that we do + # is rearrange the dimensions so that things pertaining to a particular + # block stay together. + + column_map = [] + for x in range(0, args.num_jesus_blocks): + dim_offset = 0 + for src_splice in spliced_dims: + src_block_size = src_splice / args.num_jesus_blocks + for y in range(0, src_block_size): + column_map.append(dim_offset + (x * src_block_size) + y) + dim_offset += src_splice + if sorted(column_map) != range(0, sum(spliced_dims)): + print("column_map is " + str(column_map)) + print("num_jesus_blocks is " + str(args.num_jesus_blocks)) + print("spliced_dims is " + str(spliced_dims)) + sys.exit("code error creating new column order") + + need_input_permute_component = (column_map != range(0, sum(spliced_dims))) + + # Now add the jesus component. + num_sub_components = (5 if need_input_permute_component else 4); + print('component name=jesus{0} type=CompositeComponent num-components={1}'.format( + l, num_sub_components), file=f, end='') + # print the sub-components of the CompositeComopnent on the same line. + # this CompositeComponent has the same effect as a sequence of + # components, but saves memory. + if need_input_permute_component: + print(" component1='type=PermuteComponent column-map={1}'".format( + l, ','.join([str(x) for x in column_map])), file=f, end='') + print(" component{0}='type=RectifiedLinearComponent dim={1}'".format( + (2 if need_input_permute_component else 1), + cur_dim), file=f, end='') + + if args.use_repeated_affine == "true": + print(" component{0}='type=NaturalGradientRepeatedAffineComponent input-dim={1} output-dim={2} " + "num-repeats={3} param-stddev={4} bias-mean={5} bias-stddev=0'".format( + (3 if need_input_permute_component else 2), + cur_dim, args.jesus_hidden_dim, + args.num_jesus_blocks, + args.jesus_stddev_scale / math.sqrt(cur_dim / args.num_jesus_blocks), + 0.5 * args.jesus_stddev_scale), + file=f, end='') + else: + print(" component{0}='type=BlockAffineComponent input-dim={1} output-dim={2} " + "num-blocks={3} param-stddev={4} bias-stddev=0'".format( + (3 if need_input_permute_component else 2), + cur_dim, args.jesus_hidden_dim, + args.num_jesus_blocks, + args.jesus_stddev_scale / math.sqrt(cur_dim / args.num_jesus_blocks)), + file=f, end='') + + + print(" component{0}='type=RectifiedLinearComponent dim={1}'".format( + (4 if need_input_permute_component else 3), + args.jesus_hidden_dim), file=f, end='') + + + + if args.use_repeated_affine == "true": + print(" component{0}='type=NaturalGradientRepeatedAffineComponent input-dim={1} output-dim={2} " + "num-repeats={3} param-stddev={4} bias-mean={5} bias-stddev=0'".format( + (5 if need_input_permute_component else 4), + args.jesus_hidden_dim, + this_jesus_output_dim, + args.num_jesus_blocks, + args.jesus_stddev_scale / math.sqrt(args.jesus_hidden_dim / args.num_jesus_blocks), + 0.5 * args.jesus_stddev_scale), + file=f, end='') + else: + print(" component{0}='type=BlockAffineComponent input-dim={1} output-dim={2} " + "num-blocks={3} param-stddev={4} bias-stddev=0'".format( + (5 if need_input_permute_component else 4), + args.jesus_hidden_dim, + this_jesus_output_dim, + args.num_jesus_blocks, + args.jesus_stddev_scale / math.sqrt((args.jesus_hidden_dim / args.num_jesus_blocks))), + file=f, end='') + + print("", file=f) # print newline. + print('component-node name=jesus{0} component=jesus{0} input={1}'.format( + l, cur_input), file=f) + + # now print the post-Jesus component which consists of [permute +] ReLU + # + renormalize. we only need the permute component if this is a + # recurrent layer. + + num_sub_components = (3 if this_layer_is_recurrent else 2); + print('component name=post-jesus{0} type=CompositeComponent num-components={1}'.format( + l, num_sub_components), file=f, end='') + if this_layer_is_recurrent: + column_map = [] + output_part_dims = [ args.jesus_forward_output_dim, + args.jesus_direct_recurrence_dim, + args.jesus_projected_recurrence_output_dim ] + if sum(output_part_dims) != this_jesus_output_dim: + sys.exit("code error") + total_block_size = this_jesus_output_dim / args.num_jesus_blocks + previous_part_dims_sum = 0 + for part_dim in output_part_dims: + within_block_offset = previous_part_dims_sum / args.num_jesus_blocks + within_block_dim = part_dim / args.num_jesus_blocks + for x in range(0, args.num_jesus_blocks): + for y in range(0, within_block_dim): + column_map.append(x * total_block_size + within_block_offset + y) + previous_part_dims_sum += part_dim + if sorted(column_map) != range(0, this_jesus_output_dim): + print("column_map is " + str(column_map)) + print("output_part_dims is " + str(output_part_dims)) + sys.exit("code error creating new column order") + print(" component1='type=PermuteComponent column-map={1}'".format( + l, ','.join([str(x) for x in column_map ])), file=f, end='') + + # still within the post-Jesus component, print the ReLU + print(" component{0}='type=RectifiedLinearComponent dim={1}'".format( + (2 if this_layer_is_recurrent else 1), + this_jesus_output_dim), file=f, end='') + # still within the post-Jesus component, print the NormalizeComponent + print(" component{0}='type=NormalizeComponent dim={1} '".format( + (3 if this_layer_is_recurrent else 2), + this_jesus_output_dim), file=f, end='') + print("", file=f) # print newline. + print('component-node name=post-jesus{0} component=post-jesus{0} input=jesus{0}'.format(l), + file=f) + + if len(recurrence_array[l-1]) != 0: + # This is a recurrent layer -> print the dim-range nodes. + dim_offset = 0 + print('dim-range-node name=jesus{0}-forward-output input-node=post-jesus{0} ' + 'dim={1} dim-offset={2}'.format(l, args.jesus_forward_output_dim, dim_offset), file=f) + dim_offset += args.jesus_forward_output_dim + print('dim-range-node name=jesus{0}-direct-output input-node=post-jesus{0} ' + 'dim={1} dim-offset={2}'.format(l, args.jesus_direct_recurrence_dim, dim_offset), file=f) + dim_offset += args.jesus_direct_recurrence_dim + print('dim-range-node name=jesus{0}-projected-output input-node=post-jesus{0} ' + 'dim={1} dim-offset={2}'.format(l, args.jesus_projected_recurrence_output_dim, + dim_offset), file=f) + input_to_forward_affine = 'jesus{0}-forward-output'.format(l) + else: + input_to_forward_affine = 'post-jesus{0}'.format(l) + + # handle the forward output, we need an affine node for this: + cur_affine_output_dim = (args.jesus_forward_input_dim if l < num_hidden_layers else args.final_hidden_dim) + print('component name=forward-affine{0} type=NaturalGradientAffineComponent ' + 'input-dim={1} output-dim={2} bias-stddev=0'. + format(l, args.jesus_forward_output_dim, cur_affine_output_dim), file=f) + print('component-node name=jesus{0}-forward-output-affine component=forward-affine{0} input={1}'.format( + l, input_to_forward_affine), file=f) + # for each recurrence delay, create an affine node followed by a + # clip-gradient node. [if there are multiple recurrences in the same layer, + # each one gets its own affine projection.] + + # The reason we set the param-stddev to 0 is out of concern that if we + # initialize to nonzero, this will encourage the corresponding inputs at + # the jesus layer to become small (to remove this random input), which + # in turn will make this component learn slowly (due to small + # derivatives). we set the bias-mean to 0.001 so that the ReLUs on the + # input of the Jesus layer are in the part of the activation that has a + # nonzero derivative- otherwise with this setup it would never learn. + for delay in recurrence_array[l-1]: + print('component name=jesus{0}-recurrent-affine-offset{1} type=NaturalGradientAffineComponent ' + 'input-dim={2} output-dim={3} learning-rate-factor={4} param-stddev=0 bias-stddev=0 bias-mean=0.001'. + format(l, delay, + args.jesus_projected_recurrence_output_dim, + args.jesus_projected_recurrence_input_dim, + args.recurrent_projection_learning_rate_factor), file=f) + print('component-node name=jesus{0}-recurrent-affine-offset{1} component=jesus{0}-recurrent-affine-offset{1} ' + 'input=jesus{0}-projected-output'.format(l, delay), file=f) + print('component name=jesus{0}-recurrent-affine-offset{1}-clip type=ClipGradientComponent ' + 'dim={2} clipping-threshold={3} '.format(l, delay, args.jesus_projected_recurrence_input_dim, + args.clipping_threshold), file=f) + print('component-node name=jesus{0}-recurrent-affine-offset{1}-clip component=jesus{0}-recurrent-affine-offset{1}-clip ' + 'input=jesus{0}-recurrent-affine-offset{1}'.format(l, delay), file=f) + + cur_output = 'jesus{0}-forward-output-affine'.format(l) + + + # with each new layer we regenerate the final-affine component, with a ReLU before it + # because the layers we printed don't end with a nonlinearity. + print('component name=final-relu type=RectifiedLinearComponent dim={0}'.format( + cur_affine_output_dim), file=f) + print('component-node name=final-relu component=final-relu input={0}'.format(cur_output), + file=f) + print('component name=final-affine type=NaturalGradientAffineComponent ' + 'input-dim={0} output-dim={1} learning-rate-factor={2} param-stddev=0.0 bias-stddev=0'.format( + cur_affine_output_dim, args.num_targets, + args.final_layer_learning_rate_factor), file=f) + print('component-node name=final-affine component=final-affine input=final-relu', + file=f) + # printing out the next two, and their component-nodes, for l > 1 is not + # really necessary as they will already exist, but it doesn't hurt and makes + # the structure clearer. + if args.include_log_softmax == "true": + print('component name=final-log-softmax type=LogSoftmaxComponent dim={0}'.format( + args.num_targets), file=f) + print('component-node name=final-log-softmax component=final-log-softmax ' + 'input=final-affine', file=f) + print('output-node name=output input=final-log-softmax', file=f) + else: + print('output-node name=output input=final-affine', file=f) + + if args.xent_regularize != 0.0: + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 1.0 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + print('component name=final-affine-xent type=NaturalGradientAffineComponent ' + 'input-dim={0} output-dim={1} param-stddev=0.0 bias-stddev=0 learning-rate-factor={2}'.format( + cur_affine_output_dim, args.num_targets, 0.5 / args.xent_regularize), file=f) + print('component-node name=final-affine-xent component=final-affine-xent input=final-relu', + file=f) + print('component name=final-log-softmax-xent type=LogSoftmaxComponent dim={0}'.format( + args.num_targets), file=f) + print('component-node name=final-log-softmax-xent component=final-log-softmax-xent ' + 'input=final-affine-xent', file=f) + print('output-node name=output-xent input=final-log-softmax-xent', file=f) + + f.close() diff --git a/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh b/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh index 24666b8bd02..c36de8c16bf 100755 --- a/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh +++ b/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh @@ -1,11 +1,12 @@ #!/bin/bash # script showing use of nnet3_to_dot.py -# Copyright 2015 Johns Hopkins University (Author: Vijayaditya Peddinti). +# Copyright 2015 Johns Hopkins University (Author: Vijayaditya Peddinti). # Begin configuration section. component_attributes="name,type" node_prefixes="" +info_bin=nnet3-am-info echo "$0 $@" # Print the command line for logging [ -f ./path.sh ] && . ./path.sh; # source the path. @@ -20,7 +21,7 @@ if [ $# != 3 ]; then echo " --node-prefixes # list of prefixes. Nnet3 components/component-nodes with the same prefix" echo " # will be clustered together in the dot-graph" - + exit 1; fi @@ -29,10 +30,10 @@ dot_file=$2 output_file=$3 attr=${node_prefixes:+ --node-prefixes "$node_prefixes"} -nnet3-am-info $model | \ +$info_bin $model | \ steps/nnet3/dot/nnet3_to_dot.py \ --component-attributes "$component_attributes" \ - $attr > $dot_file + $attr $dot_file command -v dot >/dev/null 2>&1 || { echo >&2 "This script requires dot but it's not installed. Please compile $dot_file with dot"; exit 1; } -dot -Tpng $dot_file -o $output_file +dot -Tpdf $dot_file -o $output_file diff --git a/egs/wsj/s5/steps/nnet3/report/README b/egs/wsj/s5/steps/nnet3/report/README new file mode 100644 index 00000000000..848b4e32fad --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/report/README @@ -0,0 +1,6 @@ +These python scripts are used to parse the log files generated by nnet3 scripts. + +Usage: + steps/nnet3/report/nnet3_log_parse.py --key log-probability exp/chain/tdnn_4q + + steps/nnet3/report/nnet3_log_parse.py --key accuracy exp/nnet3/tdnn_sp diff --git a/egs/wsj/s5/steps/nnet3/report/nnet3_log_parse.py b/egs/wsj/s5/steps/nnet3/report/nnet3_log_parse.py new file mode 100755 index 00000000000..225906eea1b --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/report/nnet3_log_parse.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python + +# script to parse the train logs generated by nnet-compute-prob +from __future__ import division +import sys, glob, re, numpy, math, datetime, argparse +from subprocess import Popen, PIPE + +def parse_train_logs(exp_dir): + train_log_files = "%s/log/train.*.log" % (exp_dir) + train_log_proc = Popen('grep -e Accounting {0}'.format(train_log_files), + shell=True, + stdout=PIPE, + stderr=PIPE) + train_log_lines = train_log_proc.communicate()[0] + parse_regex = re.compile(".*train\.([0-9]+)\.([0-9]+)\.log:# Accounting: time=([0-9]+) thread.*") + train_times = {} + for line in train_log_lines.split('\n'): + mat_obj = parse_regex.search(line) + if mat_obj is not None: + groups = mat_obj.groups() + try: + train_times[int(groups[0])][int(groups[1])] = float(groups[2]) + except KeyError: + train_times[int(groups[0])] = {} + train_times[int(groups[0])][int(groups[1])] = float(groups[2]) + iters = train_times.keys() + for iter in iters: + values = train_times[iter].values() + train_times[iter] = max(values) + return train_times + +def parse_prob_logs(exp_dir, key = 'accuracy'): + train_prob_files = "%s/log/compute_prob_train.*.log" % (exp_dir) + valid_prob_files = "%s/log/compute_prob_valid.*.log" % (exp_dir) + train_prob_proc = Popen('grep -e {0} {1}'.format(key, train_prob_files), + shell=True, + stdout=PIPE, + stderr=PIPE) + train_prob_strings = train_prob_proc.communicate()[0] + valid_prob_proc = Popen('grep -e {0} {1}'.format(key, valid_prob_files), + shell=True, + stdout=PIPE, + stderr=PIPE) + valid_prob_strings = valid_prob_proc.communicate()[0] + #LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:149) Overall log-probability for 'output' is -0.399395 + -0.013437 = -0.412832 per frame, over 20000 fra + #LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:144) Overall log-probability for 'output' is -0.307255 per frame, over 20000 frames. + parse_regex = re.compile(".*compute_prob_.*\.([0-9]+).log:LOG .nnet3.*compute-prob:PrintTotalStats..:nnet.*diagnostics.cc:[0-9]+. Overall ([a-zA-Z\-]+) for 'output'.*is ([0-9.\-]+) .*per frame") + train_loss={} + valid_loss={} + + + for line in train_prob_strings.split('\n'): + mat_obj = parse_regex.search(line) + if mat_obj is not None: + groups = mat_obj.groups() + if groups[1] == key: + train_loss[int(groups[0])] = groups[2] + + for line in valid_prob_strings.split('\n'): + mat_obj = parse_regex.search(line) + if mat_obj is not None: + groups = mat_obj.groups() + if groups[1] == key: + valid_loss[int(groups[0])] = groups[2] + iters = list(set(valid_loss.keys()).intersection(train_loss.keys())) + iters.sort() + return numpy.array(map(lambda x: (int(x), float(train_loss[x]), float(valid_loss[x])), iters)) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Prints accuracy/log-probability across iterations") + parser.add_argument("--key", type=str, default="accuracy", + help="Value to print out") + parser.add_argument("exp_dir", help="experiment directory, e.g. exp/nnet3/tdnn") + + args = parser.parse_args() + exp_dir = args.exp_dir + times = parse_train_logs(exp_dir) + data = parse_prob_logs(exp_dir, key = args.key) + print "%Iter\tduration\ttrain_loss\tvalid_loss\tdifference" + for x in data: + try: + print "%d\t%s\t%g\t%g\t%g" % (x[0], str(times[x[0]]), x[1], x[2], x[2]-x[1]) + except KeyError: + continue + + total_time = 0 + for iter in times.keys(): + total_time += times[iter] + print "Total training time is {0}\n".format(str(datetime.timedelta(seconds = total_time))) diff --git a/egs/wsj/s5/steps/nnet3/report/nnet3_progress_log_parse.py b/egs/wsj/s5/steps/nnet3/report/nnet3_progress_log_parse.py new file mode 100755 index 00000000000..a910d42d6b1 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/report/nnet3_progress_log_parse.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python + +# script to parse the train logs generated by nnet-compute-prob +from __future__ import division +import sys, glob, re, numpy, math, datetime, argparse +from subprocess import Popen, PIPE + +#exp/chain/cwrnn_trial2_ld5_sp/log/progress.245.log:LOG (nnet3-show-progress:main():nnet3-show-progress.cc:144) Relative parameter differences per layer are [ Cwrnn1_T3_W_r:0.0171537 Cwrnn1_T3_W_x:1.33338e-07 Cwrnn1_T2_W_r:0.048075 Cwrnn1_T2_W_x:1.34088e-07 Cwrnn1_T1_W_r:0.0157277 Cwrnn1_T1_W_x:0.0212704 Final_affine:0.0321521 Cwrnn2_T3_W_r:0.0212082 Cwrnn2_T3_W_x:1.33691e-07 Cwrnn2_T2_W_r:0.0212978 Cwrnn2_T2_W_x:1.33401e-07 Cwrnn2_T1_W_r:0.014976 Cwrnn2_T1_W_x:0.0233588 Cwrnn3_T3_W_r:0.0237165 Cwrnn3_T3_W_x:1.33184e-07 Cwrnn3_T2_W_r:0.0239754 Cwrnn3_T2_W_x:1.3296e-07 Cwrnn3_T1_W_r:0.0194809 Cwrnn3_T1_W_x:0.0271934 ] + +def parse_difference_string(string): + dict = {} + for parts in string.split(): + sub_parts = parts.split(":") + dict[sub_parts[0]] = float(sub_parts[1]) + return dict + +def parse_progress_logs(exp_dir, pattern): + progress_log_files = "%s/log/progress.*.log" % (exp_dir) + progress_per_iter = {} + component_names = set([]) + progress_log_proc = Popen('grep -e "{0}" {1}'.format(pattern, progress_log_files), + shell=True, + stdout=PIPE, + stderr=PIPE) + progress_log_lines = progress_log_proc.communicate()[0] + parse_regex = re.compile(".*progress\.([0-9]+)\.log:LOG.*{0}.*\[(.*)\]".format(pattern)) + for line in progress_log_lines.split("\n") : + mat_obj = parse_regex.search(line) + if mat_obj is None: + continue + groups = mat_obj.groups() + iteration = groups[0] + differences = parse_difference_string(groups[1]) + component_names = component_names.union(differences.keys()) + progress_per_iter[int(iteration)] = differences + + component_names = list(component_names) + component_names.sort() + # rearranging the data into an array + data = [] + data.append(["iteration"]+component_names) + max_iter = max(progress_per_iter.keys()) + for iter in range(max_iter + 1): + try: + component_dict = progress_per_iter[iter] + except KeyError: + continue + iter_values = [] + for component_name in component_names: + try: + iter_values.append(component_dict[component_name]) + except KeyError: + # the component was not found this iteration, may be because of layerwise discriminative training + iter_values.append(0) + data.append([iter] + iter_values) + + return data + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Prints accuracy/log-probability across iterations") + parser.add_argument("--key", type=str, default="relative-difference", + help="Value to print out", choices = ["relative-difference", 'difference']) + parser.add_argument("exp_dir", help="experiment directory, e.g. exp/nnet3/tdnn") + + args = parser.parse_args() + exp_dir = args.exp_dir + if args.key == "relative-difference": + key = "Relative parameter differences" + else: + key = "Parameter differences" + data = parse_progress_logs(exp_dir, key) + for row in data: + print " ".join(map(lambda x:str(x),row)) diff --git a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py new file mode 100755 index 00000000000..5c042c3a15e --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py @@ -0,0 +1,321 @@ +#!/usr/bin/env python + +# we're using python 3.x style print but want it to work in python 2.x, +from __future__ import print_function +import os +import argparse +import sys +import warnings +import copy +import imp +import ast +import scipy.signal as signal +import numpy as np + +nodes = imp.load_source('', 'steps/nnet3/components.py') + + +def AddPerDimAffineLayer(config_lines, name, input, input_window): + filter_context = int((input_window - 1) / 2) + filter_input_splice_indexes = range(-1 * filter_context, filter_context + 1) + list = [('Offset({0}, {1})'.format(input['descriptor'], n) if n != 0 else input['descriptor']) for n in filter_input_splice_indexes] + filter_input_descriptor = 'Append({0})'.format(' , '.join(list)) + filter_input_descriptor = {'descriptor':filter_input_descriptor, + 'dimension':len(filter_input_splice_indexes) * input['dimension']} + + + # add permute component to shuffle the feature columns of the Append + # descriptor output so that columns corresponding to the same feature index + # are contiguous add a block-affine component to collapse all the feature + # indexes across time steps into a single value + num_feats = input['dimension'] + num_times = len(filter_input_splice_indexes) + column_map = [] + for i in range(num_feats): + for j in range(num_times): + column_map.append(j * num_feats + i) + permuted_output_descriptor = nodes.AddPermuteLayer(config_lines, + name, filter_input_descriptor, column_map) + + # add a block-affine component + output_descriptor = nodes.AddBlockAffineLayer(config_lines, name, + permuted_output_descriptor, + num_feats, num_feats) + + return [output_descriptor, filter_context, filter_context] + + +def AddLpFilter(config_lines, name, input, rate, num_lpfilter_taps, lpfilt_filename, is_updatable = False): + # low-pass smoothing of input was specified. so we will add a low-pass filtering layer + lp_filter = signal.firwin(num_lpfilter_taps, rate, width=None, window='hamming', pass_zero=True, scale=True, nyq=1.0) + lp_filter = np.append(lp_filter, 0) + nodes.WriteKaldiMatrix(np.array([lp_filter]), lpfilt_filename) + filter_context = int((num_lpfilter_taps - 1) / 2) + filter_input_splice_indexes = range(-1 * filter_context, filter_context + 1) + list = [('Offset({0}, {1})'.format(input['descriptor'], n) if n != 0 else input['descriptor']) for n in filter_input_splice_indexes] + filter_input_descriptor = 'Append({0})'.format(' , '.join(list)) + filter_input_descriptor = {'descriptor':filter_input_descriptor, + 'dimension':len(filter_input_splice_indexes) * input['dimension']} + + input_x_dim = len(filter_input_splice_indexes) + input_y_dim = input['dimension'] + input_z_dim = 1 + filt_x_dim = len(filter_input_splice_indexes) + filt_y_dim = 1 + filt_x_step = 1 + filt_y_step = 1 + input_vectorization = 'zyx' + + tdnn_input_descriptor = nodes.AddConvolutionLayer(config_lines, name, + filter_input_descriptor, + input_x_dim, input_y_dim, input_z_dim, + filt_x_dim, filt_y_dim, + filt_x_step, filt_y_step, + 1, input_vectorization, + filter_bias_file = lpfilt_filename, + is_updatable = is_updatable) + + + return [tdnn_input_descriptor, filter_context, filter_context] + + + +def PrintConfig(file_name, config_lines): + f = open(file_name, 'w') + f.write("\n".join(config_lines['components'])+"\n") + f.write("\n#Component nodes\n") + f.write("\n".join(config_lines['component-nodes'])) + f.close() + +def ParseSpliceString(splice_indexes, label_delay=None): + ## Work out splice_array e.g. splice_array = [ [ -3,-2,...3 ], [0], [-2,2], .. [ -8,8 ] ] + splice_array = [] + left_context = 0 + right_context = 0 + split1 = args.splice_indexes.split(" "); # we already checked the string is nonempty. + if len(split1) < 1: + sys.exit("invalid --splice-indexes argument, too short: " + + args.splice_indexes) + try: + for string in split1: + split2 = string.split(",") + if len(split2) < 1: + sys.exit("invalid --splice-indexes argument, too-short element: " + + args.splice_indexes) + int_list = [] + for int_str in split2: + int_list.append(int(int_str)) + if not int_list == sorted(int_list): + sys.exit("elements of --splice-indexes must be sorted: " + + args.splice_indexes) + left_context += -int_list[0] + right_context += int_list[-1] + splice_array.append(int_list) + except ValueError as e: + sys.exit("invalid --splice-indexes argument " + args.splice_indexes + e) + left_context = max(0, left_context) + right_context = max(0, right_context) + num_hidden_layers = len(splice_array) + input_dim = len(splice_array[0]) * args.feat_dim + args.ivector_dim + + return {'left_context':left_context, + 'right_context':right_context, + 'splice_indexes':splice_array, + 'num_hidden_layers':len(splice_array) + } + +if __name__ == "__main__": + # we add compulsary arguments as named arguments for readability + parser = argparse.ArgumentParser(description="Writes config files and variables " + "for TDNNs creation and training", + epilog="See steps/nnet3/tdnn/train.sh for example.") + # General neural network options + parser.add_argument("--splice-indexes", type=str, + help="Splice indexes at input layer, e.g. '-3,-2,-1,0,1,2,3' [compulsary argument]", default="0") + parser.add_argument("--feat-dim", type=int, + help="Raw feature dimension, e.g. 13") + parser.add_argument("--ivector-dim", type=int, + help="iVector dimension, e.g. 100", default=0) + parser.add_argument("--include-log-softmax", type=str, + help="add the final softmax layer ", default="true", choices = ["false", "true"]) + parser.add_argument("--final-layer-normalize-target", type=float, + help="RMS target for final layer (set to <1 if final layer learns too fast", + default=1.0) + parser.add_argument("--subset-dim", type=int, default=0, + help="dimension of the subset of units to be sent to the central frame") + parser.add_argument("--pnorm-input-dim", type=int, + help="input dimension to p-norm nonlinearities") + parser.add_argument("--pnorm-output-dim", type=int, + help="output dimension of p-norm nonlinearities") + parser.add_argument("--relu-dim", type=int, + help="dimension of ReLU nonlinearities") + parser.add_argument("--pool-type", type=str, default = 'none', + help="Type of pooling to be used.", choices = ['low-pass', 'sum', 'max', 'weighted-average', 'per-dim-weighted-average', 'none']) + parser.add_argument("--pool-window", type=int, default = None, + help="Width of the pooling window") + parser.add_argument("--pool-lpfilter-width", type=float, + default = None, help="Nyquist frequency of the lpfilter to be used for pooling") + parser.add_argument("--use-presoftmax-prior-scale", type=str, + help="if true, a presoftmax-prior-scale is added", + choices=['true', 'false'], default = "true") + parser.add_argument("--num-targets", type=int, + help="number of network targets (e.g. num-pdf-ids/num-leaves)") + parser.add_argument("config_dir", + help="Directory to write config files and variables") + + print(' '.join(sys.argv)) + + args = parser.parse_args() + + if not os.path.exists(args.config_dir): + os.makedirs(args.config_dir) + + ## Check arguments. + if args.splice_indexes is None: + sys.exit("--splice-indexes argument is required") + if args.feat_dim is None or not (args.feat_dim > 0): + sys.exit("--feat-dim argument is required") + if args.num_targets is None or not (args.num_targets > 0): + sys.exit("--num-targets argument is required") + if (args.subset_dim < 0): + sys.exit("--subset-dim has to be non-negative") + if (args.pool_window is not None) and (args.pool_window <= 0): + sys.exit("--pool-window has to be positive") + + if not args.relu_dim is None: + if not args.pnorm_input_dim is None or not args.pnorm_output_dim is None: + sys.exit("--relu-dim argument not compatible with " + "--pnorm-input-dim or --pnorm-output-dim options"); + nonlin_input_dim = args.relu_dim + nonlin_output_dim = args.relu_dim + else: + if not args.pnorm_input_dim > 0 or not args.pnorm_output_dim > 0: + sys.exit("--relu-dim not set, so expected --pnorm-input-dim and " + "--pnorm-output-dim to be provided."); + nonlin_input_dim = args.pnorm_input_dim + nonlin_output_dim = args.pnorm_output_dim + + prior_scale_file = '{0}/presoftmax_prior_scale.vec'.format(args.config_dir) + if args.use_presoftmax_prior_scale == "true": + use_presoftmax_prior_scale = True + else: + use_presoftmax_prior_scale = False + + parsed_splice_output = ParseSpliceString(args.splice_indexes.strip()) + num_hidden_layers = parsed_splice_output['num_hidden_layers'] + splice_indexes = parsed_splice_output['splice_indexes'] + + config_lines = {'components':[], 'component-nodes':[]} + + config_files={} + prev_layer_output = nodes.AddInputLayer(config_lines, args.feat_dim, splice_indexes[0], args.ivector_dim) + + # Add the init config lines for estimating the preconditioning matrices + init_config_lines = copy.deepcopy(config_lines) + init_config_lines['components'].insert(0, '# Config file for initializing neural network prior to') + init_config_lines['components'].insert(0, '# preconditioning matrix computation') + nodes.AddOutputLayer(init_config_lines, prev_layer_output) + config_files[args.config_dir + '/init.config'] = init_config_lines + + prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, args.config_dir + '/lda.mat') + + left_context = 0 + right_context = 0 + # we moved the first splice layer to before the LDA.. + # so the input to the first affine layer is going to [0] index + splice_indexes[0] = [0] + for i in range(0, num_hidden_layers): + # make the intermediate config file for layerwise discriminative training + # if specified, pool the input from the previous layer + + # prepare the spliced input + if not (len(splice_indexes[i]) == 1 and splice_indexes[i][0] == 0): + if args.pool_type != "none" and args.pool_window is None: + raise Exception("Pooling type was specified as {0}, this requires specification of the pool-window".format(args.pool_type)) + if args.pool_type in set(["low-pass", "weighted-average"]): + if args.pool_type == "weighted-average": + lpfilter_is_updatable = True + else: + lpfilter_is_updatable = False + # low-pass filter the input to smooth it before the sub-sampling + [prev_layer_output, cur_left_context, cur_right_context] = AddLpFilter(config_lines, + 'Tdnn_input_smoother_{0}'.format(i), + prev_layer_output, + args.pool_lpfilter_width, + args.pool_window, + args.config_dir + '/Tdnn_input_smoother_{0}.txt'.format(i), + is_updatable = lpfilter_is_updatable) + left_context += cur_left_context + right_context += cur_right_context + + if args.pool_type == "per-dim-weighted-average": + # add permute component to shuffle the feature columns of the Append descriptor output so + # that columns corresponding to the same feature index are contiguous + # add a block-affine component to collapse all the feature indexes across time steps into a single value + [prev_layer_output, cur_left_context, cur_right_context] = AddPerDimAffineLayer(config_lines, + 'Tdnn_input_PDA_{0}'.format(i), + prev_layer_output, + args.pool_window) + + left_context += cur_left_context + right_context += cur_right_context + + if args.pool_type == "sum": + raise NotImplementedError("Sum-pooling has not been tested yet.") + + if args.pool_type == "max" : + raise NotImplementedError("Max-pooling component needs to be reimplemented for this.") + + try: + zero_index = splice_indexes[i].index(0) + except ValueError: + zero_index = None + # I just assume the prev_layer_output_descriptor is a simple forwarding descriptor + prev_layer_output_descriptor = prev_layer_output['descriptor'] + subset_output = prev_layer_output + if args.subset_dim > 0: + # if subset_dim is specified the script expects a zero in the splice indexes + assert(zero_index is not None) + subset_node_config = "dim-range-node name=Tdnn_input_{0} input-node={1} dim-offset={2} dim={3}".format(i, prev_layer_output_descriptor, 0, args.subset_dim) + subset_output = {'descriptor' : 'Tdnn_input_{0}'.format(i), + 'dimension' : args.subset_dim} + config_lines['component-nodes'].append(subset_node_config) + appended_descriptors = [] + appended_dimension = 0 + for j in range(len(splice_indexes[i])): + if j == zero_index: + appended_descriptors.append(prev_layer_output['descriptor']) + appended_dimension += prev_layer_output['dimension'] + continue + appended_descriptors.append('Offset({0}, {1})'.format(subset_output['descriptor'], splice_indexes[i][j])) + appended_dimension += subset_output['dimension'] + prev_layer_output = {'descriptor' : "Append({0})".format(" , ".join(appended_descriptors)), + 'dimension' : appended_dimension} + else: + # this is a normal affine node + pass + prev_layer_output = nodes.AddAffRelNormLayer(config_lines, "Tdnn_{0}".format(i), + prev_layer_output, nonlin_output_dim, norm_target_rms = 1.0 if i < num_hidden_layers -1 else args.final_layer_normalize_target) + # a final layer is added after each new layer as we are generating configs for layer-wise discriminative training + nodes.AddFinalLayer(config_lines, prev_layer_output, args.num_targets, + use_presoftmax_prior_scale = use_presoftmax_prior_scale, + prior_scale_file = prior_scale_file, + include_log_softmax = True if args.include_log_softmax == "true" else False) + + config_files['{0}/layer{1}.config'.format(args.config_dir, i+1)] = config_lines + config_lines = {'components':[], 'component-nodes':[]} + + left_context += int(parsed_splice_output['left_context']) + right_context += int(parsed_splice_output['right_context']) + + # write the files used by other scripts like steps/nnet3/get_egs.sh + f = open(args.config_dir + "/vars", "w") + print('left_context=' + str(left_context), file=f) + print('right_context=' + str(right_context), file=f) + print('num_hidden_layers=' + str(num_hidden_layers), file=f) + f.close() + + # printing out the configs + # init.config used to train lda-mllt train + for key in config_files.keys(): + PrintConfig(key, config_files[key]) diff --git a/egs/wsj/s5/steps/nnet3/tdnn/train.sh b/egs/wsj/s5/steps/nnet3/tdnn/train.sh new file mode 100755 index 00000000000..773e10ccab6 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/tdnn/train.sh @@ -0,0 +1,660 @@ +#!/bin/bash + +# note, TDNN is the same as what we used to call multisplice. + +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). +# 2013 Xiaohui Zhang +# 2013 Guoguo Chen +# 2014 Vimal Manohar +# 2014 Vijayaditya Peddinti +# Apache 2.0. + + +# Begin configuration section. +cmd=run.pl +num_epochs=15 # Number of epochs of training; + # the number of iterations is worked out from this. +initial_effective_lrate=0.01 +final_effective_lrate=0.001 +pnorm_input_dim=3000 +pnorm_output_dim=300 +relu_dim= # you can use this to make it use ReLU's instead of p-norms. +rand_prune=4.0 # Relates to a speedup we do for LDA. +minibatch_size=512 # This default is suitable for GPU-based training. + # Set it to 128 for multi-threaded CPU-based training. +max_param_change=2.0 # max param change per minibatch +samples_per_iter=400000 # each iteration of training, see this many samples + # per job. This option is passed to get_egs.sh +num_jobs_initial=1 # Number of neural net jobs to run in parallel at the start of training +num_jobs_final=8 # Number of neural net jobs to run in parallel at the end of training +prior_subset_size=20000 # 20k samples per job, for computing priors. +num_jobs_compute_prior=10 # these are single-threaded, run on CPU. +get_egs_stage=0 # can be used for rerunning after partial +online_ivector_dir= +presoftmax_prior_scale_power=-0.25 +use_presoftmax_prior_scale=true +remove_egs=true # set to false to disable removing egs after training is done. + +max_models_combine=20 # The "max_models_combine" is the maximum number of models we give + # to the final 'combine' stage, but these models will themselves be averages of + # iteration-number ranges. + +shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples + # on each iter. You could set it to 0 or to a large value for complete + # randomization, but this would both consume memory and cause spikes in + # disk I/O. Smaller is easier on disk and memory but less random. It's + # not a huge deal though, as samples are anyway randomized right at the start. + # (the point of this is to get data in different minibatches on different iterations, + # since in the preconditioning method, 2 samples in the same minibatch can + # affect each others' gradients. + +add_layers_period=2 # by default, add new layers every 2 iterations. +stage=-6 +exit_stage=-100 # you can set this to terminate the training early. Exits before running this stage + +# count space-separated fields in splice_indexes to get num-hidden-layers. +splice_indexes="-4,-3,-2,-1,0,1,2,3,4 0 -2,2 0 -4,4 0" +# Format : layer/....layer/ " +# note: hidden layers which are composed of one or more components, +# so hidden layer indexing is different from component count +chunk_training=false # if true training is done with chunk randomization, rather than frame randomization + +randprune=4.0 # speeds up LDA. +use_gpu=true # if true, we run on GPU. +cleanup=true +egs_dir= +max_lda_jobs=10 # use no more than 10 jobs for the LDA accumulation. +lda_opts= +egs_opts= +transform_dir= # If supplied, this dir used instead of alidir to find transforms. +cmvn_opts= # will be passed to get_lda.sh and get_egs.sh, if supplied. + # only relevant for "raw" features, not lda. +feat_type=raw # or set to 'lda' to use LDA features. +align_cmd= # The cmd that is passed to steps/nnet2/align.sh +align_use_gpu= # Passed to use_gpu in steps/nnet2/align.sh [yes/no] +realign_times= # List of times on which we realign. Each time is + # floating point number strictly between 0 and 1, which + # will be multiplied by the num-iters to get an iteration + # number. +num_jobs_align=30 # Number of jobs for realignment +# End configuration section. +frames_per_eg=8 # to be passed on to get_egs.sh +subset_dim=0 + +trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet" + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --num-epochs <#epochs|15> # Number of epochs of training" + echo " --initial-effective-lrate # effective learning rate at start of training." + echo " --final-effective-lrate # effective learning rate at end of training." + echo " # data, 0.00025 for large data" + echo " --num-hidden-layers <#hidden-layers|2> # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs" + echo " --add-layers-period <#iters|2> # Number of iterations between adding hidden layers" + echo " --presoftmax-prior-scale-power # use the specified power value on the priors (inverse priors) to scale" + echo " # the pre-softmax outputs (set to 0.0 to disable the presoftmax element scale)" + echo " --num-jobs-initial # Number of parallel jobs to use for neural net training, at the start." + echo " --num-jobs-final # Number of parallel jobs to use for neural net training, at the end" + echo " --num-threads # Number of parallel threads per job, for CPU-based training (will affect" + echo " # results as well as speed; may interact with batch size; if you increase" + echo " # this, you may want to decrease the batch size." + echo " --parallel-opts # extra options to pass to e.g. queue.pl for processes that" + echo " # use multiple threads... note, you might have to reduce mem_free,ram_free" + echo " # versus your defaults, because it gets multiplied by the -pe smp argument." + echo " --minibatch-size # Size of minibatch to process (note: product with --num-threads" + echo " # should not get too large, e.g. >2k)." + echo " --samples-per-iter <#samples|400000> # Number of samples of data to process per iteration, per" + echo " # process." + echo " --splice-indexes " + echo " # Frame indices used for each splice layer." + echo " # Format : layer/....layer/ " + echo " # (note: we splice processed, typically 40-dimensional frames" + echo " --lda-dim # Dimension to reduce spliced features to with LDA" + echo " --realign-times # A list of space-separated floating point numbers between 0.0 and" + echo " # 1.0 to specify how far through training realignment is to be done" + echo " --align-cmd (utils/run.pl|utils/queue.pl ) # passed to align.sh" + echo " --align-use-gpu (yes/no) # specify is gpu is to be used for realignment" + echo " --num-jobs-align <#njobs|30> # Number of jobs to perform realignment" + echo " --stage # Used to run a partially-completed training process from somewhere in" + echo " # the middle." + + + exit 1; +fi + +data=$1 +lang=$2 +alidir=$3 +dir=$4 + +if [ ! -z "$realign_times" ]; then + [ -z "$align_cmd" ] && echo "$0: realign_times specified but align_cmd not specified" && exit 1 + [ -z "$align_use_gpu" ] && echo "$0: realign_times specified but align_use_gpu not specified" && exit 1 +fi + +# Check some files. +for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + + +# Set some variables. +num_leaves=`tree-info $alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1 +[ -z $num_leaves ] && echo "\$num_leaves is unset" && exit 1 +[ "$num_leaves" -eq "0" ] && echo "\$num_leaves is 0" && exit 1 + +nj=`cat $alidir/num_jobs` || exit 1; # number of jobs in alignment dir... +# in this dir we'll have just one job. +sdata=$data/split$nj +utils/split_data.sh $data $nj + +mkdir -p $dir/log +echo $nj > $dir/num_jobs +cp $alidir/tree $dir + + +# First work out the feature and iVector dimension, needed for tdnn config creation. +case $feat_type in + raw) feat_dim=$(feat-to-dim --print-args=false scp:$data/feats.scp -) || \ + { echo "$0: Error getting feature dim"; exit 1; } + ;; + lda) [ ! -f $alidir/final.mat ] && echo "$0: With --feat-type lda option, expect $alidir/final.mat to exist." + # get num-rows in lda matrix, which is the lda feature dim. + feat_dim=$(matrix-dim --print-args=false $alidir/final.mat | cut -f 1) + ;; + *) + echo "$0: Bad --feat-type '$feat_type';"; exit 1; +esac +if [ -z "$online_ivector_dir" ]; then + ivector_dim=0 +else + ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1; +fi + + +if [ $stage -le -5 ]; then + echo "$0: creating neural net configs"; + + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + python steps/nnet3/tdnn/make_configs.py \ + --splice-indexes "$splice_indexes" \ + --subset-dim "$subset_dim" \ + --feat-dim $feat_dim \ + --ivector-dim $ivector_dim \ + $dim_opts \ + --use-presoftmax-prior-scale $use_presoftmax_prior_scale \ + --num-targets $num_leaves \ + $dir/configs || exit 1; + + # Initialize as "raw" nnet, prior to training the LDA-like preconditioning + # matrix. This first config just does any initial splicing that we do; + # we do this as it's a convenient way to get the stats for the 'lda-like' + # transform. + $cmd $dir/log/nnet_init.log \ + nnet3-init --srand=-2 $dir/configs/init.config $dir/init.raw || exit 1; +fi + +# sourcing the "vars" below sets +# left_context=(something) +# right_context=(something) +# num_hidden_layers=(something) +. $dir/configs/vars || exit 1; + +context_opts="--left-context=$left_context --right-context=$right_context" + +! [ "$num_hidden_layers" -gt 0 ] && echo \ + "$0: Expected num_hidden_layers to be defined" && exit 1; + +[ -z "$transform_dir" ] && transform_dir=$alidir + + +if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then + extra_opts=() + [ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts") + [ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type) + [ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir) + extra_opts+=(--transform-dir $transform_dir) + extra_opts+=(--left-context $left_context) + extra_opts+=(--right-context $right_context) + echo "$0: calling get_egs.sh" + steps/nnet3/get_egs.sh $egs_opts "${extra_opts[@]}" \ + --samples-per-iter $samples_per_iter --stage $get_egs_stage \ + --cmd "$cmd" $egs_opts \ + --frames-per-eg $frames_per_eg \ + $data $alidir $dir/egs || exit 1; +fi + +[ -z $egs_dir ] && egs_dir=$dir/egs + +if [ "$feat_dim" != "$(cat $egs_dir/info/feat_dim)" ]; then + echo "$0: feature dimension mismatch with egs, $feat_dim vs $(cat $egs_dir/info/feat_dim)"; + exit 1; +fi +if [ "$ivector_dim" != "$(cat $egs_dir/info/ivector_dim)" ]; then + echo "$0: ivector dimension mismatch with egs, $ivector_dim vs $(cat $egs_dir/info/ivector_dim)"; + exit 1; +fi + +# copy any of the following that exist, to $dir. +cp $egs_dir/{cmvn_opts,splice_opts,final.mat} $dir 2>/dev/null + +# confirm that the egs_dir has the necessary context (especially important if +# the --egs-dir option was used on the command line). +egs_left_context=$(cat $egs_dir/info/left_context) || exit -1 +egs_right_context=$(cat $egs_dir/info/right_context) || exit -1 + ( [ $egs_left_context -lt $left_context ] || \ + [ $egs_right_context -lt $right_context ] ) && \ + echo "$0: egs in $egs_dir have too little context" && exit -1; + +frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; } +num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; } + +# num_archives_expanded considers each separate label-position from +# 0..frames_per_eg-1 to be a separate archive. +if [ "$chunk_training" == "true" ]; then + num_archives_expanded=$num_archives +else + num_archives_expanded=$[$num_archives*$frames_per_eg] +fi + +[ $num_jobs_initial -gt $num_jobs_final ] && \ + echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1; + +[ $num_jobs_final -gt $num_archives_expanded ] && \ + echo "$0: --final-num-jobs cannot exceed #archives $num_archives_expanded." && exit 1; + + +if [ $stage -le -3 ]; then + echo "$0: getting preconditioning matrix for input features." + num_lda_jobs=$num_archives + [ $num_lda_jobs -gt $max_lda_jobs ] && num_lda_jobs=$max_lda_jobs + + # Write stats with the same format as stats for LDA. + $cmd JOB=1:$num_lda_jobs $dir/log/get_lda_stats.JOB.log \ + nnet3-acc-lda-stats --rand-prune=$rand_prune \ + $dir/init.raw "ark:$egs_dir/egs.JOB.ark" $dir/JOB.lda_stats || exit 1; + + all_lda_accs=$(for n in $(seq $num_lda_jobs); do echo $dir/$n.lda_stats; done) + $cmd $dir/log/sum_transform_stats.log \ + sum-lda-accs $dir/lda_stats $all_lda_accs || exit 1; + + rm $all_lda_accs || exit 1; + + # this computes a fixed affine transform computed in the way we described in + # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant + # of an LDA transform but without dimensionality reduction. + $cmd $dir/log/get_transform.log \ + nnet-get-feature-transform $lda_opts $dir/lda.mat $dir/lda_stats || exit 1; + + ln -sf ../lda.mat $dir/configs/lda.mat +fi + + +if [ $stage -le -2 ]; then + echo "$0: preparing initial vector for FixedScaleComponent before softmax" + echo " ... using priors^$presoftmax_prior_scale_power and rescaling to average 1" + + # obtains raw pdf count + $cmd JOB=1:$nj $dir/log/acc_pdf.JOB.log \ + ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \ + post-to-tacc --per-pdf=true $alidir/final.mdl ark:- $dir/pdf_counts.JOB || exit 1; + $cmd $dir/log/sum_pdf_counts.log \ + vector-sum --binary=false $dir/pdf_counts.* $dir/pdf_counts || exit 1; + rm $dir/pdf_counts.* + + awk -v power=$presoftmax_prior_scale_power -v smooth=0.01 \ + '{ for(i=2; i<=NF-1; i++) { count[i-2] = $i; total += $i; } + num_pdfs=NF-2; average_count = total/num_pdfs; + for (i=0; i $dir/presoftmax_prior_scale.vec + ln -sf ../presoftmax_prior_scale.vec $dir/configs/presoftmax_prior_scale.vec +fi + +if [ $stage -le -1 ]; then + # Add the first layer; this will add in the lda.mat and + # presoftmax_prior_scale.vec. + $cmd $dir/log/add_first_layer.log \ + nnet3-init --srand=-3 $dir/init.raw $dir/configs/layer1.config $dir/0.raw || exit 1; + + # Convert to .mdl, train the transitions, set the priors. + $cmd $dir/log/init_mdl.log \ + nnet3-am-init $alidir/final.mdl $dir/0.raw - \| \ + nnet3-am-train-transitions - "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl || exit 1; +fi + + +# set num_iters so that as close as possible, we process the data $num_epochs +# times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives_expanded, +# where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2. + +num_archives_to_process=$[$num_epochs*$num_archives_expanded] +num_archives_processed=0 +num_iters=$[($num_archives_to_process*2)/($num_jobs_initial+$num_jobs_final)] + +! [ $num_iters -gt $[$finish_add_layers_iter+2] ] \ + && echo "$0: Insufficient epochs" && exit 1 + +finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period] + +echo "$0: Will train for $num_epochs epochs = $num_iters iterations" + +if $use_gpu; then + parallel_suffix="" + train_queue_opt="--gpu 1" + combine_queue_opt="--gpu 1" + prior_gpu_opt="--use-gpu=yes" + prior_queue_opt="--gpu 1" + parallel_train_opts= + if ! cuda-compiled; then + echo "$0: WARNING: you are running with one thread but you have not compiled" + echo " for CUDA. You may be running a setup optimized for GPUs. If you have" + echo " GPUs and have nvcc installed, go to src/ and do ./configure; make" + exit 1 + fi +else + echo "$0: without using a GPU this will be very slow. nnet3 does not yet support multiple threads." + parallel_train_opts="--use-gpu=no" + combine_queue_opt="" # the combine stage will be quite slow if not using + # GPU, as we didn't enable that program to use + # multiple threads. + prior_gpu_opt="--use-gpu=no" + prior_queue_opt="" +fi + + +approx_iters_per_epoch_final=$[$num_archives_expanded/$num_jobs_final] +# First work out how many iterations we want to combine over in the final +# nnet3-combine-fast invocation. (We may end up subsampling from these if the +# number exceeds max_model_combine). The number we use is: +# min(max(max_models_combine, approx_iters_per_epoch_final), +# 1/2 * iters_after_last_layer_added) +num_iters_combine=$max_models_combine +if [ $num_iters_combine -lt $approx_iters_per_epoch_final ]; then + num_iters_combine=$approx_iters_per_epoch_final +fi +half_iters_after_add_layers=$[($num_iters-$finish_add_layers_iter)/2] +if [ $num_iters_combine -gt $half_iters_after_add_layers ]; then + num_iters_combine=$half_iters_after_add_layers +fi +first_model_combine=$[$num_iters-$num_iters_combine+1] + +x=0 + +for realign_time in $realign_times; do + # Work out the iterations on which we will re-align, if the --realign-times + # option was used. This is slightly approximate. + ! perl -e "exit($realign_time > 0.0 && $realign_time < 1.0 ? 0:1);" && \ + echo "Invalid --realign-times option $realign_times: elements must be strictly between 0 and 1."; + # the next formula is based on the one for mix_up_iter above. + realign_iter=$(perl -e '($j,$k,$n,$p)=@ARGV; print int(0.5 + ($j==$k ? $n*$p : $n*(sqrt((1-$p)*$j*$j+$p*$k*$k)-$j)/($k-$j))); ' $num_jobs_initial $num_jobs_final $num_iters $realign_time) || exit 1; + realign_this_iter[$realign_iter]=$realign_time +done + +cur_egs_dir=$egs_dir + +while [ $x -lt $num_iters ]; do + [ $x -eq $exit_stage ] && echo "$0: Exiting early due to --exit-stage $exit_stage" && exit 0; + + this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);") + + ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process; + this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);"); + + echo "On iteration $x, learning rate is $this_learning_rate." + + if [ ! -z "${realign_this_iter[$x]}" ]; then + prev_egs_dir=$cur_egs_dir + cur_egs_dir=$dir/egs_${realign_this_iter[$x]} + fi + + if [ $x -ge 0 ] && [ $stage -le $x ]; then + if [ ! -z "${realign_this_iter[$x]}" ]; then + time=${realign_this_iter[$x]} + + echo "Getting average posterior for purposes of adjusting the priors." + # Note: this just uses CPUs, using a smallish subset of data. + # always use the first egs archive, which makes the script simpler; + # we're using different random subsets of it. + rm $dir/post.$x.*.vec 2>/dev/null + $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \ + nnet3-copy-egs --srand=JOB --frame=random $context_opts ark:$prev_egs_dir/egs.1.ark ark:- \| \ + nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \ + nnet3-merge-egs ark:- ark:- \| \ + nnet3-compute-from-egs --apply-exp=true "nnet3-am-copy --raw=true $dir/$x.mdl -|" ark:- ark:- \| \ + matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1; + + sleep 3; # make sure there is time for $dir/post.$x.*.vec to appear. + + $cmd $dir/log/vector_sum.$x.log \ + vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1; + rm $dir/post.$x.*.vec; + + echo "Re-adjusting priors based on computed posteriors" + $cmd $dir/log/adjust_priors.$x.log \ + nnet3-am-adjust-priors $dir/$x.mdl $dir/post.$x.vec $dir/$x.mdl || exit 1; + + sleep 2 + + steps/nnet3/align.sh --nj $num_jobs_align --cmd "$align_cmd" --use-gpu $align_use_gpu \ + --transform-dir "$transform_dir" --online-ivector-dir "$online_ivector_dir" \ + --iter $x $data $lang $dir $dir/ali_$time || exit 1 + + steps/nnet3/relabel_egs.sh --cmd "$cmd" --iter $x $dir/ali_$time \ + $prev_egs_dir $cur_egs_dir || exit 1 + + if $cleanup && [[ $prev_egs_dir =~ $dir/egs* ]]; then + steps/nnet3/remove_egs.sh $prev_egs_dir + fi + fi + + # Set off jobs doing some diagnostics, in the background. + # Use the egs dir from the previous iteration for the diagnostics + $cmd $dir/log/compute_prob_valid.$x.log \ + nnet3-compute-prob "nnet3-am-copy --raw=true $dir/$x.mdl - |" \ + "ark:nnet3-merge-egs ark:$cur_egs_dir/valid_diagnostic.egs ark:- |" & + $cmd $dir/log/compute_prob_train.$x.log \ + nnet3-compute-prob "nnet3-am-copy --raw=true $dir/$x.mdl - |" \ + "ark:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:- |" & + + if [ $x -gt 0 ]; then + $cmd $dir/log/progress.$x.log \ + nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true $dir/$[$x-1].mdl - |" "nnet3-am-copy --raw=true $dir/$x.mdl - |" \ + "ark:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:-|" '&&' \ + nnet3-info "nnet3-am-copy --raw=true $dir/$x.mdl - |" & + fi + + echo "Training neural net (pass $x)" + + if [ $x -gt 0 ] && \ + [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \ + [ $[$x%$add_layers_period] -eq 0 ]; then + do_average=false # if we've just mixed up, don't do averaging but take the + # best. + cur_num_hidden_layers=$[1+$x/$add_layers_period] + config=$dir/configs/layer$cur_num_hidden_layers.config + raw="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl - | nnet3-init --srand=$x - $config - |" + else + do_average=true + if [ $x -eq 0 ]; then do_average=false; fi # on iteration 0, pick the best, don't average. + raw="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl -|" + fi + if $do_average; then + this_minibatch_size=$minibatch_size + else + # on iteration zero or when we just added a layer, use a smaller minibatch + # size (and we will later choose the output of just one of the jobs): the + # model-averaging isn't always helpful when the model is changing too fast + # (i.e. it can worsen the objective function), and the smaller minibatch + # size will help to keep the update stable. + this_minibatch_size=$[$minibatch_size/2]; + fi + + rm $dir/.error 2>/dev/null + + + ( # this sub-shell is so that when we "wait" below, + # we only wait for the training jobs that we just spawned, + # not the diagnostic jobs that we spawned above. + + # We can't easily use a single parallel SGE job to do the main training, + # because the computation of which archive and which --frame option + # to use for each job is a little complex, so we spawn each one separately. + for n in $(seq $this_num_jobs); do + k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we'll derive + # the other indexes from. + archive=$[($k%$num_archives)+1]; # work out the 1-based archive index. + frame=$[(($k/$num_archives)%$frames_per_eg)]; # work out the 0-based frame + # index; this increases more slowly than the archive index because the + # same archive with different frame indexes will give similar gradients, + # so we want to separate them in time. + + $cmd $train_queue_opt $dir/log/train.$x.$n.log \ + nnet3-train $parallel_train_opts \ + --max-param-change=$max_param_change "$raw" \ + "ark:nnet3-copy-egs --frame=$frame $context_opts ark:$cur_egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --minibatch-size=$this_minibatch_size --discard-partial-minibatches=true ark:- ark:- |" \ + $dir/$[$x+1].$n.raw || touch $dir/.error & + done + wait + ) + # the error message below is not that informative, but $cmd will + # have printed a more specific one. + [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1; + + nnets_list= + for n in `seq 1 $this_num_jobs`; do + nnets_list="$nnets_list $dir/$[$x+1].$n.raw" + done + + if $do_average; then + # average the output of the different jobs. + $cmd $dir/log/average.$x.log \ + nnet3-average $nnets_list - \| \ + nnet3-am-copy --set-raw-nnet=- $dir/$x.mdl $dir/$[$x+1].mdl || exit 1; + else + # choose the best from the different jobs. + n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) { + $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn"; + undef $logprob; while () { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } } + close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; + $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1; + [ -z "$n" ] && echo "Error getting best model" && exit 1; + $cmd $dir/log/select.$x.log \ + nnet3-am-copy --set-raw-nnet=$dir/$[$x+1].$n.raw $dir/$x.mdl $dir/$[$x+1].mdl || exit 1; + fi + + rm $nnets_list + [ ! -f $dir/$[$x+1].mdl ] && exit 1; + if [ -f $dir/$[$x-1].mdl ] && $cleanup && \ + [ $[($x-1)%100] -ne 0 ] && [ $[$x-1] -lt $first_model_combine ]; then + rm $dir/$[$x-1].mdl + fi + fi + x=$[$x+1] + num_archives_processed=$[$num_archives_processed+$this_num_jobs] +done + + +if [ $stage -le $num_iters ]; then + echo "Doing final combination to produce final.mdl" + + # Now do combination. In the nnet3 setup, the logic + # for doing averaging of subsets of the models in the case where + # there are too many models to reliably esetimate interpolation + # factors (max_models_combine) is moved into the nnet3-combine + nnets_list=() + for n in $(seq 0 $[num_iters_combine-1]); do + iter=$[$first_model_combine+$n] + mdl=$dir/$iter.mdl + [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1; + nnets_list[$n]="nnet3-am-copy --raw=true $mdl -|"; + done + + # Below, we use --use-gpu=no to disable nnet3-combine-fast from using a GPU, + # as if there are many models it can give out-of-memory error; and we set + # num-threads to 8 to speed it up (this isn't ideal...) + + $cmd $combine_queue_opt $dir/log/combine.log \ + nnet3-combine --num-iters=40 \ + --enforce-sum-to-one=true --enforce-positive-weights=true \ + --verbose=3 "${nnets_list[@]}" "ark:nnet3-merge-egs --minibatch-size=1024 ark:$cur_egs_dir/combine.egs ark:-|" \ + "|nnet3-am-copy --set-raw-nnet=- $dir/$num_iters.mdl $dir/combined.mdl" || exit 1; + + # Compute the probability of the final, combined model with + # the same subset we used for the previous compute_probs, as the + # different subsets will lead to different probs. + $cmd $dir/log/compute_prob_valid.final.log \ + nnet3-compute-prob "nnet3-am-copy --raw=true $dir/combined.mdl -|" \ + "ark:nnet3-merge-egs ark:$cur_egs_dir/valid_diagnostic.egs ark:- |" & + $cmd $dir/log/compute_prob_train.final.log \ + nnet3-compute-prob "nnet3-am-copy --raw=true $dir/combined.mdl -|" \ + "ark:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:- |" & +fi + +if [ $stage -le $[$num_iters+1] ]; then + echo "Getting average posterior for purposes of adjusting the priors." + # Note: this just uses CPUs, using a smallish subset of data. + if [ $num_jobs_compute_prior -gt $num_archives ]; then egs_part=1; + else egs_part=JOB; fi + rm $dir/post.$x.*.vec 2>/dev/null + $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$x.JOB.log \ + nnet3-copy-egs --frame=random $context_opts --srand=JOB ark:$cur_egs_dir/egs.$egs_part.ark ark:- \| \ + nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \ + nnet3-merge-egs ark:- ark:- \| \ + nnet3-compute-from-egs $prior_gpu_opt --apply-exp=true \ + "nnet3-am-copy --raw=true $dir/combined.mdl -|" ark:- ark:- \| \ + matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1; + + sleep 3; # make sure there is time for $dir/post.$x.*.vec to appear. + + $cmd $dir/log/vector_sum.$x.log \ + vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1; + + rm $dir/post.$x.*.vec; + + echo "Re-adjusting priors based on computed posteriors" + $cmd $dir/log/adjust_priors.final.log \ + nnet3-am-adjust-priors $dir/combined.mdl $dir/post.$x.vec $dir/final.mdl || exit 1; +fi + + +if [ ! -f $dir/final.mdl ]; then + echo "$0: $dir/final.mdl does not exist." + # we don't want to clean up if the training didn't succeed. + exit 1; +fi + +sleep 2 + +echo Done + +if $cleanup; then + echo Cleaning up data + if $remove_egs && [[ $cur_egs_dir =~ $dir/egs* ]]; then + steps/nnet2/remove_egs.sh $cur_egs_dir + fi + + echo Removing most of the models + for x in `seq 0 $num_iters`; do + if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.mdl ]; then + # delete all but every 100th model; don't delete the ones which combine to form the final model. + rm $dir/$x.mdl + fi + done +fi diff --git a/egs/wsj/s5/steps/nnet3/train_discriminative.sh b/egs/wsj/s5/steps/nnet3/train_discriminative.sh new file mode 100755 index 00000000000..de8b0519009 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/train_discriminative.sh @@ -0,0 +1,382 @@ +#!/bin/bash + +# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey) +# 2014-2015 Vimal Manohar +# Apache 2.0. + +set -e +set -o pipefail + +# This script does MPE or MMI or state-level minimum bayes risk (sMBR) training +# using egs obtained by steps/nnet3/get_egs_discriminative.sh + +# Begin configuration section. +cmd=run.pl +num_epochs=4 # Number of epochs of training; + # the number of iterations is worked out from this. + # Be careful with this: we actually go over the data + # num-epochs * frame-subsampling-factor times, due to + # using different data-shifts. +use_gpu=true +truncate_deriv_weights=0 # can be used to set to zero the weights of derivs from frames + # near the edges. (counts subsampled frames). +apply_deriv_weights=true +run_diagnostics=true +learning_rate=0.00002 +max_param_change=2.0 +scale_max_param_change=false # if this option is used, scale it by num-jobs. + +effective_lrate= # If supplied, overrides the learning rate, which gets set to effective_lrate * num_jobs_nnet. +acoustic_scale=0.1 # acoustic scale for MMI/MPFE/SMBR training. +boost=0.0 # option relevant for MMI + +criterion=smbr +drop_frames=false # option relevant for MMI +one_silence_class=true # option relevant for MPE/SMBR +num_jobs_nnet=4 # Number of neural net jobs to run in parallel. Note: this + # will interact with the learning rates (if you decrease + # this, you'll have to decrease the learning rate, and vice + # versa). +regularization_opts= +minibatch_size=64 # This is the number of examples rather than the number of output frames. +modify_learning_rates=false +last_layer_factor=1.0 # relates to modify-learning-rates +first_layer_factor=1.0 # relates to modify-learning-rates +shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples + # on each iter. You could set it to 0 or to a large value for complete + # randomization, but this would both consume memory and cause spikes in + # disk I/O. Smaller is easier on disk and memory but less random. It's + # not a huge deal though, as samples are anyway randomized right at the start. + + +stage=-3 + +adjust_priors=true +num_threads=16 # this is the default but you may want to change it, e.g. to 1 if + # using GPUs. + +cleanup=true +keep_model_iters=1 +retroactive=false +remove_egs=false +src_model= # will default to $degs_dir/final.mdl + +left_deriv_truncate= # number of time-steps to avoid using the deriv of, on the left. +right_deriv_truncate= # number of time-steps to avoid using the deriv of, on the right. +# End configuration section. + + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 2 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 exp/tri4_mpe_degs exp/tri4_mpe" + echo "" + echo "You have to first call get_egs_discriminative2.sh to dump the egs." + echo "Caution: the options 'drop-frames' and 'criterion' are taken here" + echo "even though they were required also by get_egs_discriminative2.sh," + echo "and they should normally match." + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --num-epochs <#epochs|4> # Number of epochs of training" + echo " --learning-rate # Learning rate to use" + echo " --effective-lrate # If supplied, learning rate will be set to" + echo " # this value times num-jobs-nnet." + echo " --num-jobs-nnet # Number of parallel jobs to use for main neural net" + echo " # training (will affect results as well as speed; try 8, 16)" + echo " # Note: if you increase this, you may want to also increase" + echo " # the learning rate. Also note: if there are fewer archives" + echo " # of egs than this, it will get reduced automatically." + echo " --num-threads # Number of parallel threads per job (will affect results" + echo " # as well as speed; may interact with batch size; if you increase" + echo " # this, you may want to decrease the batch size. With GPU, must be 1." + echo " --parallel-opts # extra options to pass to e.g. queue.pl for processes that" + echo " # use multiple threads... " + echo " --stage # Used to run a partially-completed training process from somewhere in" + echo " # the middle." + echo " --criterion # Training criterion: may be smbr, mmi or mpfe" + echo " --boost # Boosting factor for MMI (e.g., 0.1)" + echo " --drop-frames # Option that affects MMI training: if true, we exclude gradients from frames" + echo " # where the numerator transition-id is not in the denominator lattice." + echo " --one-silence-class # Option that affects MPE/SMBR training (will tend to reduce insertions)" + echo " --modify-learning-rates # If true, modify learning rates to try to equalize relative" + echo " # changes across layers." + exit 1; +fi + +degs_dir=$1 +dir=$2 + +[ -z "$src_model" ] && src_model=$degs_dir/final.mdl + +# Check some files. +for f in $degs_dir/degs.1.ark $degs_dir/info/{num_archives,silence.csl,frames_per_eg,egs_per_archive} $src_model; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +mkdir -p $dir/log || exit 1; + +# copy some things +for f in splice_opts cmvn_opts tree final.mat; do + if [ -f $degs_dir/$f ]; then + cp $degs_dir/$f $dir/ || exit 1; + fi +done + +silphonelist=`cat $degs_dir/info/silence.csl` || exit 1; + +num_archives_priors=0 +if $adjust_priors; then + num_archives_priors=`cat $degs_dir/info/num_archives_priors` || exit 1 +fi + +frames_per_eg=$(cat $degs_dir/info/frames_per_eg) || { echo "error: no such file $degs_dir/info/frames_per_eg"; exit 1; } +num_archives=$(cat $degs_dir/info/num_archives) || exit 1; +frame_subsampling_factor=$(cat $degs_dir/info/frame_subsampling_factor) + +echo $frame_subsampling_factor > $dir/frame_subsampling_factor + +num_archives_expanded=$[$num_archives*$frame_subsampling_factor] + +if [ $num_jobs_nnet -gt $num_archives_expanded ]; then + echo "$0: num-jobs-nnet $num_jobs_nnet exceeds number of archives $num_archives_expanded," + echo " ... setting it to $num_archives." + num_jobs_nnet=$num_archives_expanded +fi + +num_archives_to_process=$[$num_epochs*$num_archives_expanded] +num_archives_processed=0 +num_iters=$[$num_archives_to_process/$num_jobs_nnet] + +echo "$0: Will train for $num_epochs epochs = $num_iters iterations" + +prior_gpu_opt="--use-gpu=no" +prior_queue_opt="" + +if $use_gpu; then + parallel_suffix="" + train_queue_opt="--gpu 1" + parallel_train_opts= + if ! cuda-compiled; then + echo "$0: WARNING: you are running with one thread but you have not compiled" + echo " for CUDA. You may be running a setup optimized for GPUs. If you have" + echo " GPUs and have nvcc installed, go to src/ and do ./configure; make" + exit 1 + fi +else + echo "$0: without using a GPU this will be very slow. nnet3 does not yet support multiple threads." + parallel_train_opts="--use-gpu=no" + prior_gpu_opt="--use-gpu=no" + prior_queue_opt="" +fi + +for e in $(seq 1 $[num_epochs*frame_subsampling_factor]); do + x=$[($e*$num_archives)/$num_jobs_nnet] # gives the iteration number. + iter_to_epoch[$x]=$e +done + +if [ $stage -le -1 ]; then + echo "$0: Copying initial model and modifying preconditioning setup" + + # Note, the baseline model probably had preconditioning, and we'll keep it; + # but we want online preconditioning with a larger number of samples of + # history, since in this setup the frames are only randomized at the segment + # level so they are highly correlated. It might make sense to tune this a + # little, later on, although I doubt it matters once the --num-samples-history + # is large enough. + + if [ ! -z "$effective_lrate" ]; then + learning_rate=$(perl -e "print ($num_jobs_nnet*$effective_lrate);") + echo "$0: setting learning rate to $learning_rate = --num-jobs-nnet * --effective-lrate." + fi + + $cmd $dir/log/convert.log \ + nnet3-am-copy --learning-rate=$learning_rate "$src_model" $dir/0.mdl || exit 1; +fi + + +rm -f $dir/.error 2>/dev/null || true + +x=0 + +deriv_time_opts= +[ ! -z "$left_deriv_truncate" ] && deriv_time_opts="--optimization.min-deriv-time=$left_deriv_truncate" +[ ! -z "$right_deriv_truncate" ] && \ + deriv_time_opts="$deriv_time_opts --optimization.max-deriv-time=$((frames_per_eg - right_deriv_truncate))" + +while [ $x -lt $num_iters ]; do + if [ $stage -le $x ]; then + if $run_diagnostics; then + # Set off jobs doing some diagnostics, in the background. # Use the egs dir from the previous iteration for the diagnostics + $cmd $dir/log/compute_objf_valid.$x.log \ + nnet3-discriminative-compute-objf $regularization_opts \ + --silence-phones=$silphonelist \ + --criterion=$criterion --drop-frames=$drop_frames \ + --one-silence-class=$one_silence_class \ + --boost=$boost --acoustic-scale=$acoustic_scale \ + $dir/$x.mdl \ + ark:$degs_dir/valid_diagnostic.degs & + $cmd $dir/log/compute_objf_train.$x.log \ + nnet3-discriminative-compute-objf $regularization_opts \ + --silence-phones=$silphonelist \ + --criterion=$criterion --drop-frames=$drop_frames \ + --one-silence-class=$one_silence_class \ + --boost=$boost --acoustic-scale=$acoustic_scale \ + $dir/$x.mdl \ + ark:$degs_dir/train_diagnostic.degs & + fi + + if [ $x -gt 0 ]; then + $cmd $dir/log/progress.$x.log \ + nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true $dir/$[$x-1].mdl - |" "nnet3-am-copy --raw=true $dir/$x.mdl - |" \ + '&&' \ + nnet3-info "nnet3-am-copy --raw=true $dir/$x.mdl - |" & + fi + + + echo "Training neural net (pass $x)" + + ( # this sub-shell is so that when we "wait" below, + # we only wait for the training jobs that we just spawned, + # not the diagnostic jobs that we spawned above. + + # We can't easily use a single parallel SGE job to do the main training, + # because the computation of which archive and which --frame option + # to use for each job is a little complex, so we spawn each one separately. + for n in `seq $num_jobs_nnet`; do + k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we'll derive + # the other indexes from. + archive=$[($k%$num_archives)+1]; # work out the 1-based archive index. + + if [ $[num_archives % frame_subsampling_factor] -ne 0 ]; then + frame_shift=$[k % frame_subsampling_factor] + else + frame_shift=$[(k + k/num_archives) % frame_subsampling_factor] + fi + + #archive=$[(($n+($x*$num_jobs_nnet))%$num_archives)+1] + if $scale_max_param_change; then + this_max_param_change=$(perl -e "print ($max_param_change * $num_jobs_nnet);") + else + this_max_param_change=$max_param_change + fi + + $cmd $train_queue_opt $dir/log/train.$x.$n.log \ + nnet3-discriminative-train --verbose=2 \ + --apply-deriv-weights=$apply_deriv_weights \ + $parallel_train_opts $deriv_time_opts \ + --max-param-change=$this_max_param_change \ + --silence-phones=$silphonelist \ + --criterion=$criterion --drop-frames=$drop_frames \ + --one-silence-class=$one_silence_class \ + --boost=$boost --acoustic-scale=$acoustic_scale $regularization_opts \ + $dir/$x.mdl \ + "ark:nnet3-discriminative-copy-egs --frame-shift=$frame_shift --truncate-deriv-weights=$truncate_deriv_weights ark:$degs_dir/degs.$archive.ark ark:- | nnet3-discriminative-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:- | nnet3-discriminative-merge-egs --minibatch-size=$minibatch_size ark:- ark:- |" \ + $dir/$[$x+1].$n.raw || touch $dir/.error & + done + wait + ) + [ -f $dir/.error ] && { echo "Found $dir/.error. See $dir/log/train.$x.*.log"; exit 1; } + + nnets_list=$(for n in $(seq $num_jobs_nnet); do echo $dir/$[$x+1].$n.raw; done) + + # below use run.pl instead of a generic $cmd for these very quick stages, + # so that we don't run the risk of waiting for a possibly hard-to-get GPU. + run.pl $dir/log/average.$x.log \ + nnet3-average $nnets_list - \| \ + nnet3-am-copy --set-raw-nnet=- $dir/$x.mdl $dir/$[$x+1].mdl || exit 1; + + if $modify_learning_rates; then + run.pl $dir/log/modify_learning_rates.$x.log \ + nnet3-modify-learning-rates --retroactive=$retroactive \ + --last-layer-factor=$last_layer_factor \ + --first-layer-factor=$first_layer_factor \ + "nnet3-am-copy --raw $dir/$x.mdl -|" "nnet3-am-copy --raw $dir/$[$x+1].mdl -|" - \| \ + nnet3-am-copy --set-raw-nnet=- $dir/$x.mdl $dir/$[$x+1].mdl || exit 1; + fi + rm $nnets_list + + if [ ! -z "${iter_to_epoch[$x]}" ]; then + e=${iter_to_epoch[$x]} + ln -sf $x.mdl $dir/epoch$e.mdl + fi + + if $adjust_priors && [ ! -z "${iter_to_epoch[$x]}" ]; then + if [ ! -f $degs_dir/priors_egs.1.ark ]; then + echo "$0: Expecting $degs_dir/priors_egs.1.ark to exist since --adjust-priors was true." + echo "$0: Run this script with --adjust-priors false to not adjust priors" + exit 1 + fi + ( + e=${iter_to_epoch[$x]} + rm -f $dir/.error 2> /dev/null || true + + steps/nnet3/adjust_priors.sh --egs-type priors_egs \ + --num-jobs-compute-prior $num_archives_priors \ + --cmd "$cmd $prior_queue_opt" --use-gpu false \ + --raw false --iter epoch$e $dir $degs_dir \ + || { touch $dir/.error; echo "Error in adjusting priors. See $dir/log/adjust_priors.epoch$e.log"; exit 1; } + ) & + fi + + [ -f $dir/.error ] && { echo "Found $dir/.error. Error on iteration $x"; exit 1; } + fi + + x=$[$x+1] + num_archives_processed=$[num_archives_processed+num_jobs_nnet] +done + +rm -f $dir/final.mdl 2>/dev/null || true +cp $dir/$x.mdl $dir/final.mdl +ln -sf final.mdl $dir/epoch$[num_epochs*frame_subsampling_factor].mdl + +if $adjust_priors && [ $stage -le $num_iters ]; then + if [ ! -f $degs_dir/priors_egs.1.ark ]; then + echo "$0: Expecting $degs_dir/priors_egs.1.ark to exist since --adjust-priors was true." + echo "$0: Run this script with --adjust-priors false to not adjust priors" + exit 1 + fi + + steps/nnet3/adjust_priors.sh --egs-type priors_egs \ + --num-jobs-compute-prior $num_archives_priors \ + --cmd "$cmd $prior_queue_opt" --use-gpu false \ + --raw false --iter epoch$[num_epochs*frame_subsampling_factor] \ + $dir $degs_dir || exit 1 +fi + +echo Done + +#epoch_final_iters= +#for e in $(seq 0 $num_epochs); do +# x=$[($e*$num_archives)/$num_jobs_nnet] # gives the iteration number. +# #ln -sf $x.mdl $dir/epoch$e.mdl +# epoch_final_iters="$epoch_final_iters $x" +#done + + +# function to remove egs that might be soft links. +remove () { for x in $*; do [ -L $x ] && rm $(readlink -f $x); rm $x; done } + +if $cleanup && $remove_egs; then # note: this is false by default. + echo Removing training examples + remove $degs_dir/degs.* + remove $degs_dir/priors_egs.* +fi + + +if $cleanup; then + echo Removing most of the models + for x in `seq 1 $keep_model_iters $num_iters`; do + if [ -z "${iter_to_epoch[$x]}" ]; then + # if $x is not an epoch-final iteration.. + rm $dir/$x.mdl 2>/dev/null + fi + done +fi + diff --git a/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh b/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh index e17026e496f..d8ac11da720 100755 --- a/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh +++ b/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh @@ -93,6 +93,7 @@ echo -n >$ieconf cp $srcdir/online_cmvn.conf $dir/conf/ || exit 1; echo "--cmvn-config=$dir/conf/online_cmvn.conf" >>$ieconf for x in $(echo $splice_opts); do echo "$x"; done > $dir/conf/splice.conf +echo "--ivector-period=$ivector_period" >>$ieconf echo "--splice-config=$dir/conf/splice.conf" >>$ieconf echo "--lda-matrix=$srcdir/final.mat" >>$ieconf echo "--global-cmvn-stats=$srcdir/global_cmvn.stats" >>$ieconf diff --git a/egs/wsj/s5/steps/score_kaldi.sh b/egs/wsj/s5/steps/score_kaldi.sh index 8a2aee9d48d..202208c1f5f 100755 --- a/egs/wsj/s5/steps/score_kaldi.sh +++ b/egs/wsj/s5/steps/score_kaldi.sh @@ -14,6 +14,7 @@ beam=6 word_ins_penalty=0.0,0.5,1.0 min_lmwt=9 max_lmwt=20 +iter=final #end configuration section. echo "$0 $@" # Print the command line for logging diff --git a/src/Makefile b/src/Makefile index 57a4b98e0aa..c8d2e401866 100644 --- a/src/Makefile +++ b/src/Makefile @@ -170,7 +170,7 @@ cudamatrix: base util matrix nnet: base util matrix cudamatrix nnet2: base util matrix thread lat gmm hmm tree transform cudamatrix nnet3: base util matrix thread lat gmm hmm tree transform cudamatrix chain -chain: lat hmm tree fstext matrix cudamatrix util base +chain: lat hmm tree fstext matrix cudamatrix util base ivector: base util matrix thread transform tree gmm #3)Dependencies for optional parts of Kaldi onlinebin: base matrix util feat tree optimization gmm transform sgmm sgmm2 fstext hmm lm decoder lat cudamatrix nnet nnet2 online thread diff --git a/src/base/Makefile b/src/base/Makefile index 8db3b86d021..88be1b96c9a 100644 --- a/src/base/Makefile +++ b/src/base/Makefile @@ -5,7 +5,7 @@ include ../kaldi.mk TESTFILES = kaldi-math-test io-funcs-test kaldi-error-test timer-test -OBJFILES = kaldi-math.o kaldi-error.o io-funcs.o kaldi-utils.o +OBJFILES = kaldi-math.o kaldi-error.o io-funcs.o kaldi-utils.o kaldi-types-extra.o LIBNAME = kaldi-base diff --git a/src/base/io-funcs-inl.h b/src/base/io-funcs-inl.h index 9629c5466ad..9311645cc0c 100644 --- a/src/base/io-funcs-inl.h +++ b/src/base/io-funcs-inl.h @@ -3,6 +3,7 @@ // Copyright 2009-2011 Microsoft Corporation; Saarland University; // Jan Silovsky; Yanmin Qian; // Johns Hopkins University (Author: Daniel Povey) +// 2016 Xiaohui Zhang // See ../../COPYING for clarification regarding multiple authors // @@ -87,6 +88,112 @@ template inline void ReadBasicType(std::istream &is, } } +// Template that covers integers. +template +inline void WriteIntegerPairVector(std::ostream &os, bool binary, + const std::vector > &v) { + // Compile time assertion that this is not called with a wrong type. + KALDI_ASSERT_IS_INTEGER_TYPE(T); + if (binary) { + char sz = sizeof(T); // this is currently just a check. + os.write(&sz, 1); + int32 vecsz = static_cast(v.size()); + KALDI_ASSERT((size_t)vecsz == v.size()); + os.write(reinterpret_cast(&vecsz), sizeof(vecsz)); + if (vecsz != 0) { + os.write(reinterpret_cast(&(v[0])), sizeof(T) * vecsz * 2); + } + } else { + // focus here is on prettiness of text form rather than + // efficiency of reading-in. + // reading-in is dominated by low-level operations anyway: + // for efficiency use binary. + os << "[ "; + typename std::vector >::const_iterator iter = v.begin(), + end = v.end(); + for (; iter != end; ++iter) { + if (sizeof(T) == 1) + os << static_cast(iter->first) << ',' + << static_cast(iter->second) << ' '; + else + os << iter->first << ',' + << iter->second << ' '; + } + os << "]\n"; + } + if (os.fail()) { + throw std::runtime_error("Write failure in WriteIntegerPairVector."); + } +} + +// Template that covers integers. +template +inline void ReadIntegerPairVector(std::istream &is, bool binary, + std::vector > *v) { + KALDI_ASSERT_IS_INTEGER_TYPE(T); + KALDI_ASSERT(v != NULL); + if (binary) { + int sz = is.peek(); + if (sz == sizeof(T)) { + is.get(); + } else { // this is currently just a check. + KALDI_ERR << "ReadIntegerPairVector: expected to see type of size " + << sizeof(T) << ", saw instead " << sz << ", at file position " + << is.tellg(); + } + int32 vecsz; + is.read(reinterpret_cast(&vecsz), sizeof(vecsz)); + if (is.fail() || vecsz < 0) goto bad; + v->resize(vecsz); + if (vecsz > 0) { + is.read(reinterpret_cast(&((*v)[0])), sizeof(T)*vecsz*2); + } + } else { + std::vector > tmp_v; // use temporary so v doesn't use extra memory + // due to resizing. + is >> std::ws; + if (is.peek() != static_cast('[')) { + KALDI_ERR << "ReadIntegerPairVector: expected to see [, saw " + << is.peek() << ", at file position " << is.tellg(); + } + is.get(); // consume the '['. + is >> std::ws; // consume whitespace. + while (is.peek() != static_cast(']')) { + if (sizeof(T) == 1) { // read/write chars as numbers. + int16 next_t1, next_t2; + is >> next_t1; + if (is.fail()) goto bad; + if (is.peek() != static_cast(',')) + KALDI_ERR << "ReadIntegerPairVector: expected to see ',', saw " + << is.peek() << ", at file position " << is.tellg(); + is.get(); // consume the ','. + is >> next_t2 >> std::ws; + if (is.fail()) goto bad; + else + tmp_v.push_back(std::make_pair((T)next_t1, (T)next_t2)); + } else { + T next_t1, next_t2; + is >> next_t1; + if (is.fail()) goto bad; + if (is.peek() != static_cast(',')) + KALDI_ERR << "ReadIntegerPairVector: expected to see ',', saw " + << is.peek() << ", at file position " << is.tellg(); + is.get(); // consume the ','. + is >> next_t2 >> std::ws; + if (is.fail()) goto bad; + else + tmp_v.push_back(std::make_pair((T)next_t1, (T)next_t2)); + } + } + is.get(); // get the final ']'. + *v = tmp_v; // could use std::swap to use less temporary memory, but this + // uses less permanent memory. + } + if (!is.fail()) return; + bad: + KALDI_ERR << "ReadIntegerPairVector: read failure at file position " + << is.tellg(); +} template inline void WriteIntegerVector(std::ostream &os, bool binary, const std::vector &v) { diff --git a/src/base/io-funcs-test.cc b/src/base/io-funcs-test.cc index 63506073ff8..dd05326d5ed 100644 --- a/src/base/io-funcs-test.cc +++ b/src/base/io-funcs-test.cc @@ -43,8 +43,20 @@ void UnitTestIo(bool binary) { WriteIntegerVector(outfile, binary, vec2); if (!binary) outfile << " \n"; std::vector vec3; - for (size_t i = 0; i < 10; i++) vec3.push_back(Rand()%100); + + int32 size = RandInt(0, 10); + for (size_t i = 0; i < size; i++) vec3.push_back(Rand()%100); WriteIntegerVector(outfile, binary, vec3); + std::vector > vec4; + WriteIntegerPairVector(outfile, binary, vec4); + if (!binary && Rand()%2 == 0) outfile << " \n"; + std::vector > vec5; + for (size_t i = 0; i < size; i++) vec5.push_back(std::make_pair(Rand()%100 - 10, Rand()%100 - 10)); + WriteIntegerPairVector(outfile, binary, vec5); + if (!binary) outfile << " \n"; + std::vector > vec6; + for (size_t i = 0; i < size; i++) vec6.push_back(std::make_pair(Rand()%100, Rand()%100)); + WriteIntegerPairVector(outfile, binary, vec6); if (!binary && Rand()%2 == 0) outfile << " \n"; const char *token1 = "Hi"; WriteToken(outfile, binary, token1); @@ -90,6 +102,15 @@ void UnitTestIo(bool binary) { std::vector vec3_in; ReadIntegerVector(infile, binary_in, &vec3_in); KALDI_ASSERT(vec3_in == vec3); + std::vector > vec4_in; + ReadIntegerPairVector(infile, binary_in, &vec4_in); + KALDI_ASSERT(vec4_in == vec4); + std::vector > vec5_in; + ReadIntegerPairVector(infile, binary_in, &vec5_in); + KALDI_ASSERT(vec5_in == vec5); + std::vector > vec6_in; + ReadIntegerPairVector(infile, binary_in, &vec6_in); + KALDI_ASSERT(vec6_in == vec6); std::string token1_in, token2_in; KALDI_ASSERT(Peek(infile, binary_in) == static_cast(*token1)); KALDI_ASSERT(PeekToken(infile, binary_in) == static_cast(*token1)); diff --git a/src/base/io-funcs.h b/src/base/io-funcs.h index ba0cf1c1c7c..4caddc6b5b3 100644 --- a/src/base/io-funcs.h +++ b/src/base/io-funcs.h @@ -2,6 +2,7 @@ // Copyright 2009-2011 Microsoft Corporation; Saarland University; // Jan Silovsky; Yanmin Qian +// 2016 Xiaohui Zhang // See ../../COPYING for clarification regarding multiple authors // @@ -181,6 +182,16 @@ template inline void WriteIntegerVector(std::ostream &os, bool binary, template inline void ReadIntegerVector(std::istream &is, bool binary, std::vector *v); +/// Function for writing STL vectors of pairs of integer types. +template +inline void WriteIntegerPairVector(std::ostream &os, bool binary, + const std::vector > &v); + +/// Function for reading STL vector of pairs of integer types. +template +inline void ReadIntegerPairVector(std::istream &is, bool binary, + std::vector > *v); + /// The WriteToken functions are for writing nonempty sequences of non-space /// characters. They are not for general strings. void WriteToken(std::ostream &os, bool binary, const char *token); diff --git a/src/base/kaldi-math.h b/src/base/kaldi-math.h index e28ddcc1a09..ac590a06a25 100644 --- a/src/base/kaldi-math.h +++ b/src/base/kaldi-math.h @@ -41,20 +41,19 @@ #endif #ifndef M_PI -# define M_PI 3.1415926535897932384626433832795 +#define M_PI 3.1415926535897932384626433832795 #endif #ifndef M_SQRT2 -# define M_SQRT2 1.4142135623730950488016887 +#define M_SQRT2 1.4142135623730950488016887 #endif - #ifndef M_2PI -# define M_2PI 6.283185307179586476925286766559005 +#define M_2PI 6.283185307179586476925286766559005 #endif #ifndef M_SQRT1_2 -# define M_SQRT1_2 0.7071067811865475244008443621048490 +#define M_SQRT1_2 0.7071067811865475244008443621048490 #endif #ifndef M_LOG_2PI @@ -65,6 +64,11 @@ #define M_LN2 0.693147180559945309417232121458 #endif +#ifndef M_LN10 +#define M_LN10 2.302585092994045684017991454684 +#endif + + #define KALDI_ISNAN std::isnan #define KALDI_ISINF std::isinf #define KALDI_ISFINITE(x) std::isfinite(x) diff --git a/src/base/kaldi-types-extra.cc b/src/base/kaldi-types-extra.cc new file mode 100644 index 00000000000..f7f67a19fb4 --- /dev/null +++ b/src/base/kaldi-types-extra.cc @@ -0,0 +1,268 @@ +// base/kaldi-types-extra.cc + +// Copyright 2014 Vimal Manohar + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-math.h" +#include "base/kaldi-types-extra.h" +#include "base/kaldi-types.h" + +namespace kaldi { + +template +void SignedLogReal::SetZero() { + sign_ = false; + log_f_ = kLogZeroDouble; +} + +template +void SignedLogReal::SetOne() { + sign_ = false; + log_f_ = 0.0; +} + +template +void SignedLogReal::Set(Real f) { + if (f < 0.0) { + sign_ = true; + log_f_ = static_cast(kaldi::Log(static_cast(-f))); + } else { + sign_ = false; + log_f_ = static_cast(kaldi::Log(static_cast(f))); + } +} + +template +void SignedLogReal::SetRandn() { + Set(kaldi::RandGauss()); +} + +template +void SignedLogReal::SetRandUniform() { + Set(kaldi::RandUniform()); +} + +template +void SignedLogReal::Log() { + KALDI_ASSERT(Positive()); + log_f_ = kaldi::Log(log_f_); +} + +template +bool SignedLogReal::IsZero(Real cutoff) const { + return (log_f_ < kaldi::Log(cutoff)); +} + +template +bool SignedLogReal::IsOne(Real cutoff) const { + return ( Positive() && (log_f_ > 0 ? LogSub(log_f_, 0) : LogSub(0, log_f_)) < kaldi::Log(cutoff) ); +} + +template +bool SignedLogReal::ApproxEqual(const SignedLogReal &other, float tol) const { + + if (Sign() == other.sign_) { + double tmp1 = log_f_; + double tmp2 = other.LogMagnitude(); + if (tmp1 >= tmp2) { + return (LogSub(tmp1, tmp2) <= kaldi::Log(tol) + tmp1); + } else { + return (LogSub(tmp2, tmp1) <= kaldi::Log(tol) + tmp1); + } + } + + return (LogAdd(log_f_, other.LogMagnitude() <= kaldi::Log(tol) + log_f_)); +} + +template +bool SignedLogReal::Equal(const SignedLogReal &other) const { + return (sign_ == other.sign_ && log_f_ == other.log_f_); +} + +template +template +void SignedLogReal::Add(const SignedLogReal &a) { + if (sign_ == a.Sign()) { + log_f_ = LogAdd(log_f_, a.LogMagnitude()); + } else { + if (log_f_ < a.LogMagnitude()) { + sign_ = !sign_; + log_f_ = LogSub(a.LogMagnitude(), log_f_); + } else { + log_f_ = LogSub(log_f_, a.LogMagnitude()); + } + } +} + +template +template +void SignedLogReal::AddReal(OtherReal f) { + SignedLogReal temp(f); + Add(temp); +} + +template +template +void SignedLogReal::AddLogReal(OtherReal log_f) { + SignedLogReal temp(false, log_f); + Add(temp); +} + +template +template +void SignedLogReal::AddMultiplyLogReal(const SignedLogReal &a, + OtherReal log_b) { + SignedLogReal temp(false, log_b); + temp.Multiply(a); + Add(temp); +} + +template +template +void SignedLogReal::Sub(const SignedLogReal &a) { + if (sign_ == a.Sign()) { + if (log_f_ < a.LogMagnitude()) { + sign_ = !sign_; + log_f_ = LogSub(a.LogMagnitude(), log_f_); + } else { + log_f_ = LogSub(log_f_, a.LogMagnitude()); + } + } else { + log_f_ = LogAdd(log_f_, a.LogMagnitude()); + } +} + +template +template +void SignedLogReal::SubMultiplyLogReal(const SignedLogReal &a, + OtherReal log_b) { + SignedLogReal temp(false, log_b); + temp.Multiply(a); + Sub(temp); +} + +template +template +void SignedLogReal::Multiply(const SignedLogReal &a) { + if (sign_ != a.Sign()) { sign_ = true; } + else { sign_ = false; } + + log_f_ += a.LogMagnitude(); +} + +template +template +void SignedLogReal::MultiplyReal(OtherReal f) { + SignedLogReal temp(f); + Multiply(temp); +} + +template +template +void SignedLogReal::MultiplyLogReal(OtherReal log_f) { + log_f_ += log_f; +} + +template +template +void SignedLogReal::DivideBy(const SignedLogReal &a) { + if (sign_ != a.Sign()) { sign_ = true; } + else { sign_ = false; } + + log_f_ -= a.LogMagnitude(); +} + +template +SignedLogReal SignedLogReal::operator+(const SignedLogReal &a) const { + SignedLogReal tmp(*this); + tmp.Add(a); + return tmp; +} + +template +SignedLogReal SignedLogReal::operator*(const SignedLogReal &a) const { + SignedLogReal tmp(*this); + tmp.Multiply(a); + return tmp; +} + +template +SignedLogReal SignedLogReal::operator/(const SignedLogReal &a) const { + SignedLogReal tmp(*this); + tmp.DivideBy(a); + return tmp; +} + +template +SignedLogReal operator-(const SignedLogReal &a) { + SignedLogReal tmp(a); + tmp.Negate(); + return tmp; +} + +template +SignedLogReal SignedLogReal::operator-(const SignedLogReal &a) const { + SignedLogReal tmp(*this); + tmp.Sub(a); + return tmp; +} + +template void SignedLogReal::Add(const SignedLogReal &a); +template void SignedLogReal::Add(const SignedLogReal &); +template void SignedLogReal::AddReal(double f); +template void SignedLogReal::AddReal(float f); +template void SignedLogReal::AddLogReal(double f); +template void SignedLogReal::AddLogReal(float f); +template void SignedLogReal::AddMultiplyLogReal(const SignedLogReal &a, double log_b); +template void SignedLogReal::AddMultiplyLogReal(const SignedLogReal &a, float log_b); +template void SignedLogReal::Sub(const SignedLogReal &a); +template void SignedLogReal::Sub(const SignedLogReal &); +template void SignedLogReal::SubMultiplyLogReal(const SignedLogReal &a, double log_b); +template void SignedLogReal::SubMultiplyLogReal(const SignedLogReal &a, float log_b); +template void SignedLogReal::Multiply(const SignedLogReal &a); +template void SignedLogReal::Multiply(const SignedLogReal &a); +template void SignedLogReal::MultiplyReal(double f); +template void SignedLogReal::MultiplyReal(float f); +template void SignedLogReal::MultiplyLogReal(double f); +template void SignedLogReal::MultiplyLogReal(float f); +template void SignedLogReal::DivideBy(const SignedLogReal &a); +template void SignedLogReal::DivideBy(const SignedLogReal &a); + +template SignedLogReal SignedLogReal::operator+(const SignedLogReal &a) const; +template SignedLogReal SignedLogReal::operator*(const SignedLogReal &a) const ; +template SignedLogReal SignedLogReal::operator/(const SignedLogReal &a) const; +template SignedLogReal SignedLogReal::operator-(const SignedLogReal &a) const; +template SignedLogReal operator-(const SignedLogReal &a); + +template SignedLogReal::SignedLogReal(double f); +template SignedLogReal::SignedLogReal(float f); +template SignedLogReal::SignedLogReal(double f); +template SignedLogReal::SignedLogReal(float f); + +template SignedLogReal::SignedLogReal(bool s, double); +template SignedLogReal::SignedLogReal(bool s, float); +template SignedLogReal::SignedLogReal(bool s, double); +template SignedLogReal::SignedLogReal(bool s, float); + +template SignedLogReal::SignedLogReal(const SignedLogReal &); +template SignedLogReal::SignedLogReal(const SignedLogReal &); +template SignedLogReal::SignedLogReal(const SignedLogReal &); +template SignedLogReal::SignedLogReal(const SignedLogReal &); + +template class SignedLogReal; + +} // namespace kaldi diff --git a/src/base/kaldi-types-extra.h b/src/base/kaldi-types-extra.h new file mode 100644 index 00000000000..6d1a4656a3e --- /dev/null +++ b/src/base/kaldi-types-extra.h @@ -0,0 +1,171 @@ +// base/kaldi-types-extra.h + +// Copyright 2014 Vimal Manohar + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_BASE_KALDI_TYPES_EXTRA_H_ +#define KALDI_BASE_KALDI_TYPES_EXTRA_H_ 1 + +#include "base/kaldi-math.h" +#include "base/kaldi-types.h" +#include "base/kaldi-common.h" + +namespace kaldi { + +template +class SignedLogReal { + public: + /// Returns the sign of the real number + inline bool Sign() const { return sign_; } + inline bool Positive() const { return (!sign_); } + inline bool Negative() const { return sign_; } + + /// Returns the log magnitude of the real number + inline Real LogMagnitude() const { return log_f_; } + + /// Returns the real number in double + inline Real Value() const { return + static_cast(Exp(static_cast(log_f_)) * + (sign_ ? -1.0 : 1.0)); } + + /* Basic setting-to-special values functions. */ + + /// Sets the value to zero + void SetZero(); + + /// Sets the number to particular value + void Set(Real); + + /// Sets the number to one + void SetOne(); + + /// Sets the number to random value from normal distribution + void SetRandn(); + + /// Sets the number to uniformly distributed on (0,1) + void SetRandUniform(); + + /* Various special functions. */ + + void Negate() { sign_ = !sign_; }; + + /// Apply log to the value if the number is positive + /// or exit with error. + void Log(); + + /// returns true if the number is zero + bool IsZero(Real cutoff = 1.0e-40) const; + + /// returns true if the number is one + bool IsOne(Real cutoff = 1.0e-06) const; + + /// Returns true if this - other <= tol * this + bool ApproxEqual(const SignedLogReal &other, float tol = 0.01) const; + + /// Tests for exact equality + bool Equal(const SignedLogReal &other) const; + + /// Add another object of same type + template void Add(const SignedLogReal &a); + + /// Add a real number + template void AddReal(OtherReal f); + + /// Add log real number + template void AddLogReal(OtherReal log_f); + + /// Add SignedLogReal multiplied by real number in log + template void AddMultiplyLogReal(const SignedLogReal &a, OtherReal log_f); + + /// Subtract another object of same type + template void Sub(const SignedLogReal &a); + + /// Subtract SignedLogReal multiplied by real number in log + template void SubMultiplyLogReal(const SignedLogReal &a, OtherReal log_f); + + /// Multiply by another object of same type + template void Multiply(const SignedLogReal &a); + + /// Multiply by real number + template void MultiplyReal(OtherReal f); + + /// Multiply by log real number + template void MultiplyLogReal(OtherReal log_f); + + /// DivideBy another object of same type + template void DivideBy(const SignedLogReal &a); + + /// Operators + SignedLogReal operator+(const SignedLogReal &a) const; + SignedLogReal operator*(const SignedLogReal &a) const; + SignedLogReal operator/(const SignedLogReal &a) const; + SignedLogReal operator-(const SignedLogReal &a) const; + + /// Initializer, callable only from child. + /// Default initializer + explicit SignedLogReal() : + sign_(false), log_f_(kLogZeroDouble) { + KALDI_ASSERT_IS_FLOATING_TYPE(Real); + } + + /// Initialize from a real number + template + explicit SignedLogReal(OtherReal f) { + KALDI_ASSERT_IS_FLOATING_TYPE(Real); + KALDI_ASSERT_IS_FLOATING_TYPE(OtherReal); + if (f < 0.0) { + sign_ = true; + log_f_ = static_cast(kaldi::Log(static_cast(-f))); + } else { + sign_ = false; + log_f_ = static_cast(kaldi::Log(static_cast(f))); + } + } + + /// Initialize from sign and log real number + template + explicit SignedLogReal(bool sign, OtherReal log_f) : + sign_(sign), log_f_(log_f) { + KALDI_ASSERT_IS_FLOATING_TYPE(Real); + KALDI_ASSERT_IS_FLOATING_TYPE(OtherReal); + } + + /// Initialize from object + template + explicit SignedLogReal(const SignedLogReal &a) : + sign_(a.Sign()), log_f_(a.LogMagnitude()) { + KALDI_ASSERT_IS_FLOATING_TYPE(Real); + KALDI_ASSERT_IS_FLOATING_TYPE(OtherReal); + } + + private: + bool sign_; + Real log_f_; +}; + +template +inline std::ostream & operator << (std::ostream & os, const SignedLogReal &a) { + os << (a.Negative() ? "-" : "") << "1.0 * Exp(" << a.LogMagnitude() << ")"; + return os; +} + +template +SignedLogReal operator-(const SignedLogReal &a); + +} // namespace kaldi + +#endif // KALDI_BASE_KALDI_TYPES_EXTRA_H_ diff --git a/src/bin/vector-sum.cc b/src/bin/vector-sum.cc index 20f58d52b7d..42404e38384 100644 --- a/src/bin/vector-sum.cc +++ b/src/bin/vector-sum.cc @@ -101,7 +101,8 @@ int32 TypeOneUsage(const ParseOptions &po) { } int32 TypeTwoUsage(const ParseOptions &po, - bool binary) { + bool binary, + bool average = false) { KALDI_ASSERT(po.NumArgs() == 2); KALDI_ASSERT(ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier && "vector-sum: first argument must be an rspecifier"); @@ -133,6 +134,8 @@ int32 TypeTwoUsage(const ParseOptions &po, } } } + + if (num_done > 0 && average) sum.Scale(1.0 / num_done); Vector sum_float(sum); WriteKaldiObject(sum_float, po.GetArg(2), binary); @@ -199,12 +202,13 @@ int main(int argc, char *argv[]) { " e.g.: vector-sum --binary=false 1.vec 2.vec 3.vec sum.vec\n" "See also: copy-vector, dot-weights\n"; - bool binary; + bool binary, average = false; ParseOptions po(usage); po.Register("binary", &binary, "If true, write output as binary (only " "relevant for usage types two or three"); + po.Register("average", &average, "Do average instead of sum"); po.Read(argc, argv); @@ -219,7 +223,7 @@ int main(int argc, char *argv[]) { ClassifyWspecifier(po.GetArg(N), NULL, NULL, NULL) == kNoWspecifier) { // input from a single table, output not to table. - exit_status = TypeTwoUsage(po, binary); + exit_status = TypeTwoUsage(po, binary, average); } else if (po.NumArgs() >= 2 && ClassifyRspecifier(po.GetArg(1), NULL, NULL) == kNoRspecifier && ClassifyWspecifier(po.GetArg(N), NULL, NULL, NULL) == diff --git a/src/chain/Makefile b/src/chain/Makefile index e24913c06f2..c02844767f8 100644 --- a/src/chain/Makefile +++ b/src/chain/Makefile @@ -12,7 +12,7 @@ OBJFILES = chain-supervision.o chain-numerator.o chain-den-graph.o \ language-model.o chain-denominator.o chain-training.o ifeq ($(CUDA), true) - OBJFILES += chain-kernels.o + OBJFILES += chain-kernels.o endif LIBNAME = kaldi-chain @@ -53,7 +53,7 @@ endif ADDLIBS = ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \ ../fstext/kaldi-fstext.a \ - ../matrix/kaldi-matrix.a ../cudamatrix/kaldi-cudamatrix.a \ + ../cudamatrix/kaldi-cudamatrix.a ../matrix/kaldi-matrix.a \ ../util/kaldi-util.a ../base/kaldi-base.a diff --git a/src/chain/chain-datastruct.h b/src/chain/chain-datastruct.h index 7ea58038918..52e388a3f2e 100644 --- a/src/chain/chain-datastruct.h +++ b/src/chain/chain-datastruct.h @@ -45,7 +45,8 @@ extern "C" { }; - + // Search for this in chain-kernels.cu for an explanation. + enum { kThresholdingPowerOfTwo = 14 }; } diff --git a/src/chain/chain-den-graph.cc b/src/chain/chain-den-graph.cc index a654ad7d05f..ceb61a550f0 100644 --- a/src/chain/chain-den-graph.cc +++ b/src/chain/chain-den-graph.cc @@ -139,77 +139,6 @@ void DenominatorGraph::SetInitialProbs(const fst::StdVectorFst &fst) { Vector avg_prob_float(avg_prob); initial_probs_ = avg_prob_float; - special_hmm_state_ = ComputeSpecialState(fst, avg_prob_float); -} - -int32 NumStatesThatCanReach(const fst::StdVectorFst &fst, - int32 dest_state) { - int32 num_states = fst.NumStates(), - num_states_can_reach = 0; - KALDI_ASSERT(dest_state >= 0 && dest_state < num_states); - std::vector can_reach(num_states, false); - std::vector > reverse_transitions(num_states); - for (int32 s = 0; s < num_states; s++) - for (fst::ArcIterator aiter(fst, s); !aiter.Done(); - aiter.Next()) - reverse_transitions[aiter.Value().nextstate].push_back(s); - std::vector queue; - can_reach[dest_state] = true; - queue.push_back(dest_state); - num_states_can_reach++; - while (!queue.empty()) { - int32 state = queue.back(); - queue.pop_back(); - std::vector::const_iterator iter = reverse_transitions[state].begin(), - end = reverse_transitions[state].end(); - for (; iter != end; ++iter) { - int32 prev_state = *iter; - if (!can_reach[prev_state]) { - can_reach[prev_state] = true; - queue.push_back(prev_state); - num_states_can_reach++; - } - } - } - KALDI_ASSERT(num_states_can_reach >= 1 && - num_states_can_reach <= num_states); - return num_states_can_reach; -} - - -int32 DenominatorGraph::ComputeSpecialState( - const fst::StdVectorFst &fst, - const Vector &initial_probs) { - int32 num_states = initial_probs.Dim(); - std::vector > pairs(num_states); - for (int32 i = 0; i < num_states; i++) - pairs.push_back(std::pair(-initial_probs(i), i)); - // the first element of each pair is the negative of the initial-prob, - // so when we sort, the highest initial-prob will be first. - std::sort(pairs.begin(), pairs.end()); - // this threshold of 0.75 is pretty arbitrary. We reject any - // state if it can't be reached by 75% of all other states. - // In practice we think that states will either be reachable by - // almost-all states, or almost-none (e.g. states that are active - // only at utterance-beginning), so this threshold shouldn't - // be too critical. - int32 min_states_can_reach = 0.75 * num_states; - for (int32 i = 0; i < num_states; i++) { - int32 state = pairs[i].second; - int32 n = NumStatesThatCanReach(fst, state); - if (n < min_states_can_reach) { - KALDI_WARN << "Rejecting state " << state << " as a 'special' HMM state " - << "(for renormalization in fwd-bkwd), because it's only " - << "reachable by " << n << " out of " << num_states - << " states."; - } else { - return state; - } - } - KALDI_ERR << "Found no states that are reachable by at least " - << min_states_can_reach << " out of " << num_states - << " states. This is unexpected. Change the threshold"; - return -1; } void DenominatorGraph::GetNormalizationFst(const fst::StdVectorFst &ifst, @@ -261,6 +190,34 @@ void MinimizeAcceptorNoPush(fst::StdVectorFst *fst) { fst::Decode(fst, encoder); } +// This static function, used in CreateDenominatorFst, sorts an +// fst's states in decreasing order of number of transitions (into + out of) +// the state. The aim is to have states that have a lot of transitions +// either into them or out of them, be numbered earlier, so hopefully +// they will be scheduled first and won't delay the computation +static void SortOnTransitionCount(fst::StdVectorFst *fst) { + // negative_num_transitions[i] will contain (before sorting), the pair + // ( -(num-transitions-into(i) + num-transition-out-of(i)), i) + int32 num_states = fst->NumStates(); + std::vector > negative_num_transitions(num_states); + for (int32 i = 0; i < num_states; i++) { + negative_num_transitions[i].first = 0; + negative_num_transitions[i].second = i; + } + for (int32 i = 0; i < num_states; i++) { + for (fst::ArcIterator aiter(*fst, i); !aiter.Done(); + aiter.Next()) { + negative_num_transitions[i].first--; + negative_num_transitions[aiter.Value().nextstate].first--; + } + } + std::sort(negative_num_transitions.begin(), negative_num_transitions.end()); + std::vector order(num_states); + for (int32 i = 0; i < num_states; i++) + order[negative_num_transitions[i].second] = i; + fst::StateSort(fst, order); +} + void DenGraphMinimizeWrapper(fst::StdVectorFst *fst) { for (int32 i = 1; i <= 3; i++) { fst::PushSpecial(fst, fst::kDelta * 0.01); @@ -414,6 +371,8 @@ void CreateDenominatorFst(const ContextDependency &ctx_dep, DenGraphMinimizeWrapper(&transition_id_fst); + SortOnTransitionCount(&transition_id_fst); + *den_fst = transition_id_fst; CheckDenominatorFst(trans_model.NumPdfs(), *den_fst); PrintDenGraphStats(*den_fst); diff --git a/src/chain/chain-den-graph.h b/src/chain/chain-den-graph.h index 8e5ee39e4bd..b2510651f39 100644 --- a/src/chain/chain-den-graph.h +++ b/src/chain/chain-den-graph.h @@ -88,13 +88,6 @@ class DenominatorGraph { // Note: we renormalize each HMM-state to sum to one before doing this. const CuVector &InitialProbs() const; - // returns the index of the HMM-state that has the highest value in - // InitialProbs (and which we believe will always be reachable from most other - // states... later on we may check this more carefully [TODO]). - // It's used in getting the 'arbitrary_scale' value to keep the alphas - // in a good dynamic range. - int32 SpecialHmmState() const { return special_hmm_state_; } - // This function outputs a modifified version of the FST that was used to // build this object, that has an initial-state with epsilon transitions to // each state, with weight determined by initial_probs_; and has each original @@ -116,23 +109,15 @@ class DenominatorGraph { // functions called from the constructor void SetTransitions(const fst::StdVectorFst &fst, int32 num_pfds); - // work out the initial-probs and the 'special state' - // Note, there are no final-probs; we treat all states as final - // with probability one [we have a justification for this.. - // assuming it's roughly a well-normalized HMM, this makes sense; - // note that we train on chunks, so the beginning and end of a chunk - // appear at arbitrary points in the sequence. - // At both beginning and end of the chunk, we limit ourselves to - // only those pdf-ids that were allowed in the numerator sequence. + // work out the initial-probs. Note, there are no final-probs; we treat all + // states as final with probability one [we have a justification for this.. + // assuming it's roughly a well-normalized HMM, this makes sense; note that we + // train on chunks, so the beginning and end of a chunk appear at arbitrary + // points in the sequence. At both beginning and end of the chunk, we limit + // ourselves to only those pdf-ids that were allowed in the numerator + // sequence. void SetInitialProbs(const fst::StdVectorFst &fst); - // return a suitable 'special' HMM-state used for normalizing probabilities in - // the forward-backward. It has to have a reasonably high probability and be - // reachable from most of the graph. returns a suitable state-index - // that we can set special_hmm_state_ to. - int32 ComputeSpecialState(const fst::StdVectorFst &fst, - const Vector &initial_probs); - // forward_transitions_ is an array, indexed by hmm-state index, // of start and end indexes into the transition_ array, which // give us the set of transitions out of this state. @@ -152,23 +137,9 @@ class DenominatorGraph { // distribution of the HMM. This isn't too critical. CuVector initial_probs_; - // The index of a somewhat arbitrarily chosen HMM-state that we - // use for adjusting the alpha probabilities. It needs to be - // one that is reachable from all states (i.e. not a special - // state that's only reachable at sentence-start). We choose - // whichever one has the greatest initial-prob. It's set - // in SetInitialProbs(). - int32 special_hmm_state_; - int32 num_pdfs_; }; -// returns the number of states from which there is a path to -// 'dest_state'. Utility function used in selecting 'special' state -// for normalization of probabilities. -int32 NumStatesThatCanReach(const fst::StdVectorFst &fst, - int32 dest_state); - // Function that does acceptor minimization without weight pushing... // this is useful when constructing the denominator graph. diff --git a/src/chain/chain-denominator.cc b/src/chain/chain-denominator.cc index eaee850a999..258c33cd465 100644 --- a/src/chain/chain-denominator.cc +++ b/src/chain/chain-denominator.cc @@ -39,12 +39,23 @@ DenominatorComputation::DenominatorComputation( std::min(exp_nnet_output_transposed_.NumCols(), static_cast(kMaxDerivTimeSteps) * num_sequences_)), - alpha_(frames_per_sequence_ + 1, den_graph_.NumStates() * num_sequences_, + alpha_(frames_per_sequence_ + 1, + den_graph_.NumStates() * num_sequences_ + num_sequences_, kUndefined), - beta_(2, den_graph_.NumStates() * num_sequences_, kUndefined), + beta_(2, den_graph_.NumStates() * num_sequences_ + num_sequences_, + kUndefined), tot_prob_(num_sequences_, kUndefined), tot_log_prob_(num_sequences_, kUndefined), - log_correction_term_(num_sequences_, kUndefined) { + log_correction_term_(num_sequences_, kUndefined), + ok_(true) { + KALDI_ASSERT(opts_.leaky_hmm_coefficient > 0.0 && + opts_.leaky_hmm_coefficient < 1.0); + // make sure the alpha sums and beta sums are zeroed. + alpha_.ColRange(den_graph_.NumStates() * num_sequences_, + num_sequences_).SetZero(); + beta_.ColRange(den_graph_.NumStates() * num_sequences_, + num_sequences_).SetZero(); + KALDI_ASSERT(nnet_output.NumRows() % num_sequences == 0); exp_nnet_output_transposed_.ApplyExp(); } @@ -70,13 +81,12 @@ void DenominatorComputation::AlphaFirstFrame() { void DenominatorComputation::AlphaGeneralFrame(int32 t) { KALDI_ASSERT(t > 0 && t <= frames_per_sequence_); BaseFloat *this_alpha = alpha_.RowData(t); - const BaseFloat *prev_alpha = alpha_.RowData(t - 1); + const BaseFloat *prev_alpha_dash = alpha_.RowData(t - 1); const Int32Pair *backward_transitions = den_graph_.BackwardTransitions(); const DenominatorGraphTransition *transitions = den_graph_.Transitions(); int32 num_pdfs = exp_nnet_output_transposed_.NumRows(), num_hmm_states = den_graph_.NumStates(), - num_sequences = num_sequences_, - special_hmm_state = den_graph_.SpecialHmmState(); + num_sequences = num_sequences_; // 'probs' is the matrix of pseudo-likelihoods for frame t - 1. CuSubMatrix probs(exp_nnet_output_transposed_, 0, num_pdfs, @@ -90,8 +100,8 @@ void DenominatorComputation::AlphaGeneralFrame(int32 t) { dim3 dimGrid(n_blocks(num_sequences, dimBlock.x), num_hmm_states, 1); cuda_chain_hmm_forward(dimGrid, dimBlock, backward_transitions, transitions, - num_sequences, special_hmm_state, prob_data, - probs.Stride(), prev_alpha, this_alpha); + num_sequences, prob_data, probs.Stride(), + prev_alpha_dash, this_alpha); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); @@ -110,18 +120,19 @@ void DenominatorComputation::AlphaGeneralFrame(int32 t) { int32 pdf_id = trans_iter->pdf_id, prev_hmm_state = trans_iter->hmm_state; BaseFloat prob = prob_data[pdf_id * prob_stride + s], - this_prev_alpha = prev_alpha[prev_hmm_state * num_sequences + s]; + this_prev_alpha = prev_alpha_dash[prev_hmm_state * num_sequences + s]; this_tot_alpha += this_prev_alpha * transition_prob * prob; } - // Let arbitrary_scale be the inverse of the alpha value for the - // hmm-state indexed special_hmm_state_ on the previous frame (for this - // sequence); we multiply this into all the transition-probabilities - // from the previous frame to this frame, in both the forward and - // backward passes, in order to keep the alphas in a good numeric range. - // This won't affect the posteriors, but when computing the total - // likelihood we'll need to compensate for it later on. + // Let arbitrary_scale be the inverse of the alpha-sum value that we + // store in the same place we'd store the alpha for the state numbered + // 'num_hmm_states'. We multiply this into all the + // transition-probabilities from the previous frame to this frame, in + // both the forward and backward passes, in order to keep the alphas in + // a good numeric range. This won't affect the posteriors, but when + // computing the total likelihood we'll need to compensate for it later + // on. BaseFloat arbitrary_scale = - 1.0 / prev_alpha[special_hmm_state * num_sequences + s]; + 1.0 / prev_alpha_dash[num_hmm_states * num_sequences + s]; KALDI_ASSERT(this_tot_alpha - this_tot_alpha == 0); this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale; } @@ -129,37 +140,89 @@ void DenominatorComputation::AlphaGeneralFrame(int32 t) { } } +void DenominatorComputation::AlphaDash(int32 t) { + BaseFloat *this_alpha = alpha_.RowData(t); + + // create a 'fake matrix' for the regular alphas- view this row as a matrix. + // initializer takes [pointer, num-rows, num-cols, stride]. + CuSubMatrix alpha_mat(this_alpha, + den_graph_.NumStates(), + num_sequences_, + num_sequences_); + + // the alpha-dash is the sum of alpha over all states. + CuSubVector alpha_sum_vec(this_alpha + + den_graph_.NumStates() * num_sequences_, + num_sequences_); + alpha_sum_vec.AddRowSumMat(1.0, alpha_mat, 0.0); + + alpha_mat.AddVecVec(opts_.leaky_hmm_coefficient, + den_graph_.InitialProbs(), + alpha_sum_vec); + // it's now alpha-dash. +} + +// compute beta from beta-dash. +void DenominatorComputation::Beta(int32 t) { + BaseFloat *this_beta_dash = beta_.RowData(t % 2); + // create a 'fake matrix' for the regular beta-dash (which is + // the counterpart of alpha-dash)- view this row as a matrix. + // initializer takes [pointer, num-rows, num-cols, stride]. + CuSubMatrix beta_dash_mat(this_beta_dash, + den_graph_.NumStates(), + num_sequences_, + num_sequences_); + // making the t index implicit, the beta-dash-sum for each sequence is the sum + // over all states i of beta_i * opts_.leaky_hmm_coefficient * initial_prob_i. + CuSubVector beta_dash_sum_vec( + this_beta_dash + den_graph_.NumStates() * num_sequences_, + num_sequences_); + beta_dash_sum_vec.AddMatVec(opts_.leaky_hmm_coefficient, beta_dash_mat, + kTrans, den_graph_.InitialProbs(), 0.0); + // we are computing beta in place. After the following, beta-dash-mat + // will contain the actual beta (i.e. the counterpart of alpha), + // not the beta-dash. + beta_dash_mat.AddVecToRows(1.0, beta_dash_sum_vec); +} + BaseFloat DenominatorComputation::Forward() { AlphaFirstFrame(); - for (int32 t = 1; t <= frames_per_sequence_; t++) + AlphaDash(0); + for (int32 t = 1; t <= frames_per_sequence_; t++) { AlphaGeneralFrame(t); + AlphaDash(t); + } return ComputeTotLogLike(); } BaseFloat DenominatorComputation::ComputeTotLogLike() { tot_prob_.Resize(num_sequences_); - // View the last alpha as a matrix of size num-hmm-states by num-sequences. - CuSubMatrix last_alpha(alpha_.RowData(frames_per_sequence_), - den_graph_.NumStates(), - num_sequences_, - num_sequences_); + // View the last alpha-dash as a matrix of size num-hmm-states by num-sequences. + CuSubMatrix last_alpha_dash( + alpha_.RowData(frames_per_sequence_), + den_graph_.NumStates(), + num_sequences_, + num_sequences_); - tot_prob_.AddRowSumMat(1.0, last_alpha, 0.0); + tot_prob_.AddRowSumMat(1.0, last_alpha_dash, 0.0); // we should probably add an ApplyLog() function that takes a vector argument. tot_log_prob_ = tot_prob_; tot_log_prob_.ApplyLog(); BaseFloat tot_log_prob = tot_log_prob_.Sum(); - // We now have to add something for the arbitrary scaling factor. the - // inverses of all the alphas for hmm-states numbered zero, for t = 0 - // ... frames_per_sequence_ - 1, were included as the 'arbitrary factors' in the - // transition-probs, so we need to multiply them all together (not inversed) - // and add them as a correction term to the total log-likes. Note: the + // We now have to add something for the arbitrary scaling factor. [note: the // purpose of the arbitrary scaling factors was to keep things in a good - // floating-point range. + // floating-point range] + // The inverses of all the tot-alpha quantities, for t = 0 + // ... frames_per_sequence_ - 1, were included as the 'arbitrary factors' in + // the transition-probs, so we need to multiply them all together (not + // inversed) and add them as a correction term to the total log-likes. + // These tot-alpha quantities were stored in the same place that we would + // have stored the HMM-state numbered 'num_hmm_states'. + int32 num_hmm_states = den_graph_.NumStates(); CuSubMatrix inv_arbitrary_scales( alpha_, 0, frames_per_sequence_, - num_sequences_ * den_graph_.SpecialHmmState(), num_sequences_); + num_sequences_ * num_hmm_states, num_sequences_); CuMatrix log_inv_arbitrary_scales( inv_arbitrary_scales); log_inv_arbitrary_scales.ApplyLog(); @@ -170,12 +233,16 @@ BaseFloat DenominatorComputation::ComputeTotLogLike() { -void DenominatorComputation::Backward( +bool DenominatorComputation::Backward( BaseFloat deriv_weight, CuMatrixBase *nnet_output_deriv) { - BetaLastFrame(); + BetaDashLastFrame(); + Beta(frames_per_sequence_); for (int32 t = frames_per_sequence_ - 1; t >= 0; t--) { - BetaGeneralFrame(t); + BetaDashGeneralFrame(t); + if (GetVerboseLevel() >= 1 || t == 0) + BetaGeneralFrameDebug(t); + Beta(t); if (t % kMaxDerivTimeSteps == 0) { // commit the derivative stored in exp_nnet_output_transposed_ by adding // its transpose to the appropriate sub-matrix of 'nnet_output_deriv'. @@ -190,35 +257,35 @@ void DenominatorComputation::Backward( *nnet_output_deriv, t * num_sequences_, chunk_frames * num_sequences_, 0, num_pdfs); - output_deriv_part.AddMat(deriv_weight, transposed_deriv_part, - kTrans); + output_deriv_part.AddMat(deriv_weight, transposed_deriv_part, kTrans); if (t != 0) transposed_deriv_part.SetZero(); } } + return ok_; } -void DenominatorComputation::BetaLastFrame() { - // sets up the beta on the last frame (frame == frames_per_sequence_). Note that - // the betas we use here contain a 1/(tot-prob) factor in order to simplify - // the backprop. +void DenominatorComputation::BetaDashLastFrame() { + // sets up the beta-dash quantity on the last frame (frame == + // frames_per_sequence_). Note that the betas we use here contain a + // 1/(tot-prob) factor in order to simplify the backprop. int32 t = frames_per_sequence_; - BaseFloat *last_frame_beta = beta_.RowData(t % 2); + BaseFloat *last_frame_beta_dash = beta_.RowData(t % 2); // create a 'fake matrix' - view this row as a matrix. - CuSubMatrix beta_mat(last_frame_beta, - den_graph_.NumStates(), - num_sequences_, - num_sequences_); + CuSubMatrix beta_dash_mat(last_frame_beta_dash, + den_graph_.NumStates(), + num_sequences_, + num_sequences_); CuVector inv_tot_prob(tot_prob_); inv_tot_prob.InvertElements(); // the beta values at the end of the file only vary with the sequence-index, // not with the HMM-index. We treat all states as having a final-prob of one. - beta_mat.CopyRowsFromVec(inv_tot_prob); + beta_dash_mat.CopyRowsFromVec(inv_tot_prob); } -void DenominatorComputation::BetaGeneralFrame(int32 t) { +void DenominatorComputation::BetaDashGeneralFrame(int32 t) { KALDI_ASSERT(t >= 0 && t < frames_per_sequence_); int32 num_pdfs = exp_nnet_output_transposed_.NumRows(); // t_wrapped gives us the time-index we use when indexing @@ -226,9 +293,9 @@ void DenominatorComputation::BetaGeneralFrame(int32 t) { // matrix, storing only chunks of frames at a time, and we add it to the // non-transposed output whenever we finish a chunk. int32 t_wrapped = t % static_cast(kMaxDerivTimeSteps); - const BaseFloat *this_alpha = alpha_.RowData(t), + const BaseFloat *this_alpha_dash = alpha_.RowData(t), *next_beta = beta_.RowData((t + 1) % 2); - BaseFloat *this_beta = beta_.RowData(t % 2); + BaseFloat *this_beta_dash = beta_.RowData(t % 2); const Int32Pair *forward_transitions = den_graph_.ForwardTransitions(); const DenominatorGraphTransition *transitions = den_graph_.Transitions(); // 'probs' is the matrix of pseudo-likelihoods for frame t. @@ -238,8 +305,7 @@ void DenominatorComputation::BetaGeneralFrame(int32 t) { t_wrapped * num_sequences_, num_sequences_); int32 num_hmm_states = den_graph_.NumStates(), - num_sequences = num_sequences_, - special_hmm_state = den_graph_.SpecialHmmState(); + num_sequences = num_sequences_; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { @@ -247,10 +313,9 @@ void DenominatorComputation::BetaGeneralFrame(int32 t) { dim3 dimBlock(std::min(CU1DBLOCK, num_sequences), 1, 1); dim3 dimGrid(n_blocks(num_sequences, dimBlock.x), num_hmm_states, 1); cuda_chain_hmm_backward(dimGrid, dimBlock, forward_transitions, transitions, - num_sequences, special_hmm_state, - probs.Data(), probs.Stride(), this_alpha, next_beta, - this_beta, log_prob_deriv.Data(), - log_prob_deriv.Stride()); + num_sequences, probs.Data(), probs.Stride(), + this_alpha_dash, next_beta, this_beta_dash, + log_prob_deriv.Data(), log_prob_deriv.Stride()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else @@ -262,12 +327,12 @@ void DenominatorComputation::BetaGeneralFrame(int32 t) { BaseFloat *log_prob_deriv_data = log_prob_deriv.Data(); for (int32 h = 0; h < num_hmm_states; h++) { for (int32 s = 0; s < num_sequences; s++) { - BaseFloat this_alpha_prob = this_alpha[h * num_sequences + s], + BaseFloat this_alpha_dash_prob = this_alpha_dash[h * num_sequences + s], inv_arbitrary_scale = - this_alpha[special_hmm_state * num_sequences + s]; + this_alpha_dash[num_hmm_states * num_sequences + s]; double tot_variable_factor = 0.0; - BaseFloat - occupation_factor = this_alpha_prob / inv_arbitrary_scale; + BaseFloat occupation_factor = this_alpha_dash_prob / + inv_arbitrary_scale; const DenominatorGraphTransition *trans_iter = transitions + forward_transitions[h].first, *trans_end = transitions + forward_transitions[h].second; @@ -282,13 +347,49 @@ void DenominatorComputation::BetaGeneralFrame(int32 t) { BaseFloat occupation_prob = variable_factor * occupation_factor; log_prob_deriv_data[pdf_id * deriv_stride + s] += occupation_prob; } - this_beta[h * num_sequences + s] = + this_beta_dash[h * num_sequences + s] = tot_variable_factor / inv_arbitrary_scale; } } } } +void DenominatorComputation::BetaGeneralFrameDebug(int32 t) { + BaseFloat num_hmm_states = den_graph_.NumStates(), + alpha_beta_size = num_hmm_states * num_sequences_; + CuSubVector this_alpha_dash(alpha_.RowData(t), alpha_beta_size), + this_beta_dash(beta_.RowData(t % 2), alpha_beta_size); + int32 t_wrapped = t % static_cast(kMaxDerivTimeSteps), + num_pdfs = exp_nnet_output_transposed_.NumRows(); + CuSubMatrix this_log_prob_deriv( + nnet_output_deriv_transposed_, 0, num_pdfs, + t_wrapped * num_sequences_, num_sequences_); + BaseFloat alpha_beta_product = VecVec(this_alpha_dash, + this_beta_dash), + this_log_prob_deriv_sum = this_log_prob_deriv.Sum(); + if (!ApproxEqual(alpha_beta_product, num_sequences_)) { + KALDI_WARN << "On time " << t << ", alpha-beta product " + << alpha_beta_product << " != " << num_sequences_ + << " alpha-dash-sum = " << this_alpha_dash.Sum() + << ", beta-dash-sum = " << this_beta_dash.Sum(); + if (fabs(alpha_beta_product - num_sequences_) > 2.0) { + KALDI_WARN << "Excessive error detected, will abandon this minibatch"; + ok_ = false; + } + } + // use higher tolerance, since we are using randomized pruning for the + // log-prob derivatives. + if (!ApproxEqual(this_log_prob_deriv_sum, + num_sequences_, 0.01)) { + KALDI_WARN << "On time " << t << ", log-prob-deriv sum " + << this_log_prob_deriv_sum << " != " << num_sequences_; + if (fabs(this_log_prob_deriv_sum - num_sequences_) > 2.0) { + KALDI_WARN << "Excessive error detected, will abandon this minibatch"; + ok_ = false; + } + } +} + } // namespace chain } // namespace kaldi diff --git a/src/chain/chain-denominator.h b/src/chain/chain-denominator.h index f3b0afa6721..b0f616673d6 100644 --- a/src/chain/chain-denominator.h +++ b/src/chain/chain-denominator.h @@ -41,6 +41,153 @@ namespace kaldi { namespace chain { +/* + This extended comment describes how we implement forward-backward without log + and without overflow, and also the leaky-HMM idea. + + We'll start by establishing the notation for conventional forward-backward, + then add the 'arbitrary-scale' concept that prevents overflow, and then + add the 'leaky-hmm' concept. + + All this is done in parallel over multiple sequences, but the computations + are independent over the separate sequences, so we won't introduce any notation + or index for the sequence; we'll just explain it for one sequences. + + Suppose we have I hmm-states, numbered i = 0 ... I-1 (we'll use i and j for + hmm-state indexes). Let foll(i) give a list of arcs leaving state i, and + pred(i) give a list of arcs entering state i, and we'll use notation like: + for (j, p, n) in foll(i): + for iterating over those arcs, where in this case j is the destination-state, + p is the transition-probability of the arc and n is the pdf-id index. + We can then look up the emission probability as x(t, n) for some frame + 0 <= t < T. + + ** Version 1 of the computation (naive version) ** + + * Forward computation (version 1) + + In the forward computation we're computing alpha(i, t) for 0 <= t <= T): + - For the first frame, set alpha(0, i) = init(i), where init(i) is the + initial-probabilitiy from state i. # in our framework these are obtained + # by running the HMM for a while and getting an averaged occupation + # probability, and using this as an initial-prob, since the boundaries of + # chunks don't really correspond to utterance boundaries in general.] + - For t = 1 ... T: + for i = 0 ... I-1: + alpha(t, i) = 0 + for (j, p, n) in pred(i): # note: j is preceding-state. + alpha(t, i) += x(t-1, n) * alpha(t-1, j) * p. + + - total-prob = \sum_i alpha(T, i). # note, we take the final-probs of all states + # to be 1.0. + + * Backward computation (version 1) + + And now for the backward computation. Contrary to tradition, we include the + inverse of the total-prob as a factor in the betas. This is both more + convenient (it simplifies the way we obtain posteriors), and makes the + algorithm more generalizable as all the beta quantities can be interpreted as + the partial derivative of the logprob with respect to their corresponding + alpha. + + In forward backward notation, gamma is normally used for state-level + occupation probabilities, but what we care about here is pdf-id-level + occupation probabilities (i.e. the partial derivative of the log-likelihood + w.r.t. the logs of the x(t, n) quantities), so we use gamma for that. + + - for the final frame: + for each i, beta(T, i) = 1 / total-prob. + - for t = T-1 ... 0: + for i = 0 ... I-1: + beta(t, i) = 0 + for (j, p, n) in foll(i): # note: j is following-state. + beta(t, i) += x(t, n) * beta(t+1, j) * p. + gamma(t, n) += alpha(t, i) * x(t, n) * beta(t+1, j) * p. + + ** Version 2 of the computation (renormalized version) ** + + Version 1 of the algorithm is susceptible to numeric underflow and overflow, + due to the limited range of IEEE floating-point exponents. + Define tot-alpha(t) = \sum_i alpha(t, i). Then the renormalized version of + the computation is as above, except whenever the quantity x(t, n) appears, + we replace it with x(t, n) / alpha(t). In the algorithm we refer to + 1.0 / tot-alpha(t) as 'arbitrary_scale', because mathematically we can use any + value here as long as we are consistent and the value only varies with t + and not with n; we'll always get the same posteriors (gamma). + + When the algorithm outputs log(total-prob) as the total log-probability + of the HMM, we have to instead return the expression: + log(total-prob) + \sum_{t=0}^{T-1} tot-alpha(t). + to correct for the scaling of the x values. + + The algorithm is still vulnerable to overflow in the beta computation because + it's possible that the dominant path could have a very tiny alpha. However, + once we introduce the leaky-HMM idea (below), this problem will disappear. + + ** Version 3 of the computation (leaky-HMM version) ** + + The leaky-HMM idea is intended to improve generalization by allowing paths + other than those explicitly allowed by the FST we compiled. Another way to + look at it is as a way of hedging our bets about where we split the utterance, + so it's as we're marginalizing over different splits of the utterance. You + could also think of it as a modification of the FST so that there is an + epsilon transition from each state to a newly added state, with probability + one, and then an epsilon transition from the newly added state to each state + with probability leaky-hmm-prob * init(i) [except we need a mechanism so that + no more than two epsilon transitions can be taken per frame- this would involve + creating two copies of the states] + + Recall that we mentioned that init(i) is the initial-probability of + HMM-state i, but these are obtained in such a way that they can be treated + as priors, or average occupation-probabilities. + + Anyway, the way we formulate leaky-hmm is as follows: + + * Forward computation (version 3) + + Let leaky-hmm-prob be a constant defined by the user, with 0.1 being a typical + value. It defines how much probability we give to the 'leaky' transitions. + + - For frame 0, set alpha(0, i) = init(i). + - For 0 <= t <= T, define tot-alpha(t) = \sum_i alpha(t, i). + - For 0 <= t <= T, define alpha'(t, i) = alpha(t, i) + tot-alpha(t) * leaky-hmm-prob * init(i). + + - For 1 <= t <= T, the computation of alpha(t, i) is as before except we use + the previous frame's alpha' instead of alpha. That is: + alpha(t, i) = 0 + for (j, p, n) in pred(i): # note: j is preceding-state. + alpha(t, i) += alpha'(t-1, j) * p * x(t-1, n) / tot-alpha(t-1) + + - total-prob = \sum_i alpha'(T, i) + + The corrected log-prob that we return from the algorithm will be + (total-prob + \sum_{t=0}^{T-1} tot-alpha(t)). + + * Backward computation (version 3) + + The backward computation is as follows. It is fairly straightforward to + derive if you think of it as an instance of backprop where beta, tot-beta and + beta' are the partial derivatives of the output log-prob w.r.t. the + corresponding alpha, tot-alpha and alpha' quantities. Note, tot-beta is not + really the sum of the betas as its name might suggest, it's just the + derivative w.r.t. tot-alpha. + + - beta'(T, i) = 1 / total-prob. + - for 0 <= t <= T, define tot-beta(t) = leaky-hmm-prob * \sum_i init(i) * beta'(t, i) + - for 0 <= t <= T, define beta(t, i) = beta'(t, i) + tot-beta(t). + - for 0 <= t < T, we compute beta'(t, i) and update gamma(t, n) as follows: + for 0 <= i < I: + beta'(t, i) = 0 + for (j, p, n) in foll(i): # note: j is following-state. + beta'(t, i) += beta(t+1, j) * p * x(t, n) / tot-alpha(t) + gamma(t, n) += alpha'(t, i) * beta(t+1, j) * p * x(t, n) / tot-alpha(t) + + Note: in the code, the tot-alpha and tot-beta quantities go in the same + memory location that the corresponding alpha and beta for state I would go. + + */ + + // This does forward-backward in parallel on a number of sequences, using a // single HMM. class DenominatorComputation { @@ -70,7 +217,8 @@ class DenominatorComputation { // this adds deriv_weight times (the derivative of the log-prob w.r.t. the // nnet output), to 'nnet_output_deriv'. - void Backward(BaseFloat deriv_weight, + // returns true if everything seemed OK, false if a failure was detected. + bool Backward(BaseFloat deriv_weight, CuMatrixBase *nnet_output_deriv); private: @@ -84,6 +232,9 @@ class DenominatorComputation { void AlphaFirstFrame(); // the alpha computation for some 0 < t <= num_time_steps_. void AlphaGeneralFrame(int32 t); + // does the 'alpha-dash' computation for time t. this relates to + // 'leaky hmm'. + void AlphaDash(int32 t); // done after all the alphas, this function computes and returns the total // log-likelihood summed over all the sequences, and sets tot_prob_ (if we're @@ -92,9 +243,15 @@ class DenominatorComputation { // from the Forward() computation). BaseFloat ComputeTotLogLike(); - void BetaLastFrame(); + void BetaDashLastFrame(); // beta computation for 0 <= beta < num_time_steps_. - void BetaGeneralFrame(int32 t); + void BetaDashGeneralFrame(int32 t); + // compute the beta quantity from the beta-dash quantity (relates to leaky hmm). + void Beta(int32 t); + + // some checking that we can do if debug mode is activated, or on frame zero. + // Sets ok_ to false if a bad problem is detected. + void BetaGeneralFrameDebug(int32 t); const ChainTrainingOptions &opts_; const DenominatorGraph &den_graph_; @@ -116,13 +273,18 @@ class DenominatorComputation { // the derivs w.r.t. the nnet outputs (transposed) CuMatrix nnet_output_deriv_transposed_; - // the alpha probabilities; dimension is (frames_per_sequence + 1) by (num-hmm-states - // * num-sequences). Note, they are not logs. + // the (temporarily) alpha and (more permanently) alpha-dash probabilities; + // dimension is (frames_per_sequence + 1) by (num-hmm-states * num-sequences + + // num_sequences). Note, they are not logs. The last 'num_sequences' + // columns, where the alpha for the state indexed 'num_hmm_states' would live, + // are for the alpha-sums, which relates to leaky HMM. CuMatrix alpha_; - // the beta probabilities (rolling buffer); dimension is 2 * (num-hmm-states * - // num-sequences). Note: for efficiency and to simplify the equations, these - // are actually the beta / tot_prob_. + // the beta (also beta-dash) probabilities (rolling buffer); dimension is 2 * + // (num-hmm-states * num-sequences + num_sequences). [the last + // 'num_sequences' columns are for the beta-sums, which relates to leaky HMM.] + // Note: for efficiency and to simplify the equations, these are actually the + // beta / tot_prob_. CuMatrix beta_; // the total probability for each sequence, excluding the product of @@ -136,11 +298,13 @@ class DenominatorComputation { CuVector tot_log_prob_; // the log of the total correction term for each sequence, which is the - // product of the alpha_[special hmm state] over all the frames. The - // 'correction terms' are terms that we divide the alphas and betas by in - // order to keep them in a good dynamic range. The product of them - // must be included in the total likelihood. + // product of the alpha-sums [used in the leaky-hmm computation] over all the + // frames. The 'correction terms' are terms that we divide the alphas and + // betas by in order to keep them in a good dynamic range. The product of + // them must be included in the total likelihood. CuVector log_correction_term_; + + bool ok_; }; diff --git a/src/chain/chain-kernels-ansi.h b/src/chain/chain-kernels-ansi.h index af7a1a6b176..8ec1dcf322c 100644 --- a/src/chain/chain-kernels-ansi.h +++ b/src/chain/chain-kernels-ansi.h @@ -29,7 +29,6 @@ extern "C" { const Int32Pair *forward_transitions, const DenominatorGraphTransition *transitions, int32_cuda num_sequences, - int32_cuda special_hmm_state, const BaseFloat *probs, int32_cuda prob_stride, const BaseFloat *this_alpha, @@ -42,7 +41,6 @@ extern "C" { const Int32Pair *backward_transitions, const DenominatorGraphTransition *transitions, int32_cuda num_sequences, - int32_cuda special_hmm_state, const BaseFloat *probs, int32_cuda prob_stride, const BaseFloat *prev_alpha, diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu index 8fcf8037d36..ea10b6680f0 100644 --- a/src/chain/chain-kernels.cu +++ b/src/chain/chain-kernels.cu @@ -40,9 +40,9 @@ __device__ inline void atomic_add_thresholded(Real* address, Real value) { // threshold itself with probability (value / threshold). This preserves // expectations. Note: we assume that value >= 0. - // you can choose any value for the threshold, but powers of 2 are nice - // because they will exactly preserve the precision of the value. - const Real threshold = 1.0 / (1 << 14); + // kThresholdingPowerOfTwo is defined in chain-datastruct.h; it defines + // the threshold for randomized posterior pruning. + const Real threshold = 1.0 / (1 << kThresholdingPowerOfTwo); if (value >= threshold) { atomic_add(address, value); } else { @@ -67,7 +67,6 @@ __device__ inline void atomic_add_thresholded(Real* address, Real value) { if ((x >> 12) > (x & 4095)) atomic_add(address, threshold); } - } // one iteration of the forward computation in the 'tombstone' CTC HMM computation. @@ -82,7 +81,6 @@ __global__ static void _cuda_chain_hmm_forward(const Int32Pair *backward_transitions, const DenominatorGraphTransition *transitions, int32_cuda num_sequences, - int32_cuda special_hmm_state, const BaseFloat *probs, int32_cuda prob_stride, const BaseFloat *prev_alpha, @@ -137,15 +135,18 @@ static void _cuda_chain_hmm_forward(const Int32Pair *backward_transitions, this_tot_alpha += this_prev_alpha0 * transition_prob0 * pseudo_loglike0; } - // Let arbitrary_scale be the inverse of the alpha value for the - // hmm-state indexed special_hmm_state_ on the previous frame (for this - // sequence); we multiply this into all the transition-probabilities - // from the previous frame to this frame, in both the forward and - // backward passes, in order to keep the alphas in a good numeric range. - // This won't affect the posteriors, but when computing the total - // likelihood we'll need to compensate for it later on. + int32_cuda num_hmm_states = gridDim.y; + // Let arbitrary_scale be the inverse of the sum of all alpha values on-- the + // previous frame this sum of all the alpha values is stored in the place that + // we'd store the previous alpha for state-index equal to num_hmm_states + // (i.e. one past the end). We multiply this into all the + // transition-probabilities from the previous frame to this frame, in both the + // forward and backward passes, in order to keep the alphas in a good numeric + // range. This won't affect the posteriors, as it's just a constant factor + // for each frame, but when computing the total likelihood we'll need to + // compensate for it later on. BaseFloat arbitrary_scale = - 1.0 / prev_alpha[special_hmm_state * num_sequences + s]; + 1.0 / prev_alpha[num_hmm_states * num_sequences + s]; this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale; } @@ -154,7 +155,6 @@ __global__ static void _cuda_chain_hmm_backward(const Int32Pair *forward_transitions, const DenominatorGraphTransition *transitions, int32_cuda num_sequences, - int32_cuda special_hmm_state, const BaseFloat *probs, int32_cuda prob_stride, const BaseFloat *this_alpha, const BaseFloat *next_beta, BaseFloat *this_beta, BaseFloat *log_prob_deriv, @@ -179,10 +179,14 @@ static void _cuda_chain_hmm_backward(const Int32Pair *forward_transitions, if (s >= num_sequences) return; + // below, you can read 'gridDim.y' as 'num_hmm_states'. See where + // arbitrary_scale is defined in the forward computation above, for more + // explanation. BaseFloat this_alpha_prob = this_alpha[h * num_sequences + s], inv_arbitrary_scale = - this_alpha[special_hmm_state * num_sequences + s]; + this_alpha[gridDim.y * num_sequences + s]; double tot_variable_factor = 0.0; + BaseFloat occupation_factor = this_alpha_prob / inv_arbitrary_scale; const DenominatorGraphTransition *trans_iter = transitions + forward_transitions[h].first, @@ -223,7 +227,8 @@ static void _cuda_chain_hmm_backward(const Int32Pair *forward_transitions, atomic_add_thresholded(log_prob_deriv + (pdf_id0 * log_prob_deriv_stride + s), occupation_prob0); } - this_beta[h * num_sequences + s] = tot_variable_factor / inv_arbitrary_scale; + BaseFloat beta = tot_variable_factor / inv_arbitrary_scale; + this_beta[h * num_sequences + s] = beta; } @@ -231,28 +236,26 @@ void cuda_chain_hmm_forward(dim3 Gr, dim3 Bl, const Int32Pair *backward_transitions, const DenominatorGraphTransition *transitions, int32_cuda num_sequences, - int32_cuda special_hmm_state, const BaseFloat *probs, int32_cuda prob_stride, const BaseFloat *prev_alpha, BaseFloat *this_alpha) { _cuda_chain_hmm_forward<<>>(backward_transitions, transitions, - num_sequences, special_hmm_state, - probs, prob_stride, prev_alpha, this_alpha); + num_sequences, probs, prob_stride, + prev_alpha, this_alpha); } void cuda_chain_hmm_backward(dim3 Gr, dim3 Bl, const Int32Pair *forward_transitions, const DenominatorGraphTransition *transitions, int32_cuda num_sequences, - int32_cuda special_hmm_state, const BaseFloat *probs, int32_cuda prob_stride, const BaseFloat *this_alpha, const BaseFloat *next_beta, BaseFloat *this_beta, BaseFloat *log_prob_deriv, int32_cuda log_prob_deriv_stride) { _cuda_chain_hmm_backward<<>>(forward_transitions, transitions, - num_sequences, special_hmm_state, - probs, prob_stride, this_alpha, next_beta, + num_sequences, probs, prob_stride, + this_alpha, next_beta, this_beta, log_prob_deriv, log_prob_deriv_stride); } diff --git a/src/chain/chain-numerator.h b/src/chain/chain-numerator.h index 1dc9d9d489d..15cb31e0571 100644 --- a/src/chain/chain-numerator.h +++ b/src/chain/chain-numerator.h @@ -76,8 +76,8 @@ class NumeratorComputation { BaseFloat Forward(); // Does the backward computation and (efficiently) adds the derivative of the - // nnet output w.r.t. the (log-prob times supervision_.weight) to - // 'nnet_output_deriv'. + // nnet output w.r.t. the (log-prob times supervision_.weight times + // deriv_weight) to 'nnet_output_deriv'. void Backward(CuMatrixBase *nnet_output_deriv); private: diff --git a/src/chain/chain-supervision-test.cc b/src/chain/chain-supervision-test.cc index e6a333317e8..ea673df3291 100644 --- a/src/chain/chain-supervision-test.cc +++ b/src/chain/chain-supervision-test.cc @@ -251,15 +251,17 @@ void ChainTrainingTest(const DenominatorGraph &den_graph, nnet_output.SetRandn(); ChainTrainingOptions opts; + if (RandInt(0, 1) == 1) + opts.leaky_hmm_coefficient = 0.2; CuMatrix nnet_output_deriv(nnet_output.NumRows(), nnet_output.NumCols(), kUndefined); - BaseFloat objf, weight; + BaseFloat objf, l2_term, weight; ComputeChainObjfAndDeriv(opts, den_graph, supervision, - nnet_output, &objf, &weight, + nnet_output, &objf, &l2_term, &weight, &nnet_output_deriv); { @@ -296,11 +298,12 @@ void ChainTrainingTest(const DenominatorGraph &den_graph, CuMatrix nnet_output_perturbed(nnet_delta_output); nnet_output_perturbed.AddMat(1.0, nnet_output); - BaseFloat objf_modified, weight_modified; + BaseFloat objf_modified, l2_term_modified, weight_modified; ComputeChainObjfAndDeriv(opts, den_graph, supervision, nnet_output_perturbed, - &objf_modified, &weight_modified, + &objf_modified, &l2_term_modified, + &weight_modified, NULL); observed_objf_changes(p) = objf_modified - objf; @@ -419,21 +422,6 @@ void ChainDenominatorTest(const DenominatorGraph &den_graph) { 10.0); } - { // another check: that scaling the initial probs has the expected effect. - BaseFloat scale = 0.1 + 0.7 * RandUniform(); - DenominatorGraph den_graph_scaled(den_graph); - den_graph_scaled.ScaleInitialProbs(scale); - DenominatorComputation denominator_computation_scaled_initial( - opts, den_graph_scaled, - num_sequences, nnet_output); - BaseFloat forward_prob_scaled_initial = - denominator_computation_scaled_initial.Forward(); - BaseFloat observed_difference = - forward_prob_scaled_initial - forward_prob, - predicted_difference = num_sequences * log(scale); - AssertEqual(observed_difference, predicted_difference); - } - int32 num_tries = 5; BaseFloat epsilon = 1.0e-04; Vector predicted_objf_changes(num_tries), diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index 03fdb3cbe64..a1972736c68 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -679,60 +679,6 @@ bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst, return true; } -void SplitIntoRanges(int32 num_frames, - int32 frames_per_range, - std::vector *range_starts) { - if (frames_per_range > num_frames) { - range_starts->clear(); - return; // there is no room for even one range. - } - int32 num_ranges = num_frames / frames_per_range, - extra_frames = num_frames % frames_per_range; - // this is a kind of heuristic. If the number of frames we'd - // be skipping is less than 1/4 of the frames_per_range, then - // skip frames; otherwise, duplicate frames. - // it's important that this is <=, not <, so that if - // extra_frames == 0 and frames_per_range is < 4, we - // don't insert an extra range. - if (extra_frames <= frames_per_range / 4) { - // skip frames. we do this at start or end, or between ranges. - std::vector num_skips(num_ranges + 1, 0); - for (int32 i = 0; i < extra_frames; i++) - num_skips[RandInt(0, num_ranges)]++; - range_starts->resize(num_ranges); - int32 cur_start = num_skips[0]; - for (int32 i = 0; i < num_ranges; i++) { - (*range_starts)[i] = cur_start; - cur_start += frames_per_range; - cur_start += num_skips[i + 1]; - } - KALDI_ASSERT(cur_start == num_frames); - } else { - // duplicate frames. - num_ranges++; - int32 num_duplicated_frames = frames_per_range - extra_frames; - // the way we handle the 'extra_frames' frames of output is that we - // backtrack zero or more frames between outputting each pair of ranges, and - // the total of these backtracks equals 'extra_frames'. - std::vector num_backtracks(num_ranges, 0); - for (int32 i = 0; i < num_duplicated_frames; i++) { - // num_ranges - 2 below is not a bug. we only want to backtrack - // between ranges, not past the end of the last range (i.e. at - // position num_ranges - 1). we make the vector one longer to - // simplify the loop below. - num_backtracks[RandInt(0, num_ranges - 2)]++; - } - range_starts->resize(num_ranges); - int32 cur_start = 0; - for (int32 i = 0; i < num_ranges; i++) { - (*range_starts)[i] = cur_start; - cur_start += frames_per_range; - cur_start -= num_backtracks[i]; - } - KALDI_ASSERT(cur_start == num_frames); - } -} - bool Supervision::operator == (const Supervision &other) const { return weight == other.weight && num_sequences == other.num_sequences && frames_per_sequence == other.frames_per_sequence && @@ -755,50 +701,6 @@ void Supervision::Check(const TransitionModel &trans_mdl) const { KALDI_ERR << "Num-frames does not match fst."; } -void GetWeightsForRanges(int32 range_length, - const std::vector &range_starts, - std::vector > *weights) { - KALDI_ASSERT(range_length > 0); - int32 num_ranges = range_starts.size(); - weights->resize(num_ranges); - for (int32 i = 0; i < num_ranges; i++) { - (*weights)[i].Resize(range_length); - (*weights)[i].Set(1.0); - } - for (int32 i = 0; i + 1 < num_ranges; i++) { - int32 j = i + 1; - int32 i_start = range_starts[i], i_end = i_start + range_length, - j_start = range_starts[j]; - KALDI_ASSERT(j_start > i_start); - if (i_end > j_start) { - Vector &i_weights = (*weights)[i], &j_weights = (*weights)[j]; - - int32 overlap_length = i_end - j_start; - // divide the overlapping piece of the 2 ranges into 3 regions of - // approximately equal size, called the left, middle and right region. - int32 left_length = overlap_length / 3, - middle_length = (overlap_length - left_length) / 2, - right_length = overlap_length - left_length - middle_length; - KALDI_ASSERT(left_length >= 0 && middle_length >= 0 && right_length >= 0 && - left_length + middle_length + right_length == overlap_length); - // set the weight of the left region to be zero for the right (j) range. - for (int32 k = 0; k < left_length; k++) - j_weights(k) = 0.0; - // set the weight of the right region to be zero for the left (i) range. - for (int32 k = 0; k < right_length; k++) - i_weights(range_length - 1 - k) = 0.0; - // for the middle range, linearly interpolate between the 0's and 1's. - // note: we multiply with existing weights instead of set in order to get - // more accurate behavior in the unexpected case where things triply - // overlap. - for (int32 k = 0; k < middle_length; k++) { - BaseFloat weight = (0.5 + k) / middle_length; - j_weights(left_length + k) = weight; - i_weights(range_length - 1 - right_length - k) = weight; - } - } - } -} } // namespace chain } // namespace kaldi diff --git a/src/chain/chain-supervision.h b/src/chain/chain-supervision.h index b17f62d00ad..0ca12e628e1 100644 --- a/src/chain/chain-supervision.h +++ b/src/chain/chain-supervision.h @@ -364,42 +364,27 @@ void AppendSupervision(const std::vector &input, std::vector *output_supervision); -/// This function helps you to pseudo-randomly split a sequence of length 'num_frames', -/// interpreted as frames 0 ... num_frames - 1, into pieces of length exactly -/// 'frames_per_range', to be used as examples for training. Because frames_per_range -/// may not exactly divide 'num_frames', this function will leave either small gaps or -/// small overlaps in pseudo-random places. -/// The output 'range_starts' will be set to a list of the starts of ranges, the -/// output ranges are of the form -/// [ (*range_starts)[i] ... (*range_starts)[i] + frames_per_range - 1 ]. -void SplitIntoRanges(int32 num_frames, - int32 frames_per_range, - std::vector *range_starts); - - -/// This utility function is not used directly in the 'chain' code. It is used -/// to get weights for the derivatives, so that we don't doubly train on some -/// frames after splitting them up into overlapping ranges of frames. The input -/// 'range_starts' will be obtained from 'SplitIntoRanges', but the -/// 'range_length', which is a length in frames, may be longer than the one -/// supplied to SplitIntoRanges, due the 'overlap'. (see the calling code... -/// if we want overlapping ranges, we get it by 'faking' the input to -/// SplitIntoRanges). -/// -/// The output vector 'weights' will be given the same dimension as -/// 'range_starts'. By default the output weights in '*weights' will be vectors -/// of all ones, of length equal to 'range_length', and '(*weights)[i]' represents -/// the weights given to frames numbered -/// t = range_starts[i] ... range_starts[i] + range_length - 1. -/// If these ranges for two successive 'i' values overlap, then we -/// reduce the weights to ensure that no 't' value gets a total weight -/// greater than 1. We do this by dividing the overlapped region -/// into three approximately equal parts, and giving the left part -/// to the left range; the right part to the right range; and -/// in between, interpolating linearly. -void GetWeightsForRanges(int32 range_length, - const std::vector &range_starts, - std::vector > *weights); + +/// This is a newer version of GetWeightsForRanges with a simpler behavior +/// than GetWeightsForRanges and a different purpose. Instead of aiming to +/// create weights that sum to one over the whole file, the purpose is to +/// zero out the derivative weights for a certain number of frames to each +/// side of every 'cut point' in the numerator lattice [by numerator lattice, +/// what I mean is the FST that we automatically generate from the numerator +/// alignment or lattice]. So we don't zero out the weights for the very +/// beginning or very end of each original utterance, just those where +/// we split the utterance into pieces. We believe there is an incentive +/// for the network to produce deletions near the edges, and this aims to fix +/// this problem. +/// range_length is the length of each range of times (so range_starts[0] +/// represents the start of a range of t values of length 'range_length' +/// and so range_starts[1] etc.), and num_frames_zeroed is the number of frames +/// on each side of the cut point on which we are supposed to zero out the +/// derivative. +void GetWeightsForRangesNew(int32 range_length, + int32 num_frames_zeroed, + const std::vector &range_starts, + std::vector > *weights); typedef TableWriter > SupervisionWriter; diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index 42cdfed2713..1bf0201fbfa 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -29,9 +29,11 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, const DenominatorGraph &den_graph, const Supervision &supervision, const CuMatrixBase &nnet_output, - BaseFloat *tot_objf, - BaseFloat *tot_weight, - CuMatrixBase *nnet_output_deriv) { + BaseFloat *objf, + BaseFloat *l2_term, + BaseFloat *weight, + CuMatrixBase *nnet_output_deriv, + CuMatrixBase *xent_output_deriv) { BaseFloat num_logprob_weighted; if (nnet_output_deriv) nnet_output_deriv->SetZero(); @@ -40,29 +42,44 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, // note: supervision.weight is included as a factor in the derivative from // the numerator object, and the logprob too. num_logprob_weighted = numerator.Forward(); - if (nnet_output_deriv) + if (nnet_output_deriv) { numerator.Backward(nnet_output_deriv); + if (xent_output_deriv) + xent_output_deriv->CopyFromMat(*nnet_output_deriv); + } else if (xent_output_deriv) { + // this branch will be taken if xent_output_deriv but not + // nnet_output_deriv is set- which could happen if you want to compute the + // cross-entropy objective but not the derivatives. + xent_output_deriv->SetZero(); + numerator.Backward(xent_output_deriv); + } } DenominatorComputation denominator(opts, den_graph, supervision.num_sequences, nnet_output); BaseFloat den_logprob = denominator.Forward(); + bool ok = true; if (nnet_output_deriv) - denominator.Backward(-supervision.weight, - nnet_output_deriv); + ok = denominator.Backward(-supervision.weight, + nnet_output_deriv); - *tot_objf = num_logprob_weighted - supervision.weight * den_logprob; - *tot_weight = supervision.weight * supervision.num_sequences * + *objf = num_logprob_weighted - supervision.weight * den_logprob; + *weight = supervision.weight * supervision.num_sequences * supervision.frames_per_sequence; - if (!(*tot_objf == *tot_objf)) { - // inf or NaN detected + if (!((*objf) - (*objf) == 0) || !ok) { + // inf or NaN detected, or denominator computation returned false. if (nnet_output_deriv) nnet_output_deriv->SetZero(); + if (xent_output_deriv) + xent_output_deriv->SetZero(); BaseFloat default_objf = -10; - KALDI_WARN << "Objective function is " << (*tot_objf) - << ", setting to " << default_objf << " per frame."; - *tot_objf = default_objf * *tot_weight; + KALDI_WARN << "Objective function is " << (*objf) + << " and denominator computation (if done) returned " + << std::boolalpha << ok + << ", setting objective function to " << default_objf + << " per frame."; + *objf = default_objf * *weight; } // This code helps us see how big the derivatives are, on average, @@ -81,6 +98,16 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, row_products_per_frame(i / num_sequences) += row_products_cpu(i); KALDI_LOG << "Derivs per frame are " << row_products_per_frame; } + + if (opts.l2_regularize == 0.0) { + *l2_term = 0.0; + } else { + // compute the l2 penalty term and its derivative + BaseFloat scale = supervision.weight * opts.l2_regularize; + *l2_term = -0.5 * scale * TraceMatMat(nnet_output, nnet_output, kTrans); + if (nnet_output_deriv) + nnet_output_deriv->AddMat(-1.0 * scale, nnet_output); + } } diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h index 8eb7e8343f4..e6143d10846 100644 --- a/src/chain/chain-training.h +++ b/src/chain/chain-training.h @@ -40,11 +40,44 @@ namespace chain { struct ChainTrainingOptions { - // Currently empty. - - ChainTrainingOptions() { } - + // l2 regularization constant on the 'chain' output; the actual term added to + // the objf will be -0.5 times this constant times the squared l2 norm. + // (squared so it's additive across the dimensions). e.g. try 0.0005. + BaseFloat l2_regularize; + + // Coefficient for 'leaky hmm'. This means we have an epsilon-transition from + // each state to a special state with probability one, and then another + // epsilon-transition from that special state to each state, with probability + // leaky_hmm_coefficient times [initial-prob of destination state]. Imagine + // we make two copies of each state prior to doing this, version A and version + // B, with transition from A to B, so we don't have to consider epsilon loops- + // or just imagine the coefficient is small enough that we can ignore the + // epsilon loops. + BaseFloat leaky_hmm_coefficient; + + + // Cross-entropy regularization constant. (e.g. try 0.1). If nonzero, + // the network is expected to have an output named 'output-xent', which + // should have a softmax as its final nonlinearity. + BaseFloat xent_regularize; + + ChainTrainingOptions(): l2_regularize(0.0), leaky_hmm_coefficient(1.0e-05), + xent_regularize(0.0) { } + void Register(OptionsItf *opts) { + opts->Register("l2-regularize", &l2_regularize, "l2 regularization " + "constant for 'chain' training, applied to the output " + "of the neural net."); + opts->Register("leaky-hmm-coefficient", &leaky_hmm_coefficient, "Coefficient " + "that allows transitions from each HMM state to each other " + "HMM state, to ensure gradual forgetting of context (can " + "improve generalization). For numerical reasons, may not be " + "exactly zero."); + opts->Register("xent-regularize", &xent_regularize, "Cross-entropy " + "regularization constant for 'chain' training. If " + "nonzero, the network is expected to have an output " + "named 'output-xent', which should have a softmax as " + "its final nonlinearity."); } }; @@ -59,10 +92,13 @@ struct ChainTrainingOptions { paths and constraints on the alignment as an FST @param [in] nnet_output The output of the neural net; dimension must equal ((supervision.num_sequences * supervision.frames_per_sequence) by - den_graph.NumPdfs()). + den_graph.NumPdfs()). The rows are ordered as: all sequences + for frame 0; all sequences for frame 1; etc. @param [out] objf The [num - den] objective function computed for this example; you'll want to divide it by 'tot_weight' before displaying it. + @param [out] l2_term The l2 regularization term in the objective function, if + the --l2-regularize option is used. To be added to 'o @param [out] weight The weight to normalize the objective function by; equals supervision.weight * supervision.num_sequences * supervision.frames_per_sequence. @@ -70,14 +106,22 @@ struct ChainTrainingOptions { the neural-net output. Only written to if non-NULL. You don't have to zero this before passing to this function, we zero it internally. + @param [out] xent_output_deriv If non-NULL, then the numerator part of the derivative + (which equals a posterior from the numerator forward-backward, + scaled by the supervision weight) is written to here. This will + be used in the cross-entropy regularization code. This value + is also used in computing the cross-entropy objective value. */ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, const DenominatorGraph &den_graph, const Supervision &supervision, const CuMatrixBase &nnet_output, - BaseFloat *tot_objf, - BaseFloat *tot_weight, - CuMatrixBase *nnet_output_deriv); + BaseFloat *objf, + BaseFloat *l2_term, + BaseFloat *weight, + CuMatrixBase *nnet_output_deriv, + CuMatrixBase *xent_output_deriv = NULL); + } // namespace chain diff --git a/src/chainbin/nnet3-chain-acc-lda-stats.cc b/src/chainbin/nnet3-chain-acc-lda-stats.cc index 3bdf710c489..3f092879b6e 100644 --- a/src/chainbin/nnet3-chain-acc-lda-stats.cc +++ b/src/chainbin/nnet3-chain-acc-lda-stats.cc @@ -40,9 +40,11 @@ class NnetChainLdaStatsAccumulator { void AccStats(const NnetChainExample &eg) { ComputationRequest request; - bool need_backprop = false, store_stats = false; + bool need_backprop = false, store_stats = false, + need_xent = false, need_xent_deriv = false; - GetChainComputationRequest(nnet_, eg, need_backprop, store_stats, &request); + GetChainComputationRequest(nnet_, eg, need_backprop, store_stats, + need_xent, need_xent_deriv, &request); const NnetComputation &computation = *(compiler_.Compile(request)); diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc index 4e32d280638..6820ee125e0 100644 --- a/src/chainbin/nnet3-chain-get-egs.cc +++ b/src/chainbin/nnet3-chain-get-egs.cc @@ -25,6 +25,7 @@ #include "hmm/posterior.h" #include "nnet3/nnet-example.h" #include "nnet3/nnet-chain-example.h" +#include "nnet3/nnet-example-utils.h" namespace kaldi { namespace nnet3 { @@ -48,6 +49,7 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst, int32 frames_per_eg, int32 frames_overlap_per_eg, int32 frame_subsampling_factor, + int32 cut_zero_frames, int64 *num_frames_written, int64 *num_egs_written, NnetChainExampleWriter *example_writer) { @@ -78,7 +80,7 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst, // Instead we select ranges of frames that fully fit within the file; these // might slightly overlap with each other or have gaps. std::vector range_starts_subsampled; - chain::SplitIntoRanges(num_feature_frames_subsampled - + SplitIntoRanges(num_feature_frames_subsampled - frames_overlap_subsampled, frames_shift_subsampled, &range_starts_subsampled); @@ -88,10 +90,16 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst, // to the edge are not as accurate as they could be, because when we split we // don't know the correct alphas and betas). std::vector > deriv_weights; - chain::GetWeightsForRanges(frames_per_eg_subsampled, - range_starts_subsampled, - &deriv_weights); - + if (cut_zero_frames >= 0) + GetWeightsForRangesNew(frames_per_eg_subsampled, + cut_zero_frames / frame_subsampling_factor, + range_starts_subsampled, + &deriv_weights); + else + GetWeightsForRanges(frames_per_eg_subsampled, + range_starts_subsampled, + &deriv_weights); + if (range_starts_subsampled.empty()) { KALDI_WARN << "No output for utterance " << utt_id << " (num-frames=" << num_feature_frames @@ -177,35 +185,6 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst, return true; } -void RoundUpNumFrames(int32 frame_subsampling_factor, - int32 *num_frames, - int32 *num_frames_overlap) { - if (*num_frames % frame_subsampling_factor != 0) { - int32 new_num_frames = frame_subsampling_factor * - (*num_frames / frame_subsampling_factor + 1); - KALDI_LOG << "Rounding up --num-frames=" << (*num_frames) - << " to a multiple of --frame-subsampling-factor=" - << frame_subsampling_factor - << ", now --num-frames=" << new_num_frames; - *num_frames = new_num_frames; - } - if (*num_frames_overlap % frame_subsampling_factor != 0) { - int32 new_num_frames_overlap = frame_subsampling_factor * - (*num_frames_overlap / frame_subsampling_factor + 1); - KALDI_LOG << "Rounding up --num-frames-overlap=" << (*num_frames_overlap) - << " to a multiple of --frame-subsampling-factor=" - << frame_subsampling_factor - << ", now --num-frames-overlap=" << new_num_frames_overlap; - *num_frames_overlap = new_num_frames_overlap; - } - if (*num_frames_overlap < 0 || *num_frames_overlap >= *num_frames) { - KALDI_ERR << "--num-frames-overlap=" << (*num_frames_overlap) << " < " - << "--num-frames=" << (*num_frames); - } - -} - - } // namespace nnet2 } // namespace kaldi @@ -237,6 +216,7 @@ int main(int argc, char *argv[]) { bool compress = true; int32 left_context = 0, right_context = 0, num_frames = 1, num_frames_overlap = 0, length_tolerance = 100, + cut_zero_frames = -1, frame_subsampling_factor = 1; std::string ivector_rspecifier; @@ -244,6 +224,10 @@ int main(int argc, char *argv[]) { ParseOptions po(usage); po.Register("compress", &compress, "If true, write egs in " "compressed format (recommended)"); + po.Register("cut-zero-frames", &cut_zero_frames, "Number of frames " + "(measured before subsampling) to zero the derivative on each " + "side of a cut point (if set, activates new-style derivative " + "weights)"); po.Register("left-context", &left_context, "Number of frames of left " "context the neural net requires."); po.Register("right-context", &right_context, "Number of frames of right " @@ -338,9 +322,10 @@ int main(int argc, char *argv[]) { continue; } if (ProcessFile(normalization_fst, feats, ivector_feats, supervision, - key, compress, left_context, right_context, num_frames, + key, compress, + left_context, right_context, num_frames, num_frames_overlap, frame_subsampling_factor, - &num_frames_written, &num_egs_written, + cut_zero_frames, &num_frames_written, &num_egs_written, &example_writer)) num_done++; else diff --git a/src/configure b/src/configure index c90e9ba4ee0..acd63da0d84 100755 --- a/src/configure +++ b/src/configure @@ -177,7 +177,10 @@ do esac done - +# the idea here is that if you change the configuration options from using +# CUDA to not using it, or vice versa, we want to recompile all parts of the +# code that may use a GPU. Touching this file is a way to force this. +touch cudamatrix/cu-common.h 2>/dev/null function failure { echo "***configure failed: $* ***" >&2 diff --git a/src/cudamatrix/cu-common.cc b/src/cudamatrix/cu-common.cc index 8718c49eea5..2b23bf0b621 100644 --- a/src/cudamatrix/cu-common.cc +++ b/src/cudamatrix/cu-common.cc @@ -51,19 +51,20 @@ void GetBlockSizesForSimpleMatrixOperation(int32 num_rows, dim3 *dimBlock) { KALDI_ASSERT(num_rows > 0 && num_cols > 0); int32 col_blocksize = 64, row_blocksize = 4; - while (num_cols + (num_cols / 2) <= col_blocksize && - num_rows > 65536 * row_blocksize) { + while (col_blocksize > 1 && + (num_cols + (num_cols / 2) <= col_blocksize || + num_rows > 65536 * row_blocksize)) { col_blocksize /= 2; row_blocksize *= 2; } - KALDI_ASSERT(col_blocksize > 0 && "Matrix too large to process"); - dimBlock->x = col_blocksize; dimBlock->y = row_blocksize; dimBlock->z = 1; dimGrid->x = n_blocks(num_cols, col_blocksize); dimGrid->y = n_blocks(num_rows, row_blocksize); + KALDI_ASSERT(dimGrid->y <= 65536 && + "Matrix has too many rows to process"); dimGrid->z = 1; } #endif diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h index 804bea1a217..2d8aae1808c 100644 --- a/src/cudamatrix/cu-kernels-ansi.h +++ b/src/cudamatrix/cu-kernels-ansi.h @@ -59,6 +59,7 @@ void cudaF_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d); void cudaF_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim d); void cudaF_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power, bool include_sign, MatrixDim d); void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d); +void cudaF_apply_signum(dim3 Gr, dim3 Bl, float* mat, MatrixDim d); void cudaF_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val, MatrixDim d); void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride); void cudaF_add_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride); @@ -198,6 +199,7 @@ void cudaD_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d); void cudaD_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim d); void cudaD_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power, bool include_sign, MatrixDim d); void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d); +void cudaD_apply_signum(dim3 Gr, dim3 Bl, double* mat, MatrixDim d); void cudaD_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val, MatrixDim d); void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride); void cudaD_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride); diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 00af3eb234a..422bc5af2f3 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -931,15 +931,15 @@ static void _add_diag_mat_mat( int v_idx = i / threads_per_element, // v_idx is the index into v that we are supposed to sub_idx = i % threads_per_element; // add to; 0 <= sub_idx < threads_per_element tells // us which block of elements we sum up. - if (v_idx >= v_dim) return; - - Real sum = 0.0; - for (int j = sub_idx; j < M_cols; j += threads_per_element) { - int M_index = v_idx * M_row_stride + j * M_col_stride, - N_index = j * N_row_stride + v_idx * N_col_stride; - sum += M[M_index] * N[N_index]; + if (v_idx < v_dim) { + Real sum = 0.0; + for (int j = sub_idx; j < M_cols; j += threads_per_element) { + int M_index = v_idx * M_row_stride + j * M_col_stride, + N_index = j * N_row_stride + v_idx * N_col_stride; + sum += M[M_index] * N[N_index]; + } + temp_data[threadIdx.x] = sum; } - temp_data[threadIdx.x] = sum; // start_idx = threadIdx.x - sub_idx; // start of the position in temp_data // that we want to sum up. @@ -959,7 +959,7 @@ static void _add_diag_mat_mat( __syncthreads(); num_total_threads = half_point; } - if (sub_idx == 0) { + if (sub_idx == 0 && v_idx < v_dim) { v[v_idx] = beta * v[v_idx] + alpha * temp_data[threadIdx.x]; } } @@ -1152,7 +1152,6 @@ __global__ static void _pvec_sum(Real* v, Real* g, int dim, int size) { int i = blockIdx.x * blockDim.x + threadIdx.x; int start = size * i; - if (start >= dim) return; int end = start + size; if (end > dim) end = dim; __shared__ Real row_data[CU1DBLOCK]; @@ -1261,6 +1260,23 @@ static void _apply_heaviside(Real* mat, MatrixDim d) { } +// Caution, here i/block{idx,dim}.x is the row index and j/block{idx,dim}.y is the col index. +// this is for no reason, really, I just happened to prefer this +// at the time. [dan] +template +__global__ +static void _apply_signum(Real* mat, MatrixDim d) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i * d.stride + j; + + if (i < d.rows && j < d.cols) { + if (mat[index] > 0.0) mat[index] = 1.0; + else if (mat[index] < 0.0) mat[index] = -1.0; + } +} + + template __global__ static void _apply_floor(Real* mat, Real floor_val, MatrixDim d) { @@ -2145,7 +2161,10 @@ void cudaF_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power, bool include void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { _apply_heaviside<<>>(mat, d); +} +void cudaF_apply_signum(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { + _apply_signum<<>>(mat, d); } void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) { @@ -2610,6 +2629,10 @@ void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) { _apply_heaviside<<>>(mat, d); } +void cudaD_apply_signum(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) { + _apply_signum<<>>(mat, d); +} + void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) { _copy_cols<<>>(dst, src, reorder, dst_dim, src_stride); } diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index fc1fbae54da..57133092574 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -125,6 +125,7 @@ inline void cuda_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { cudaF_ap inline void cuda_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim dim) { cudaF_apply_pow(Gr,Bl,mat,power,dim); } inline void cuda_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power, bool include_sign, MatrixDim dim) { cudaF_apply_pow_abs(Gr,Bl,mat,power,include_sign, dim); } inline void cuda_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim dim) { cudaF_apply_heaviside(Gr,Bl,mat,dim); } +inline void cuda_apply_signum(dim3 Gr, dim3 Bl, float* mat, MatrixDim dim) { cudaF_apply_signum(Gr,Bl,mat,dim); } inline void cuda_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val, MatrixDim dim) { cudaF_apply_floor(Gr,Bl,mat,floor_val,dim); } inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val, MatrixDim dim) { cudaF_apply_ceiling(Gr,Bl,mat,ceiling_val,dim); } inline void cuda_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) { @@ -311,6 +312,7 @@ inline void cuda_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) { cudaD_a inline void cuda_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim dim) { cudaD_apply_pow(Gr,Bl,mat,power,dim); } inline void cuda_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power, bool include_sign, MatrixDim dim) { cudaD_apply_pow_abs(Gr,Bl,mat,power,include_sign,dim); } inline void cuda_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim dim) { cudaD_apply_heaviside(Gr,Bl,mat,dim); } +inline void cuda_apply_signum(dim3 Gr, dim3 Bl, double* mat, MatrixDim dim) { cudaD_apply_signum(Gr,Bl,mat,dim); } inline void cuda_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val, MatrixDim dim) { cudaD_apply_floor(Gr,Bl,mat,floor_val,dim); } inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, double* mat, double ceiling_val, MatrixDim dim) { cudaD_apply_ceiling(Gr,Bl,mat,ceiling_val,dim); } inline void cuda_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) { diff --git a/src/cudamatrix/cu-matrix-speed-test.cc b/src/cudamatrix/cu-matrix-speed-test.cc index f50ded8c209..1c32de34d5c 100644 --- a/src/cudamatrix/cu-matrix-speed-test.cc +++ b/src/cudamatrix/cu-matrix-speed-test.cc @@ -298,6 +298,23 @@ template void TestCuMatrixSigmoid(int32 dim) { << dim << ", speed was " << gflops << " gigaflops."; } +template void TestCuMatrixHeaviside(int32 dim) { + BaseFloat time_in_secs = 0.025; + CuMatrix M(dim, dim), N(dim, dim); + M.SetRandn(); + N.SetRandn(); + Timer tim; + int32 iter = 0; + for (;tim.Elapsed() < time_in_secs; iter++) { + N.ApplyHeaviside(); + } + + BaseFloat fdim = dim; + BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); + KALDI_LOG << "For CuMatrix::Heaviside" << NameOf() << ", for dim = " + << dim << ", speed was " << gflops << " gigaflops."; +} + template void TestCuMatrixMulRowsGroupMat(int32 dim) { BaseFloat time_in_secs = 0.025; @@ -806,6 +823,8 @@ template void CudaMatrixSpeedTest() { TestCuMatrixCholesky(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixSigmoid(sizes[s]); + for (int32 s = 0; s < ns; s++) + TestCuMatrixHeaviside(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuFindRowMaxId(sizes[s]); for (int32 s = 0; s < ns; s++) diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index eb5a268d543..a7c034a29ae 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -1895,6 +1895,7 @@ void CuMatrixBase::CopyRowsFromVec(const CuVectorBase &v) { GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), &dimGrid, &dimBlock); cuda_copy_rows_from_vec(dimGrid, dimBlock, data_, this->Dim(), v.Data()); + CU_SAFE_CALL(cudaGetLastError()); } else { KALDI_ERR << "Wrong sized arguments"; } @@ -2016,6 +2017,24 @@ void CuMatrixBase::ApplyHeaviside() { } } +template +void CuMatrixBase::ApplySignum() { +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + Timer tim; + dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); + dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), + n_blocks(NumCols(), CU2DBLOCK)); + + cuda_apply_heaviside(dimGrid, dimBlock, data_, Dim()); + CU_SAFE_CALL(cudaGetLastError()); + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + } else +#endif + { + Mat().ApplySignum(); + } +} template void CuMatrixBase::ApplyExp() { diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h index fd4c642ab7f..6ae233b8f56 100644 --- a/src/cudamatrix/cu-matrix.h +++ b/src/cudamatrix/cu-matrix.h @@ -337,6 +337,7 @@ class CuMatrixBase { ///< multiply the result by the sign of the input. void ApplyPowAbs(Real power, bool include_sign=false); void ApplyHeaviside(); ///< For each element, sets x = (x > 0 ? 1.0 : 0.0) + void ApplySignum(); ///< For each element, sets x = (1 if x > 0; 0 if x = 0; -1 if x < 0) void ApplyFloor(Real floor_val); void ApplyCeiling(Real ceiling_val); void ApplyExp(); diff --git a/src/cudamatrix/cu-vector-test.cc b/src/cudamatrix/cu-vector-test.cc index a32e136f62e..9b7aa97776a 100644 --- a/src/cudamatrix/cu-vector-test.cc +++ b/src/cudamatrix/cu-vector-test.cc @@ -22,7 +22,7 @@ #include #include #include - +#include #include "base/kaldi-common.h" #include "util/common-utils.h" #include "cudamatrix/cu-matrix.h" @@ -62,7 +62,7 @@ static void UnitTestCuVectorIO() { } -template +template static void UnitTestCuVectorCopyFromVec() { for (int32 i = 1; i < 10; i++) { MatrixIndexT dim = 10 * i; @@ -80,7 +80,7 @@ static void UnitTestCuVectorCopyFromVec() { } } -template +template static void UnitTestCuSubVector() { for (int32 iter = 0 ; iter < 10; iter++) { int32 M1 = 1 + rand () % 10, M2 = 1 + Rand() % 1, M3 = 1 + Rand() % 10, M = M1 + M2 + M3, @@ -97,7 +97,7 @@ static void UnitTestCuSubVector() { -template +template static void UnitTestCuVectorMulTp() { for (int32 i = 1; i < 10; i++) { MatrixIndexT dim = 10 * i; @@ -105,7 +105,7 @@ static void UnitTestCuVectorMulTp() { A.SetRandn(); TpMatrix B(dim); B.SetRandn(); - + CuVector C(A); CuTpMatrix D(B); @@ -127,10 +127,10 @@ static void UnitTestCuVectorAddTp() { B.SetRandn(); Vector C(dim); C.SetRandn(); - + CuVector D(A); CuTpMatrix E(B); - CuVector F(C); + CuVector F(C); A.AddTpVec(1.0, B, kNoTrans, C, 1.0); D.AddTpVec(1.0, E, kNoTrans, F, 1.0); @@ -160,7 +160,7 @@ template void CuVectorUnitTestAddVec() { CuVector vec1_orig(vec1); BaseFloat alpha = 0.43243; vec1.AddVec(alpha, vec2); - + for (int32 i = 0; i < M; i++) AssertEqual(vec1_orig(i) + alpha * vec2(i), vec1(i)); } @@ -177,7 +177,7 @@ template void CuVectorUnitTestAddVecCross() { CuVector vec1_orig(vec1); Real alpha = 0.43243; vec1.AddVec(alpha, vec2); - + for (int32 i = 0; i < M; i++) AssertEqual(vec1_orig(i) + alpha * vec2(i), vec1(i)); } else { @@ -198,7 +198,7 @@ template void CuVectorUnitTestAddVecExtra() { CuVector vec1_orig(vec1); BaseFloat alpha = 0.43243, beta = 1.4321; vec1.AddVec(alpha, vec2, beta); - + for (int32 i = 0; i < M; i++) AssertEqual(beta * vec1_orig(i) + alpha * vec2(i), vec1(i)); } @@ -268,6 +268,20 @@ template static void UnitTestCuVectorReplaceValue() { } } +template static void UnitTestCuVectorSum() { + for (int32 i = 0; i < 200; i++) { + int32 start_dim = RandInt(1, 500), end_dim = RandInt(1, 500); + int32 dim = RandInt(10, 12000) + start_dim + end_dim; + Real quiet_nan = nan(""); // this is from . + Vector vec(start_dim + dim + end_dim); + vec.Range(0, start_dim).Set(quiet_nan); + vec.Range(start_dim, dim).Set(1.0); + vec.Range(start_dim + dim, end_dim).Set(quiet_nan); + BaseFloat sum = vec.Range(start_dim, dim).Sum(); + KALDI_ASSERT(ApproxEqual(sum, dim)); + } +} + template void CuVectorUnitTestInvertElements() { // Also tests MulElements(); int32 M = 256 + Rand() % 100; @@ -288,7 +302,7 @@ template void CuVectorUnitTestSum() { CuVector A(dim), ones(dim); A.SetRandn(); ones.Set(1.0); - + AssertEqual(VecVec(A, ones), A.Sum()); } } @@ -320,7 +334,7 @@ template void CuVectorUnitTestCopyFromMat() { } Matrix matrix(cu_matrix), matrix2(M, N); CuMatrix matrix3(M, N); - + CuVector vector(M * N), vector2(M * N); vector.CopyRowsFromMat(cu_matrix); vector2.CopyRowsFromMat(matrix); @@ -328,8 +342,8 @@ template void CuVectorUnitTestCopyFromMat() { matrix3.CopyRowsFromVec(Vector(vector2)); Vector vector3(M * N); vector3.CopyRowsFromMat(cu_matrix); - - + + for(int32 j = 0; j < M*N; j++) { if (Rand() % 500 == 0) { // random small subset (it was slow) KALDI_ASSERT(vector(j) == cu_matrix(j/N, j%N)); @@ -412,7 +426,7 @@ template void CuVectorUnitTestNorm() { KALDI_ASSERT(ApproxEqual(cu_vector.Norm(1.0), 3.0)); KALDI_ASSERT(ApproxEqual(cu_vector.Norm(2.0), sqrt(5.0))); } - + template void CuVectorUnitTestMin() { for (int32 p = 0; p < 5; p++) { @@ -496,7 +510,7 @@ template void CuVectorUnitTestApplyFloor() { BaseFloat floor = 0.33 * (-5 + Rand() % 10); int32 i = cu_vector.ApplyFloor(floor); int32 j = vector.ApplyFloor(floor); - + CuVector cu2(vector); AssertEqual(cu2, cu_vector); @@ -517,7 +531,7 @@ template void CuVectorUnitTestApplyCeiling() { BaseFloat floor = 0.33 * (-5 + Rand() % 10); int32 i = cu_vector.ApplyCeiling(floor); int32 j = vector.ApplyCeiling(floor); - + CuVector cu2(vector); AssertEqual(cu2, cu_vector); @@ -540,7 +554,7 @@ template void CuVectorUnitTestApplyPow() { BaseFloat pow = -2 + (Rand() % 5); cu_vector.ApplyPow(pow); vector.ApplyPow(pow); - + CuVector cu2(vector); AssertEqual(cu2, cu_vector); @@ -579,7 +593,7 @@ template void CuVectorUnitTestAddDiagMat2() { cu_mat_orig.SetRandn(); MatrixTransposeType trans = (p % 2 == 0 ? kNoTrans : kTrans); CuMatrix cu_mat(cu_mat_orig, trans); - + Vector vector(cu_vector); Matrix mat(cu_mat); @@ -604,12 +618,12 @@ static void CuVectorUnitTestAddDiagMatMat() { MatrixTransposeType transM = (iter % 2 == 0 ? kNoTrans : kTrans); MatrixTransposeType transN = ((iter/2) % 2 == 0 ? kNoTrans : kTrans); CuMatrix M(M_orig, transM), N(N_orig, transN); - + v.SetRandn(); CuVector w(v); w.AddDiagMatMat(alpha, M, transM, N, transN, beta); - + { CuVector w2(v); CuMatrix MN(dimM, dimM); @@ -669,7 +683,7 @@ template void CuVectorUnitTestAddSpVec() { CuSpMatrix mat_cu(M); mat_cu.SetRandn(); SpMatrix mat(mat_cu); - + BaseFloat alpha = 0.5 * (Rand() % 5), beta = 0.5 * (Rand() % 5); dst_cu.AddSpVec(alpha, mat_cu, src_cu, beta); dst.AddSpVec(alpha, mat, src, beta); @@ -695,6 +709,7 @@ template void CuVectorUnitTest() { CuVectorUnitTestScale(); CuVectorUnitTestSum(); CuVectorUnitTestInvertElements(); + UnitTestCuVectorSum(); CuVectorUnitTestAddRowSumMat(); CuVectorUnitTestAddColSumMat(); UnitTestCuVectorReplaceValue(); @@ -708,8 +723,8 @@ template void CuVectorUnitTest() { CuVectorUnitTestCopyDiagFromPacked(); CuVectorUnitTestCopyDiagFromMat(); CuVectorUnitTestCopyCross(); - CuVectorUnitTestCopyCross2(); - CuVectorUnitTestNorm(); + CuVectorUnitTestCopyCross2(); + CuVectorUnitTestNorm(); CuVectorUnitTestApplyExp(); CuVectorUnitTestApplyLog(); CuVectorUnitTestApplyFloor(); @@ -732,10 +747,10 @@ int main(int argc, char *argv[]) { const char *usage = "Usage: cu-vector-test [options]"; ParseOptions po(usage); - std::string use_gpu = "yes"; + std::string use_gpu = "yes"; po.Register("use-gpu", &use_gpu, "yes|no|optional"); po.Read(argc, argv); - + if (po.NumArgs() != 0) { po.PrintUsage(); exit(1); diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc index 64f41720869..6deb3809d85 100644 --- a/src/cudamatrix/cu-vector.cc +++ b/src/cudamatrix/cu-vector.cc @@ -279,7 +279,6 @@ Real CuVectorBase::Sum() const { CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); return tmp.Sum(); } else { - if (dim_ == 0) return 0.0; CuVector tmp(1, kUndefined); int dimBlock(CU1DBLOCK); int dimGrid = 1; // only 1 block here. we have loops in each thread. diff --git a/src/doc/glossary.dox b/src/doc/glossary.dox index ba42ea12370..31fa62d3389 100644 --- a/src/doc/glossary.dox +++ b/src/doc/glossary.dox @@ -26,7 +26,7 @@ search function of your browser. For convenience the definition of each term's section is preceded and followed by a colon, so for instance, typing ctrl-f ":lattice:" would take you to the section for "lattice". - +
@@ -37,7 +37,7 @@ synonymous with a sequence of transition-ids. Most of the time an alignment is derived from aligning the reference transcript of an utterance, in which case it is called a forced alignment. lattices also contain alignment information as sequences of transition-ids for each word -sequence in the lattice. The program \ref bin/show-alignments.cc "show-alignments" shows +sequence in the lattice. The program \ref bin/show-alignments.cc "show-alignments" shows alignments in a human-readable format. :forced alignment: see alignment. @@ -54,6 +54,18 @@ of the HMMs, and also various other important integer mappings; see \ref transit This object is generally written at the start of model files. The program \ref bin/show-transitions.cc "show-transitions" shows these. +:G.fst: The grammar FST G.fst which lives in the + data/lang/ directory in the scripts (see \ref data_prep_lang) represents + the language model in a Finite State Transducer format (see www.openfst.org). + For the most part it is an acceptor, meaning the input and output symbols on the + arcs are the same, but for statistical language models with backoff, the backoff + arcs have the "disambiguation symbol" #0 on the input side only. + For many purposes you'll want to get rid of the disambiguation symbols + using the command fstproject --project_output=true. The disambiguation symbols + are needed during graph compilation to make the FST determinizable, but for things + like language-model rescoring you don't want them. + +
*/ diff --git a/src/doc/hmm.dox b/src/doc/hmm.dox index 9935fa52711..938321fd7b2 100644 --- a/src/doc/hmm.dox +++ b/src/doc/hmm.dox @@ -447,9 +447,10 @@ We now explain what these three scales do: when we add the self-loop, let the probability mass given to the self-loop be p and the mass given to the rest be (1-p). We add a self-loop with log-probability self_loop_scale * log(p), and add (self_loop_scale * log(1-p)) to all the other - log transition probabilities - out of that state. In typical topologies, the self-loop scale is the only scale - that matters. + log transition probabilities out of that state. (Note: in the initial stage of + graph creation we create a graph without self-loops, and with the non-self-loop + transition probabilities renormalized to sum to one). In typical topologies, the + self-loop scale is the only scale that matters. The reason we feel it might make sense to apply a different probability scale to the self-loops versus the normal transition scale is we think they could be diff --git a/src/doc/install.dox b/src/doc/install.dox index 0ffb2b1220f..b40b139a8dc 100644 --- a/src/doc/install.dox +++ b/src/doc/install.dox @@ -29,8 +29,8 @@ possibly including unfinished and experimental features, can be downloaded by typing into a shell: \verbatim - git clone https://github.com/kaldi-asr/kaldi.git kaldi-trunk --origin golden - cd kaldi-trunk + git clone https://github.com/kaldi-asr/kaldi.git kaldi --origin upstream + cd kaldi \endverbatim If you want to get updates and bug fixes you can go to some checked-out directory, and type diff --git a/src/doc/tree_externals.dox b/src/doc/tree_externals.dox index ee2bc11d8b9..df9f96e8430 100644 --- a/src/doc/tree_externals.dox +++ b/src/doc/tree_externals.dox @@ -32,13 +32,13 @@ namespace kaldi { The basic algorithm that is being implemented is a top-down greedy splitting, where we have a number of ways we can split the data by asking about, say, the left phone, the right - phone, the central phone, the state we're in, and so on. + phone, the central phone, the state we're in, and so on. The algorithm we implement is similar to the standard algorithm, see for example the paper "Tree-based State Tying for High Accuracy Acoustic Modeling" by Young, Odell and Woodland. In this algorithm, we split the data up by asking the locally optimal question, i.e. the one that gives the most likelihood increase, supposing - we model the data on each side of the split by a single Gaussian. - Differences from standard implementations include added flexibility + we model the data on each side of the split by a single Gaussian. + Differences from standard implementations include added flexibility about how to configure the tree roots; the ability to ask questions about the HMM-state and the central phone; and the fact that by default in the Kaldi scripts, the questions are automatically generated by a top-down binary clustering of the data, which means @@ -50,7 +50,7 @@ namespace kaldi { be the tree roots. For how to configure it using the standard scripts, see \ref data_prep. In practice we generally let each tree-root correspond to a "real phone", meaning that we group together all word-position-dependent, tone-dependent or stress-dependent versions of - each phone into one group that becomes a tree root. + each phone into one group that becomes a tree root. The rest of this page mostly gives details at the code level of what is happening. @@ -74,7 +74,7 @@ below summarizes these values: N is the width of the context window and P is the identity of the designated -"central phone". Normally P is exactly in the middle of the window +"central phone". Normally P is exactly in the middle of the window (hence the name "central-position"); for example, with N=3, we would normally have P=1, but you are free to choose any value from 0 to N-1; for instance, P=2 and N=3 means two phones of left context and no right context at all. @@ -82,32 +82,32 @@ In the code, when we talk about the "central phone" we always mean the P'th phone which may or may not actually be the central phone of the context window. A vector of integers representing a typical triphone context window might be: -\code -// probably not valid C++ +\code +// probably not valid C++ vector ctx_window = { 12, 15, 21 }; \endcode -Assuming N=3 and P=1, this would represent phone 15 with +Assuming N=3 and P=1, this would represent phone 15 with a right context of 21 and a left context of 12. The way we handle end effects is using zero (which is not a valid phone because it's reserved in OpenFst for the epsilon meaning "no symbol"), so for instance: -\code +\code vector ctx_window = { 12, 15, 0 }; \endcode means phone 15 with a left-context of 12 and no right-context because it's the end of the utterance. At the end of utterance in particular, the use of zero this way may be a little unexpected because the last "phone" is actually the -subsequential symbol "$" (see \ref graph_c), but for the convenience +subsequential symbol "$" (see \ref graph_c), but for the convenience of the decision-tree code we don't put the subsequential symbol in these context windows, we put zero. Note that if we had N=3 and P=2, the above context window would be invalid because its P'th element would be zero which is not a real phone; also of course, -if we had a tree with N=1, neither of the windows above would be valid because they +if we had a tree with N=1, neither of the windows above would be valid because they are the wrong size. In the monophone case, we would have a window like: -\code +\code vector ctx_window = { 15 }; \endcode so monophone systems are just treated as a special case of context-dependent -systems, with a window size N of 1 and a tree that doesn't do anything very +systems, with a window size N of 1 and a tree that doesn't do anything very interesting. @@ -126,28 +126,28 @@ TransitionModel object and an AmDiagGmm object). If the program gmm-init-mono receives an option called --shared-phones, it will share the pdfs between specified sets of phones; otherwise it makes all the phones separate. -After training a monophone system starting from a flat start, we take +After training a monophone system starting from a flat start, we take the monophone alignments -and use the function AccumulateTreeStats() (called from \ref acc-tree-stats.cc +and use the function AccumulateTreeStats() (called from \ref acc-tree-stats.cc "acc-tree-stats") to accumulate statistics for training the tree. This program is not limited to reading in monophone alignments; it works from context-dependent alignments too so we can build trees based on e.g. triphone alignments. -The statistics for tree building are written to disk as the type \ref BuildTreeStatsType -(see \ref treei_stats). +The statistics for tree building are written to disk as the type \ref BuildTreeStatsType +(see \ref treei_stats). The function AccumulateTreeStats() takes the values N and P, as explained in the previous section; the command-line programs will set these by default to 3 and 1 respectively, but this can be overridden using the --context-width -and --central-position options. The program \ref acc-tree-stats.cc +and --central-position options. The program \ref acc-tree-stats.cc "acc-tree-stats" takes a list of context-independent phones (e.g. silence), but this is not required even if there are context-independent phones; it is just -a mechanism to reduce the size of the statistics. +a mechanism to reduce the size of the statistics. For context-independent hones, the program will accumulate the corresponding statistics without the keys corresponding to the left and right phones defined (c.f. \ref treei_event_map). When the statistics have been -accumulated we use the program \ref build-tree.cc "build-tree" to -build the tree. This outputs the tree. +accumulated we use the program \ref build-tree.cc "build-tree" to +build the tree. This outputs the tree. The program \ref build-tree.cc "build-tree" requires three things: - The statistics (of type BuildTreeStatsType) - The questions config (of type Questions) @@ -160,21 +160,32 @@ scripts, these are automatically obtained from tree-building statistics by the program cluster-phones. The roots file specifies sets of phones that are goint to have shared roots in the decision-tree clustering process, and says for each phone set the following two things: - - "shared" or "not-shared" says whether or not there should be separate - roots for each of the \ref pdf_class "pdf-classes" (i.e. HMM-states, - in the typical case), or if the roots - should be shared. If we are going to be splitting (the "split" option - below) we enforce that the roots should be shared. + + - "shared" or "not-shared" says whether or not there should be separate roots + for each of the \ref pdf_class "pdf-classes" (i.e. HMM-states, in the + typical case), or if the roots should be shared. If it says "shared" there + will be a single tree-root for all HMM states (e.g. all three states, in a + normal topology) ; if "not-shared" there would be (e.g.) three tree-roots, + one for each pdf-class. + - "split" or "not-split" says whether or not the decision tree splitting should actually be done for the roots in question (for silence, we - typically don't split). + typically don't split). If the line says "split" (the normal case) then + we do the decision tree splitting. If it says "not-split" then no splitting + is done and the roots are left un-split. -Be careful because the notation is a bit tricky. The "shared" on the line of -the roots file is about whether we will share all the 3 HMM-states of the phone -in a single tree root. But we will always share together the roots of all the phones that -appear on a single lines of the roots file. This is not configurable via these -strings because if you don't want to share them, you can just put them on -separate lines of the roots file. + +The following will clarify some aspects of how this works: + + - If we say "shared split", then + even though there is one root node for all three HMM-states, the different + HMM states can still get different leaves because the tree can ask questions + about the pdf-class as well as about phonetic context. + + - We always share together the roots of all the phones that appear on a single + lines of the roots file. This is not configurable via these strings because + if you don't want to share the phones' roots, you can just put them on + separate lines of the roots file. Below is an example of a roots file; this assumes that phone 1 is silence and all the other phones have separate roots. @@ -185,14 +196,14 @@ shared split 3 ... shared split 28 \endverbatim -Having multiple phones on the same line is most useful when we have things like position and +Having multiple phones on the same line is most useful when we have things like position and stress-dependent phones; in this case each "real" phone would correspond to a set of integer phone ids. In that case we share the roots for all versions of a particular underlying phone. Below is an example of a roots file -for Wall Street Journal, from the egs/wsj/s5 scripts (this is in text, not integer form; +for Wall Street Journal, from the egs/wsj/s5 scripts (this is in text, not integer form; it would have to be converted to integer form before being read by Kalid): \verbatim -not-shared not-split SIL SIL_B SIL_E SIL_I SIL_S SPN SPN_B SPN_E SPN_I SPN_S NSN NSN_B NSN_E NSN_I NSN_S +not-shared not-split SIL SIL_B SIL_E SIL_I SIL_S SPN SPN_B SPN_E SPN_I SPN_S NSN NSN_B NSN_E NSN_I NSN_S shared split AA_B AA_E AA_I AA_S AA0_B AA0_E AA0_I AA0_S AA1_B AA1_E AA1_I AA1_S AA2_B AA2_E AA2_I AA2_S shared split AE_B AE_E AE_I AE_S AE0_B AE0_E AE0_I AE0_S AE1_B AE1_E AE1_I AE1_S AE2_B AE2_E AE2_I AE2_S shared split AH_B AH_E AH_I AH_S AH0_B AH0_E AH0_I AH0_S AH1_B AH1_E AH1_I AH1_S AH2_B AH2_E AH2_I AH2_S @@ -207,7 +218,7 @@ When creating the roots file, you should ensure that at least one phone on each For instance, in this case, if the phone AY was seen in at least some combination of stress and word-position, we would be OK. -In this example, we have various word-position-dependent variants of silence and so on. +In this example, we have various word-position-dependent variants of silence and so on. In this example they will all share their pdf's because they are on the same line and are "not-split"-- but they may have different transition parameters. In fact, most of these variants of silence would never be used as silence never appears inside words; this is for @@ -224,13 +235,13 @@ tree to another using the program \ref convert-ali.cc "convert-ali". pdf-id, and these are contiguous (typically there are several thousand of these in an LVCSR system). They are originally assigned when the tree is first built. Depending how the tree is built, it may or may not be possible to say, for each pdf-id, which phone - it corresponds to. + it corresponds to. \section tree_ctxdep Context dependency objects The ContextDependencyInterface object is a virtual base-class for the tree that specifies how it interacts with the graph-building code. This - interface contains only four functions: + interface contains only four functions: - \ref ContextDependencyInterface::ContextWidth() "ContextWidth()" returns the value of N (context-width) that the tree requires. - \ref ContextDependencyInterface::CentralPosition() "CentralPosition()" returns @@ -264,8 +275,8 @@ else \endcode The only class that currently inherits from ContextDependencyInterface -is the class ContextDependency, which has marginally richer interface; -the only important addition is the function \ref ContextDependency::GetPdfInfo +is the class ContextDependency, which has marginally richer interface; +the only important addition is the function \ref ContextDependency::GetPdfInfo "GetPdfInfo" which is used by the TransitionModel class to work out which phones a particular pdf can possibly correspond to (this function could be emulated given only the interface of ContextDependencyInterface, by @@ -274,7 +285,7 @@ enumerating all contexts). The ContextDependency object is actually a fairly thin wrapper for the EventMap object; see \ref tree_internals. We wanted to hide the actual implementation of the tree as much as possible to make it -easy to refactor the code later if needed. +easy to refactor the code later if needed. \section tree_example An example of a decision tree @@ -309,18 +320,18 @@ Below is a kind of quasi-BNF notation that explains the tree-file format. In the example below, the top-level EventMap of the tree is a SplitEventMap (SE) that splits on key 1, which is the central phone. In square brackets are a contiguous range of phone-ids. As it happens, these don't represent a question, but are just a way of -splitting on phones so we can get to the "real" decision trees which are per phone. +splitting on phones so we can get to the "real" decision trees which are per phone. The issue is that this tree was built with "shared roots", so there are various phone-ids, corresponding to different word-position-and-stress-marked versions of the same phone, that share the root. We can't use a TableEventMap (TE) at the top level of the tree, or we'd have to repeat each decision tree several times (since the EventMap is a pure -tree, not a general graph, it has no mechanism for pointers to be "shared"). -The next few instances of the "SE" label are also part of this "quasi-tree" which +tree, not a general graph, it has no mechanism for pointers to be "shared"). +The next few instances of the "SE" label are also part of this "quasi-tree" which is initially splitting on the central phone (as we go down this file we are going deeper into the tree; notice that the braces "{" are opening but not yet closing). Then we have the string "TE -1 5 ( CE 0 CE 1 CE 2 CE 3 CE 4 )", which represents splitting with a TableEventMap -on the pdf-class "-1" (effectively, the HMM-position), and returning values 0 through 4. +on the pdf-class "-1" (effectively, the HMM-position), and returning values 0 through 4. The values represent the five pdf-ids for the silence and noise phones SIL, NSN and SPN; in our setup, the pdfs are shared between these three non-speech phones (only the transition matrix is specific to each non-speech phone). @@ -332,8 +343,8 @@ various versions of the phone AA; and question is asking whether the pdf-class ( has value 0 (i.e. the leftmost HMM-state). Assuming the answer is "yes", the next question is "SE 2 [ 220 221 222 223 ]", which is asking whether the phone to the right is one of various forms of the phone "M" (a rather unintuitive question to ask, since we're -in the leftmost HMM-state); if yes, we ask "SE 0 [ 104 105 106 107... 286 287 ]" which is -a question about the phone to the right; if yes, then the pdf-id is 5 ("CE 5") and if +in the leftmost HMM-state); if yes, we ask "SE 0 [ 104 105 106 107... 286 287 ]" which is +a question about the phone to the right; if yes, then the pdf-id is 5 ("CE 5") and if no, 696 ("CE 696"). \verbatim s3# copy-tree --binary=false exp/tri1/tree - 2>/dev/null | head -100 @@ -366,8 +377,8 @@ SE 2 [ 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 36 37 38 39 40 41 42 43 44 45 4 \endverbatim Below is a simpler example: the monophone tree from the Resource Management -recipe. The top-level EventMap is a TableEventMap ("TE 0 49 ..."). -The key "0" is the phone-position of zero which represents the central (and only) phone +recipe. The top-level EventMap is a TableEventMap ("TE 0 49 ..."). +The key "0" is the phone-position of zero which represents the central (and only) phone since the context width (N) is 1. The number of entries in the table is 49 (in this case, the number of phones plus one). The first EventMap in the table (index zero) is NULL, because there is no phone with @@ -375,11 +386,11 @@ index zero. The next one is a TableEventMap with three elements, corresponding to the three HMM-states (technically, pdf-classes) of the first phone: "TE -1 3 ( CE 0 CE 1 CE 2 )". \verbatim s3# copy-tree --binary=false exp/mono/tree - 2>/dev/null| head -5 -ContextDependency 1 0 ToPdf TE 0 49 ( NULL TE -1 3 ( CE 0 CE 1 CE 2 ) -TE -1 3 ( CE 3 CE 4 CE 5 ) -TE -1 3 ( CE 6 CE 7 CE 8 ) -TE -1 3 ( CE 9 CE 10 CE 11 ) -TE -1 3 ( CE 12 CE 13 CE 14 ) +ContextDependency 1 0 ToPdf TE 0 49 ( NULL TE -1 3 ( CE 0 CE 1 CE 2 ) +TE -1 3 ( CE 3 CE 4 CE 5 ) +TE -1 3 ( CE 6 CE 7 CE 8 ) +TE -1 3 ( CE 9 CE 10 CE 11 ) +TE -1 3 ( CE 12 CE 13 CE 14 ) \endverbatim @@ -391,8 +402,8 @@ disambiguation symbols and possibly epsilon symbols). In the graph, as always, these are represented by integer labels. We use an object that, in code and in filenames, is generally called ilabel_info. The ilabel_info object 4has a strong connection to the \ref fst::ContextFst "ContextFst" objects, see \ref graph_context. -As with many other Kaldi types, ilabel_info is a generic (STL) type but -we use a consistent variable name +As with many other Kaldi types, ilabel_info is a generic (STL) type but +we use a consistent variable name to make it identifiable. It is of the following type: \code std::vector > ilabel_info; @@ -402,7 +413,7 @@ input label the corresponding phonetic context window (see above, \ref tree_window). For example, suppose symbol 1500 is phone 30 with a right-context of 12 and a left-context of 4, we would have -\code +\code // not valid C++ ilabel_info[1500] == { 4, 30, 12 }; \endcode @@ -410,14 +421,14 @@ In the monophone case, we would have things like: \code ilabel_info[30] == { 28 }; \endcode -There are special cases to deal with disambiguation symbols (see -\ref graph_disambig or the +There are special cases to deal with disambiguation symbols (see +\ref graph_disambig or the Springer Handbook paper referenced above for an explanation of what these are). If an ilabel_info entry corresponds to a disambiguation symbol, we put in it the negative of the symbol-table entry of the disambiguation symbol (note that this is not the same as the number of the printed form -of the disambiguation symbol as in #0, #1, #2 etc., it is the number -corresponding to it in a symbol-table file, which in our current scripts is +of the disambiguation symbol as in #0, #1, #2 etc., it is the number +corresponding to it in a symbol-table file, which in our current scripts is called phones_disambig.txt). For example, \code ilabel_info[5] == { -42 }; @@ -428,7 +439,7 @@ so the programs that interpret the ilabel_info object don't need to be given a list of disambiguation symbols in order to be able to distinguish them from real phones in the monophone case. There are two additional special cases: we have -\code +\code ilabel_info[0] == { }; // epsilon ilabel_info[1] == { 0 }; // disambig symbol #-1; // we use symbol 1, but don't consider this hardwired. diff --git a/src/feat/signal.cc b/src/feat/signal.cc index 19b876989c2..e8fbb0b84cf 100644 --- a/src/feat/signal.cc +++ b/src/feat/signal.cc @@ -34,7 +34,7 @@ void ElementwiseProductOfFft(const Vector &a, Vector *b) { void ConvolveSignals(const Vector &filter, Vector *signal) { int32 signal_length = signal->Dim(); int32 filter_length = filter.Dim(); - Vector signal_padded(signal_length + filter_length - 1); + Vector signal_padded(signal_length + filter_length - 1); signal_padded.SetZero(); for (int32 i = 0; i < signal_length; i++) { for (int32 j = 0; j < filter_length; j++) { @@ -54,11 +54,11 @@ void FFTbasedConvolveSignals(const Vector &filter, Vector SplitRadixRealFft srfft(fft_length); - Vector filter_padded(fft_length); + Vector filter_padded(fft_length); filter_padded.Range(0, filter_length).CopyFromVec(filter); srfft.Compute(filter_padded.Data(), true); - Vector signal_padded(fft_length); + Vector signal_padded(fft_length); signal_padded.Range(0, signal_length).CopyFromVec(*signal); srfft.Compute(signal_padded.Data(), true); @@ -83,13 +83,13 @@ void FFTbasedBlockConvolveSignals(const Vector &filter, Vector srfft(fft_length); - Vector filter_padded(fft_length); + Vector filter_padded(fft_length); filter_padded.Range(0, filter_length).CopyFromVec(filter); srfft.Compute(filter_padded.Data(), true); - Vector temp_pad(filter_length - 1); + Vector temp_pad(filter_length - 1); temp_pad.SetZero(); - Vector signal_block_padded(fft_length); + Vector signal_block_padded(fft_length); for (int32 po = 0; po < signal_length; po += block_length) { // get a block of the signal diff --git a/src/featbin/Makefile b/src/featbin/Makefile index 9843e7bbd4b..a2ad0032815 100644 --- a/src/featbin/Makefile +++ b/src/featbin/Makefile @@ -15,7 +15,7 @@ BINFILES = compute-mfcc-feats compute-plp-feats compute-fbank-feats \ process-kaldi-pitch-feats compare-feats wav-to-duration add-deltas-sdc \ compute-and-process-kaldi-pitch-feats modify-cmvn-stats wav-copy \ wav-reverberate append-vector-to-feats detect-sinusoids shift-feats \ - concat-feats + concat-feats vector-to-feat extract-column OBJFILES = diff --git a/src/featbin/extract-column.cc b/src/featbin/extract-column.cc new file mode 100644 index 00000000000..2bbf6b17235 --- /dev/null +++ b/src/featbin/extract-column.cc @@ -0,0 +1,82 @@ +// featbin/extract-column.cc + +// Copyright 2015 Vimal Manohar (Johns Hopkins University) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "matrix/kaldi-matrix.h" + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace std; + + const char *usage = + "Extract a column out of a matrix. \n" + "This is most useful to extract log-energies \n" + "from feature files\n" + "\n" + "Usage: extract-column [options] --column-index= \n" + " e.g. extract-column ark:feats-in.ark ark:energies.ark\n" + "See also: select-feats, subset-feats, subsample-feats, extract-rows\n"; + + ParseOptions po(usage); + + int32 column_index = 0; + + po.Register("column-index", &column_index, + "Index of column to extract"); + + po.Read(argc, argv); + + if (po.NumArgs() != 2) { + po.PrintUsage(); + exit(1); + } + + string feat_rspecifier = po.GetArg(1); + string vector_wspecifier = po.GetArg(2); + + SequentialBaseFloatMatrixReader reader(feat_rspecifier); + BaseFloatVectorWriter writer(vector_wspecifier); + + int32 num_done = 0, num_err = 0; + + string line; + + for (; !reader.Done(); reader.Next(), num_done++) { + const Matrix& feats(reader.Value()); + Vector col(feats.NumRows()); + if (column_index >= feats.NumCols()) { + KALDI_ERR << "Column index " << column_index << " is " + << "not less than number of columns " << feats.NumCols(); + } + col.CopyColFromMat(feats, column_index); + writer.Write(reader.Key(), col); + } + + KALDI_LOG << "Processed " << num_done << " segments successfully; " + << "errors on " << num_err; + + return (num_done > 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} + diff --git a/src/featbin/vector-to-feat.cc b/src/featbin/vector-to-feat.cc new file mode 100644 index 00000000000..5e98cf95a1c --- /dev/null +++ b/src/featbin/vector-to-feat.cc @@ -0,0 +1,99 @@ +// featbin/vector-to-feat.cc + +// Copyright 2015 Vimal Manohar + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "matrix/kaldi-matrix.h" + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + + const char *usage = + "Convert a vector into a single feature so that it can be appended \n" + "to other feature matrices\n" + "Usage: vector-to-feats \n" + "or: vector-to-feats \n" + "e.g.: vector-to-feats scp:weights.scp ark:weight_feats.ark\n" + " or: vector-to-feats weight_vec feat_mat\n" + "See also: copy-feats, copy-matrix, paste-feats, \n" + "subsample-feats, splice-feats\n"; + + ParseOptions po(usage); + bool compress = false, binary = true; + + po.Register("binary", &binary, "Binary-mode output (not relevant if writing " + "to archive)"); + po.Register("compress", &compress, "If true, write output in compressed form" + "(only currently supported for wxfilename, i.e. archive/script," + "output)"); + + po.Read(argc, argv); + + if (po.NumArgs() != 2) { + po.PrintUsage(); + exit(1); + } + + int32 num_done = 0; + + if (ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier) { + std::string vector_rspecifier = po.GetArg(1); + std::string feature_wspecifier = po.GetArg(2); + + SequentialBaseFloatVectorReader vector_reader(vector_rspecifier); + BaseFloatMatrixWriter feat_writer(feature_wspecifier); + CompressedMatrixWriter compressed_feat_writer(feature_wspecifier); + + for (; !vector_reader.Done(); vector_reader.Next(), ++num_done) { + const Vector &vec = vector_reader.Value(); + Matrix feat(vec.Dim(), 1); + feat.CopyColFromVec(vec, 0); + + if (!compress) + feat_writer.Write(vector_reader.Key(), feat); + else + compressed_feat_writer.Write(vector_reader.Key(), CompressedMatrix(feat)); + } + KALDI_LOG << "Converted " << num_done << " vectors into features"; + return (num_done != 0 ? 0 : 1); + } + + KALDI_ASSERT(!compress && "Compression not yet supported for single files"); + + std::string vector_rxfilename = po.GetArg(1), + feature_wxfilename = po.GetArg(2); + + Vector vec; + ReadKaldiObject(vector_rxfilename, &vec); + + Matrix feat(vec.Dim(), 1); + feat.CopyColFromVec(vec, 0); + + WriteKaldiObject(feat, feature_wxfilename, binary); + + KALDI_LOG << "Converted vector " << PrintableRxfilename(vector_rxfilename) + << " to " << PrintableWxfilename(feature_wxfilename); + return 0; + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} + diff --git a/src/hmm/posterior.cc b/src/hmm/posterior.cc index 25acf48a7d1..4e5cbd45282 100644 --- a/src/hmm/posterior.cc +++ b/src/hmm/posterior.cc @@ -429,18 +429,6 @@ void WeightSilencePostDistributed(const TransitionModel &trans_model, } } -// comparator object that can be used to sort from greatest to -// least posterior. -struct CompareReverseSecond { - // view this as an "<" operator used for sorting, except it behaves like - // a ">" operator on the .second field of the pair because we want the - // sort to be in reverse order (greatest to least) on posterior. - bool operator() (const std::pair &a, - const std::pair &b) { - return (a.second > b.second); - } -}; - BaseFloat VectorToPosteriorEntry( const VectorBase &log_likes, int32 num_gselect, diff --git a/src/hmm/posterior.h b/src/hmm/posterior.h index 18bbd65a86a..4f5896da7c6 100644 --- a/src/hmm/posterior.h +++ b/src/hmm/posterior.h @@ -155,6 +155,18 @@ int32 MergePosteriors(const Posterior &post1, bool drop_frames, Posterior *post); +// comparator object that can be used to sort from greatest to +// least posterior. +struct CompareReverseSecond { + // view this as an "<" operator used for sorting, except it behaves like + // a ">" operator on the .second field of the pair because we want the + // sort to be in reverse order (greatest to least) on posterior. + bool operator() (const std::pair &a, + const std::pair &b) { + return (a.second > b.second); + } +}; + /// Given a vector of log-likelihoods (typically of Gaussians in a GMM /// but could be of pdf-ids), a number gselect >= 1 and a minimum posterior /// 0 <= min_post < 1, it gets the posterior for each element of log-likes diff --git a/src/lat/Makefile b/src/lat/Makefile index ef9166fea12..bb36694f12e 100644 --- a/src/lat/Makefile +++ b/src/lat/Makefile @@ -6,7 +6,8 @@ include ../kaldi.mk EXTRA_CXXFLAGS += -Wno-sign-compare TESTFILES = kaldi-lattice-test push-lattice-test minimize-lattice-test \ - determinize-lattice-pruned-test word-align-lattice-lexicon-test + determinize-lattice-pruned-test word-align-lattice-lexicon-test \ + lattice-functions-test OBJFILES = kaldi-lattice.o lattice-functions.o word-align-lattice.o \ phone-align-lattice.o word-align-lattice-lexicon.o sausages.o \ diff --git a/src/lat/lattice-functions.cc b/src/lat/lattice-functions.cc index 0ea66712eda..fcb0039a6a3 100644 --- a/src/lat/lattice-functions.cc +++ b/src/lat/lattice-functions.cc @@ -6,6 +6,7 @@ // 2013 Cisco Systems (author: Neha Agrawal) [code modified // from original code in ../gmmbin/gmm-rescore-lattice.cc] // 2014 Guoguo Chen +// 2014-2015 Vimal Manohar // See ../../COPYING for clarification regarding multiple authors // @@ -22,17 +23,22 @@ // See the Apache 2 License for the specific language governing permissions and // limitations under the License. - +#include #include "lat/lattice-functions.h" #include "hmm/transition-model.h" #include "util/stl-utils.h" #include "base/kaldi-math.h" #include "hmm/hmm-utils.h" +#include "hmm/posterior.h" +#include "base/kaldi-types-extra.h" namespace kaldi { using std::map; using std::vector; +typedef SignedLogReal SignedLogDouble; +typedef SignedLogReal SignedLogBaseFloat; + int32 LatticeStateTimes(const Lattice &lat, vector *times) { if (!lat.Properties(fst::kTopSorted, true)) KALDI_ERR << "Input lattice must be topologically sorted."; @@ -270,7 +276,9 @@ template bool PruneLattice(BaseFloat beam, CompactLattice *lat); BaseFloat LatticeForwardBackward(const Lattice &lat, Posterior *post, - double *acoustic_like_sum) { + double *acoustic_like_sum, + std::vector *out_alpha, + std::vector *out_beta) { // Note, Posterior is defined as follows: Indexed [frame], then a list // of (transition-id, posterior-probability) pairs. // typedef std::vector > > Posterior; @@ -289,22 +297,35 @@ BaseFloat LatticeForwardBackward(const Lattice &lat, Posterior *post, int32 num_states = lat.NumStates(); vector state_times; int32 max_time = LatticeStateTimes(lat, &state_times); - std::vector alpha(num_states, kLogZeroDouble); - std::vector &beta(alpha); // we re-use the same memory for - // this, but it's semantically distinct so we name it differently. + + std::vector *alpha, *beta; + if (out_alpha && out_beta) { + alpha = out_alpha; + beta = out_beta; + alpha->clear(); + alpha->resize(num_states, kLogZeroDouble); + beta->clear(); + beta->resize(num_states, kLogZeroDouble); + } else { + alpha = new std::vector(num_states, kLogZeroDouble); + beta = alpha; + // we re-use the same memory for + // this, but it's semantically distinct so we name it differently. + } + double tot_forward_prob = kLogZeroDouble; post->clear(); post->resize(max_time); - alpha[0] = 0.0; + (*alpha)[0] = 0.0; // Propagate alphas forward. for (StateId s = 0; s < num_states; s++) { - double this_alpha = alpha[s]; + double this_alpha = (*alpha)[s]; for (ArcIterator aiter(lat, s); !aiter.Done(); aiter.Next()) { const Arc &arc = aiter.Value(); double arc_like = -ConvertToCost(arc.weight); - alpha[arc.nextstate] = LogAdd(alpha[arc.nextstate], this_alpha + arc_like); + (*alpha)[arc.nextstate] = LogAdd((*alpha)[arc.nextstate], this_alpha + arc_like); } Weight f = lat.Final(s); if (f != Weight::Zero()) { @@ -320,13 +341,13 @@ BaseFloat LatticeForwardBackward(const Lattice &lat, Posterior *post, for (ArcIterator aiter(lat, s); !aiter.Done(); aiter.Next()) { const Arc &arc = aiter.Value(); double arc_like = -ConvertToCost(arc.weight), - arc_beta = beta[arc.nextstate] + arc_like; + arc_beta = (*beta)[arc.nextstate] + arc_like; this_beta = LogAdd(this_beta, arc_beta); int32 transition_id = arc.ilabel; // The following "if" is an optimization to avoid un-needed exp(). if (transition_id != 0 || acoustic_like_sum != NULL) { - double posterior = Exp(alpha[s] + arc_beta - tot_forward_prob); + double posterior = Exp((*alpha)[s] + arc_beta - tot_forward_prob); if (transition_id != 0) // Arc has a transition-id on it [not epsilon] (*post)[state_times[s]].push_back(std::make_pair(transition_id, @@ -337,12 +358,12 @@ BaseFloat LatticeForwardBackward(const Lattice &lat, Posterior *post, } if (acoustic_like_sum != NULL && f != Weight::Zero()) { double final_logprob = - ConvertToCost(f), - posterior = Exp(alpha[s] + final_logprob - tot_forward_prob); + posterior = Exp((*alpha)[s] + final_logprob - tot_forward_prob); *acoustic_like_sum -= posterior * f.Value2(); } - beta[s] = this_beta; + (*beta)[s] = this_beta; } - double tot_backward_prob = beta[0]; + double tot_backward_prob = (*beta)[0]; if (!ApproxEqual(tot_forward_prob, tot_backward_prob, 1e-8)) { KALDI_WARN << "Total forward probability over lattice = " << tot_forward_prob << ", while total backward probability = " << tot_backward_prob; @@ -398,72 +419,6 @@ void ConvertLatticeToPhones(const TransitionModel &trans, } -static inline double LogAddOrMax(bool viterbi, double a, double b) { - if (viterbi) - return std::max(a, b); - else - return LogAdd(a, b); -} - -// Computes (normal or Viterbi) alphas and betas; returns (total-prob, or -// best-path negated cost) Note: in either case, the alphas and betas are -// negated costs. Requires that lat be topologically sorted. This code -// will work for either CompactLattice or Latice. -template -static double ComputeLatticeAlphasAndBetas(const LatticeType &lat, - bool viterbi, - vector *alpha, - vector *beta) { - typedef typename LatticeType::Arc Arc; - typedef typename Arc::Weight Weight; - typedef typename Arc::StateId StateId; - - StateId num_states = lat.NumStates(); - KALDI_ASSERT(lat.Properties(fst::kTopSorted, true) == fst::kTopSorted); - KALDI_ASSERT(lat.Start() == 0); - alpha->resize(num_states, kLogZeroDouble); - beta->resize(num_states, kLogZeroDouble); - - double tot_forward_prob = kLogZeroDouble; - (*alpha)[0] = 0.0; - // Propagate alphas forward. - for (StateId s = 0; s < num_states; s++) { - double this_alpha = (*alpha)[s]; - for (fst::ArcIterator aiter(lat, s); !aiter.Done(); - aiter.Next()) { - const Arc &arc = aiter.Value(); - double arc_like = -ConvertToCost(arc.weight); - (*alpha)[arc.nextstate] = LogAddOrMax(viterbi, (*alpha)[arc.nextstate], - this_alpha + arc_like); - } - Weight f = lat.Final(s); - if (f != Weight::Zero()) { - double final_like = this_alpha - ConvertToCost(f); - tot_forward_prob = LogAddOrMax(viterbi, tot_forward_prob, final_like); - } - } - for (StateId s = num_states-1; s >= 0; s--) { // it's guaranteed signed. - double this_beta = -ConvertToCost(lat.Final(s)); - for (fst::ArcIterator aiter(lat, s); !aiter.Done(); - aiter.Next()) { - const Arc &arc = aiter.Value(); - double arc_like = -ConvertToCost(arc.weight), - arc_beta = (*beta)[arc.nextstate] + arc_like; - this_beta = LogAddOrMax(viterbi, this_beta, arc_beta); - } - (*beta)[s] = this_beta; - } - double tot_backward_prob = (*beta)[lat.Start()]; - if (!ApproxEqual(tot_forward_prob, tot_backward_prob, 1e-8)) { - KALDI_WARN << "Total forward probability over lattice = " << tot_forward_prob - << ", while total backward probability = " << tot_backward_prob; - } - // Split the difference when returning... they should be the same. - return 0.5 * (tot_backward_prob + tot_forward_prob); -} - - - /// This is used in CompactLatticeLimitDepth. struct LatticeArcRecord { BaseFloat logprob; // logprob <= 0 is the best Viterbi logprob of this arc, @@ -736,7 +691,6 @@ bool LatticeBoost(const TransitionModel &trans, } - BaseFloat LatticeForwardBackwardMpeVariants( const TransitionModel &trans, const std::vector &silence_phones, @@ -831,19 +785,24 @@ BaseFloat LatticeForwardBackwardMpeVariants( if (!is_mpfe) { // smbr. int32 pdf = trans.TransitionIdToPdf(arc.ilabel), ref_pdf = trans.TransitionIdToPdf(num_ali[cur_time]); - if (!one_silence_class) // old behavior + if (!one_silence_class) { // old behavior + //frame_acc = (pdf == ref_pdf && !ref_phone_is_sil) ? 1.0 : 0.0; frame_acc = (pdf == ref_pdf && !phone_is_sil) ? 1.0 : 0.0; - else + } else frame_acc = (pdf == ref_pdf || both_sil) ? 1.0 : 0.0; } else { if (!one_silence_class) // old behavior - frame_acc = (phone == ref_phone && !phone_is_sil) ? 1.0 : 0.0; + frame_acc = (phone == ref_phone && !ref_phone_is_sil) ? 1.0 : 0.0; else frame_acc = (phone == ref_phone || both_sil) ? 1.0 : 0.0; } } double arc_scale = Exp(alpha[s] + arc_like - alpha[arc.nextstate]); alpha_smbr[arc.nextstate] += arc_scale * (alpha_smbr[s] + frame_acc); + KALDI_VLOG(10) << "Alpha SMBR for state " << arc.nextstate + << " reached from state " << s + << " at time " << state_times[s] << " is " + << alpha_smbr[s]; } Weight f = lat.Final(s); if (f != Weight::Zero()) { @@ -875,13 +834,14 @@ BaseFloat LatticeForwardBackwardMpeVariants( if (!is_mpfe) { // smbr. int32 pdf = trans.TransitionIdToPdf(arc.ilabel), ref_pdf = trans.TransitionIdToPdf(num_ali[cur_time]); - if (!one_silence_class) // old behavior + if (!one_silence_class) { // old behavior + //frame_acc = (pdf == ref_pdf && !ref_phone_is_sil) ? 1.0 : 0.0; frame_acc = (pdf == ref_pdf && !phone_is_sil) ? 1.0 : 0.0; - else + } else frame_acc = (pdf == ref_pdf || both_sil) ? 1.0 : 0.0; } else { if (!one_silence_class) // old behavior - frame_acc = (phone == ref_phone && !phone_is_sil) ? 1.0 : 0.0; + frame_acc = (phone == ref_phone && !ref_phone_is_sil) ? 1.0 : 0.0; else frame_acc = (phone == ref_phone || both_sil) ? 1.0 : 0.0; } @@ -892,8 +852,13 @@ BaseFloat LatticeForwardBackwardMpeVariants( // i.e., paths don't survive to the final state if (KALDI_ISNAN(arc_scale)) arc_scale = 0; beta_smbr[s] += arc_scale * (beta_smbr[arc.nextstate] + frame_acc); + KALDI_VLOG(10) << "Beta SMBR for state " << s + << " going to state " << arc.nextstate + << " at time " << state_times[s] << " is " + << beta_smbr[s]; if (transition_id != 0) { // Arc has a transition-id on it [not epsilon] + // Get gradient wrt acoustic log-likelihood double posterior = Exp(alpha[s] + arc_beta - tot_forward_prob); double acc_diff = alpha_smbr[s] + frame_acc + beta_smbr[arc.nextstate] - tot_forward_score; @@ -918,6 +883,491 @@ BaseFloat LatticeForwardBackwardMpeVariants( return tot_forward_score; } + + +BaseFloat LatticeForwardBackwardEmpeVariants( + const TransitionModel &trans, + const std::vector &silence_phones, + const Lattice &lat, + const std::vector &num_ali, + const Posterior *num_post, + const Lattice *num_lat, + std::string criterion, + bool one_silence_class, + BaseFloat deletion_penalty, + Posterior *post, + BaseFloat weight_threshold, + const std::vector *weights) { + using namespace fst; + typedef Lattice::Arc Arc; + typedef Arc::Weight Weight; + typedef Arc::StateId StateId; + + KALDI_ASSERT(criterion == "empfe" || criterion == "esmbr"); + + if (lat.Properties(fst::kTopSorted, true) == 0) + KALDI_ERR << "Input lattice must be topologically sorted."; + KALDI_ASSERT(lat.Start() == 0); + + vector state_times; + int32 max_time = LatticeStateTimes(lat, &state_times); + + KALDI_ASSERT(num_ali.size() == max_time); + std::vector alpha, beta; + + Posterior num_post_computed; + + if ((criterion == "smbr" || criterion == "mpfe") && num_lat == NULL && num_post == NULL) { + // Using numerator alignment + KALDI_VLOG(4) << "Computing for " << criterion + << " criterion using numerator alignment"; + AlignmentToPosterior(num_ali, &num_post_computed); + ComputeLatticeAlphasAndBetas(lat, false, &alpha, &beta); + } else if (num_lat) { + // Using numerator lattice + KALDI_VLOG(4) << "Computing for " << criterion + << " criterion using numerator lattice"; + LatticeForwardBackward(*num_lat, &num_post_computed, NULL); + ComputeLatticeAlphasAndBetas(lat, false, &alpha, &beta); + } else if (num_post) { + // Using numerator posteriors + KALDI_VLOG(4) << "Computing for " << criterion + << " criterion using numerator posteriors"; + num_post_computed = *num_post; + ComputeLatticeAlphasAndBetas(lat, false, &alpha, &beta); + } else { + // Using denominator lattice + KALDI_VLOG(4) << "Computing for " << criterion + << " criterion using denominator lattice"; + LatticeForwardBackward(lat, &num_post_computed, + NULL, &alpha, &beta); + } + + // Now combine any posteriors with the same transition-id. + for (int32 t = 0; t < max_time; t++) + MergePairVectorSumming(&(num_post_computed[t])); + + // Remove frames with max numerator posterior < weight_threshold + for (size_t i = 0; i < max_time; i++) { + std::vector > &post_i = num_post_computed[i]; + std::vector >::iterator it = + std::min_element(post_i.begin(), post_i.end(), CompareReverseSecond()); + if (it->second < weight_threshold) + num_post_computed[i].clear(); + } + + BaseFloat tot_forward_score = + LatticeForwardBackwardEmpeVariantsInternal(trans, silence_phones, lat, + num_ali, num_post_computed, alpha, beta, criterion, + one_silence_class, deletion_penalty, post, weights); + + return tot_forward_score; +} + +BaseFloat LatticeForwardBackwardEmpeVariantsInternal( + const TransitionModel &trans, + const std::vector &silence_phones, + const Lattice &lat, + const std::vector &num_ali, + const Posterior &num_post, + const std::vector &alpha, + const std::vector &beta, + std::string criterion, + bool one_silence_class, + BaseFloat deletion_penalty, + Posterior *post, + const std::vector *weights = NULL) { + using namespace fst; + typedef Lattice::Arc Arc; + typedef Arc::Weight Weight; + typedef Arc::StateId StateId; + + KALDI_ASSERT(criterion == "empfe" || criterion == "esmbr"); + bool is_mpfe = (criterion == "empfe"); + + if (lat.Properties(fst::kTopSorted, true) == 0) + KALDI_ERR << "Input lattice must be topologically sorted."; + KALDI_ASSERT(lat.Start() == 0); + + int32 num_states = lat.NumStates(); + vector state_times; + int32 max_time = LatticeStateTimes(lat, &state_times); + + KALDI_ASSERT(alpha.size() == num_states && beta.size() == num_states); + + std::vector alpha_smbr(num_states, 0), //forward variable for sMBR + beta_smbr(num_states, 0); //backward variable for sMBR + + post->clear(); + post->resize(max_time); + + double tot_forward_prob = beta[0]; + double tot_forward_score = 0; + + alpha_smbr[0] = 0.0; + // Second Pass Forward, calculate forward for EMPFE/ESMBR + for (StateId s = 0; s < num_states; s++) { + double this_alpha = alpha[s]; + for (ArcIterator aiter(lat, s); !aiter.Done(); aiter.Next()) { + const Arc &arc = aiter.Value(); + double arc_like = -ConvertToCost(arc.weight); + double frame_acc = 0.0; + if (arc.ilabel != 0) { + int32 cur_time = state_times[s]; + int32 phone = trans.TransitionIdToPhone(arc.ilabel); + int32 pdf = trans.TransitionIdToPdf(arc.ilabel); + bool phone_is_sil = std::binary_search(silence_phones.begin(), + silence_phones.end(), + phone); + + // Go through the numerator lattice + for (std::vector >::const_iterator it = num_post[cur_time].begin(); + it != num_post[cur_time].end(); ++it) { + int32 ref_phone = trans.TransitionIdToPhone(it->first); + BaseFloat weight = it->second; + + bool ref_phone_is_sil = std::binary_search(silence_phones.begin(), + silence_phones.end(), + ref_phone), + both_sil = phone_is_sil && ref_phone_is_sil; + + if (!is_mpfe) { // smbr. + int32 ref_pdf = trans.TransitionIdToPdf(it->first); + if (!one_silence_class) // old behavior + //frame_acc += (pdf == ref_pdf && !phone_is_sil) ? weight : 0.0; + // fixed old behavior + frame_acc += (pdf == ref_pdf && !ref_phone_is_sil) ? weight : 0.0; + else + frame_acc += (pdf == ref_pdf || both_sil) ? weight : 0.0; + } else { + if (!one_silence_class) // old behavior + // frame_acc += (phone == ref_phone && !phone_is_sil) ? weight : 0.0; + // fixed old behavior + frame_acc += (phone == ref_phone && !ref_phone_is_sil) ? weight : 0.0; + else + frame_acc += (phone == ref_phone || both_sil) ? weight : 0.0; + } + } + + if (deletion_penalty > 0) { + int32 ali_phone = trans.TransitionIdToPhone(num_ali[cur_time]); + bool ali_is_sil = std::binary_search(silence_phones.begin(), + silence_phones.end(), + ali_phone); + // Add extra score to a path if it is not a deletion + // (deletion: path has silence and best path has non-silence) + frame_acc += !(!ali_is_sil && phone_is_sil) ? deletion_penalty : 0.0; + } + } + + if (weights != NULL) + frame_acc *= (*weights)[state_times[s]]; + + double arc_scale = Exp(alpha[s] + arc_like - alpha[arc.nextstate]); + alpha_smbr[arc.nextstate] += arc_scale * (alpha_smbr[s] + frame_acc); + KALDI_VLOG(10) << "Alpha SMBR for state " << arc.nextstate + << " reached from state " << s + << " at time " << state_times[s] << " is " + << alpha_smbr[s]; + } + Weight f = lat.Final(s); + if (f != Weight::Zero()) { + double final_like = this_alpha - (f.Value1() + f.Value2()); + double arc_scale = Exp(final_like - tot_forward_prob); + tot_forward_score += arc_scale * alpha_smbr[s]; + KALDI_ASSERT(state_times[s] == max_time && + "Lattice is inconsistent (final-prob not at max_time)"); + } + } + + // Second Pass Backward, collect EMPFE style posteriors + for (StateId s = num_states-1; s >= 0; s--) { + for (ArcIterator aiter(lat, s); !aiter.Done(); aiter.Next()) { + const Arc &arc = aiter.Value(); + double arc_like = -ConvertToCost(arc.weight), + arc_beta = beta[arc.nextstate] + arc_like; + int32 transition_id = arc.ilabel; + double frame_acc = 0.0; + if (arc.ilabel != 0) { + int32 cur_time = state_times[s]; + int32 phone = trans.TransitionIdToPhone(arc.ilabel); + int32 pdf = trans.TransitionIdToPdf(arc.ilabel); + bool phone_is_sil = std::binary_search(silence_phones.begin(), + silence_phones.end(), phone); + for (std::vector >::const_iterator it = num_post[cur_time].begin(); + it != num_post[cur_time].end(); ++it) { + int32 ref_phone = trans.TransitionIdToPhone(it->first); + BaseFloat weight = it->second; + bool ref_phone_is_sil = std::binary_search(silence_phones.begin(), + silence_phones.end(), + ref_phone), + both_sil = phone_is_sil && ref_phone_is_sil; + if (!is_mpfe) { // smbr. + int32 ref_pdf = trans.TransitionIdToPdf(it->first); + if (!one_silence_class) // old behavior + // frame_acc += (pdf == ref_pdf && !phone_is_sil) ? weight : 0.0; + // fixed old behavior + frame_acc += (pdf == ref_pdf && !ref_phone_is_sil) ? weight : 0.0; + else + frame_acc += (pdf == ref_pdf || both_sil) ? weight : 0.0; + } else { + if (!one_silence_class) // old behavior + // frame_acc += (phone == ref_phone && !phone_is_sil) ? weight : 0.0; + // fixed old behavior + frame_acc += (phone == ref_phone && !ref_phone_is_sil) ? weight : 0.0; + else + frame_acc += (phone == ref_phone || both_sil) ? weight : 0.0; + } + } + + if (deletion_penalty > 0.0) { + int32 ali_phone = trans.TransitionIdToPhone(num_ali[cur_time]); + bool ali_is_sil = std::binary_search(silence_phones.begin(), + silence_phones.end(), + ali_phone); + // Add extra score to a path if it is not a deletion + // (deletion: path has silence and best path has non-silence) + frame_acc += !(!ali_is_sil && phone_is_sil) ? deletion_penalty : 0.0; + } + } + + if (weights != NULL) + frame_acc *= (*weights)[state_times[s]]; + + + double arc_scale = Exp(beta[arc.nextstate] + arc_like - beta[s]); + // check arc_scale NAN, + // this is to prevent partial paths in Lattices + // i.e., paths don't survive to the final state + if (KALDI_ISNAN(arc_scale)) arc_scale = 0; + beta_smbr[s] += arc_scale * (beta_smbr[arc.nextstate] + frame_acc); + KALDI_VLOG(10) << "Beta SMBR for state " << s + << " going to state " << arc.nextstate + << " at time " << state_times[s] << " is " + << beta_smbr[s]; + + if (transition_id != 0) { // Arc has a transition-id on it [not epsilon] + double posterior = exp(alpha[s] + arc_beta - tot_forward_prob); + double acc_diff = alpha_smbr[s] + frame_acc + beta_smbr[arc.nextstate] + - tot_forward_score; + double posterior_smbr = posterior * acc_diff; + (*post)[state_times[s]].push_back(std::make_pair(transition_id, + static_cast(posterior_smbr))); + } + } + } + + //Second Pass Forward Backward check + double tot_backward_score = beta_smbr[0]; // Initial state id == 0 + // may loose the condition somehow here 1e-5/1e-4 + if (!ApproxEqual(tot_forward_score, tot_backward_score, 1e-4)) { + KALDI_ERR << "Total forward score over lattice = " << tot_forward_score + << ", while total backward score = " << tot_backward_score; + } + + // Output the computed posteriors + for (int32 t = 0; t < max_time; t++) + MergePairVectorSumming(&((*post)[t])); + + return tot_forward_score; +} + + +SignedLogDouble LatticeForwardBackwardNce( + const TransitionModel &trans, + const Lattice &lat, + Posterior *post, + const std::vector *weights, + BaseFloat weight_threshold) { + using namespace fst; + typedef Lattice::Arc Arc; + typedef Arc::Weight Weight; + typedef Arc::StateId StateId; + + if (lat.Properties(fst::kTopSorted, true) == 0) + KALDI_ERR << "Input lattice must be topologically sorted."; + KALDI_ASSERT(lat.Start() == 0); + + int32 num_states = lat.NumStates(); + vector state_times; + int32 max_time = LatticeStateTimes(lat, &state_times); + + std::vector alpha_p(num_states), // forward variable for p + alpha_r(num_states), // forward variable for -plog(p) + beta_p(num_states), // backward variable for p + beta_r(num_states); // backward variable for -plog(p) + + SignedLogDouble Z; + SignedLogDouble r; + + post->clear(); + post->resize(max_time); + + KALDI_ASSERT(lat.Start() == 0); // For debugging + + alpha_p[0].SetOne(); + int32 final_states_count = 0; + // Forward Pass + for (StateId s = 0; s < num_states; s++) { + SignedLogDouble this_alpha_p(alpha_p[s]); + SignedLogDouble this_alpha_r(alpha_r[s]); + + for (ArcIterator aiter(lat, s); !aiter.Done(); aiter.Next()) { + const Arc &arc = aiter.Value(); + SignedLogDouble p_a(false, -ConvertToCost(arc.weight)); // Initialize from log of real number + + // r_a = (p_a * -log_p_a); + SignedLogDouble r_a(-p_a.LogMagnitude()); + r_a.Multiply(p_a); + + // alpha_p[n[a]] += this_alpha_p * p_a + alpha_p[arc.nextstate].Add(this_alpha_p * p_a); + + // alpha_r[n[a]] += this_alpha_p * r_a + this_alpha_r * p_a + alpha_r[arc.nextstate].Add(this_alpha_p * r_a); + alpha_r[arc.nextstate].Add(this_alpha_r * p_a); + } + Weight f = lat.Final(s); + if (f != Weight::Zero()) { + final_states_count++; + SignedLogDouble f_p(false, -(f.Value1() + f.Value2())); + + // f_r = f_p * -log_f_p + SignedLogDouble f_r(-f_p.LogMagnitude()); // Initialize from a real number + f_r.Multiply(f_p); + + Z.Add(this_alpha_p * f_p); + + r.Add(this_alpha_p * f_r); + r.Add(this_alpha_r * f_p); + + KALDI_ASSERT(state_times[s] == max_time && "Lattice is inconsistent (final-prob not at max_time"); + } + } + + // Special case check where the final state has weight One(). + // This case is ensured by connecting all original final states to the "Final" + // state through arcs carrying their respective final weights and then + // add a "Final" weight of One() to the new state + // KALDI_ASSERT(final_states_count == 1); // Apparently not true + + // Backward Pass + for (StateId s = num_states-1; s >= 0; s--) { + Weight f = lat.Final(s); + SignedLogDouble this_beta_p; + SignedLogDouble this_beta_r; + + if (f != Weight::Zero()) { + KALDI_ASSERT(state_times[s] == max_time); // Special case + + SignedLogDouble f_p(false, -(f.Value1() + f.Value2())); // Initialize from log of real number + + // f_r = f_p * -log_f_p + SignedLogDouble f_r(-f_p.LogMagnitude()); // Initialize from real number + f_r.Multiply(f_p); + + this_beta_p.Add(f_p); + this_beta_r.Add(f_r); + } + + for (ArcIterator aiter(lat,s); !aiter.Done(); aiter.Next()) { + const Arc &arc = aiter.Value(); + SignedLogDouble p_a(false, -ConvertToCost(arc.weight)); // Initialize from log of real number + + // log(p_a * -log_p_a); + SignedLogDouble r_a(-p_a.LogMagnitude()); + r_a.Multiply(p_a); + + this_beta_p.Add(beta_p[arc.nextstate] * p_a); + this_beta_r.Add(beta_p[arc.nextstate] * r_a); + this_beta_r.Add(beta_r[arc.nextstate] * p_a); + } + beta_p[s] = this_beta_p; + beta_r[s] = this_beta_r; + + KALDI_VLOG(10) << "beta_p for state " << s << " is " << beta_p[s]; + KALDI_VLOG(10) << "beta_r for state " << s << " is " << beta_r[s]; + + } + + // Forward-Backward Check + KALDI_VLOG(10) << "Total forward probability over lattice = " << Z + << ", while total backward probability = " << beta_p[0]; + KALDI_VLOG(10) << "Total forward (-plog(p)) over lattice = " << r + << ", while total backward (-plog(p)) = " << beta_r[0]; + if (!Z.ApproxEqual(beta_p[0], 1e-6)) { + KALDI_WARN << "Total forward probability over lattice = " << Z + << ", while total backward probability = " << beta_p[0]; + } + if (!r.ApproxEqual(beta_r[0], 1e-6)) { + KALDI_WARN << "Total forward (-plog(p)) over lattice = " << r + << ", while total backward (-plog(p)) = " << beta_r[0]; + } + + // Compute Entropy H = r/Z + log(Z) + SignedLogDouble H(r); + H.DivideBy(Z); + + KALDI_ASSERT(Z.Positive()); + H.AddReal(Z.LogMagnitude()); + + KALDI_VLOG(4) << "Entropy of Lattice is " << H; + + // Derivative Computation + for (StateId s = 0; s < num_states; s++) { + int32 t = state_times[s]; + if (weights != NULL && (*weights)[t] < weight_threshold) + continue; + for (ArcIterator aiter(lat, s); !aiter.Done(); aiter.Next()) { + const Arc &arc = aiter.Value(); + SignedLogDouble p_a(false, -ConvertToCost(arc.weight)); + + // log(p_a * -log_p_a); + SignedLogDouble r_a(-p_a.LogMagnitude()); + r_a.Multiply(p_a); + + if (arc.ilabel != 0) { + //SignedLogDouble delH((alpha_p[s] * beta_p[arc.nextstate] * p_a) / Z); + //delH.Sub((alpha_p[s] * beta_p[arc.nextstate] * p_a) / Z * r / Z); + //delH.Add((alpha_p[s] * beta_r[arc.nextstate] * p_a) / Z); + //delH.Add((alpha_r[s] * beta_p[arc.nextstate] * p_a) / Z); + //delH.Sub((alpha_p[s] * beta_p[arc.nextstate] * p_a) / Z); + //delH.Add((alpha_p[s] * beta_p[arc.nextstate] * r_a) / Z); + + SignedLogDouble delZ = alpha_p[s] * beta_p[arc.nextstate] * p_a; + SignedLogDouble delr = alpha_p[s] * beta_r[arc.nextstate] * p_a; + delr.Add(alpha_r[s] * beta_p[arc.nextstate] * p_a); + delr.Add(alpha_p[s] * beta_p[arc.nextstate] * r_a); + delr.Sub(alpha_p[s] * beta_p[arc.nextstate] * p_a); + + SignedLogDouble delH = delZ / Z; + delH.Sub(delZ / Z * r / Z); + delH.Add(delr / Z); + + // Push back delNce = -delH + (*post)[state_times[s]].push_back(std::make_pair(arc.ilabel, -delH.Value())); + + /* + (1/Z - r/Z/Z) * alpha_p[s] * beta_p[arc.nextstate] * p_a + + 1/Z * ( + alpha_p[s] * beta_r[arc.nextstate] * p_a + + alpha_r[s] * beta_p[arc.nextstate] * p_a + - alpha_p[s] * beta_p[arc.nextstate] * p_a + + alpha_p[s] * beta_p[arc.nextstate] * r_a + ) + + -(1.0/Z - r/Z/Z) * Exp(alpha_p[s] + beta_p[arc.nextstate] + log_p_a) + - (1.0/Z) * ( Exp(alpha_p[s] + beta_r[arc.nextstate] + log_p_a) + + Exp(alpha_r[s] + beta_p[arc.nextstate] + log_p_a) + - Exp(alpha_p[s] + beta_p[arc.nextstate] + log_p_a) + + Exp(alpha_p[s] + beta_p[arc.nextstate] + log_r_a) ))); + */ + } + } + } + return -H; // Negative Conditional Entropy +} + bool CompactLatticeToWordAlignment(const CompactLattice &clat, std::vector *words, std::vector *begin_times, @@ -969,6 +1419,7 @@ bool CompactLatticeToWordAlignment(const CompactLattice &clat, } + bool CompactLatticeToWordProns( const TransitionModel &tmodel, const CompactLattice &clat, diff --git a/src/lat/lattice-functions.h b/src/lat/lattice-functions.h index 505aaffbe55..8aee928c6a7 100644 --- a/src/lat/lattice-functions.h +++ b/src/lat/lattice-functions.h @@ -4,6 +4,7 @@ // 2012-2013 Johns Hopkins University (Author: Daniel Povey); // Bagher BabaAli // 2014 Guoguo Chen +// 2014 Vimal Manohar // See ../../COPYING for clarification regarding multiple authors // @@ -33,9 +34,13 @@ #include "hmm/transition-model.h" #include "lat/kaldi-lattice.h" #include "itf/decodable-itf.h" +#include "base/kaldi-types-extra.h" namespace kaldi { +typedef SignedLogReal SignedLogDouble; +typedef SignedLogReal SignedLogBaseFloat; + /// This function iterates over the states of a topologically sorted lattice and /// counts the time instance corresponding to each state. The times are returned /// in a vector of integers 'times' which is resized to have a size equal to the @@ -61,7 +66,9 @@ int32 CompactLatticeStateTimes(const CompactLattice &clat, /// the objective function in MMI discriminative training. BaseFloat LatticeForwardBackward(const Lattice &lat, Posterior *arc_post, - double *acoustic_like_sum = NULL); + double *acoustic_like_sum = NULL, + std::vector *out_alpha = NULL, + std::vector *out_beta = NULL); // This function is something similar to LatticeForwardBackward(), but it is on // the CompactLattice lattice format. Also we only need the alpha in the forward @@ -74,6 +81,72 @@ bool ComputeCompactLatticeAlphas(const CompactLattice &lat, bool ComputeCompactLatticeBetas(const CompactLattice &lat, vector *beta); + +static inline double LogAddOrMax(bool viterbi, double a, double b) { + if (viterbi) + return std::max(a, b); + else + return LogAdd(a, b); +} + + +// Computes (normal or Viterbi) alphas and betas; returns (total-prob, or +// best-path negated cost) Note: in either case, the alphas and betas are +// negated costs. Requires that lat be topologically sorted. This code +// will work for either CompactLattice or Latice. +template +double ComputeLatticeAlphasAndBetas(const LatticeType &lat, + bool viterbi, + vector *alpha, + vector *beta) { + typedef typename LatticeType::Arc Arc; + typedef typename Arc::Weight Weight; + typedef typename Arc::StateId StateId; + + StateId num_states = lat.NumStates(); + KALDI_ASSERT(lat.Properties(fst::kTopSorted, true) == fst::kTopSorted); + KALDI_ASSERT(lat.Start() == 0); + alpha->resize(num_states, kLogZeroDouble); + beta->resize(num_states, kLogZeroDouble); + + double tot_forward_prob = kLogZeroDouble; + (*alpha)[0] = 0.0; + // Propagate alphas forward. + for (StateId s = 0; s < num_states; s++) { + double this_alpha = (*alpha)[s]; + for (fst::ArcIterator aiter(lat, s); !aiter.Done(); + aiter.Next()) { + const Arc &arc = aiter.Value(); + double arc_like = -ConvertToCost(arc.weight); + (*alpha)[arc.nextstate] = LogAddOrMax(viterbi, (*alpha)[arc.nextstate], + this_alpha + arc_like); + } + Weight f = lat.Final(s); + if (f != Weight::Zero()) { + double final_like = this_alpha - ConvertToCost(f); + tot_forward_prob = LogAddOrMax(viterbi, tot_forward_prob, final_like); + } + } + for (StateId s = num_states-1; s >= 0; s--) { // it's guaranteed signed. + double this_beta = -ConvertToCost(lat.Final(s)); + for (fst::ArcIterator aiter(lat, s); !aiter.Done(); + aiter.Next()) { + const Arc &arc = aiter.Value(); + double arc_like = -ConvertToCost(arc.weight), + arc_beta = (*beta)[arc.nextstate] + arc_like; + this_beta = LogAddOrMax(viterbi, this_beta, arc_beta); + } + (*beta)[s] = this_beta; + } + double tot_backward_prob = (*beta)[lat.Start()]; + if (!ApproxEqual(tot_forward_prob, tot_backward_prob, 1e-8)) { + KALDI_WARN << "Total forward probability over lattice = " << tot_forward_prob + << ", while total backward probability = " << tot_backward_prob; + } + // Split the difference when returning... they should be the same. + return 0.5 * (tot_backward_prob + tot_forward_prob); +} + /// Topologically sort the compact lattice if not already topologically sorted. /// Will crash if the lattice cannot be topologically sorted. void TopSortCompactLatticeIfNeeded(CompactLattice *clat); @@ -176,6 +249,34 @@ BaseFloat LatticeForwardBackwardMpeVariants( bool one_silence_class, Posterior *post); +BaseFloat LatticeForwardBackwardEmpeVariants( + const TransitionModel &trans, + const std::vector &silence_phones, + const Lattice &lat, + const std::vector &num_ali, + const Posterior *num_post, + const Lattice *num_lat, + std::string criterion, + bool one_silence_class, + BaseFloat deletion_penalty, + Posterior *post, + BaseFloat weight_threshold = 0.0, + const std::vector *weights = NULL); + +BaseFloat LatticeForwardBackwardEmpeVariantsInternal( + const TransitionModel &trans, + const std::vector &silence_phones, + const Lattice &lat, + const std::vector &num_ali, + const Posterior &num_post, + const std::vector &alpha, + const std::vector &beta, + std::string criterion, + bool one_silence_class, + BaseFloat deletion_penalty, + Posterior *post, + const std::vector *weights); + /** This function can be used to compute posteriors for MMI, with a positive contribution for the numerator and a negative one for the denominator. This function is not actually @@ -198,6 +299,19 @@ BaseFloat LatticeForwardBackwardMmi( bool cancel, Posterior *arc_post); +/** + This function can be used to compute the derivatives of NCE objective + function. This function is written for using in neural-net + semi-supervised discriminative training. + It returns the objective function, which is the negative conditional + entropy of the lattice given the observation sequence. */ +SignedLogDouble LatticeForwardBackwardNce( + const TransitionModel &trans, + const Lattice &lat, + Posterior *arc_post, + const std::vector *weights = NULL, + BaseFloat weight_threshold = 0.0); + /// This function takes a CompactLattice that should only contain a single /// linear sequence (e.g. derived from lattice-1best), and that should have been diff --git a/src/latbin/Makefile b/src/latbin/Makefile index f1633978fbf..ef2b2064b24 100644 --- a/src/latbin/Makefile +++ b/src/latbin/Makefile @@ -20,7 +20,8 @@ BINFILES = lattice-best-path lattice-prune lattice-equivalent lattice-to-nbest \ lattice-minimize lattice-limit-depth lattice-depth-per-frame \ lattice-confidence lattice-determinize-phone-pruned \ lattice-determinize-phone-pruned-parallel lattice-expand-ngram \ - lattice-lmrescore-const-arpa lattice-lmrescore-rnnlm nbest-to-prons + lattice-lmrescore-const-arpa lattice-lmrescore-rnnlm nbest-to-prons \ + lattice-determinize-non-compact OBJFILES = diff --git a/src/latbin/lattice-copy.cc b/src/latbin/lattice-copy.cc index 76ca034b2e4..a97a75e7450 100644 --- a/src/latbin/lattice-copy.cc +++ b/src/latbin/lattice-copy.cc @@ -24,6 +24,108 @@ #include "fstext/fstext-lib.h" #include "lat/kaldi-lattice.h" +namespace kaldi { + int32 CopySubsetLattices(std::string filename, + SequentialLatticeReader *lattice_reader, + LatticeWriter *lattice_writer, + bool include = true, bool ignore_missing = false + ) { + unordered_set subset; + std::set subset_list; + + bool binary; + Input ki(filename, &binary); + KALDI_ASSERT(!binary); + std::string line; + while (std::getline(ki.Stream(), line)) { + std::vector split_line; + SplitStringToVector(line, " \t\r", true, &split_line); + if(split_line.empty()) { + KALDI_ERR << "Unable to parse line \"" << line << "\" encountered in input in " << filename; + } + subset.insert(split_line[0]); + subset_list.insert(split_line[0]); + } + + int32 num_total = 0; + size_t num_success = 0; + for (; !lattice_reader->Done(); lattice_reader->Next(), num_total++) { + if (include && lattice_reader->Key() > *(subset_list.rbegin())) { + KALDI_LOG << "The utterance " << lattice_reader->Key() + << " is larger than " + << "the last key in the include list. Not reading further."; + KALDI_LOG << "Wrote " << num_success << " utterances"; + return 0; + } + + if (include && subset.count(lattice_reader->Key()) > 0) { + lattice_writer->Write(lattice_reader->Key(), lattice_reader->Value()); + num_success++; + } else if (!include && subset.count(lattice_reader->Key()) == 0) { + lattice_writer->Write(lattice_reader->Key(), lattice_reader->Value()); + num_success++; + } + } + + KALDI_LOG << "Wrote " << num_success << " out of " << num_total + << " utterances."; + + if (ignore_missing) return 0; + + return (num_success != 0 ? 0 : 1); + } + + int32 CopySubsetLattices(std::string filename, + SequentialCompactLatticeReader *lattice_reader, + CompactLatticeWriter *lattice_writer, + bool include = true, bool ignore_missing = false + ) { + unordered_set subset; + std::set subset_list; + + bool binary; + Input ki(filename, &binary); + KALDI_ASSERT(!binary); + std::string line; + while (std::getline(ki.Stream(), line)) { + std::vector split_line; + SplitStringToVector(line, " \t\r", true, &split_line); + if(split_line.empty()) { + KALDI_ERR << "Unable to parse line \"" << line << "\" encountered in input in " << filename; + } + subset.insert(split_line[0]); + subset_list.insert(split_line[0]); + } + + int32 num_total = 0; + size_t num_success = 0; + for (; !lattice_reader->Done(); lattice_reader->Next(), num_total++) { + if (include && lattice_reader->Key() > *(subset_list.rbegin())) { + KALDI_LOG << "The utterance " << lattice_reader->Key() + << " is larger than " + << "the last key in the include list. Not reading further."; + KALDI_LOG << "Wrote " << num_success << " utterances"; + return 0; + } + + if (include && subset.count(lattice_reader->Key()) > 0) { + lattice_writer->Write(lattice_reader->Key(), lattice_reader->Value()); + num_success++; + } else if (!include && subset.count(lattice_reader->Key()) == 0) { + lattice_writer->Write(lattice_reader->Key(), lattice_reader->Value()); + num_success++; + } + } + + KALDI_LOG << " Wrote " << num_success << " out of " << num_total + << " utterances."; + + if (ignore_missing) return 0; + + return (num_success != 0 ? 0 : 1); + } +} + int main(int argc, char *argv[]) { try { using namespace kaldi; @@ -36,14 +138,29 @@ int main(int argc, char *argv[]) { const char *usage = "Copy lattices (e.g. useful for changing to text mode or changing\n" "format to standard from compact lattice.)\n" + "The --include and --exclude mutually exclusive options of this " + "program, which are intended to copy only subset of lattices.\n" "Usage: lattice-copy [options] lattice-rspecifier lattice-wspecifier\n" " e.g.: lattice-copy --write-compact=false ark:1.lats ark,t:text.lats\n" "See also: lattice-to-fst, and the script egs/wsj/s5/utils/convert_slf.pl\n"; ParseOptions po(usage); - bool write_compact = true; + bool write_compact = true, ignore_missing = false; + std::string include_rxfilename; + std::string exclude_rxfilename; + po.Register("write-compact", &write_compact, "If true, write in normal (compact) form."); - + po.Register("include", &include_rxfilename, + "Text file, the first field of each " + "line being interpreted as an " + "utterance-id whose features will be included"); + po.Register("exclude", &exclude_rxfilename, + "Text file, the first field of each " + "line being interpreted as an utterance-id" + " whose features will be excluded"); + po.Register("ignore-missing", &ignore_missing, + "Exit with status 0 even if no lattices are copied"); + po.Read(argc, argv); if (po.NumArgs() != 2) { @@ -59,15 +176,46 @@ int main(int argc, char *argv[]) { if (write_compact) { SequentialCompactLatticeReader lattice_reader(lats_rspecifier); CompactLatticeWriter lattice_writer(lats_wspecifier); + + if (include_rxfilename != "") { + if (exclude_rxfilename != "") { + KALDI_ERR << "should not have both --exclude and --include option!"; + } + return CopySubsetLattices(include_rxfilename, + &lattice_reader, &lattice_writer, + true, ignore_missing); + } else if (exclude_rxfilename != "") { + return CopySubsetLattices(exclude_rxfilename, + &lattice_reader, &lattice_writer, + false, ignore_missing); + } + for (; !lattice_reader.Done(); lattice_reader.Next(), n_done++) lattice_writer.Write(lattice_reader.Key(), lattice_reader.Value()); } else { SequentialLatticeReader lattice_reader(lats_rspecifier); LatticeWriter lattice_writer(lats_wspecifier); + + if (include_rxfilename != "") { + if (exclude_rxfilename != "") { + KALDI_ERR << "should not have both --exclude and --include option!"; + } + return CopySubsetLattices(include_rxfilename, + &lattice_reader, &lattice_writer, + true, ignore_missing); + } else if (exclude_rxfilename != "") { + return CopySubsetLattices(exclude_rxfilename, + &lattice_reader, &lattice_writer, + true, ignore_missing); + } + for (; !lattice_reader.Done(); lattice_reader.Next(), n_done++) lattice_writer.Write(lattice_reader.Key(), lattice_reader.Value()); } KALDI_LOG << "Done copying " << n_done << " lattices."; + + if (ignore_missing) return 0; + return (n_done != 0 ? 0 : 1); } catch(const std::exception &e) { std::cerr << e.what(); diff --git a/src/latbin/lattice-determinize-non-compact.cc b/src/latbin/lattice-determinize-non-compact.cc new file mode 100644 index 00000000000..f3f0acde892 --- /dev/null +++ b/src/latbin/lattice-determinize-non-compact.cc @@ -0,0 +1,315 @@ +// latbin/lattice-determinize-non-compact.cc + +// Copyright 2009-2012 Microsoft Corporation +// 2012-2013 Johns Hopkins University (Author: Daniel Povey) +// 2015 Vimal Manohar + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "util/stl-utils.h" +#include "fstext/fstext-lib.h" +#include "lat/kaldi-lattice.h" +#include "lat/lattice-functions.h" +#include "lat/push-lattice.h" +#include "lat/minimize-lattice.h" + +#ifdef _MSC_VER +#include +using std::unordered_map; +#elif __cplusplus > 199711L || defined(__GXX_EXPERIMENTAL_CXX0X__) +#include +using std::unordered_map; +#else +#include +using std::tr1::unordered_map; +#endif + + +namespace kaldi { + +typedef Lattice::StateId StateId; +typedef Lattice::Arc Arc; + +bool DeterminizeLatticeWrapper(const Lattice &lat, + const std::string &key, + bool prune, + BaseFloat beam, + BaseFloat beam_ratio, + int32 max_mem, + int32 max_loop, + BaseFloat delta, + int32 num_loops, + CompactLattice *clat) { + fst::DeterminizeLatticeOptions lat_opts; + lat_opts.max_mem = max_mem; + lat_opts.max_loop = max_loop; + lat_opts.delta = delta; + BaseFloat cur_beam = beam; + for (int32 i = 0; i < num_loops;) { // we increment i below. + + if (lat.Start() == fst::kNoStateId) { + KALDI_WARN << "Detected empty lattice, skipping " << key; + return false; + } + + // The work gets done in the next line. + if (DeterminizeLattice(lat, clat, lat_opts, NULL)) { + if (prune) PruneLattice(cur_beam, clat); + return true; + } else { // failed to determinize.. + KALDI_WARN << "Failed to determinize lattice (presumably max-states " + << "reached), reducing lattice-beam to " + << (cur_beam*beam_ratio) << " and re-trying."; + for (; i < num_loops; i++) { + cur_beam *= beam_ratio; + Lattice pruned_lat(lat); + PruneLattice(cur_beam, &pruned_lat); + if (NumArcs(lat) == NumArcs(pruned_lat)) { + cur_beam *= beam_ratio; + KALDI_WARN << "Pruning did not have an effect on the original " + << "lattice size; reducing beam to " + << cur_beam << " and re-trying."; + } else if (DeterminizeLattice(pruned_lat, clat, lat_opts, NULL)) { + if (prune) PruneLattice(cur_beam, clat); + return true; + } else { + KALDI_WARN << "Determinization failed again; reducing beam again to " + << (cur_beam*beam_ratio) << " and re-trying."; + } + } + } + } + KALDI_WARN << "Decreased pruning beam --num-loops=" << num_loops + << " times and was not able to determinize: failed for " + << key; + return false; +} + +void ComputeAcousticScoresMap( + const Lattice &lat, + unordered_map, std::pair, PairHasher > *acoustic_scores) { + acoustic_scores->clear(); + + std::vector state_times; + LatticeStateTimes(lat, &state_times); + + KALDI_ASSERT(lat.Start() == 0); + + for (StateId s = 0; s < lat.NumStates(); s++) { + int32 t = state_times[s]; + for (fst::ArcIterator aiter(lat, s); !aiter.Done(); + aiter.Next()) { + const Arc &arc = aiter.Value(); + const LatticeWeight &weight = arc.weight; + + int32 tid = arc.ilabel; + + if (tid != 0) { + unordered_map, std::pair, PairHasher >::iterator it = acoustic_scores->find(std::make_pair(t, tid)); + if (it == acoustic_scores->end()) { + acoustic_scores->insert(std::make_pair(std::make_pair(t, tid), std::make_pair(weight.Value2(), 1))); + } else { + if (it->second.second == 2 && it->second.first / it->second.second != weight.Value2()) { + KALDI_VLOG(2) << "Transitions on the same frame have different acoustic costs for tid " + << tid << "; " + << it->second.first / it->second.second + << " vs " << weight.Value2(); + } + it->second.first += weight.Value2(); + it->second.second++; + } + } else { + // Arcs with epsilon input label (tid) must have 0 acoustic cost + KALDI_ASSERT(weight.Value2() == 0); + } + } + + LatticeWeight f = lat.Final(s); + if (f != LatticeWeight::Zero()) { + // Final acoustic cost must be 0 + KALDI_ASSERT(f.Value2() == 0.0); + } + } +} + +void ReplaceAcousticScoresFromMap( + const unordered_map, std::pair, PairHasher > &acoustic_scores, + Lattice *lat) { + fst::TopSort(lat); + + std::vector state_times; + LatticeStateTimes(*lat, &state_times); + + KALDI_ASSERT(lat->Start() == 0); + + for (StateId s = 0; s < lat->NumStates(); s++) { + int32 t = state_times[s]; + for (fst::MutableArcIterator aiter(lat, s); + !aiter.Done(); aiter.Next()) { + Arc arc(aiter.Value()); + + int32 tid = arc.ilabel; + if (tid != 0) { + unordered_map, std::pair, PairHasher >::const_iterator it = acoustic_scores.find(std::make_pair(t, tid)); + if (it == acoustic_scores.end()) { + KALDI_ERR << "Could not find tid " << tid << " at time " << t + << " in the acoustic scores map."; + } else { + arc.weight.SetValue2(it->second.first / it->second.second); + } + } else { + // For epsilon arcs, set acoustic cost to 0.0 + arc.weight.SetValue2(0.0); + } + aiter.SetValue(arc); + } + + LatticeWeight f = lat->Final(s); + if (f != LatticeWeight::Zero()) { + // Set final acoustic cost to 0.0 + f.SetValue2(0.0); + lat->SetFinal(s, f); + } + } +} + +} + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + using fst::SymbolTable; + using fst::VectorFst; + using fst::StdArc; + + const char *usage = + "lattice-determinize lattices (and apply a pruning beam)\n" + " (see http://kaldi.sourceforge.net/lattices.html for more explanation)\n" + "This version of the program retains the original " + "acoustic scores of arcs in the lattice. " + " note: this program is tyically only useful if you generated state-level\n" + " lattices, e.g. called gmm-latgen-simple with --determinize=false\n" + "\n" + "Usage: lattice-determinize-non-compact [options] lattice-rspecifier lattice-wspecifier\n" + " e.g.: lattice-determinize-non-compact --acoustic-scale=0.1 --beam=15.0 ark:1.lats ark:det.lats\n"; + + ParseOptions po(usage); + BaseFloat acoustic_scale = 1.0; + BaseFloat beam = 10.0; + BaseFloat beam_ratio = 0.9; + int32 num_loops = 20; + int32 max_mem = 50000000; // 50 MB + int32 max_loop = 500000; + BaseFloat delta = fst::kDelta; + bool prune = false; + bool minimize = false; + + po.Register("acoustic-scale", &acoustic_scale, + "Scaling factor for acoustic likelihoods"); + po.Register("beam", &beam, + "Pruning beam [applied after acoustic scaling]-- also used " + "to handle determinization failures, set --prune=false to " + "disable routine pruning"); + po.Register("delta", &delta, "Tolerance used in determinization"); + po.Register("prune", &prune, "If true, prune determinized lattices " + "with the --beam option."); + po.Register("max-mem", &max_mem, "Maximum approximate memory usage in " + "determinization (real usage might be many times this)"); + po.Register("max-loop", &max_loop, "Option to detect a certain " + "type of failure in lattice determinization (not critical)"); + po.Register("beam-ratio", &beam_ratio, "Ratio by which to " + "decrease beam if we reach the max-arcs."); + po.Register("num-loops", &num_loops, "Number of times to " + "decrease beam by beam-ratio if determinization fails."); + po.Register("minimize", &minimize, + "If true, push and minimize after determinization"); + + po.Read(argc, argv); + + if (po.NumArgs() != 2) { + po.PrintUsage(); + exit(1); + } + + std::string lats_rspecifier = po.GetArg(1), + lats_wspecifier = po.GetArg(2); + + + // Read as regular lattice-- this is the form we need it in for efficient + // pruning. + SequentialLatticeReader lattice_reader(lats_rspecifier); + + // Write as regular lattice. + LatticeWriter lattice_writer(lats_wspecifier); + + int32 n_done = 0, n_error = 0; + + if (acoustic_scale == 0.0) + KALDI_ERR << "Do not use a zero acoustic scale (cannot be inverted)"; + LatticeWeight beam_weight(beam, static_cast(0.0)); + + for (; !lattice_reader.Done(); lattice_reader.Next()) { + std::string key = lattice_reader.Key(); + Lattice lat = lattice_reader.Value(); + + lattice_reader.FreeCurrent(); + + fst::TopSort(&lat); + + fst::ScaleLattice(fst::AcousticLatticeScale(acoustic_scale), &lat); + + unordered_map, std::pair, PairHasher > acoustic_scores; + ComputeAcousticScoresMap(lat, &acoustic_scores); + + Invert(&lat); // make it so word labels are on the input. + + CompactLattice clat; + if (DeterminizeLatticeWrapper(lat, key, prune, + beam, beam_ratio, max_mem, max_loop, + delta, num_loops, &clat)) { + if (minimize) { + PushCompactLatticeStrings(&clat); + PushCompactLatticeWeights(&clat); + MinimizeCompactLattice(&clat); + } + + Lattice out_lat; + fst::ConvertLattice(clat, &out_lat); + fst::TopSort(&out_lat); + + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + + fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), &out_lat); + lattice_writer.Write(key, out_lat); + n_done++; + } else { + n_error++; // will have already printed warning. + } + } + + KALDI_LOG << "Done " << n_done << " lattices, errors on " << n_error; + return (n_done != 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} + diff --git a/src/lm/Makefile b/src/lm/Makefile index ddda9576557..acf327d994f 100644 --- a/src/lm/Makefile +++ b/src/lm/Makefile @@ -10,10 +10,10 @@ MATHLIB = NONE include ../kaldi.mk -TESTFILES = lm-lib-test +TESTFILES = arpa-file-parser-test lm-lib-test -OBJFILES = const-arpa-lm.o kaldi-lmtable.o kaldi-lm.o kaldi-rnnlm.o \ - mikolov-rnnlm-lib.o +OBJFILES = arpa-file-parser.o const-arpa-lm.o kaldi-lmtable.o kaldi-lm.o \ + kaldi-rnnlm.o mikolov-rnnlm-lib.o TESTOUTPUTS = composed.fst output.fst output1.fst output2.fst diff --git a/src/lm/arpa-file-parser-test.cc b/src/lm/arpa-file-parser-test.cc new file mode 100644 index 00000000000..e37a916d263 --- /dev/null +++ b/src/lm/arpa-file-parser-test.cc @@ -0,0 +1,365 @@ +// lm/arpa-file-parser-test.cc + +// Copyright 2016 Smart Action Company LLC (kkm) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +/** + * @file lm-lib-test.cc + * @brief Unit tests for language model code. + */ + +#include +#include +#include +#include +#include +#include "lm/kaldi-lm.h" + +#include "lm/arpa-file-parser.h" + +namespace kaldi { +namespace { + +const int kMaxOrder = 3; + +struct NGramTestData { + int32 line_number; + float logprob; + int32 words[kMaxOrder]; + float backoff; +}; + +std::ostream& operator<<(std::ostream& os, const NGramTestData& data) { + std::ios::fmtflags saved_state(os.flags()); + os << std::fixed << std::setprecision(6); + + os << data.logprob << ' '; + for (int i = 0; i < kMaxOrder; ++i) os << data.words[i] << ' '; + os << data.backoff << " // Line " << data.line_number; + + os.flags(saved_state); + return os; +} + +// This does not own the array pointer, and uset to simplify passing expected +// result to TestableArpaFileParser::Verify. +template +struct CountedArray { + template + CountedArray(T(&array)[N]) : array(array), count(N) { } + const T* array; + const size_t count; +}; + +template +inline CountedArray MakeCountedArray(T(&array)[N]) { + return CountedArray(array); +} + +class TestableArpaFileParser : public ArpaFileParser { + public: + TestableArpaFileParser(ArpaParseOptions options, fst::SymbolTable* symbols) + : ArpaFileParser(options, symbols), + header_available_(false), + read_complete_(false), + last_order_(0) { } + void Validate(CountedArray counts, CountedArray ngrams); + + private: + // ArpaFileParser overrides. + virtual void HeaderAvailable(); + virtual void ConsumeNGram(const NGram& ngram); + virtual void ReadComplete(); + + bool header_available_; + bool read_complete_; + int32 last_order_; + std::vector ngrams_; +}; + +void TestableArpaFileParser::HeaderAvailable() { + KALDI_ASSERT(!header_available_); + KALDI_ASSERT(!read_complete_); + header_available_ = true; + KALDI_ASSERT(NgramCounts().size() <= kMaxOrder); +} + +void TestableArpaFileParser::ConsumeNGram(const NGram& ngram) { + KALDI_ASSERT(header_available_); + KALDI_ASSERT(!read_complete_); + KALDI_ASSERT(ngram.words.size() <= NgramCounts().size()); + KALDI_ASSERT(ngram.words.size() >= last_order_); + last_order_ = ngram.words.size(); + + NGramTestData entry = { 0 }; + entry.line_number = LineNumber(); + entry.logprob = ngram.logprob; + entry.backoff = ngram.backoff; + std::copy(ngram.words.begin(), ngram.words.end(), entry.words); + ngrams_.push_back(entry); +} + +void TestableArpaFileParser::ReadComplete() { + KALDI_ASSERT(header_available_); + KALDI_ASSERT(!read_complete_); + read_complete_ = true; +} + +// +bool CompareNgrams(const NGramTestData& actual, + const NGramTestData& expected) { + if (actual.line_number != expected.line_number + || !std::equal(actual.words, actual.words + kMaxOrder, + expected.words) + || !ApproxEqual(actual.logprob, expected.logprob) + || !ApproxEqual(actual.backoff, expected.backoff)) { + KALDI_WARN << "Actual n-gram [" << actual + << "] differs from expected [" << expected << "]"; + return false; + } + return true; +} + +void TestableArpaFileParser::Validate( + CountedArray expect_counts, + CountedArray expect_ngrams) { + // This needs better disagnostics probably. + KALDI_ASSERT(NgramCounts().size() == expect_counts.count); + KALDI_ASSERT(std::equal(NgramCounts().begin(), NgramCounts().end(), + expect_counts.array)); + + KALDI_ASSERT(ngrams_.size() == expect_ngrams.count); + // auto mpos = std::mismatch(ngrams_.begin(), ngrams_.end(), + // expect_ngrams.array, CompareNgrams); + // if (mpos.first != ngrams_.end()) + // KALDI_ERR << "Maismatch at index " << mpos.first - ngrams_.begin(); + //TODO:auto above requres C++11, and I cannot spell out the type!!! + KALDI_ASSERT(std::equal(ngrams_.begin(), ngrams_.end(), + expect_ngrams.array, CompareNgrams)); +} + +// Read integer LM (no symbols) with log base conversion. +void ReadIntegerLmLogconvExpectSuccess() { + KALDI_LOG << "ReadIntegerLmLogconvExpectSuccess()"; + + static std::string integer_lm = "\ +\\data\\\n\ +ngram 1=4\n\ +ngram 2=2\n\ +ngram 3=2\n\ +\n\ +\\1-grams:\n\ +-5.234679 4 -3.3\n\ +-3.456783 5\n\ +0.0000000 1 -2.5\n\ +-4.333333 2\n\ +\n\ +\\2-grams:\n\ +-1.45678 4 5 -3.23\n\ +-1.30490 1 4 -4.2\n\ +\n\ +\\3-grams:\n\ +-0.34958 1 4 5\n\ +-0.23940 4 5 2\n\ +\n\ +\\end\\"; + + int32 expect_counts[] = { 4, 2, 2 }; + NGramTestData expect_ngrams[] = { + { 7, -12.05329, { 4, 0, 0 }, -7.598531 }, + { 8, -7.959537, { 5, 0, 0 }, 0.0 }, + { 9, 0.0, { 1, 0, 0 }, -5.756463 }, + { 10, -9.977868, { 2, 0, 0 }, 0.0 }, + + { 13, -3.354360, { 4, 5, 0 }, -7.437350 }, + { 14, -3.004643, { 1, 4, 0 }, -9.670857 }, + + { 17, -0.804938, { 1, 4, 5 }, 0.0 }, + { 18, -0.551239, { 4, 5, 2 }, 0.0 } }; + + ArpaParseOptions options; + options.bos_symbol = 1; + options.eos_symbol = 2; + + TestableArpaFileParser parser(options, NULL); + std::istringstream stm(integer_lm, std::ios_base::in); + parser.Read(stm, false); + parser.Validate(MakeCountedArray(expect_counts), + MakeCountedArray(expect_ngrams)); +} + +// \xCE\xB2 = UTF-8 for Greek beta, to churn some UTF-8 cranks. +static std::string symbolic_lm = "\ +\\data\\\n\ +ngram 1=4\n\ +ngram 2=2\n\ +ngram 3=2\n\ +\n\ +\\1-grams:\n\ +-5.2 a -3.3\n\ +-3.4 \xCE\xB2\n\ +0.0 -2.5\n\ +-4.3 \n\ +\n\ +\\2-grams:\n\ +-1.5 a \xCE\xB2 -3.2\n\ +-1.3 a -4.2\n\ +\n\ +\\3-grams:\n\ +-0.3 a \xCE\xB2\n\ +-0.2 a \n\ +\n\ +\\end\\"; + +// Symbol table that is created with predefined test symbols, "a" but no "b". +class TestSymbolTable : public fst::SymbolTable { + public: + TestSymbolTable() { + AddSymbol("", 0); + AddSymbol("", 1); + AddSymbol("", 2); + AddSymbol("", 3); + AddSymbol("a", 4); + } +}; + +// Full expected result shared between ReadSymbolicLmNoOovImpl and +// ReadSymbolicLmWithOovAddToSymbols(). +NGramTestData expect_symbolic_full[] = { + { 7, -5.2, { 4, 0, 0 }, -3.3 }, + { 8, -3.4, { 5, 0, 0 }, 0.0 }, + { 9, 0.0, { 1, 0, 0 }, -2.5 }, + { 10, -4.3, { 2, 0, 0 }, 0.0 }, + + { 13, -1.5, { 4, 5, 0 }, -3.2 }, + { 14, -1.3, { 1, 4, 0 }, -4.2 }, + + { 17, -0.3, { 1, 4, 5 }, 0.0 }, + { 18, -0.2, { 1, 4, 2 }, 0.0 } }; + +// This is run with all possible oov setting and yields same result. +void ReadSymbolicLmNoOovImpl(ArpaParseOptions::OovHandling oov) { + int32 expect_counts[] = { 4, 2, 2 }; + TestSymbolTable symbols; + symbols.AddSymbol("\xCE\xB2", 5); + + ArpaParseOptions options; + options.bos_symbol = 1; + options.eos_symbol = 2; + options.unk_symbol = 3; + options.use_log10 = true; + options.oov_handling = oov; + TestableArpaFileParser parser(options, &symbols); + std::istringstream stm(symbolic_lm, std::ios_base::in); + parser.Read(stm, false); + parser.Validate(MakeCountedArray(expect_counts), + MakeCountedArray(expect_symbolic_full)); + KALDI_ASSERT(symbols.NumSymbols() == 6); +} + +void ReadSymbolicLmNoOovTests() { + KALDI_LOG << "ReadSymbolicLmNoOovImpl(kRaiseError)"; + ReadSymbolicLmNoOovImpl(ArpaParseOptions::kRaiseError); + KALDI_LOG << "ReadSymbolicLmNoOovImpl(kAddToSymbols)"; + ReadSymbolicLmNoOovImpl(ArpaParseOptions::kAddToSymbols); + KALDI_LOG << "ReadSymbolicLmNoOovImpl(kReplaceWithUnk)"; + ReadSymbolicLmNoOovImpl(ArpaParseOptions::kReplaceWithUnk); + KALDI_LOG << "ReadSymbolicLmNoOovImpl(kSkipNGram)"; + ReadSymbolicLmNoOovImpl(ArpaParseOptions::kSkipNGram); +} + +// This is run with all possible oov setting and yields same result. +void ReadSymbolicLmWithOovImpl( + ArpaParseOptions::OovHandling oov, + CountedArray expect_ngrams, + fst::SymbolTable* symbols) { + int32 expect_counts[] = { 4, 2, 2 }; + ArpaParseOptions options; + options.bos_symbol = 1; + options.eos_symbol = 2; + options.unk_symbol = 3; + options.use_log10 = true; + options.oov_handling = oov; + TestableArpaFileParser parser(options, symbols); + std::istringstream stm(symbolic_lm, std::ios_base::in); + parser.Read(stm, false); + parser.Validate(MakeCountedArray(expect_counts), expect_ngrams); +} + +void ReadSymbolicLmWithOovAddToSymbols() { + TestSymbolTable symbols; + ReadSymbolicLmWithOovImpl(ArpaParseOptions::kAddToSymbols, + MakeCountedArray(expect_symbolic_full), + &symbols); + KALDI_ASSERT(symbols.NumSymbols() == 6); + KALDI_ASSERT(symbols.Find("\xCE\xB2") == 5); +} + +void ReadSymbolicLmWithOovReplaceWithUnk() { + NGramTestData expect_symbolic_unk_b[] = { + { 7, -5.2, { 4, 0, 0 }, -3.3 }, + { 8, -3.4, { 3, 0, 0 }, 0.0 }, + { 9, 0.0, { 1, 0, 0 }, -2.5 }, + { 10, -4.3, { 2, 0, 0 }, 0.0 }, + + { 13, -1.5, { 4, 3, 0 }, -3.2 }, + { 14, -1.3, { 1, 4, 0 }, -4.2 }, + + { 17, -0.3, { 1, 4, 3 }, 0.0 }, + { 18, -0.2, { 1, 4, 2 }, 0.0 } }; + + TestSymbolTable symbols; + ReadSymbolicLmWithOovImpl(ArpaParseOptions::kReplaceWithUnk, + MakeCountedArray(expect_symbolic_unk_b), + &symbols); + KALDI_ASSERT(symbols.NumSymbols() == 5); +} + +void ReadSymbolicLmWithOovSkipNGram() { + NGramTestData expect_symbolic_no_b[] = { + { 7, -5.2, { 4, 0, 0 }, -3.3 }, + { 9, 0.0, { 1, 0, 0 }, -2.5 }, + { 10, -4.3, { 2, 0, 0 }, 0.0 }, + + { 14, -1.3, { 1, 4, 0 }, -4.2 }, + + { 18, -0.2, { 1, 4, 2 }, 0.0 } }; + + TestSymbolTable symbols; + ReadSymbolicLmWithOovImpl(ArpaParseOptions::kSkipNGram, + MakeCountedArray(expect_symbolic_no_b), + &symbols); + KALDI_ASSERT(symbols.NumSymbols() == 5); +} + +void ReadSymbolicLmWithOovTests() { + KALDI_LOG << "ReadSymbolicLmWithOovAddToSymbols()"; + ReadSymbolicLmWithOovAddToSymbols(); + KALDI_LOG << "ReadSymbolicLmWithOovReplaceWithUnk()"; + ReadSymbolicLmWithOovReplaceWithUnk(); + KALDI_LOG << "ReadSymbolicLmWithOovSkipNGram()"; + ReadSymbolicLmWithOovSkipNGram(); +} + +} // namespace +} // namespace kaldi + +int main(int argc, char *argv[]) { + kaldi::ReadIntegerLmLogconvExpectSuccess(); + kaldi::ReadSymbolicLmNoOovTests(); + kaldi::ReadSymbolicLmWithOovTests(); +} diff --git a/src/lm/arpa-file-parser.cc b/src/lm/arpa-file-parser.cc new file mode 100644 index 00000000000..2d8f9f18638 --- /dev/null +++ b/src/lm/arpa-file-parser.cc @@ -0,0 +1,236 @@ +// lm/arpa-file-parser.cc + +// Copyright 2014 Guoguo Chen +// Copyright 2016 Smart Action Company LLC (kkm) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include "base/kaldi-error.h" +#include "base/kaldi-math.h" +#include "lm/arpa-file-parser.h" +#include "util/text-utils.h" + +namespace kaldi { + +ArpaFileParser::ArpaFileParser(ArpaParseOptions options, fst::SymbolTable* symbols) + : options_(options), symbols_(symbols), line_number_(0) { +} + +ArpaFileParser::~ArpaFileParser() { +} + +void ArpaFileParser::Read(std::istream &is, bool binary) { + if (binary) { + KALDI_ERR << "binary-mode reading is not implemented for ArpaFileParser"; + } + + // Argument sanity checks. + if (options_.bos_symbol <= 0 || options_.eos_symbol <= 0 || + options_.bos_symbol == options_.eos_symbol) + KALDI_ERR << "BOS and EOS symbols are required, must not be epsilons, and " + << "differ from each other. Given:" + << " BOS=" << options_.bos_symbol + << " EOS=" << options_.eos_symbol; + if (symbols_ != NULL && + options_.oov_handling == ArpaParseOptions::kReplaceWithUnk && + (options_.unk_symbol <= 0 || + options_.unk_symbol == options_.bos_symbol || + options_.unk_symbol == options_.eos_symbol)) + KALDI_ERR << "When symbol table is given and OOV mode is kReplaceWithUnk, " + << "UNK symbol is required, must not be epsilon, and " + << "differ from both BOS and EOS symbols. Given:" + << " UNK=" << options_.unk_symbol + << " BOS=" << options_.bos_symbol + << " EOS=" << options_.eos_symbol; + if (symbols_ != NULL && symbols_->Find(options_.bos_symbol).empty()) + KALDI_ERR << "BOS symbol must exist in symbol table"; + if (symbols_ != NULL && symbols_->Find(options_.eos_symbol).empty()) + KALDI_ERR << "EOS symbol must exist in symbol table"; + if (symbols_ != NULL && options_.unk_symbol > 0 && + symbols_->Find(options_.unk_symbol).empty()) + KALDI_ERR << "UNK symbol must exist in symbol table"; + + ngram_counts_.clear(); + line_number_ = 0; + +#define PARSE_ERR (KALDI_ERR << "in line " << line_number_ << ": ") + + // Give derived class an opportunity to prepare its state. + ReadStarted(); + + std::string line; + + // Processes "\data\" section. + bool keyword_found = false; + while (++line_number_, getline(is, line) && !is.eof()) { + if (line.empty()) continue; + + // The section keywords starts with backslash. We terminate the while loop + // if a new section is found. + if (line[0] == '\\') { + if (!keyword_found && line == "\\data\\") { + KALDI_LOG << "Reading \\data\\ section."; + keyword_found = true; + continue; + } + break; + } + + if (!keyword_found) continue; + + // Enters "\data\" section, and looks for patterns like "ngram 1=1000", + // which means there are 1000 unigrams. + std::size_t equal_symbol_pos = line.find("="); + if (equal_symbol_pos != std::string::npos) + line.replace(equal_symbol_pos, 1, " = "); // Inserts spaces around "=" + std::vector col; + SplitStringToVector(line, " \t", true, &col); + if (col.size() == 4 && col[0] == "ngram" && col[2] == "=") { + int32 order, ngram_count = 0; + if (!ConvertStringToInteger(col[1], &order) || + !ConvertStringToInteger(col[3], &ngram_count)) { + PARSE_ERR << "Cannot parse ngram count '" << line << "'."; + } + if (ngram_counts_.size() <= order) { + ngram_counts_.resize(order); + } + ngram_counts_[order - 1] = ngram_count; + } else { + KALDI_WARN << "Uninterpretable line in \\data\\ section: " << line; + } + } + + if (ngram_counts_.size() == 0) + PARSE_ERR << "\\data\\ section missing or empty."; + + // Signal that grammar order and n-gram counts are known. + HeaderAvailable(); + + NGram ngram; + ngram.words.reserve(ngram_counts_.size()); + + // Processes "\N-grams:" section. + for (int32 cur_order = 1; cur_order <= ngram_counts_.size(); ++cur_order) { + // Skips n-grams with zero count. + if (ngram_counts_[cur_order - 1] == 0) { + KALDI_WARN << "Zero ngram count in ngram order " << cur_order + << "(look for 'ngram " << cur_order << "=0' in the \\data\\ " + << " section). There is possibly a problem with the file."; + continue; + } + + // Must be looking at a \k-grams: directive at this point. + std::ostringstream keyword; + keyword << "\\" << cur_order << "-grams:"; + if (line != keyword.str()) { + PARSE_ERR << "Invalid directive '" << line << "', " + << "expecting '" << keyword.str() << "'."; + } + KALDI_LOG << "Reading " << line << " section."; + + int32 ngram_count = 0; + while (++line_number_, getline(is, line) && !is.eof()) { + if (line.empty()) continue; + if (line[0] == '\\') break; + + std::vector col; + SplitStringToVector(line, " \t", true, &col); + + if (col.size() < 1 + cur_order || + col.size() > 2 + cur_order || + (cur_order == ngram_counts_.size() && col.size() != 1 + cur_order)) { + PARSE_ERR << "Invalid n-gram line '" << line << "'"; + } + ++ngram_count; + + // Parse out n-gram logprob and, if present, backoff weight. + if (!ConvertStringToReal(col[0], &ngram.logprob)) { + PARSE_ERR << "Invalid n-gram logprob '" << col[0] << "'."; + } + ngram.backoff = 0.0; + if (col.size() > cur_order + 1) { + if (!ConvertStringToReal(col[cur_order + 1], &ngram.backoff)) + PARSE_ERR << "Invalid backoff weight '" << col[cur_order + 1] << "'."; + } + // Convert to natural log unless the option is set not to. + if (!options_.use_log10) { + ngram.logprob *= M_LN10; + ngram.backoff *= M_LN10; + } + + ngram.words.resize(cur_order); + bool skip_ngram = false; + for (int32 index = 0; !skip_ngram && index < cur_order; ++index) { + int32 word; + if (symbols_) { + // Symbol table provided, so symbol labels are expected. + if (options_.oov_handling == ArpaParseOptions::kAddToSymbols) { + word = symbols_->AddSymbol(col[1 + index]); + } else { + word = symbols_->Find(col[1 + index]); + if (word == fst::SymbolTable::kNoSymbol) { + switch(options_.oov_handling) { + case ArpaParseOptions::kReplaceWithUnk: + word = options_.unk_symbol; + break; + case ArpaParseOptions::kSkipNGram: + skip_ngram = true; + break; + default: + PARSE_ERR << "Word '" << col[1 + index] + << "' not in symbol table."; + } + } + } + } else { + // Symbols not provided, LM file should contain integers. + if (!ConvertStringToInteger(col[1 + index], &word) || word < 0) { + PARSE_ERR << "invalid symbol '" << col[1 + index] << "'"; + } + } + // Whichever way we got it, an epsilon is invalid. + if (word == 0) { + PARSE_ERR << "Epsilon symbol '" << col[1 + index] + << "' is illegal in ARPA LM."; + } + ngram.words[index] = word; + } + if (!skip_ngram) { + ConsumeNGram(ngram); + } + } + if (ngram_count > ngram_counts_[cur_order - 1]) { + PARSE_ERR << "Header said there would be " << ngram_counts_[cur_order] + << " n-grams of order " << cur_order << ", but we saw " + << ngram_count; + } + } + + if (line != "\\end\\") { + PARSE_ERR << "Invalid or unexpected directive line '" << line << "', " + << "expected \\end\\."; + } + + ReadComplete(); + +#undef PARSE_ERR +} + +} // namespace kaldi diff --git a/src/lm/arpa-file-parser.h b/src/lm/arpa-file-parser.h new file mode 100644 index 00000000000..0011fb4ee21 --- /dev/null +++ b/src/lm/arpa-file-parser.h @@ -0,0 +1,125 @@ +// lm/arpa-file-parser.h + +// Copyright 2014 Guoguo Chen +// Copyright 2016 Smart Action Company LLC (kkm) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_LM_ARPA_FILE_PARSER_H_ +#define KALDI_LM_ARPA_FILE_PARSER_H_ + +#include +#include + +#include + +#include "base/kaldi-types.h" + +namespace kaldi { + +/** + Options that control ArpaFileParser +*/ +struct ArpaParseOptions { + enum OovHandling { + kRaiseError, ///< Abort on OOV words + kAddToSymbols, ///< Add novel words to the symbol table. + kReplaceWithUnk, ///< Replace OOV words with . + kSkipNGram ///< Skip n-gram with OOV word and continue. + }; + + ArpaParseOptions() + : bos_symbol(-1), eos_symbol(-1), unk_symbol(-1), + oov_handling(kRaiseError), use_log10(false) { } + + int32 bos_symbol; ///< Symbol for , Required non-epsilon. + int32 eos_symbol; ///< Symbol for , Required non-epsilon. + int32 unk_symbol; ///< Symbol for , Required for kReplaceWithUnk. + OovHandling oov_handling; ///< How to handle OOV words in the file. + bool use_log10; ///< Use log10 for prob and backoff weight, not ln. +}; + +/** + A parsed n-gram from ARPA LM file. +*/ +struct NGram { + NGram() : logprob(0.0), backoff(0.0) { } + std::vector words; ///< Symbols in LTR order. + float logprob; ///< Log-prob of the n-gram. + float backoff; ///< log-backoff weight of the n-gram. +}; + +/** + ArpaFileParser is an abstract base class for ARPA LM file conversion. + + See ConstArpaLmBuilder for a usage example. +*/ +class ArpaFileParser { + public: + /// Constructs the parser with the given options and optional symbol table. + /// If symbol table is provided, then the file should contain text n-grams, + /// and the words are mapped to symbols through it. bos_symbol and + /// eos_symbol in the options structure must be valid symbols in the table, + /// and so must be unk_symbol if provided. The table is not owned by the + /// parser, but may be augmented, if oov_handling is set to kAddToSymbols. + /// If symbol table is a null pointer, the file should contain integer + /// symbol values, and oov_handling has no effect. bos_symbol and eos_symbol + /// must be valid symbols still. + ArpaFileParser(ArpaParseOptions options, fst::SymbolTable* symbols); + virtual ~ArpaFileParser(); + + /// Read ARPA LM file through Kaldi I/O functions. Only text mode is + /// supported. + void Read(std::istream &is, bool binary); + + const ArpaParseOptions& Options() const { return options_; } + + protected: + /// Override called before reading starts. This is the point to prepare + /// any state in the derived class. + virtual void ReadStarted() { } + + /// Override function called to signal that ARPA header with the expected + /// number of n-grams has been read, and ngram_counts() is now valid. + virtual void HeaderAvailable() { } + + /// Pure override that must be implemented to process current n-gram. The + /// n-grams are sent in the file order, which guarantees that all + /// (k-1)-grams are processed before the first k-gram is. + virtual void ConsumeNGram(const NGram&) = 0; + + /// Override function called after the last n-gram has been consumed. + virtual void ReadComplete() { } + + /// Read-only access to symbol table. + const fst::SymbolTable* Symbols() const { return symbols_; } + + /// Inside ConsumeNGram(), provides the current line number. + int32 LineNumber() const { return line_number_; } + + /// N-gram counts. Valid in and after a call to HeaderAvailable(). + const std::vector& NgramCounts() const { return ngram_counts_; } + + private: + ArpaParseOptions options_; + fst::SymbolTable* symbols_; // Not owned. + int32 line_number_; + std::vector ngram_counts_; +}; + +} // namespace kaldi + +#endif // KALDI_LM_ARPA_FILE_PARSER_H_ diff --git a/src/lm/const-arpa-lm.cc b/src/lm/const-arpa-lm.cc index 7f63dce886e..5043933d7f0 100644 --- a/src/lm/const-arpa-lm.cc +++ b/src/lm/const-arpa-lm.cc @@ -22,13 +22,14 @@ #include #include +#include "base/kaldi-math.h" +#include "lm/arpa-file-parser.h" #include "lm/const-arpa-lm.h" #include "util/stl-utils.h" #include "util/text-utils.h" -#include "base/kaldi-math.h" -namespace kaldi { +namespace kaldi { // Auxiliary struct for converting ConstArpaLm format langugae model to Arpa // format. @@ -173,13 +174,10 @@ class LmState { // Class to build ConstArpaLm from Arpa format language model. It relies on the // auxiliary class LmState above. -class ConstArpaLmBuilder { +class ConstArpaLmBuilder : public ArpaFileParser { public: - ConstArpaLmBuilder( - const bool natural_base, const int32 bos_symbol, - const int32 eos_symbol, const int32 unk_symbol) : - natural_base_(natural_base), bos_symbol_(bos_symbol), - eos_symbol_(eos_symbol), unk_symbol_(unk_symbol) { + ConstArpaLmBuilder(ArpaParseOptions options) + : ArpaFileParser(options, NULL) { ngram_order_ = 0; num_words_ = 0; overflow_buffer_size_ = 0; @@ -204,21 +202,21 @@ class ConstArpaLmBuilder { } } - // Reads in the Arpa format language model, parses it and creates LmStates. - void Read(std::istream &is, bool binary); - // Writes ConstArpaLm. void Write(std::ostream &os, bool binary) const; - // Builds ConstArpaLm. - void Build(); - void SetMaxAddressOffset(const int32 max_address_offset) { KALDI_WARN << "You are changing ; the default should " << "not be changed unless you are in testing mode."; max_address_offset_ = max_address_offset; } + protected: + // ArpaFileParser overrides. + virtual void HeaderAvailable(); + virtual void ConsumeNGram(const NGram& ngram); + virtual void ReadComplete(); + private: struct WordsAndLmStatePairLessThan { bool operator()( @@ -229,10 +227,6 @@ class ConstArpaLmBuilder { }; private: - // If true, use natural base e for log-prob, otherwise use base 10. The - // default base in Arpa format language model is base 10. - bool natural_base_; - // Indicating if ConstArpaLm has been built or not. bool is_built_; @@ -240,16 +234,6 @@ class ConstArpaLmBuilder { // The default value is 30-bits and should not be changed except for testing. int32 max_address_offset_; - // Integer corresponds to . - int32 bos_symbol_; - - // Integer corresponds to . - int32 eos_symbol_; - - // Integer corresponds to unknown-word. -1 if no unknown-word symbol is - // provided. - int32 unk_symbol_; - // N-gram order of language model. This can be figured out from "/data/" // section in Arpa format language model. int32 ngram_order_; @@ -280,201 +264,58 @@ class ConstArpaLmBuilder { LmState*, VectorHasher > seq_to_state_; }; -// Reads in the Arpa format language model, parses it and puts the word sequence -// into the corresponding LmState in . -void ConstArpaLmBuilder::Read(std::istream &is, bool binary) { - if (binary) { - KALDI_ERR << "binary-mode reading is not implemented for " - << "ConstArpaLmBuilder."; - } - - std::string line; - - // Number of n-grams from "\data\" section. Those numbers should match the - // actual number of n-grams from "\N-grams:" sections. - // Note that when we convert the words in the Arpa format language model into - // integers, we remove lines with OOV words. We also modify the n-gram counts - // in "\data\" correspondingly. - std::vector num_ngrams; - - // Processes "\data\" section. - bool keyword_found = false; - while (getline(is, line) && !is.eof()) { - // The section keywords starts with backslash. We terminate the while loop - // if a new section is found. - if (!line.empty() && line[0] == '\\') { - if (line.find("-grams:") != std::string::npos) break; - if (line.find("\\end\\") != std::string::npos) break; - } - - std::size_t equal_symbol_pos = line.find("="); - if (equal_symbol_pos != std::string::npos) - line.replace(equal_symbol_pos, 1, " = "); // Inserts spaces around "=" - std::vector col; - SplitStringToVector(line, " \t", true, &col); - - // Looks for keyword "\data\". - if (!keyword_found && col.size() == 1 && col[0] == "\\data\\") { - KALDI_LOG << "Reading \"\\data\\\" section."; - keyword_found = true; - continue; - } +void ConstArpaLmBuilder::HeaderAvailable() { + ngram_order_ = NgramCounts().size(); +} - // Enters "\data\" section, and looks for patterns like"ngram 1=1000", which - // means there are 1000 unigrams. - if (keyword_found && col.size() == 4 && col[0] == "ngram") { - if (col[2] == "=") { - int32 order, ngram_count; - if (!ConvertStringToInteger(col[1], &order)) { - KALDI_ERR << "bad line: " << line << "; fail to convert " - << col[1] << " to integer."; - } - if (!ConvertStringToInteger(col[3], &ngram_count)) { - KALDI_ERR << "bad line: " << line << "; fail to convert " - << col[3] << " to integer."; - } - if (num_ngrams.size() <= order) { - num_ngrams.resize(order + 1); - } - num_ngrams[order] = ngram_count; - } else { - KALDI_WARN << "Uninterpretable line \"\\data\\\" section: " << line; - } - } else if (keyword_found) { - KALDI_WARN << "Uninterpretable line \"\\data\\\" section: " << line; - } +void ConstArpaLmBuilder::ConsumeNGram(const NGram& ngram) { + int32 cur_order = ngram.words.size(); + // If is larger than 1, then we do not create LmState for + // the final order entry. We only keep the log probability for it. + LmState *lm_state = NULL; + if (cur_order != ngram_order_ || ngram_order_ == 1) { + lm_state = new LmState(cur_order == 1, + cur_order == ngram_order_ - 1, + ngram.logprob, ngram.backoff); + + KALDI_ASSERT(seq_to_state_.find(ngram.words) == seq_to_state_.end()); + seq_to_state_[ngram.words] = lm_state; } - if (num_ngrams.size() == 0) - KALDI_ERR << "Fail to read \"\\data\\\" section."; - ngram_order_ = num_ngrams.size() - 1; - - // Processes "\N-grams:" section. - int32 max_word_id = 0; - for (int32 cur_order = 1; cur_order < num_ngrams.size(); ++cur_order) { - // Skips n-grams with zero count. - if (num_ngrams[cur_order] == 0) continue; - - keyword_found = false; - int32 ngram_count = 0; - std::ostringstream keyword; - keyword << "\\" << cur_order << "-grams:"; - // We use "do ... while" loop since one line has already been read. - do { - // The section keywords starts with backslash. We terminate the while loop - // if a new section is found. - if (!line.empty() && line[0] == '\\') { - if (line.find("-grams:") != std::string::npos && keyword_found) break; - if (line.find("\\end\\") != std::string::npos) break; - } - std::vector col; - SplitStringToVector(line, " \t", true, &col); - - // Looks for keyword "\N-gram:" if the keyword has not been located. - if (!keyword_found && col.size() == 1 && col[0] == keyword.str()) { - KALDI_LOG << "Reading \"" << keyword.str() << "\" section."; - ngram_count = 0; - keyword_found = true; - continue; - } - - // Enters "\N-grams:" section if the keyword has been located. - if (keyword_found && col.size() > 0) { - KALDI_ASSERT(col.size() >= 1 + cur_order); - KALDI_ASSERT(col.size() <= 2 + cur_order); // backoff_logprob can be 0. - if (cur_order == ngram_order_ && col.size() == 2 + cur_order) { - KALDI_ERR << "Backoff probability detected for final-order entry \"" - << line << "\"."; - } - ngram_count++; - - // If backoff_logprob is 0, it will not appear in Arpa format language - // model. We put it back so the processing afterwards will be easier. - if (col.size() == 1 + cur_order) { - col.push_back("0"); - } - - // Creates LmState for the current word sequence. - bool is_unigram = (cur_order == 1) ? true : false; - float logprob; - float backoff_logprob; - KALDI_ASSERT(ConvertStringToReal(col[0], &logprob)); - KALDI_ASSERT(ConvertStringToReal(col[1 + cur_order], &backoff_logprob)); - if (natural_base_) { - logprob *= Log(10.0f); - backoff_logprob *= Log(10.0f); - } - - // If is larger than 1, then we do not create LmState for - // the final order entry. We only keep the log probability for it. - LmState *lm_state = NULL; - if (cur_order != ngram_order_ || ngram_order_ == 1) { - lm_state = new LmState(is_unigram, - (cur_order == ngram_order_ - 1), - logprob, backoff_logprob); - } - - // Figures out the sequence of words. - std::vector seq(cur_order, 0); - for (int32 index = 0; index < cur_order; ++index) { - int32 word; - if (!ConvertStringToInteger(col[1 + index], &word)) { - KALDI_ERR << "bad line: " << line << "; fail to convert " - << col[1 + index] << " to integer."; - } - seq[index] = word; - } - - // If is larger than 1, then we do not insert LmState to - // . - if (cur_order != ngram_order_ || ngram_order_ == 1) { - KALDI_ASSERT(lm_state != NULL); - KALDI_ASSERT(seq_to_state_.find(seq) == seq_to_state_.end()); - seq_to_state_[seq] = lm_state; - } - - // If n-gram order is larger than 1, we have to add possible child to - // existing LmStates. We have the following two assumptions: - // 1. N-grams are processed from small order to larger ones, i.e., from - // 1, 2, ... to the highest order. - // 2. If a n-gram exists in the Arpa format language model, then the - // "history" n-gram also exists. For example, if "A B C" is a valid - // n-gram, then "A B" is also a valid n-gram. - if (cur_order > 1) { - std::vector hist(seq.begin(), seq.begin() + cur_order - 1); - int32 word = seq[seq.size() - 1]; - unordered_map, - LmState*, VectorHasher >::iterator hist_iter; - hist_iter = seq_to_state_.find(hist); - KALDI_ASSERT(hist_iter != seq_to_state_.end()); - if (cur_order != ngram_order_ || ngram_order_ == 1) { - KALDI_ASSERT(lm_state != NULL); - KALDI_ASSERT(!hist_iter->second->IsChildFinalOrder()); - hist_iter->second->AddChild(word, lm_state); - } else { - KALDI_ASSERT(lm_state == NULL); - KALDI_ASSERT(hist_iter->second->IsChildFinalOrder()); - hist_iter->second->AddChild(word, logprob); - } - } else { - // Figures out . - KALDI_ASSERT(seq.size() == 1); - if (seq[0] > max_word_id) { - max_word_id = seq[0]; - } - } - } - } while (getline(is, line) && !is.eof()); - if (ngram_count > num_ngrams[cur_order] || - (ngram_count == 0 && num_ngrams[cur_order] != 0)) { - KALDI_ERR << "Header said there would be " << num_ngrams[cur_order] - << " n-grams of order " << cur_order << ", but we saw " - << ngram_count; + // If n-gram order is larger than 1, we have to add possible child to + // existing LmStates. We have the following two assumptions: + // 1. N-grams are processed from small order to larger ones, i.e., from + // 1, 2, ... to the highest order. + // 2. If a n-gram exists in the Arpa format language model, then the + // "history" n-gram also exists. For example, if "A B C" is a valid + // n-gram, then "A B" is also a valid n-gram. + int32 last_word = ngram.words[cur_order - 1]; + if (cur_order > 1) { + std::vector hist(ngram.words.begin(), ngram.words.end() - 1); + unordered_map, + LmState*, VectorHasher >::iterator hist_iter; + hist_iter = seq_to_state_.find(hist); + if (hist_iter == seq_to_state_.end()) { + std::ostringstream ss; + for (int i = 0; i < cur_order; ++i) + ss << (i == 0 ? '[' : ' ') << ngram.words[i]; + KALDI_ERR << "In line " << LineNumber() << ": " + << cur_order << "-gram " << ss.str() << "] does not have " + << "a parent model " << cur_order << "-gram."; + } + if (cur_order != ngram_order_ || ngram_order_ == 1) { + KALDI_ASSERT(lm_state != NULL); + KALDI_ASSERT(!hist_iter->second->IsChildFinalOrder()); + hist_iter->second->AddChild(last_word, lm_state); + } else { + KALDI_ASSERT(lm_state == NULL); + KALDI_ASSERT(hist_iter->second->IsChildFinalOrder()); + hist_iter->second->AddChild(last_word, ngram.logprob); } + } else { + // Figures out . + num_words_ = std::max(num_words_, last_word + 1); } - - // is plus 1. - num_words_ = max_word_id + 1; } // ConstArpaLm can be built in the following steps, assuming we have already @@ -503,7 +344,7 @@ void ConstArpaLmBuilder::Read(std::istream &is, bool binary) { // At the same time, we will also create two special buffers: // // -void ConstArpaLmBuilder::Build() { +void ConstArpaLmBuilder::ReadComplete() { // STEP 1: sorting LmStates lexicographically. // Vector for holding the sorted LmStates. std::vector*, LmState*> > sorted_vec; @@ -637,9 +478,10 @@ void ConstArpaLmBuilder::Write(std::ostream &os, bool binary) const { KALDI_ASSERT(is_built_); // Creates ConstArpaLm. - ConstArpaLm const_arpa_lm(bos_symbol_, eos_symbol_, unk_symbol_, ngram_order_, - num_words_, overflow_buffer_size_, lm_states_size_, - unigram_states_, overflow_buffer_, lm_states_); + ConstArpaLm const_arpa_lm( + Options().bos_symbol, Options().eos_symbol, Options().unk_symbol, + ngram_order_, num_words_, overflow_buffer_size_, lm_states_size_, + unigram_states_, overflow_buffer_, lm_states_); const_arpa_lm.Write(os, binary); } @@ -1224,10 +1066,15 @@ bool BuildConstArpaLm(const bool natural_base, const int32 bos_symbol, const int32 eos_symbol, const int32 unk_symbol, const std::string& arpa_rxfilename, const std::string& const_arpa_wxfilename) { - ConstArpaLmBuilder lm_builder(natural_base, bos_symbol, - eos_symbol, unk_symbol); + ArpaParseOptions options; + options.bos_symbol = bos_symbol; + options.eos_symbol = eos_symbol; + options.unk_symbol = unk_symbol; + options.use_log10 = !natural_base; + + ConstArpaLmBuilder lm_builder(options); + KALDI_LOG << "Reading " << arpa_rxfilename; ReadKaldiObject(arpa_rxfilename, &lm_builder); - lm_builder.Build(); WriteKaldiObject(lm_builder, const_arpa_wxfilename, true); return true; } diff --git a/src/lm/kaldi-rnnlm.cc b/src/lm/kaldi-rnnlm.cc index e1fbcbdc08b..3a811c4c0e5 100644 --- a/src/lm/kaldi-rnnlm.cc +++ b/src/lm/kaldi-rnnlm.cc @@ -58,8 +58,8 @@ KaldiRnnlmWrapper::KaldiRnnlmWrapper( BaseFloat KaldiRnnlmWrapper::GetLogProb( int32 word, const std::vector &wseq, - const std::vector &context_in, - std::vector *context_out) { + const std::vector &context_in, + std::vector *context_out) { std::vector wseq_symbols(wseq.size()); for (int32 i = 0; i < wseq_symbols.size(); ++i) { @@ -79,7 +79,7 @@ RnnlmDeterministicFst::RnnlmDeterministicFst(int32 max_ngram_order, // Uses empty history for . std::vector