diff --git a/egs/sprakbanken/s5/RESULTS b/egs/sprakbanken/s5/RESULTS index 628507ac85d..d64f006f806 100644 --- a/egs/sprakbanken/s5/RESULTS +++ b/egs/sprakbanken/s5/RESULTS @@ -1,25 +1,28 @@ -%WER 49.19 [ 5318 / 10811, 481 ins, 1511 del, 3326 sub ] exp/mono0a/decode_3g_test1k/wer_9 -%WER 47.28 [ 5111 / 10811, 443 ins, 1489 del, 3179 sub ] exp/mono0a/decode_b3g_test1k/wer_10 -%WER 16.19 [ 1750 / 10811, 397 ins, 323 del, 1030 sub ] exp/sgmm2_5a/decode_3g_test1k/wer_9 -%WER 15.10 [ 1632 / 10811, 404 ins, 305 del, 923 sub ] exp/sgmm2_5b/decode_3g_test1k/wer_9 -%WER 14.94 [ 1615 / 10811, 390 ins, 310 del, 915 sub ] exp/sgmm2_5b/decode_4g_test1k/wer_9 -%WER 14.36 [ 1553 / 10811, 376 ins, 264 del, 913 sub ] exp/sgmm2_5c/decode_3g_test1k/wer_9 -%WER 14.18 [ 1533 / 10811, 367 ins, 266 del, 900 sub ] exp/sgmm2_5c/decode_4g_test1k/wer_9 -%WER 25.61 [ 2769 / 10811, 511 ins, 539 del, 1719 sub ] exp/tri1/decode_3g_test1k/wer_10 -%WER 25.12 [ 2716 / 10811, 444 ins, 571 del, 1701 sub ] exp/tri1/decode_b3g_test1k/wer_11 -%WER 23.81 [ 2574 / 10811, 426 ins, 564 del, 1584 sub ] exp/tri2a/decode_3g_test1k/wer_12 -%WER 23.22 [ 2510 / 10811, 457 ins, 517 del, 1536 sub ] exp/tri2a/decode_3g_test1k_fromlats/wer_11 -%WER 22.18 [ 2398 / 10811, 436 ins, 495 del, 1467 sub ] exp/tri2b/decode_3g_test1k/wer_11 -%WER 21.87 [ 2364 / 10811, 380 ins, 553 del, 1431 sub ] exp/tri2b/decode_3g_test1k_mbr/wer_13 -%WER 18.98 [ 2052 / 10811, 451 ins, 372 del, 1229 sub ] exp/tri3b_20k/decode_3g_test1k/wer_11 -%WER 22.62 [ 2445 / 10811, 468 ins, 460 del, 1517 sub ] exp/tri3b_20k/decode_3g_test1k.si/wer_10 -%WER 19.31 [ 2088 / 10811, 440 ins, 388 del, 1260 sub ] exp/tri3b/decode_3g_test1k/wer_11 -%WER 23.19 [ 2507 / 10811, 435 ins, 520 del, 1552 sub ] exp/tri3b/decode_3g_test1k.si/wer_12 -%WER 19.06 [ 2061 / 10811, 427 ins, 384 del, 1250 sub ] exp/tri3b/decode_4g_test1k/wer_11 -%WER 23.20 [ 2508 / 10811, 447 ins, 520 del, 1541 sub ] exp/tri3b/decode_4g_test1k.si/wer_11 -%WER 17.42 [ 1883 / 10811, 416 ins, 359 del, 1108 sub ] exp/tri4a/decode_3g_test1k/wer_13 -%WER 20.86 [ 2255 / 10811, 403 ins, 473 del, 1379 sub ] exp/tri4a/decode_3g_test1k.si/wer_13 -%WER 17.52 [ 1894 / 10811, 396 ins, 372 del, 1126 sub ] exp/tri4b/decode_3g_test1k/wer_13 -%WER 20.82 [ 2251 / 10811, 399 ins, 471 del, 1381 sub ] exp/tri4b/decode_3g_test1k.si/wer_13 -%WER 17.53 [ 1895 / 10811, 403 ins, 375 del, 1117 sub ] exp/tri4b/decode_4g_test1k/wer_13 -%WER 20.99 [ 2269 / 10811, 438 ins, 436 del, 1395 sub ] exp/tri4b/decode_4g_test1k.si/wer_11 +GMM-based systems +%WER 22.87 [ 24286 / 106172, 3577 ins, 5321 del, 15388 sub ] exp/tri1/decode_fg_dev/wer_12_0.5 +%WER 23.13 [ 24561 / 106172, 3602 ins, 5411 del, 15548 sub ] exp/tri1/decode_tg_dev/wer_12_0.5 +%WER 21.24 [ 22548 / 106172, 4028 ins, 4246 del, 14274 sub ] exp/tri2a/decode_tg_dev/wer_13_0.0 +%WER 19.46 [ 20664 / 106172, 3276 ins, 4332 del, 13056 sub ] exp/tri2b/decode_tg_dev/wer_15_0.5 +%WER 16.80 [ 17839 / 106172, 3238 ins, 3403 del, 11198 sub ] exp/tri3b/decode_fg_dev/wer_17_0.0 +%WER 19.45 [ 20651 / 106172, 3880 ins, 3671 del, 13100 sub ] exp/tri3b/decode_fg_dev.si/wer_15_0.0 +%WER 14.24 [ 9849 / 69165, 2046 ins, 1365 del, 6438 sub ] exp/tri3b/decode_fg_test/wer_16_0.5 +%WER 17.31 [ 11972 / 69165, 2330 ins, 1695 del, 7947 sub ] exp/tri3b/decode_fg_test.si/wer_15_0.5 +%WER 16.94 [ 17984 / 106172, 3361 ins, 3377 del, 11246 sub ] exp/tri3b/decode_tg_dev/wer_16_0.0 +%WER 19.52 [ 20720 / 106172, 3654 ins, 3846 del, 13220 sub ] exp/tri3b/decode_tg_dev.si/wer_17_0.0 +%WER 14.40 [ 9957 / 69165, 2291 ins, 1184 del, 6482 sub ] exp/tri3b/decode_tg_test/wer_16_0.0 +%WER 17.41 [ 12044 / 69165, 2291 ins, 1736 del, 8017 sub ] exp/tri3b/decode_tg_test.si/wer_15_0.5 +nnet3 xent systems +%WER 11.57 [ 12279 / 106172, 2640 ins, 2442 del, 7197 sub ] exp/nnet3/tdnn0_sp/decode_dev/wer_10_0.0 +%WER 9.89 [ 6841 / 69165, 1542 ins, 917 del, 4382 sub ] exp/nnet3/tdnn0_sp/decode_test/wer_11_0.5 +%WER 10.45 [ 11098 / 106172, 2199 ins, 2272 del, 6627 sub ] exp/nnet3/lstm_0_ld5_sp/decode_dev/wer_9_0.0 +%WER 12.34 [ 8533 / 69165, 1740 ins, 1393 del, 5400 sub ] exp/nnet3/lstm_0_ld5_sp/decode_test/wer_11_1.0 +%WER 10.59 [ 11241 / 106172, 2208 ins, 2304 del, 6729 sub ] exp/nnet3/lstm_bidirectional_ld5_sp/decode_dev/wer_9_0.0 +%WER 12.43 [ 8596 / 69165, 1742 ins, 1426 del, 5428 sub ] exp/nnet3/lstm_bidirectional_ld5_sp/decode_test/wer_11_1.0 +%WER 9.18 [ 9747 / 106172, 1987 ins, 1913 del, 5847 sub ] exp/nnet3/lstm_bidirectional_sp/decode_dev/wer_8_0.0 +Nnet3 chain systems +%WER 8.48 [ 9001 / 106172, 1559 ins, 1624 del, 5818 sub ] exp/chain/tdnn_lstm1a_sp_bi/decode_dev/wer_9_0.0 +%WER 7.20 [ 4981 / 69165, 915 ins, 402 del, 3664 sub ] exp/chain/tdnn_lstm1a_sp_bi/decode_test/wer_8_1.0 +%WER 10.00 [ 10619 / 106172, 1980 ins, 1896 del, 6743 sub ] exp/chain/tdnn_sp_bi/decode_dev/wer_9_0.0 +%WER 8.58 [ 5936 / 69165, 1059 ins, 667 del, 4210 sub ] exp/chain/tdnn_sp_bi/decode_test/wer_9_1.0 +%WER 9.39 [ 9969 / 106172, 1624 ins, 1912 del, 6433 sub ] exp/chain/lstm1e_sp_bi/decode_dev/wer_8_0.5 +%WER 7.72 [ 5341 / 69165, 1002 ins, 497 del, 3842 sub ] exp/chain/lstm1e_sp_bi/decode_test/wer_8_0.5 diff --git a/egs/sprakbanken/s5/conf/mfcc_hires.conf b/egs/sprakbanken/s5/conf/mfcc_hires.conf new file mode 100644 index 00000000000..b5aeaafe704 --- /dev/null +++ b/egs/sprakbanken/s5/conf/mfcc_hires.conf @@ -0,0 +1,11 @@ +# config for high-resolution MFCC features, intended for neural network training +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=40 # there is no dimensionality reduction. +--low-freq=20 # low cutoff frequency for mel bins... this is high-bandwidth data, so + # there might be some information at the low end. + # Needs to be this low to be sensitive to creaky voice +--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) diff --git a/egs/sprakbanken/s5/conf/online_cmvn.conf b/egs/sprakbanken/s5/conf/online_cmvn.conf new file mode 100644 index 00000000000..7748a4a4dd3 --- /dev/null +++ b/egs/sprakbanken/s5/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/egs/sprakbanken/s5/local/chain/compare_wer_general.sh b/egs/sprakbanken/s5/local/chain/compare_wer_general.sh new file mode 100755 index 00000000000..4074b0c12c3 --- /dev/null +++ b/egs/sprakbanken/s5/local/chain/compare_wer_general.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +# Prints a table makes it easy to compare WER and objective values across nnet3 +# and chain training runs + +echo -n "System " +for x in "$@"; do printf "% 10s" $x; done +echo + +echo -n "WER on dev(tg) " +for x in "$@"; do + wer=$(grep WER ${x}/decode_dev/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "WER on test(tg) " +for x in "$@"; do + wer=$(grep WER ${x}/decode_test/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "Final train prob " +for x in "$@"; do + prob=$(grep Overall ${x}/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "Final valid prob " +for x in "$@"; do + prob=$(grep Overall ${x}/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "Final train prob (xent) " +for x in "$@"; do + prob=$(grep Overall ${x}/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "Final valid prob (xent) " +for x in "$@"; do + prob=$(grep Overall ${x}/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo diff --git a/egs/sprakbanken/s5/local/chain/run_lstm.sh b/egs/sprakbanken/s5/local/chain/run_lstm.sh new file mode 120000 index 00000000000..afba2a1ce94 --- /dev/null +++ b/egs/sprakbanken/s5/local/chain/run_lstm.sh @@ -0,0 +1 @@ +tuning/run_lstm_1e.sh \ No newline at end of file diff --git a/egs/sprakbanken/s5/local/chain/run_tdnn.sh b/egs/sprakbanken/s5/local/chain/run_tdnn.sh new file mode 120000 index 00000000000..61f8f499182 --- /dev/null +++ b/egs/sprakbanken/s5/local/chain/run_tdnn.sh @@ -0,0 +1 @@ +tuning/run_tdnn_1b.sh \ No newline at end of file diff --git a/egs/sprakbanken/s5/local/chain/run_tdnn_lstm.sh b/egs/sprakbanken/s5/local/chain/run_tdnn_lstm.sh new file mode 120000 index 00000000000..8e647598556 --- /dev/null +++ b/egs/sprakbanken/s5/local/chain/run_tdnn_lstm.sh @@ -0,0 +1 @@ +tuning/run_tdnn_lstm_1a.sh \ No newline at end of file diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1a.sh b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1a.sh new file mode 100755 index 00000000000..3ea61800869 --- /dev/null +++ b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1a.sh @@ -0,0 +1,260 @@ +#!/bin/bash + +# run_lstm_1a.sh is a first attempt at an LSTM system, based on xconfigs-- it's +# probably not very well configured, e.g. the num-params might be too small. +# recurrent-projection-dim is less than non-recurrent-projection-dim due to an +# oversight. + +# comparison with TDNN system (WER is worse): +# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn1b_sp_bi exp/chain_cleaned/lstm1a_sp_bi +# System tdnn1b_sp_bi lstm1a_sp_bi +# WER on dev(orig) 10.2 10.8 +# WER on dev(rescored) 9.6 10.2 +# WER on test(orig) 9.7 10.0 +# WER on test(rescored) 9.2 9.6 +# Final train prob -0.0928 -0.0848 +# Final valid prob -0.1178 -0.1098 +# Final train prob (xent) -1.4666 -1.1692 +# Final valid prob (xent) -1.5473 -1.2520 + + +## how you run this (note: this assumes that the run_lstm.sh soft link points here; +## otherwise call it directly in its location). +# by default, with cleanup: +# local/chain/run_lstm.sh + +# without cleanup: +# local/chain/run_lstm.sh --train-set train --gmm tri3 --nnet3-affix "" & + +# note, if you have already run one of the non-chain nnet3 systems +# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14. + +# This script (run_lstm_1a) is like run_tdnn_1b.sh except modified to use an LSTM +# configuration (some aspects borrowed from egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh). + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +decode_nj=30 +min_seg_len=1.55 +chunk_left_context=40 +chunk_right_context=0 +label_delay=5 +xent_regularize=0.1 +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +# decode options +extra_left_context=50 +extra_right_context=0 +frames_per_chunk=150 + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +lstm_affix=1a #affix for LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 15 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 16 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + + +if [ $stage -le 17 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=256 delay=-3 + lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=256 delay=-3 + lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=256 delay=-3 + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 18 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width "$frames_per_chunk" \ + --egs.chunk-left-context "$chunk_left_context" \ + --egs.chunk-right-context "$chunk_right_context" \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.deriv-truncate-margin 10 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + + +if [ $stage -le 19 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph +fi + +if [ $stage -le 20 ]; then + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1b.sh b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1b.sh new file mode 100755 index 00000000000..a22d4eb53d7 --- /dev/null +++ b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1b.sh @@ -0,0 +1,261 @@ +#!/bin/bash + +# run_lstm_1b.sh is as run_lstm_1a.sh but replacing the projected LSTM +# with a regular LSTM. This is done in order to have an LSTM-only baseline +# for the 'fast lstm', where we need to test the regular as well as projected +# LSTM layers. + +# It's worse than the LSTMP, as expected, due to more overtraining. + +# steps/info/chain_dir_info.pl exp/chain_cleaned/lstm1b_sp_bi +# exp/chain_cleaned/lstm1b_sp_bi: num-iters=253 nj=2..12 num-params=9.6M dim=40+100->3607 combine=-0.09->-0.09 xent:train/valid[167,252,final]=(-1.24,-1.14,-1.14/-1.35,-1.28,-1.28) logprob:train/valid[167,252,final]=(-0.092,-0.079,-0.079/-0.119,-0.110,-0.110) + +# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1a_sp_bi exp/chain_cleaned/lstm1b_sp_bi +# System lstm1a_sp_bi lstm1b_sp_bi +# WER on dev(orig) 10.8 11.3 +# WER on dev(rescored) 10.2 10.7 +# WER on test(orig) 10.0 10.6 +# WER on test(rescored) 9.6 10.0 +# Final train prob -0.0848 -0.0787 +# Final valid prob -0.1098 -0.1104 +# Final train prob (xent) -1.1692 -1.1442 +# Final valid prob (xent) -1.2520 -1.2782 + + +## how you run this (note: this assumes that the run_lstm.sh soft link points here; +## otherwise call it directly in its location). +# by default, with cleanup: +# local/chain/run_lstm.sh + +# without cleanup: +# local/chain/run_lstm.sh --train-set train --gmm tri3 --nnet3-affix "" & + +# note, if you have already run one of the non-chain nnet3 systems +# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14. + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +decode_nj=30 +min_seg_len=1.55 +chunk_left_context=40 +chunk_right_context=0 +label_delay=5 +xent_regularize=0.1 +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +# decode options +extra_left_context=50 +extra_right_context=0 +frames_per_chunk=150 + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +lstm_affix=1b #affix for LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 15 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 16 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + + +if [ $stage -le 17 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + lstm-layer name=lstm1 cell-dim=512 delay=-3 + lstm-layer name=lstm2 cell-dim=512 delay=-3 + lstm-layer name=lstm3 cell-dim=512 delay=-3 + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 18 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width "$frames_per_chunk" \ + --egs.chunk-left-context "$chunk_left_context" \ + --egs.chunk-right-context "$chunk_right_context" \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.deriv-truncate-margin 10 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + + +if [ $stage -le 19 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph +fi + +if [ $stage -le 20 ]; then + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1c.sh b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1c.sh new file mode 100755 index 00000000000..718992fc909 --- /dev/null +++ b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1c.sh @@ -0,0 +1,259 @@ +#!/bin/bash + + +# run_lstm_1c.sh is like run_lstm_1b.sh but changing from the old LSTM +# implementation to our new 'fast' LSTM layer. The xconfig changes from +# 'lstm-layer' to 'fast-lstm-layer'. It's as good as or maybe slightly better +# than the old setup. + +# steps/info/chain_dir_info.pl exp/chain_cleaned/lstm1c_sp_bi +# exp/chain_cleaned/lstm1c_sp_bi: num-iters=253 nj=2..12 num-params=9.6M dim=40+100->3607 combine=-0.09->-0.09 xent:train/valid[167,252,final]=(-1.26,-1.14,-1.14/-1.34,-1.27,-1.27) logprob:train/valid[167,252,final]=(-0.092,-0.078,-0.078/-0.116,-0.111,-0.111) + + +# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1b_sp_bi exp/chain_cleaned/lstm1c_sp_bi +# System lstm1b_sp_bi lstm1c_sp_bi +# WER on dev(orig) 11.3 11.2 +# WER on dev(rescored) 10.7 10.5 +# WER on test(orig) 10.6 10.6 +# WER on test(rescored) 10.0 10.1 +# Final train prob -0.0787 -0.0777 +# Final valid prob -0.1104 -0.1108 +# Final train prob (xent) -1.1442 -1.1445 +# Final valid prob (xent) -1.2782 -1.2692 + +## how you run this (note: this assumes that the run_lstm.sh soft link points here; +## otherwise call it directly in its location). +# by default, with cleanup: +# local/chain/run_lstm.sh + +# without cleanup: +# local/chain/run_lstm.sh --train-set train --gmm tri3 --nnet3-affix "" & + +# note, if you have already run one of the non-chain nnet3 systems +# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14. + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +decode_nj=30 +min_seg_len=1.55 +chunk_left_context=40 +chunk_right_context=0 +label_delay=5 +xent_regularize=0.1 +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +# decode options +extra_left_context=50 +extra_right_context=0 +frames_per_chunk=150 + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +lstm_affix=1c #affix for LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 15 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 16 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + + +if [ $stage -le 17 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + fast-lstm-layer name=lstm1 cell-dim=512 delay=-3 + fast-lstm-layer name=lstm2 cell-dim=512 delay=-3 + fast-lstm-layer name=lstm3 cell-dim=512 delay=-3 + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 18 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width "$frames_per_chunk" \ + --egs.chunk-left-context "$chunk_left_context" \ + --egs.chunk-right-context "$chunk_right_context" \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.deriv-truncate-margin 10 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + + +if [ $stage -le 19 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph +fi + +if [ $stage -le 20 ]; then + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1d.sh b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1d.sh new file mode 100755 index 00000000000..8cf543f5096 --- /dev/null +++ b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1d.sh @@ -0,0 +1,272 @@ +#!/bin/bash + + +# run_lstm_1d.sh is like run_lstm_1c.sh, but switching back to projected +# LSTM (LSTMP)... the configuration is the same 1a (but unlike 1a it uses +# the fast lstm layer). Note: 1a and 1d are a little broken +# in that their non-recurrent-projection-dim are twice the recurrent-projection-dim, +# but it's better for comparison purposes to have this the same as 1a. + +# As you can see, compared to 1a, 1d is 0.3% to 0.5% better absolute; +# this comes with the upgrade to 'fast' LSTM. There were differences to how +# the gradient truncation is done, maybe that's it; also there are +# other differences, like how the update of the diagonal matrices +# are done, and the integration of 4 matrix multiplies into one which +# will affect the natural gradient. Anyway, we're not complaining. + + +# steps/info/chain_dir_info.pl exp/chain_cleaned/lstm1d_sp_bi +# exp/chain_cleaned/lstm1d_sp_bi: num-iters=253 nj=2..12 num-params=6.4M dim=40+100->3607 combine=-0.09->-0.09 xent:train/valid[167,252,final]=(-1.21,-1.13,-1.13/-1.29,-1.22,-1.23) logprob:train/valid[167,252,final]=(-0.092,-0.083,-0.081/-0.114,-0.105,-0.105) + +# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1a_sp_bi exp/chain_cleaned/lstm1c_sp_bi exp/chain_cleaned/lstm1d_sp_bi +# System lstm1a_sp_bi lstm1c_sp_bi lstm1d_sp_bi +# WER on dev(orig) 10.8 11.2 10.3 +# WER on dev(rescored) 10.2 10.5 9.8 +# WER on test(orig) 10.0 10.6 9.7 +# WER on test(rescored) 9.6 10.1 9.2 +# Final train prob -0.0848 -0.0777 -0.0812 +# Final valid prob -0.1098 -0.1108 -0.1049 +# Final train prob (xent) -1.1692 -1.1445 -1.1334 +# Final valid prob (xent) -1.2520 -1.2692 -1.2263 + + + + +## how you run this (note: this assumes that the run_lstm.sh soft link points here; +## otherwise call it directly in its location). +# by default, with cleanup: +# local/chain/run_lstm.sh + +# without cleanup: +# local/chain/run_lstm.sh --train-set train --gmm tri3 --nnet3-affix "" & + +# note, if you have already run one of the non-chain nnet3 systems +# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14. + + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +decode_nj=30 +min_seg_len=1.55 +chunk_left_context=40 +chunk_right_context=0 +label_delay=5 +xent_regularize=0.1 +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +# decode options +extra_left_context=50 +extra_right_context=0 +frames_per_chunk=150 + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +lstm_affix=1d #affix for LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 15 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 16 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + + +if [ $stage -le 17 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=256 delay=-3 + fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=256 delay=-3 + fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=256 delay=-3 + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 18 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width "$frames_per_chunk" \ + --egs.chunk-left-context "$chunk_left_context" \ + --egs.chunk-right-context "$chunk_right_context" \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.deriv-truncate-margin 10 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + + +if [ $stage -le 19 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph +fi + +if [ $stage -le 20 ]; then + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1e.sh b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1e.sh new file mode 100755 index 00000000000..11af644e765 --- /dev/null +++ b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1e.sh @@ -0,0 +1,259 @@ +#!/bin/bash + +# (From the original script: +# run_lstm_1e.sh is like run_lstm_1d.sh, but reducing non-recurrent-projection-dim +# from 256 to 128 (fixes an earlier mistake). +# However, this doesn't improve WER results-- see below. Probably the system +# has too few parameters. Anyway we probably won't tune this further +# as LSTMs by themselves aren't expected to perform that well: +# see run_tdnn_lstm_1a.sh and others in that sequence.) + +# steps/info/chain_dir_info.pl exp/chain/lstm1e_sp_bi +# exp/chain/lstm1e_sp_bi: num-iters=384 nj=2..12 num-params=4.7M dim=40+100->3557 combine=-0.07->-0.07 xent:train/valid[255,383,final]=(-0.755,-0.703,-0.712/-0.793,-0.755,-0.761) logprob:train/valid[255,383,final]=(-0.060,-0.053,-0.053/-0.071,-0.066,-0.065) + +# local/chain/compare_wer_general.sh exp/chain/tdnn_sp_bi/ exp/chain/lstm1e_sp_bi/ +# System exp/chain/tdnn_sp_bi/exp/chain/lstm1e_sp_bi/ +# WER on dev(tg) 10.00 9.39 +# WER on test(tg) 8.58 7.72 +# Final train prob -0.0642 -0.0528 +# Final valid prob -0.0788 -0.0651 +# Final train prob (xent) -0.9113 -0.7117 +# Final valid prob (xent) -0.9525 -0.7607 + +## how you run this (note: this assumes that the run_lstm.sh soft link points here; +## otherwise call it directly in its location). +# by default: +# local/chain/run_lstm.sh + +# note, that you should probably adjust parallelisation to your setup +# if you have already run one of the non-chain nnet3 systems +# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14. + +# note, if you have already run one of the chain nnet3 systems, +# you may want to run with --stage 17. + + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +decode_nj=7 +min_seg_len=1.55 +chunk_left_context=40 +chunk_right_context=0 +label_delay=5 +xent_regularize=0.1 +train_set=train +gmm=tri3b # the gmm for the target data +num_threads_ubm=32 +nnet3_affix= # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +# decode options +extra_left_context=50 +extra_right_context=0 +frames_per_chunk=150 + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +lstm_affix=1e #affix for LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 15 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 16 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + + +if [ $stage -le 17 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 + fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 + fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 18 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width "$frames_per_chunk" \ + --egs.chunk-left-context "$chunk_left_context" \ + --egs.chunk-right-context "$chunk_right_context" \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.deriv-truncate-margin 10 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + + +if [ $stage -le 19 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test_tg $dir $dir/graph +fi + +if [ $stage -le 20 ]; then + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_1a.sh new file mode 100755 index 00000000000..21e3edac5f3 --- /dev/null +++ b/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_1a.sh @@ -0,0 +1,202 @@ +#!/bin/bash + +# This is the original TDNN script before we introduced xconfigs. +# See run_tdnn_1b.sh for comparative results. + + +# by default, with cleanup: +# local/chain/run_tdnn.sh + +# without cleanup: +# local/chain/run_tdnn.sh --train-set train --gmm tri3 --nnet3-affix "" & + +# note, if you have already run the corresponding non-chain nnet3 system +# (local/nnet3/run_tdnn.sh), you may want to run with --stage 14. + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +decode_nj=30 +min_seg_len=1.55 +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnn_affix= #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 15 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 16 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +if [ $stage -le 17 ]; then + mkdir -p $dir + + echo "$0: creating neural net configs"; + + steps/nnet3/tdnn/make_configs.py \ + --self-repair-scale-nonlinearity 0.00001 \ + --feat-dir data/${train_set}_sp_hires_comb \ + --ivector-dir $train_ivector_dir \ + --tree-dir $tree_dir \ + --relu-dim 450 \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize 0.1 \ + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target 1.0 \ + $dir/configs || exit 1; +fi + +if [ $stage -le 18 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + + +if [ $stage -le 19 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph +fi + +if [ $stage -le 20 ]; then + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_1b.sh new file mode 100755 index 00000000000..14973a5d029 --- /dev/null +++ b/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_1b.sh @@ -0,0 +1,239 @@ +#!/bin/bash + +# steps/info/chain_dir_info.pl exp/chain/tdnn_sp_bi/ +# exp/chain/tdnn_sp_bi/: num-iters=384 nj=2..12 num-params=7.0M dim=40+100->3557 combine=-0.08->-0.08 xent:train/valid[255,383,final]=(-0.954,-0.911,-0.911/-0.979,-0.953,-0.952) logprob:train/valid[255,383,final]=(-0.071,-0.064,-0.064/-0.084,-0.079,-0.079) + +# local/chain/compare_wer_general.sh exp/nnet3/tdnn0_sp exp/chain/tdnn_sp_bi +# System exp/nnet3/tdnn0_spexp/chain/tdnn_sp_bi +# WER on dev(tg) 11.57 10.00 +# WER on test(tg) 9.89 8.58 +# Final train prob -0.79890.7538 -0.0642 +# Final valid prob -0.77280.7590 -0.0788 +# Final train prob (xent) -0.9113 +# Final valid prob (xent) -0.9525 + +## how you run this (note: this assumes that the run_tdnn.sh soft link points here; +## otherwise call it directly in its location). +# by default: +# local/chain/run_tdnn.sh + +# note, that you should probably adjust parallelisation to your setup +# if you have already run the corresponding non-chain nnet3 system +# (local/nnet3/run_tdnn.sh), you may want to run with --stage 14. + +# This script is like run_tdnn_1a.sh except it uses an xconfig-based mechanism +# to get the configuration. + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +decode_nj=7 +min_seg_len=1.55 +xent_regularize=0.1 +train_set=train +gmm=tri3b # the gmm for the target data +num_threads_ubm=32 +nnet3_affix= # cleanup affix for nnet3 and chain dirs, e.g. _cleaned + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnn_affix= #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 15 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 16 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +if [ $stage -le 17 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=450 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=450 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1,2) dim=450 + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=450 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=450 + relu-renorm-layer name=tdnn6 input=Append(-6,-3,0) dim=450 + + ## adding the layers for chain branch + relu-renorm-layer name=prefinal-chain input=tdnn6 dim=450 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-renorm-layer name=prefinal-xent input=tdnn6 dim=450 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + +fi + +if [ $stage -le 18 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + + +if [ $stage -le 19 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test_tg $dir $dir/graph +fi + +if [ $stage -le 20 ]; then + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_lstm_1a.sh new file mode 100755 index 00000000000..7f7f263a741 --- /dev/null +++ b/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_lstm_1a.sh @@ -0,0 +1,258 @@ +#!/bin/bash + +# steps/info/chain_dir_info.pl exp/chain/tdnn_lstm1a_sp_bi/ +# exp/chain/tdnn_lstm1a_sp_bi/: num-iters=384 nj=2..12 num-params=9.5M dim=40+100->3557 combine=-0.05->-0.05 xent:train/valid[255,383,final]=(-0.579,-0.518,-0.523/-0.651,-0.616,-0.619) logprob:train/valid[255,383,final]=(-0.046,-0.038,-0.038/-0.063,-0.060,-0.059) + +# local/chain/compare_wer_general.sh exp/chain/tdnn_sp_bi/ exp/chain/lstm1e_sp_bi/ exp/chain/tdnn_lstm1a_sp_bi/ +# System exp/chain/tdnn_sp_bi/exp/chain/lstm1e_sp_bi/exp/chain/tdnn_lstm1a_sp_bi/ +# WER on dev(tg) 10.00 9.39 8.48 +# WER on test(tg) 8.58 7.72 7.20 +# Final train prob -0.0642 -0.0528 -0.0378 +# Final valid prob -0.0788 -0.0651 -0.0595 +# Final train prob (xent) -0.9113 -0.7117 -0.5228 +# Final valid prob (xent) -0.9525 -0.7607 -0.6185 + +# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly +# standard, LSTM, except that some TDNN layers were added in between the +# LSTM layers. + +## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here; +## otherwise call it directly in its location). +# by default: +# local/chain/run_tdnn_lstm.sh + +# note, that you may want to adjust parallelisation to your setup +# if you have already run one of the non-chain nnet3 systems +# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14. + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +decode_nj=7 +min_seg_len=1.55 +chunk_left_context=40 +chunk_right_context=0 +label_delay=5 +xent_regularize=0.1 +train_set=train +gmm=tri3b # the gmm for the target data +num_threads_ubm=32 +nnet3_affix= # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +# decode options +extra_left_context=50 +extra_right_context=0 +frames_per_chunk=150 + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnn_lstm_affix=1a #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 15 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 16 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + + +if [ $stage -le 17 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=512 + relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1) + fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 + relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3) + relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3) + fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 + relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3) + relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3) + fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 18 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width "$frames_per_chunk" \ + --egs.chunk-left-context "$chunk_left_context" \ + --egs.chunk-right-context "$chunk_right_context" \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.deriv-truncate-margin 10 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + + +if [ $stage -le 19 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test_tg $dir $dir/graph +fi + +if [ $stage -le 20 ]; then + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/sprakbanken/s5/local/cstr_ndx2flist.pl b/egs/sprakbanken/s5/local/cstr_ndx2flist.pl deleted file mode 100755 index d19db421a9f..00000000000 --- a/egs/sprakbanken/s5/local/cstr_ndx2flist.pl +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env perl - -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This is modified from the script in standard Kaldi recipe to account -# for the way the WSJ data is structured on the Edinburgh systems. -# - Arnab Ghoshal, 12/1/12 - -# This program takes as its standard input an .ndx file from the WSJ corpus that looks -# like this: -#;; File: tr_s_wv1.ndx, updated 04/26/94 -#;; -#;; Index for WSJ0 SI-short Sennheiser training data -#;; Data is read WSJ sentences, Sennheiser mic. -#;; Contains 84 speakers X (~100 utts per speaker MIT/SRI and ~50 utts -#;; per speaker TI) = 7236 utts -#;; -#11_1_1:wsj0/si_tr_s/01i/01ic0201.wv1 -#11_1_1:wsj0/si_tr_s/01i/01ic0202.wv1 -#11_1_1:wsj0/si_tr_s/01i/01ic0203.wv1 - -# and as command-line argument it takes the names of the WSJ disk locations, e.g.: -# /group/corpora/public/wsjcam0/data on DICE machines. -# It outputs a list of absolute pathnames. - -$wsj_dir = $ARGV[0]; - -while(){ - if(m/^;/){ next; } # Comment. Ignore it. - else { - m/^([0-9_]+):\s*(\S+)$/ || die "Could not parse line $_"; - $filename = $2; # as a subdirectory of the distributed disk. - if ($filename !~ m/\.wv1$/) { $filename .= ".wv1"; } - $filename = "$wsj_dir/$filename"; - if (-e $filename) { - print "$filename\n"; - } else { - print STDERR "File $filename found in the index but not on disk\n"; - } - } -} diff --git a/egs/sprakbanken/s5/local/find_transcripts.pl b/egs/sprakbanken/s5/local/find_transcripts.pl deleted file mode 100755 index 6429411b864..00000000000 --- a/egs/sprakbanken/s5/local/find_transcripts.pl +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - - -# This program takes on its standard input a list of utterance -# id's, one for each line. (e.g. 4k0c030a is a an utterance id). -# It takes as -# Extracts from the dot files the transcripts for a given -# dataset (represented by a file list). -# - -@ARGV == 1 || die "find_transcripts.pl dot_files_flist < utterance_ids > transcripts"; -$dot_flist = shift @ARGV; - -open(L, "<$dot_flist") || die "Opening file list of dot files: $dot_flist\n"; -while(){ - chop; - m:\S+/(\w{6})00.dot: || die "Bad line in dot file list: $_"; - $spk = $1; - $spk2dot{$spk} = $_; -} - - - -while(){ - chop; - $uttid = $_; - $uttid =~ m:(\w{6})\w\w: || die "Bad utterance id $_"; - $spk = $1; - if($spk ne $curspk) { - %utt2trans = { }; # Don't keep all the transcripts in memory... - $curspk = $spk; - $dotfile = $spk2dot{$spk}; - defined $dotfile || die "No dot file for speaker $spk\n"; - open(F, "<$dotfile") || die "Error opening dot file $dotfile\n"; - while() { - $_ =~ m:(.+)\((\w{8})\)\s*$: || die "Bad line $_ in dot file $dotfile (line $.)\n"; - $trans = $1; - $utt = $2; - $utt2trans{$utt} = $trans; - } - } - if(!defined $utt2trans{$uttid}) { - print STDERR "No transcript for utterance $uttid (current dot file is $dotfile)\n"; - } else { - print "$uttid $utt2trans{$uttid}\n"; - } -} - - diff --git a/egs/sprakbanken/s5/local/flist2scp.pl b/egs/sprakbanken/s5/local/flist2scp.pl deleted file mode 100755 index 234e4add1ed..00000000000 --- a/egs/sprakbanken/s5/local/flist2scp.pl +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# takes in a file list with lines like -# /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1 -# and outputs an scp in kaldi format with lines like -# 4k0c030a /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1 -# (the first thing is the utterance-id, which is the same as the basename of the file. - - -while(<>){ - m:^\S+/(\w+)\.[wW][vV]1$: || die "Bad line $_"; - $id = $1; - $id =~ tr/A-Z/a-z/; # Necessary because of weirdness on disk 13-16.1 (uppercase filenames) - print "$id $_"; -} - diff --git a/egs/sprakbanken/s5/local/generate_example_kws.sh b/egs/sprakbanken/s5/local/generate_example_kws.sh deleted file mode 100755 index 2c849438192..00000000000 --- a/egs/sprakbanken/s5/local/generate_example_kws.sh +++ /dev/null @@ -1,110 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (Author: Guoguo Chen) -# Apache 2.0. - - -if [ $# -ne 2 ]; then - echo "Usage: local/generate_example_kws.sh " - echo " e.g.: local/generate_example_kws.sh data/test_eval92/ " - exit 1; -fi - -datadir=$1; -kwsdatadir=$2; -text=$datadir/text; - -mkdir -p $kwsdatadir; - -# Generate keywords; we generate 20 unigram keywords with at least 20 counts, -# 20 bigram keywords with at least 10 counts and 10 trigram keywords with at -# least 5 counts. -cat $text | perl -e ' - %unigram = (); - %bigram = (); - %trigram = (); - while(<>) { - chomp; - @col=split(" ", $_); - shift @col; - for($i = 0; $i < @col; $i++) { - # unigram case - if (!defined($unigram{$col[$i]})) { - $unigram{$col[$i]} = 0; - } - $unigram{$col[$i]}++; - - # bigram case - if ($i < @col-1) { - $word = $col[$i] . " " . $col[$i+1]; - if (!defined($bigram{$word})) { - $bigram{$word} = 0; - } - $bigram{$word}++; - } - - # trigram case - if ($i < @col-2) { - $word = $col[$i] . " " . $col[$i+1] . " " . $col[$i+2]; - if (!defined($trigram{$word})) { - $trigram{$word} = 0; - } - $trigram{$word}++; - } - } - } - - $max_count = 100; - $total = 20; - $current = 0; - $min_count = 20; - while ($current < $total && $min_count <= $max_count) { - foreach $x (keys %unigram) { - if ($unigram{$x} == $min_count) { - print "$x\n"; - $unigram{$x} = 0; - $current++; - } - if ($current == $total) { - last; - } - } - $min_count++; - } - - $total = 20; - $current = 0; - $min_count = 4; - while ($current < $total && $min_count <= $max_count) { - foreach $x (keys %bigram) { - if ($bigram{$x} == $min_count) { - print "$x\n"; - $bigram{$x} = 0; - $current++; - } - if ($current == $total) { - last; - } - } - $min_count++; - } - - $total = 10; - $current = 0; - $min_count = 3; - while ($current < $total && $min_count <= $max_count) { - foreach $x (keys %trigram) { - if ($trigram{$x} == $min_count) { - print "$x\n"; - $trigram{$x} = 0; - $current++; - } - if ($current == $total) { - last; - } - } - $min_count++; - } - ' > $kwsdatadir/raw_keywords.txt - -echo "Keywords generation succeeded" diff --git a/egs/sprakbanken/s5/local/generate_results_file.sh b/egs/sprakbanken/s5/local/generate_results_file.sh new file mode 100755 index 00000000000..4659c36fc5a --- /dev/null +++ b/egs/sprakbanken/s5/local/generate_results_file.sh @@ -0,0 +1,16 @@ + +echo "GMM-based systems" +for x in exp/*/decode*;do + [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; +done + +echo "nnet3 xent systems" +for x in exp/nnet3/tdnn*/decode* exp/nnet3/lstm*/decode* ;do + [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; +done + +echo "Nnet3 chain systems" +for x in exp/chain/tdnn*/decode* exp/chain/lstm*/decode*;do + [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; +done + diff --git a/egs/sprakbanken/s5/local/kws_data_prep.sh b/egs/sprakbanken/s5/local/kws_data_prep.sh deleted file mode 100755 index 5222a88c9ef..00000000000 --- a/egs/sprakbanken/s5/local/kws_data_prep.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (Author: Guoguo Chen) -# Apache 2.0. - - -if [ $# -ne 3 ]; then - echo "Usage: local/kws_data_prep.sh " - echo " e.g.: local/kws_data_prep.sh data/lang_test_bd_tgpr/ data/test_eval92/ data/kws/" - exit 1; -fi - -langdir=$1; -datadir=$2; -kwsdatadir=$3; - -mkdir -p $kwsdatadir; - -# Create keyword id for each keyword -cat $kwsdatadir/raw_keywords.txt | perl -e ' - $idx=1; - while(<>) { - chomp; - printf "WSJ-%04d $_\n", $idx; - $idx++; - }' > $kwsdatadir/keywords.txt - -# Map the keywords to integers; note that we remove the keywords that -# are not in our $langdir/words.txt, as we won't find them anyway... -cat $kwsdatadir/keywords.txt | \ - sym2int.pl --map-oov 0 -f 2- $langdir/words.txt | \ - grep -v " 0 " | grep -v " 0$" > $kwsdatadir/keywords.int - -# Compile keywords into FSTs -transcripts-to-fsts ark:$kwsdatadir/keywords.int ark:$kwsdatadir/keywords.fsts - -# Create utterance id for each utterance; Note that by "utterance" here I mean -# the keys that will appear in the lattice archive. You may have to modify here -cat $datadir/wav.scp | \ - awk '{print $1}' | \ - sort | uniq | perl -e ' - $idx=1; - while(<>) { - chomp; - print "$_ $idx\n"; - $idx++; - }' > $kwsdatadir/utter_id - -# Map utterance to the names that will appear in the rttm file. You have -# to modify the commands below accoring to your rttm file. In the WSJ case -# since each file is an utterance, we assume that the actual file names will -# be the "names" in the rttm, so the utterance names map to themselves. -cat $datadir/wav.scp | \ - awk '{print $1}' | \ - sort | uniq | perl -e ' - while(<>) { - chomp; - print "$_ $_\n"; - }' > $kwsdatadir/utter_map; -echo "Kws data preparation succeeded" diff --git a/egs/sprakbanken/s5/local/nnet3/run_blstm.sh b/egs/sprakbanken/s5/local/nnet3/run_blstm.sh new file mode 100755 index 00000000000..f29731397fe --- /dev/null +++ b/egs/sprakbanken/s5/local/nnet3/run_blstm.sh @@ -0,0 +1,48 @@ +stage=0 +train_stage=-10 +affix=bidirectional +nnet3_affix= +common_egs_dir= +remove_egs=true +train_set=train +gmm=tri3b + + +# BLSTM params +cell_dim=1024 +rp_dim=128 +nrp_dim=128 +chunk_left_context=40 +chunk_right_context=40 + +# training options +srand=0 +num_jobs_initial=3 +num_jobs_final=15 +samples_per_iter=20000 +num_epochs=6 +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +local/nnet3/run_lstm.sh --affix $affix \ + --srand $srand \ + --stage $stage \ + --train-stage $train_stage \ + --train-set $train_set \ + --gmm $gmm \ + --lstm-delay " [-1,1] [-2,2] [-3,3] " \ + --label-delay 0 \ + --cell-dim $cell_dim \ + --recurrent-projection-dim $rp_dim \ + --non-recurrent-projection-dim $nrp_dim \ + --common-egs-dir "$common_egs_dir" \ + --chunk-left-context $chunk_left_context \ + --chunk-right-context $chunk_right_context \ + --num-jobs-initial $num_jobs_initial \ + --num-jobs-final $num_jobs_final \ + --samples-per-iter $samples_per_iter \ + --num-epochs $num_epochs \ + --remove-egs $remove_egs + diff --git a/egs/sprakbanken/s5/local/nnet3/run_ivector_common.sh b/egs/sprakbanken/s5/local/nnet3/run_ivector_common.sh new file mode 100755 index 00000000000..9a730348dfa --- /dev/null +++ b/egs/sprakbanken/s5/local/nnet3/run_ivector_common.sh @@ -0,0 +1,238 @@ +#!/bin/bash + +set -e -o pipefail + + +# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh (and may eventually +# be called by more scripts). It contains the common feature preparation and iVector-related parts +# of the script. See those scripts for examples of usage. + + +stage=0 +nj=30 +min_seg_len=1.55 # min length in seconds... we do this because chain training + # will discard segments shorter than 1.5 seconds. Must remain in sync + # with the same option given to prepare_lores_feats_and_alignments.sh +train_set=train # you might set this to e.g. train. +gmm=tri3b # This specifies a GMM-dir from the features of the type you're training the system on; + # it should contain alignments for 'train_set'. + +num_threads_ubm=32 +nnet3_affix=_n3 # affix for exp/nnet3 directory to put iVector stuff in, so it + # becomes exp/nnet3_cleaned or whatever. + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +gmm_dir=exp/${gmm} +ali_dir=exp/${gmm}_ali_${train_set}_sp_comb + +for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do + if [ ! -f $f ]; then + echo "$0: expected file $f to exist" + exit 1 + fi +done + + + +if [ $stage -le 2 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then + echo "$0: data/${train_set}_sp_hires/feats.scp already exists." + echo " ... Please either remove it, or rerun this script with stage > 2." + exit 1 +fi + + +if [ $stage -le 1 ]; then + echo "$0: preparing directory for speed-perturbed data" + utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp +fi + +if [ $stage -le 2 ]; then + echo "$0: creating high-resolution MFCC features" + + # this shows how you can split across multiple file-systems. we'll split the + # MFCC dir across multiple locations. You might want to be careful here, if you + # have multiple copies of Kaldi checked out and run the same recipe, not to let + # them overwrite each other. + mfccdir=data/${train_set}_sp_hires/data + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/egs/sprakbanken-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage + fi + + for datadir in ${train_set}_sp dev test; do + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires + done + + # do volume-perturbation on the training data prior to extracting hires + # features; this helps make trained nnets more invariant to test data volume. + utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires + + for datadir in ${train_set}_sp dev test; do + steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires + steps/compute_cmvn_stats.sh data/${datadir}_hires + utils/fix_data_dir.sh data/${datadir}_hires + done +fi + +if [ $stage -le 3 ]; then + echo "$0: combining short segments of speed-perturbed high-resolution MFCC training data" + # we have to combine short segments or we won't be able to train chain models + # on those segments. + utils/data/combine_short_segments.sh \ + data/${train_set}_sp_hires $min_seg_len data/${train_set}_sp_hires_comb + + # just copy over the CMVN to avoid having to recompute it. + cp data/${train_set}_sp_hires/cmvn.scp data/${train_set}_sp_hires_comb/ + utils/fix_data_dir.sh data/${train_set}_sp_hires_comb/ +fi + +if [ $stage -le 4 ]; then + echo "$0: selecting segments of hires training data that were also present in the" + echo " ... original training data." + + # note, these data-dirs are temporary; we put them in a sub-directory + # of the place where we'll make the alignments. + temp_data_root=exp/nnet3${nnet3_affix}/tri5 + mkdir -p $temp_data_root + + utils/data/subset_data_dir.sh --utt-list data/${train_set}/feats.scp \ + data/${train_set}_sp_hires $temp_data_root/${train_set}_hires + + # note: essentially all the original segments should be in the hires data. + n1=$(wc -l /dev/null || true + ( + steps/nnet3/decode.sh --nj 12 --cmd "$decode_cmd" --num-threads 4 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_dev_hires \ + ${graph_dir} data/dev_hires ${dir}/decode_dev || exit 1 + steps/nnet3/decode.sh --nj 7 --cmd "$decode_cmd" --num-threads 4 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_test_hires \ + ${graph_dir} data/test_hires ${dir}/decode_test || exit 1 + ) || touch $dir/.error & + wait + [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 +fi + + +exit 0; diff --git a/egs/sprakbanken/s5/local/nnet3/run_tdnn.sh b/egs/sprakbanken/s5/local/nnet3/run_tdnn.sh new file mode 100755 index 00000000000..45794ac9ee4 --- /dev/null +++ b/egs/sprakbanken/s5/local/nnet3/run_tdnn.sh @@ -0,0 +1,102 @@ +#!/bin/bash + +# This is the standard "tdnn" system, built in nnet3 + +# by default: +# local/nnet3/run_tdnn.sh + +set -e -o pipefail -u + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +decode_nj=7 +min_seg_len=1.55 +train_set=train +gmm=tri3b # this is the source gmm-dir for the data-type of interest; it + # should have alignments for the specified training data. +num_threads_ubm=32 +nnet3_affix= # cleanup affix for exp dirs, e.g. _cleaned +tdnn_affix= #affix for TDNN directory e.g. "a" or "b", in case we change the configuration. + +# Options which are not passed through to run_ivector_common.sh +train_stage=-10 +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -7,2 -3,3 0 0" +remove_egs=true +relu_dim=750 +num_epochs=3 + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat </dev/null + ( + steps/nnet3/decode.sh --nj 7 --cmd "$decode_cmd" --num-threads 4 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_test_hires \ + ${graph_dir} data/test_hires ${dir}/decode_test || exit 1 + steps/nnet3/decode.sh --nj 12 --cmd "$decode_cmd" --num-threads 4 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_dev_hires \ + ${graph_dir} data/dev_hires ${dir}/decode_dev || exit 1 + ) || touch $dir/.error & + + wait + [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 +fi + + +exit 0; diff --git a/egs/sprakbanken/s5/local/run_basis_fmllr.sh b/egs/sprakbanken/s5/local/run_basis_fmllr.sh deleted file mode 100755 index 3c04e480a0a..00000000000 --- a/egs/sprakbanken/s5/local/run_basis_fmllr.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash - -. cmd.sh - -mfccdir=mfcc - -# Make "per-utterance" versions of the test sets where the speaker -# information corresponds to utterances-- to demonstrate adaptation on -# short utterances, particularly for basis fMLLR -for x in test_eval92 test_eval93 test_dev93 ; do - y=${x}_utt - rm -r data/$y - cp -r data/$x data/$y - cat data/$x/utt2spk | awk '{print $1, $1;}' > data/$y/utt2spk; - cp data/$y/utt2spk data/$y/spk2utt; - steps/compute_cmvn_stats.sh data/$y exp/make_mfcc/$y $mfccdir || exit 1; -done - - - # basis fMLLR experiments. - # First a baseline: decode per-utterance with normal fMLLR. -steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \ - exp/tri3b/graph_tgpr data/test_dev93_utt exp/tri3b/decode_tgpr_dev93_utt || exit 1; -steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \ - exp/tri3b/graph_tgpr data/test_eval92_utt exp/tri3b/decode_tgpr_eval92_utt || exit 1; - - # get the fMLLR basis. -steps/get_fmllr_basis.sh --cmd "$train_cmd" data/train_si84 data/lang exp/tri3b - - # decoding tri3b with basis fMLLR -steps/decode_basis_fmllr.sh --nj 10 --cmd "$decode_cmd" \ - exp/tri3b/graph_tgpr data/test_dev93 exp/tri3b/decode_tgpr_dev93_basis || exit 1; -steps/decode_basis_fmllr.sh --nj 8 --cmd "$decode_cmd" \ - exp/tri3b/graph_tgpr data/test_eval92 exp/tri3b/decode_tgpr_eval92_basis || exit 1; - - # The same, per-utterance. -steps/decode_basis_fmllr.sh --nj 10 --cmd "$decode_cmd" \ - exp/tri3b/graph_tgpr data/test_dev93_utt exp/tri3b/decode_tgpr_dev93_basis_utt || exit 1; -steps/decode_basis_fmllr.sh --nj 8 --cmd "$decode_cmd" \ - exp/tri3b/graph_tgpr data/test_eval92_utt exp/tri3b/decode_tgpr_eval92_basis_utt || exit 1; - - diff --git a/egs/sprakbanken/s5/local/run_kl_hmm.sh b/egs/sprakbanken/s5/local/run_kl_hmm.sh deleted file mode 100644 index 9e7679a7675..00000000000 --- a/egs/sprakbanken/s5/local/run_kl_hmm.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -# Copyright 2013 Idiap Research Institute (Author: David Imseng) -# Apache 2.0 - -. cmd.sh - -states=20000 -dir=exp/tri4b_pretrain-dbn_dnn/ - -steps/kl_hmm/build_tree.sh --cmd "$big_memory_cmd" --thresh -1 --nnet_dir exp/tri4b_pretrain-dbn_dnn/ \ - ${states} data-fmllr-tri4b/train_si284 data/lang exp/tri4b_ali_si284 exp/tri4b-${states} || exit 1; - -utils/mkgraph.sh data/lang_test_bd_tgpr exp/tri4b-${states} exp/tri4b-${states}/graph_bd_tgpr || exit 1; - -steps/kl_hmm/train_kl_hmm.sh --nj 30 --cmd "$big_memory_cmd" --model exp/tri4b-${states}/final.mdl data-fmllr-tri4b/train_si284 exp/tri4b-${states} $dir/kl-hmm-${states} - -steps/kl_hmm/decode_kl_hmm.sh --nj 10 --cmd "$big_memory_cmd" --acwt 0.1 --nnet $dir/kl-hmm-${states}/final.nnet --model exp/tri4b-${states}/final.mdl \ - --config conf/decode_dnn.config exp/tri4b-${states}/graph_bd_tgpr/ data-fmllr-tri4b/test_dev93 $dir/decode_dev93_kl-hmm-bd-${states}_tst - -steps/kl_hmm/decode_kl_hmm.sh --nj 8 --cmd "$big_memory_cmd" --acwt 0.1 --nnet $dir/kl-hmm-${states}/final.nnet --model exp/tri4b-${states}/final.mdl \ - --config conf/decode_dnn.config exp/tri4b-${states}/graph_bd_tgpr/ data-fmllr-tri4b/test_eval92 $dir/decode_eval92_kl-hmm-bd-${states}_tst - - diff --git a/egs/sprakbanken/s5/local/run_raw_fmllr.sh b/egs/sprakbanken/s5/local/run_raw_fmllr.sh deleted file mode 100644 index c4847a93f27..00000000000 --- a/egs/sprakbanken/s5/local/run_raw_fmllr.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/bin/bash - - -steps/align_raw_fmllr.sh --nj 10 --cmd "$train_cmd" --use-graphs true \ - data/train_si84 data/lang exp/tri2b exp/tri2b_ali_si84_raw - -steps/train_raw_sat.sh --cmd "$train_cmd" \ - 2500 15000 data/train_si84 data/lang exp/tri2b_ali_si84_raw exp/tri3c || exit 1; - - -mfccdir=mfcc -for x in test_eval92 test_eval93 test_dev93 ; do - y=${x}_utt - mkdir -p data/$y - cp data/$x/* data/$y || true - cat data/$x/utt2spk | awk '{print $1, $1;}' > data/$y/utt2spk; - cp data/$y/utt2spk data/$y/spk2utt; - steps/compute_cmvn_stats.sh data/$y exp/make_mfcc/$y $mfccdir || exit 1; -done - -( -utils/mkgraph.sh data/lang_test_tgpr exp/tri3c exp/tri3c/graph_tgpr || exit 1; -steps/decode_raw_fmllr.sh --nj 10 --cmd "$decode_cmd" \ - exp/tri3c/graph_tgpr data/test_dev93 exp/tri3c/decode_tgpr_dev93 || exit 1; -steps/decode_raw_fmllr.sh --nj 8 --cmd "$decode_cmd" \ - exp/tri3c/graph_tgpr data/test_eval92 exp/tri3c/decode_tgpr_eval92 || exit 1; - -steps/decode_raw_fmllr.sh --nj 30 --cmd "$decode_cmd" \ - exp/tri3c/graph_tgpr data/test_dev93_utt exp/tri3c/decode_tgpr_dev93_utt || exit 1; -steps/decode_raw_fmllr.sh --nj 30 --cmd "$decode_cmd" \ - exp/tri3c/graph_tgpr data/test_eval92_utt exp/tri3c/decode_tgpr_eval92_utt || exit 1; - -steps/decode_raw_fmllr.sh --use-normal-fmllr true --nj 10 --cmd "$decode_cmd" \ - exp/tri3c/graph_tgpr data/test_dev93 exp/tri3c/decode_tgpr_dev93_2fmllr || exit 1; -steps/decode_raw_fmllr.sh --use-normal-fmllr true --nj 8 --cmd "$decode_cmd" \ - exp/tri3c/graph_tgpr data/test_eval92 exp/tri3c/decode_tgpr_eval92_2fmllr || exit 1; -)& - -( -utils/mkgraph.sh data/lang_test_bd_tgpr exp/tri3c exp/tri3c/graph_bd_tgpr || exit 1; - -steps/decode_raw_fmllr.sh --cmd "$decode_cmd" --nj 8 exp/tri3c/graph_bd_tgpr \ - data/test_eval92 exp/tri3c/decode_bd_tgpr_eval92 - steps/decode_raw_fmllr.sh --cmd "$decode_cmd" --nj 10 exp/tri3c/graph_bd_tgpr \ - data/test_dev93 exp/tri3c/decode_bd_tgpr_dev93 -)& - -steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \ - data/train_si284 data/lang exp/tri3c exp/tri3c_ali_si284 || exit 1; - - -steps/train_raw_sat.sh --cmd "$train_cmd" \ - 4200 40000 data/train_si284 data/lang exp/tri3c_ali_si284 exp/tri4d || exit 1; -( - utils/mkgraph.sh data/lang_test_tgpr exp/tri4d exp/tri4d/graph_tgpr || exit 1; - steps/decode_raw_fmllr.sh --nj 10 --cmd "$decode_cmd" \ - exp/tri4d/graph_tgpr data/test_dev93 exp/tri4d/decode_tgpr_dev93 || exit 1; - steps/decode_raw_fmllr.sh --nj 8 --cmd "$decode_cmd" \ - exp/tri4d/graph_tgpr data/test_eval92 exp/tri4d/decode_tgpr_eval92 || exit 1; -) & - - -wait - - -#for x in exp/tri3{b,c}/decode_tgpr*; do grep WER $x/wer_* | utils/best_wer.sh ; done - diff --git a/egs/sprakbanken/s5/local/sprak_data_prep.sh b/egs/sprakbanken/s5/local/sprak_data_prep.sh index 1b2406620f2..c336b06e8af 100755 --- a/egs/sprakbanken/s5/local/sprak_data_prep.sh +++ b/egs/sprakbanken/s5/local/sprak_data_prep.sh @@ -18,29 +18,18 @@ utils=`pwd`/utils . ./path.sh -# Checks if python3 is available on the system and install python3 in userspace if not -# This recipe currently relies on version 3 because python3 uses utf8 as internal -# string representation - -#if ! which python3 >&/dev/null; then -# echo "Installing python3 since not on your path." -# pushd $KALDI_ROOT/tools || exit 1; -# extras/install_python3.sh || exit 1; -# popd -#fi - if [ ! -d $dir/download ]; then mkdir -p $dir/download/0565-1 $dir/download/0565-2 fi -echo "Downloading and unpacking sprakbanken to $dir/corpus_processed. This will take a while." +echo "Downloading and unpacking sprakbanken to $dir/corpus_processed. This will take a while. The connection closes every 50-60 seconds and the repo maintainers do not have othersuggestions than increasing the number of retries." if [ ! -f $dir/download/da.16kHz.0565-1.tar.gz ]; then - ( wget http://www.nb.no/sbfil/talegjenkjenning/16kHz/da.16kHz.0565-1.tar.gz --directory-prefix=$dir/download ) + ( wget --tries 100 http://www.nb.no/sbfil/talegjenkjenning/16kHz/da.16kHz.0565-1.tar.gz --directory-prefix=$dir/download ) fi if [ ! -f $dir/download/da.16kHz.0565-2.tar.gz ]; then - ( wget http://www.nb.no/sbfil/talegjenkjenning/16kHz/da.16kHz.0565-2.tar.gz --directory-prefix=$dir/download ) + ( wget --tries 100 http://www.nb.no/sbfil/talegjenkjenning/16kHz/da.16kHz.0565-2.tar.gz --directory-prefix=$dir/download ) fi if [ ! -f $dir/download/da.16kHz.0611.tar.gz ]; then diff --git a/egs/sprakbanken/s5/local/sprak_run_mmi_tri4b.sh b/egs/sprakbanken/s5/local/sprak_run_mmi_tri4b.sh deleted file mode 100755 index 83999bada53..00000000000 --- a/egs/sprakbanken/s5/local/sprak_run_mmi_tri4b.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/bin/bash -. ./cmd.sh - -# LM suffix -uid=$1 - -# Test set id -test=$2 - -steps/make_denlats.sh --nj 30 --sub-split 24 --cmd "$train_cmd" \ - --transform-dir exp/tri4b_ali \ - data/train data/lang exp/tri4b exp/tri4b_denlats || exit 1; - -steps/train_mmi.sh --cmd "$train_cmd" --boost 0.1 \ - data/train data/lang exp/tri4b_ali exp/tri4b_denlats \ - exp/tri4b_mmi_b0.1 || exit 1; - -steps/decode.sh --nj 7 --cmd "$decode_cmd" --transform-dir exp/tri3b/decode_${uid}_$test \ - exp/tri4b_/graph_$uid data/$test exp/tri4b_mmi_b0.1/decode_${uid}_$test - -#first, train UBM for fMMI experiments. -steps/train_diag_ubm.sh --silence-weight 0.5 --nj 50 --cmd "$train_cmd" \ - 600 data/train data/lang exp/tri4b_ali exp/dubm4b - -# Next, fMMI+MMI. -steps/train_mmi_fmmi.sh \ - --boost 0.1 --cmd "$train_cmd" data/train data/lang exp/tri4b_ali exp/dubm4b exp/tri4b_denlats \ - exp/tri4b_fmmi_a || exit 1; - -for iter in 3 4 5 6 7 8; do - steps/decode_fmmi.sh --nj 5 --cmd "$decode_cmd" --iter $iter \ - --transform-dir exp/tri3b/decode_${uid}_$test exp/tri4b/graph_$uid data/$test \ - exp/tri4b_fmmi_a/decode_${uid}_${test}_it$iter & -done -# decode the last iter with the bd model. -#for iter in 8; do -# steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \ -# --transform-dir exp/tri3b/decode_bd_tgpr_dev93 exp/tri4b/graph_bd_tgpr data/test_dev93 \ -# exp/tri4b_fmmi_a/decode_bd_tgpr_dev93_it$iter & -# steps/decode_fmmi.sh --nj 8 --cmd "$decode_cmd" --iter $iter \ -# --transform-dir exp/tri3b/decode_bd_tgpr_eval92 exp/tri4b/graph_bd_tgpr data/test_eval92 \ -# exp/tri4b_fmmi_a/decode_tgpr_eval92_it$iter & -#done - - -# fMMI + mmi with indirect differential. -steps/train_mmi_fmmi_indirect.sh \ - --boost 0.1 --cmd "$train_cmd" data/train data/lang exp/tri4b_ali exp/dubm4b exp/tri4b_denlats \ - exp/tri4b_fmmi_indirect || exit 1; - -for iter in 3 4 5 6 7 8; do - steps/decode_fmmi.sh --nj 7 --cmd "$decode_cmd" --iter $iter \ - --transform-dir exp/tri3b/decode_${uid}_$test exp/tri4b/graph_$uid data/$test \ - exp/tri4b_fmmi_indirect/decode_${uid}_${test}_it$iter & -done - diff --git a/egs/sprakbanken/s5/local/sprak_train_cmulm.sh b/egs/sprakbanken/s5/local/sprak_train_cmulm.sh deleted file mode 100755 index 55d6d60bf9d..00000000000 --- a/egs/sprakbanken/s5/local/sprak_train_cmulm.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0 - -# This script takes data prepared in a corpus-dependent way -# in data/local/, and converts it into the "canonical" form, -# in various subdirectories of data/, e.g. data/lang, data/lang_test_ug, -# data/train_si284, data/train_si84, etc. - -# Don't bother doing train_si84 separately (although we have the file lists -# in data/local/) because it's just the first 7138 utterances in train_si284. -# We'll create train_si84 after doing the feature extraction. - -. ./path.sh || exit 1; - -echo "Preparing train and test data" -srcdir=data/local/data -lmdir=data/local/arpa_lm -tmpdir=data/local/lm_tmp -lang_tmp=data/local/lang_tmp -lexicon=data/local/dict/transcripts -ccs=data/local/lang_tmp/cmuclmtk.ccs -lm_suffix=arpa -mkdir -p $lmdir -mkdir -p $tmpdir - -# Create context cue symbol file for cmuclmtk -echo -e '' > $ccs -echo -e '' >> $ccs - - -# Envelop LM training data in context cues -python3 local/sprak_prep_lm.py $lexicon $lmdir/lm_input - - -# Next, for each type of language model, create the corresponding FST -# and the corresponding lang_test_* directory. - -echo Preparing language models for test - -text2wfreq < $lmdir/lm_input | wfreq2vocab -top 40000 > $lmdir/sprak.vocab - -text2idngram -vocab $lmdir/sprak.vocab -idngram $lmdir/sprak.idngram < $lmdir/lm_input - -idngram2lm -linear -idngram $lmdir/sprak.idngram -vocab \ - $lmdir/sprak.vocab -arpa $lmdir/sprak.arpa -context $ccs - - -test=data/lang_test_${lm_suffix} -mkdir -p $test -cp -r data/lang/* $test - -cat $lmdir/sprak.arpa | \ - arpa2fst --disambig-symbol=#0 \ - --read-symbol-table=$test/words.txt - $test/G.fst - - -utils/validate_lang.pl $test || exit 1; - -exit 0; diff --git a/egs/sprakbanken/s5/run.sh b/egs/sprakbanken/s5/run.sh index 53fd7b1484e..64a24deeabf 100755 --- a/egs/sprakbanken/s5/run.sh +++ b/egs/sprakbanken/s5/run.sh @@ -5,7 +5,6 @@ . ./path.sh # so python3 is on the path if not on the system (we made a link to utils/).a nj=12 - stage=0 . utils/parse_options.sh @@ -125,12 +124,11 @@ if [ $stage -le 9 ]; then fi if [ $stage -le 10 ]; then -# Alignment used to train nnets and sgmms -steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ - data/train data/lang exp/tri3b exp/tri3b_ali || exit 1; + # Alignment used to train nnets and sgmms + steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ + data/train data/lang exp/tri3b exp/tri3b_ali || exit 1; fi -##TODO: Add nnet3 and chain setups ## Works #local/sprak_run_nnet_cpu.sh tg dev @@ -139,5 +137,30 @@ fi #local/sprak_run_sgmm2.sh dev +# Run neural network setups based in the TEDLIUM recipe + +# Running the nnet3-tdnn setup will train an ivector extractor that +# is used by the subsequent nnet3 and chain systems (why --stage is +# specified) +#local/nnet3/run_tdnn.sh --tdnn-affix "0" --nnet3-affix "" + +# nnet3 LSTM +#local/nnet3/run_lstm.sh --stage 13 --affix "0" + +# nnet3 bLSTM +#local/nnet3/run_blstm.sh --stage 12 + + + +# chain TDNN +# This setup creates a new lang directory that is also used by the +# TDNN-LSTM system +#local/chain/run_tdnn.sh --stage 14 + +# chain TDNN-LSTM +local/chain/run_tdnn_lstm.sh --stage 17 + + # Getting results [see RESULTS file] -for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done +local/generate_results_file.sh 2> /dev/null > RESULTS + diff --git a/egs/sprakbanken_swe/s5/local/data_prep.py b/egs/sprakbanken_swe/s5/local/data_prep.py index f3b644a26b6..58a0898dc26 100755 --- a/egs/sprakbanken_swe/s5/local/data_prep.py +++ b/egs/sprakbanken_swe/s5/local/data_prep.py @@ -123,7 +123,7 @@ def create_parallel_kaldi(filelist, sphpipe, snd=False): if __name__ == '__main__': - flist = codecs.open(sys.argv[1], "r", "utf8").readlines() + flist = codecs.open(sys.argv[1], "r").readlines() outpath = sys.argv[2] if len(sys.argv) == 5: sndlist = codecs.open(sys.argv[3], "r").readlines() @@ -133,8 +133,8 @@ def create_parallel_kaldi(filelist, sphpipe, snd=False): traindata = create_parallel_kaldi(flist, "") textout = codecs.open(os.path.join(outpath, "text.unnormalised"), "w", "utf8") - wavout = codecs.open(os.path.join(outpath, "wav.scp"), "w","utf8") - utt2spkout = codecs.open(os.path.join(outpath, "utt2spk"), "w","utf8") + wavout = codecs.open(os.path.join(outpath, "wav.scp"), "w") + utt2spkout = codecs.open(os.path.join(outpath, "utt2spk"), "w") textout.writelines(traindata[0]) wavout.writelines(traindata[1]) utt2spkout.writelines(traindata[2]) diff --git a/egs/sprakbanken_swe/s5/local/normalize_transcript.py b/egs/sprakbanken_swe/s5/local/normalize_transcript.py index 68e534df40c..90e45744e2a 100755 --- a/egs/sprakbanken_swe/s5/local/normalize_transcript.py +++ b/egs/sprakbanken_swe/s5/local/normalize_transcript.py @@ -18,6 +18,9 @@ } #removes all the above signs +from_chars = ''.join(normdict.keys()) +to_chars = ''.join(normdict.values()) + t_table = str.maketrans(normdict) ## Main @@ -25,13 +28,15 @@ transcript = codecs.open(sys.argv[1], "r", "utf8") outtext = codecs.open(sys.argv[2], "w", "utf8") -for line in transcript: - line = line.replace(".\Punkt", ".") - line = line.replace(",\Komma", ",") - normtext1 = line.translate(t_table) - normtext2 = re.sub(r' +', ' ', normtext1.strip()) - outtext.write(normtext2.upper() + "\n") +#TODO: Add number normalisation and remove uppercasing +for line in transcript: + line = line.replace(".\Punkt", ".") + line = line.replace(",\Komma", ",") + normtext1 = re.sub(r'[\.,:;\?]', '', line) + normtext2 = re.sub(r'[\t\\]', ' ', normtext1) + normtext3 = re.sub(r' +', ' ', normtext2.strip()) + outtext.write(normtext3.upper()) transcript.close() outtext.close() diff --git a/egs/sprakbanken_swe/s5/local/sprak_data_prep.sh b/egs/sprakbanken_swe/s5/local/sprak_data_prep.sh index ad6c6e2472f..19751815208 100755 --- a/egs/sprakbanken_swe/s5/local/sprak_data_prep.sh +++ b/egs/sprakbanken_swe/s5/local/sprak_data_prep.sh @@ -22,10 +22,10 @@ utils=`pwd`/utils # This recipe currently relies on version 3 because python3 uses utf8 as internal # string representation -if ! which python3 >&/dev/null; then - echo "Python3 is not installed, to install it you should probably do:" - echo "sudo apt-get install python3" || exit 1; -fi +#if ! which python3 >&/dev/null; then +# echo "Python3 is not installed, to install it you should probably do:" +# echo "sudo apt-get install python3" || exit 1; +#fi if [ ! -d $dir/download ]; then mkdir -p $dir/download/0467-1 $dir/download/0467-2 $dir/download/0467-3 @@ -34,19 +34,19 @@ fi echo "Downloading and unpacking sprakbanken to $dir/corpus_processed. This will take a while." if [ ! -f $dir/download/sve.16khz.0467-1.tar.gz ]; then - ( wget http://www.nb.no/sbfil/talegjenkjenning/16kHz/sve.16khz.0467-1.tar.gz --directory-prefix=$dir/download ) + ( wget --tries 100 http://www.nb.no/sbfil/talegjenkjenning/16kHz/sve.16khz.0467-1.tar.gz --directory-prefix=$dir/download ) fi if [ ! -f $dir/download/sve.16khz.0467-2.tar.gz ]; then - ( wget http://www.nb.no/sbfil/talegjenkjenning/16kHz/sve.16khz.0467-2.tar.gz --directory-prefix=$dir/download ) + ( wget --tries 100 http://www.nb.no/sbfil/talegjenkjenning/16kHz/sve.16khz.0467-2.tar.gz --directory-prefix=$dir/download ) fi if [ ! -f $dir/download/sve.16khz.0467-3.tar.gz ]; then - ( wget http://www.nb.no/sbfil/talegjenkjenning/16kHz/sve.16khz.0467-3.tar.gz --directory-prefix=$dir/download ) + ( wget --tries 100 http://www.nb.no/sbfil/talegjenkjenning/16kHz/sve.16khz.0467-3.tar.gz --directory-prefix=$dir/download ) fi if [ ! -f $dir/download/sve.16khz.0467-1.tar.gz ]; then - ( wget http://www.nb.no/sbfil/talegjenkjenning/16kHz/sve.16khz.0468.tar.gz --directory-prefix=$dir/download ) + ( wget --tries 100 http://www.nb.no/sbfil/talegjenkjenning/16kHz/sve.16khz.0468.tar.gz --directory-prefix=$dir/download ) fi echo "Corpus files downloaded." @@ -78,31 +78,31 @@ mkdir -p $dir/corpus_processed/training/0467-1 $dir/corpus_processed/training/04 # Create parallel file lists and text files, but keep sound files in the same location to save disk space # Writes the lists to data/local/data (~ 310h) echo "Creating parallel data for training data." -python3 $local/sprak2kaldi.py $dir/download/0467-1 $dir/corpus_processed/training/0467-1 # ~140h -python3 $local/sprak2kaldi.py $dir/download/0467-2 $dir/corpus_processed/training/0467-2 # ~125h -python3 $local/sprak2kaldi.py $dir/download/0467-3 $dir/corpus_processed/training/0467-3 # ~128h +python $local/sprak2kaldi.py $dir/download/0467-1 $dir/corpus_processed/training/0467-1 # ~140h +python $local/sprak2kaldi.py $dir/download/0467-2 $dir/corpus_processed/training/0467-2 # ~125h +python $local/sprak2kaldi.py $dir/download/0467-3 $dir/corpus_processed/training/0467-3 # ~128h mv $dir/corpus_processed/training/0467-1/'r4670118.791213 8232' $dir/corpus_processed/training/0467-1/'r4670118.791213_8232' -for f in $dir/corpus_processed/training/0467-1/r4670118.791213_8232/*.txt; do mv "$f" "${f// /_}"; done +for f in $dir/corpus_processed/training/0467-1/r4670118.791213_8232/*.txt; do + mv "$f" "${f// /_}"; +done ( # Ditto test set (~ 93h) echo "Creating parallel data for test data." rm -rf $dir/corpus_processed/test/0468 mkdir -p $dir/corpus_processed/test/0468 - python3 $local/sprak2kaldi.py $dir/download/0468 $dir/corpus_processed/test/0468 + python $local/sprak2kaldi.py $dir/download/0468 $dir/corpus_processed/test/0468 ) - - # Create the LM training data ( echo "Writing the LM text to file and normalising." cat $dir/corpus_processed/training/0467-1/txtlist $dir/corpus_processed/training/0467-2/txtlist $dir/corpus_processed/training/0467-3/txtlist | while read l; do cat $l; done > $lmdir/lmsents - python3 local/normalize_transcript.py $lmdir/lmsents $lmdir/lmsents.norm + python local/normalize_transcript.py $lmdir/lmsents $lmdir/lmsents.norm sort -u $lmdir/lmsents.norm > $lmdir/transcripts.uniq -) & +) # Combine training file lists echo "Combine file lists."