diff --git a/.gitattributes b/.gitattributes index 5a815654b4c..bede44edf8a 100644 --- a/.gitattributes +++ b/.gitattributes @@ -15,4 +15,6 @@ windows/INSTALL* eol=native windows/NewGuidCmd.exe.config text eol=crlf windows/NewGuidCmd.exe binary +# Prevent git changing CR-LF to LF when archiving (patch requires CR-LF on Windows). +**/*.patch -text diff --git a/.gitignore b/.gitignore index f80ffac482d..62c22459577 100644 --- a/.gitignore +++ b/.gitignore @@ -6,11 +6,12 @@ !/src/*/Makefile !/src/*/README -# Compiled Object files +# Compiled Object files and python ciles *.slo *.lo *.o *.obj +*.pyc # Compiled Dynamic libraries *.so diff --git a/egs/ami/s5/cmd.sh b/egs/ami/s5/cmd.sh index 9bc2b3195ef..5ec5d4b715f 100644 --- a/egs/ami/s5/cmd.sh +++ b/egs/ami/s5/cmd.sh @@ -1,9 +1,24 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 1G" +export decode_cmd="queue.pl --mem 2G" +# the use of cuda_cmd is deprecated but it is sometimes still used in nnet1 +# scripts. +export cuda_cmd="queue.pl --gpu 1 --mem 20G" + +# the rest of this file is present for historical reasons. +# In general it's best to rely on conf/queue.conf for cluster-specific +# configuration. # On Eddie use: #export train_cmd="queue.pl -P inf_hcrc_cstr_nst -l h_rt=08:00:00" @@ -11,27 +26,13 @@ #export highmem_cmd="queue.pl -P inf_hcrc_cstr_nst -l h_rt=05:00:00 -pe memory-2G 4" #export scoring_cmd="queue.pl -P inf_hcrc_cstr_nst -l h_rt=00:20:00" -# JSALT2015 workshop, cluster AWS-EC2, (setup from Vijay) -export train_cmd="queue.pl -l arch=*64* --mem 1G" -export decode_cmd="queue.pl -l arch=*64* --mem 2G" -export highmem_cmd="queue.pl -l arch=*64* --mem 4G" -export scoring_cmd="queue.pl -l arch=*64*" -export cuda_cmd="queue.pl --gpu 1 -l mem_free=20G,ram_free=20G" -export cntk_decode_cmd="queue.pl -l arch=*64* --mem 1G -pe smp 2" - -# To run locally, use: -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export highmem_cmd=run.pl -#export cuda_cmd=run.pl - if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then # BUT cluster: queue="all.q@@blade,all.q@@speech" - gpu_queue="long.q@supergpu*,long.q@dellgpu*,long.q@pcspeech-gpu,long.q@pcgpu*" + gpu_queue="long.q@@gpu" storage="matylda5" - export train_cmd="queue.pl -q $queue -l ram_free=1500M,mem_free=1500M,${storage}=1" - export decode_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,${storage}=0.5" + export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=1" + export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.5" export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1" -fi +fi diff --git a/egs/ami/s5/local/nnet3/run_lstm.sh b/egs/ami/s5/local/nnet3/run_lstm.sh index 29ebf6ca601..d077d14cc1e 100755 --- a/egs/ami/s5/local/nnet3/run_lstm.sh +++ b/egs/ami/s5/local/nnet3/run_lstm.sh @@ -18,7 +18,7 @@ stage=0 train_stage=-10 mic=ihm -use_ihm_ali=false +use_ihm_ali=false use_sat_alignments=false # if true, use tri4a alignments are used # by default GMM-HMM systems are not built to this stage # in SDM and MDM systems. So run the tri4a stage if you @@ -66,7 +66,7 @@ decode_iter= echo "$0 $@" # Print the command line for logging -. cmd.sh +. ./cmd.sh . ./path.sh . ./utils/parse_options.sh diff --git a/egs/ami/s5/run_ihm.sh b/egs/ami/s5/run_ihm.sh index b4d41d7066a..b9d60d78182 100755 --- a/egs/ami/s5/run_ihm.sh +++ b/egs/ami/s5/run_ihm.sh @@ -10,13 +10,13 @@ mic=ihm stage=0 . utils/parse_options.sh -# Set bash to 'debug' mode, it prints the commands (option '-x') and exits on : +# Set bash to 'debug' mode, it prints the commands (option '-x') and exits on : # -e 'error', -u 'undefined variable', -o pipefail 'error in pipeline', set -euxo pipefail # Path where AMI gets downloaded (or where locally available): -AMI_DIR=$PWD/wav_db # Default, -case $(hostname -d) in +AMI_DIR=$PWD/wav_db # Default, +case $(hostname -d) in fit.vutbr.cz) AMI_DIR=/mnt/scratch05/iveselyk/KALDI_AMI_WAV ;; # BUT, clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU, cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh, @@ -86,7 +86,7 @@ if [ $stage -le 5 ]; then data/$mic/train data/lang exp/$mic/tri2a exp/$mic/tri2_ali # Decode, graph_dir=exp/$mic/tri2a/graph_${LM} - $highmem_cmd $graph_dir/mkgraph.log \ + $cmd --mem 4G $graph_dir/mkgraph.log \ utils/mkgraph.sh data/lang_${LM} exp/$mic/tri2a $graph_dir steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \ $graph_dir data/$mic/dev exp/$mic/tri2a/decode_dev_${LM} @@ -104,26 +104,26 @@ if [ $stage -le 6 ]; then data/$mic/train data/lang exp/$mic/tri3a exp/$mic/tri3a_ali # Decode, graph_dir=exp/$mic/tri3a/graph_${LM} - $highmem_cmd $graph_dir/mkgraph.log \ + $cmd --mem 4G $graph_dir/mkgraph.log \ utils/mkgraph.sh data/lang_${LM} exp/$mic/tri3a $graph_dir steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \ - $graph_dir data/$mic/dev exp/$mic/tri3a/decode_dev_${LM} + $graph_dir data/$mic/dev exp/$mic/tri3a/decode_dev_${LM} steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \ $graph_dir data/$mic/eval exp/$mic/tri3a/decode_eval_${LM} -fi +fi if [ $stage -le 7 ]; then # Train tri4a, which is LDA+MLLT+SAT, steps/train_sat.sh --cmd "$train_cmd" \ 5000 80000 data/$mic/train data/lang exp/$mic/tri3a_ali exp/$mic/tri4a - # Decode, + # Decode, graph_dir=exp/$mic/tri4a/graph_${LM} $highmem_cmd $graph_dir/mkgraph.log \ utils/mkgraph.sh data/lang_${LM} exp/$mic/tri4a $graph_dir steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \ - $graph_dir data/$mic/dev exp/$mic/tri4a/decode_dev_${LM} + $graph_dir data/$mic/dev exp/$mic/tri4a/decode_dev_${LM} steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \ - $graph_dir data/$mic/eval exp/$mic/tri4a/decode_eval_${LM} + $graph_dir data/$mic/eval exp/$mic/tri4a/decode_eval_${LM} fi nj_mmi=80 @@ -160,11 +160,11 @@ if [ $stage -le 11 ]; then decode_dir=exp/$mic/tri4a_mmi_b0.1/decode_dev_${i}.mdl_${LM} steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \ --transform-dir exp/$mic/tri4a/decode_dev_${LM} --iter $i \ - $graph_dir data/$mic/dev $decode_dir + $graph_dir data/$mic/dev $decode_dir decode_dir=exp/$mic/tri4a_mmi_b0.1/decode_eval_${i}.mdl_${LM} steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \ --transform-dir exp/$mic/tri4a/decode_eval_${LM} --iter $i \ - $graph_dir data/$mic/eval $decode_dir + $graph_dir data/$mic/eval $decode_dir done fi @@ -181,7 +181,7 @@ if [ $stage -le 13 ]; then --hidden-dim 950 \ --splice-indexes "layer0/-2:-1:0:1:2 layer1/-1:2 layer2/-3:3 layer3/-7:2 layer4/-3:3" \ --use-sat-alignments true - + local/online/run_nnet2_ms_sp_disc.sh \ --mic $mic \ --gmm-dir exp/$mic/tri4a \ diff --git a/egs/aspire/s5/local/nnet3/run_autoencoder.sh b/egs/aspire/s5/local/nnet3/run_autoencoder.sh new file mode 100644 index 00000000000..abc7f3a6234 --- /dev/null +++ b/egs/aspire/s5/local/nnet3/run_autoencoder.sh @@ -0,0 +1,88 @@ +#!/bin/bash + +# this is an example to show a "tdnn" system in raw nnet configuration +# i.e. without a transition model + +. cmd.sh + + +# At this script level we don't support not running on GPU, as it would be painfully slow. +# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, +# --num-threads 16 and --minibatch-size 128. + +stage=0 +affix= +train_stage=-10 +common_egs_dir= +num_data_reps=10 + +remove_egs=true + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat < $targets_scp +done + +if [ $stage -le 9 ]; then + echo "$0: creating neural net configs"; + + num_targets=`feat-to-dim scp:$targets_scp - 2>/dev/null` || exit 1 + + # create the config files for nnet initialization + python steps/nnet3/tdnn/make_configs.py \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,3 -7,2 0" \ + --feat-dir ${data_dir} \ + --relu-dim=1024 \ + --add-lda=false \ + --objective-type=quadratic \ + --add-final-sigmoid=false \ + --include-log-softmax=false \ + --use-presoftmax-prior-scale=false \ + --num-targets=$num_targets \ + $dir/configs || exit 1; +fi + +if [ $stage -le 10 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/tdnn/train_raw_nnet.sh --stage $train_stage \ + --cmd "$decode_cmd" \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --num-epochs 2 \ + --num-jobs-initial 3 \ + --num-jobs-final 16 \ + --initial-effective-lrate 0.0017 \ + --final-effective-lrate 0.00017 \ + --egs-dir "$common_egs_dir" \ + --remove-egs $remove_egs \ + --use-gpu true \ + --dense-targets true \ + ${data_dir} $targets_scp $dir || exit 1 +fi + diff --git a/egs/aurora4/s5/cmd.sh b/egs/aurora4/s5/cmd.sh index 139b2cd6c6c..378febca15b 100644 --- a/egs/aurora4/s5/cmd.sh +++ b/egs/aurora4/s5/cmd.sh @@ -1,29 +1,18 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64" -export decode_cmd="queue.pl -l arch=*64 --mem 2G" -export mkgraph_cmd="queue.pl -l arch=*64 --mem 4G" -export big_memory_cmd="queue.pl -l arch=*64 --mem 8G" +export train_cmd="queue.pl --mem 4G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" +# the use of cuda_cmd is deprecated but it's still used in some example scripts +# here. export cuda_cmd="queue.pl --gpu 1" - - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" - -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl diff --git a/egs/babel/s5/cmd.sh b/egs/babel/s5/cmd.sh index a4a11bef039..71dd849a93b 100644 --- a/egs/babel/s5/cmd.sh +++ b/egs/babel/s5/cmd.sh @@ -1,29 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64" -export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G" -export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G" - -#export cuda_cmd="..." - - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" - -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/babel/s5b/cmd.sh b/egs/babel/s5b/cmd.sh index a4a11bef039..88db78823a5 100644 --- a/egs/babel/s5b/cmd.sh +++ b/egs/babel/s5b/cmd.sh @@ -1,29 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64" -export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G" -export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G" - -#export cuda_cmd="..." - - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" - -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 4G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/babel/s5c/cmd.sh b/egs/babel/s5c/cmd.sh index a4a11bef039..71dd849a93b 100644 --- a/egs/babel/s5c/cmd.sh +++ b/egs/babel/s5c/cmd.sh @@ -1,29 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64" -export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G" -export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G" - -#export cuda_cmd="..." - - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" - -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/bn_music_speech/v1/cmd.sh b/egs/bn_music_speech/v1/cmd.sh index 27d1d36a6a6..d1ca1a6d126 100755 --- a/egs/bn_music_speech/v1/cmd.sh +++ b/egs/bn_music_speech/v1/cmd.sh @@ -1,17 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64*" - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" - -#c) run it locally... -#export train_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 4G" diff --git a/egs/callhome_egyptian/s5/cmd.sh b/egs/callhome_egyptian/s5/cmd.sh index ab29f13d4cc..71dd849a93b 100755 --- a/egs/callhome_egyptian/s5/cmd.sh +++ b/egs/callhome_egyptian/s5/cmd.sh @@ -1,18 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#train_cmd='queue.pl -q all.q@a03.clsp.jhu.edu,all.q@a06.clsp.jhu.edu,all.q@a05.clsp.jhu.edu,all.q@v01.clsp.jhu.edu,all.q@a10.clsp.jhu.edu,all.q@a04.clsp.jhu.edu,all.q@a13.clsp.jhu.edu,all.q@a11.clsp.jhu.edu -l arch=*64' -#decode_cmd='queue.pl -q all.q@a03.clsp.jhu.edu,all.q@a06.clsp.jhu.edu,all.q@a05.clsp.jhu.edu,all.q@v01.clsp.jhu.edu,all.q@a10.clsp.jhu.edu,all.q@a04.clsp.jhu.edu,all.q@a13.clsp.jhu.edu,all.q@a11.clsp.jhu.edu -l arch=*64' -train_cmd="queue.pl -l arch=*64" -decode_cmd="queue.pl -l arch=*64" -#train_cmd="run.pl" -# Do training locally. Note: for jobs on smallish subsets, -# it's way faster to run on a single machine with a handful of CPUs, as -# you avoid the latency of starting GridEngine jobs. - - - +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/chime1/s5/cmd.sh b/egs/chime1/s5/cmd.sh index dda6226f419..0dcd5a9200f 100755 --- a/egs/chime1/s5/cmd.sh +++ b/egs/chime1/s5/cmd.sh @@ -1,39 +1,18 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -#export train_cmd="queue.pl -l arch=*64" -#export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G" -#export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G" - -#export cuda_cmd="..." - - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" - -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - - -#c) USFD cluster options -#config="conf/queue_usfd.conf" -#export train_cmd="queue.pl --config $config --mem 8G --rmem 4G" -#export decode_cmd="queue.pl --config $config --mem 8G --rmem 4G" -#export mkgraph_cmd="queue.pl --config $config --mem 8G --rmem 4G" -#export cuda_cmd="queue.pl --config $config --mem 24G --rmem 20G --gpu 1 --time 24:00:00" - - -#d) run it locally... -export train_cmd=run.pl -export decode_cmd=run.pl -export cuda_cmd=run.pl -export mkgraph_cmd=run.pl - +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" + +# the use of cuda_cmd is deprecated, but it's still used in this recipe. +export cuda_cmd="queue.pl --gpu 1" diff --git a/egs/chime2/s5/cmd.sh b/egs/chime2/s5/cmd.sh index 8bb00fe0ec6..0dcd5a9200f 100644 --- a/egs/chime2/s5/cmd.sh +++ b/egs/chime2/s5/cmd.sh @@ -1,30 +1,18 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64" -export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G" -export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G" -export big_memory_cmd="queue.pl -l arch=*64,ram_free=8G,mem_free=8G" -export cuda_cmd="queue.pl -l gpu=1" -#export cuda_cmd="..." +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" - -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# the use of cuda_cmd is deprecated, but it's still used in this recipe. +export cuda_cmd="queue.pl --gpu 1" diff --git a/egs/csj/s5/cmd.sh b/egs/csj/s5/cmd.sh index d5952fe0f87..71dd849a93b 100644 --- a/egs/csj/s5/cmd.sh +++ b/egs/csj/s5/cmd.sh @@ -1,31 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -#export train_cmd="queue.pl -l arch=*64*" -#export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G" -export train_cmd="run.pl" -export decode_cmd="run.pl" -#export cuda_cmd="..." -#export mkgraph_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G" -export mkgraph_cmd="run.pl" - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl - - +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/farsdat/s5/cmd.sh b/egs/farsdat/s5/cmd.sh index d749f2c9f1f..71dd849a93b 100644 --- a/egs/farsdat/s5/cmd.sh +++ b/egs/farsdat/s5/cmd.sh @@ -1,25 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64" -export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G" -export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G" -export cuda_cmd="run.pl" - - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@blade[01][0126789][123456789] -l ram_free=2500M,mem_free=2500M,matylda5=0.5" -#export decode_cmd="queue.pl -q all.q@blade[01][0126789][123456789] -l ram_free=3000M,mem_free=3000M,matylda5=0.1" -#export mkgraph_cmd="queue.pl -q all.q@blade[01][0126789][123456789] -l ram_free=4G,mem_free=4G,matylda5=3" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu,long.q@dellgpu*,long.q@pco203-0[0124] -l gpu=1" - -#c) run locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/fisher_callhome_spanish/s5/cmd.sh b/egs/fisher_callhome_spanish/s5/cmd.sh index ab29f13d4cc..88db78823a5 100755 --- a/egs/fisher_callhome_spanish/s5/cmd.sh +++ b/egs/fisher_callhome_spanish/s5/cmd.sh @@ -1,18 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#train_cmd='queue.pl -q all.q@a03.clsp.jhu.edu,all.q@a06.clsp.jhu.edu,all.q@a05.clsp.jhu.edu,all.q@v01.clsp.jhu.edu,all.q@a10.clsp.jhu.edu,all.q@a04.clsp.jhu.edu,all.q@a13.clsp.jhu.edu,all.q@a11.clsp.jhu.edu -l arch=*64' -#decode_cmd='queue.pl -q all.q@a03.clsp.jhu.edu,all.q@a06.clsp.jhu.edu,all.q@a05.clsp.jhu.edu,all.q@v01.clsp.jhu.edu,all.q@a10.clsp.jhu.edu,all.q@a04.clsp.jhu.edu,all.q@a13.clsp.jhu.edu,all.q@a11.clsp.jhu.edu -l arch=*64' -train_cmd="queue.pl -l arch=*64" -decode_cmd="queue.pl -l arch=*64" -#train_cmd="run.pl" -# Do training locally. Note: for jobs on smallish subsets, -# it's way faster to run on a single machine with a handful of CPUs, as -# you avoid the latency of starting GridEngine jobs. - - - +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 4G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh b/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh index f453ab42058..8fe80b46784 100755 --- a/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh +++ b/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh @@ -1,13 +1,13 @@ #!/bin/bash # # Copyright 2014 Gaurav Kumar. Apache 2.0 -# The input is the Fisher Dataset which contains DISC1 and DISC2. (*.sph files) -# In addition the transcripts are needed as well. +# The input is the Fisher Dataset which contains DISC1 and DISC2. (*.sph files) +# In addition the transcripts are needed as well. # To be run from one directory above this script. # Note: when creating your own data preparation scripts, it's a good idea # to make sure that the speaker id (if present) is a prefix of the utterance -# id, that the output scp file is sorted on utterance id, and that the +# id, that the output scp file is sorted on utterance id, and that the # transcription file is exactly the same length as the scp file and is also # sorted on utterance id (missing transcriptions should be removed from the # scp file using e.g. scripts/filter_scp.pl) @@ -18,8 +18,8 @@ export LC_ALL=C if [ $# -lt 2 ]; then - echo "Arguments should be the location of the Spanish Fisher Speech and Transcript Directories, se -e ../run.sh for example." + echo "Usage: $0 " + echo "e.g.: $0 /home/mpost/data/LDC/LDC2010S01 /home/mpost/data/LDC/LDC2010T04" exit 1; fi @@ -72,20 +72,20 @@ fi speech_d1=$dir/links/LDC2010S01/DISC1/data/speech speech_d2=$dir/links/LDC2010S01/DISC2/data/speech -transcripts=$dir/links/LDC2010T04/data/transcripts - -fcount_d1=`find ${speech_d1} -iname '*.sph' | wc -l` -fcount_d2=`find ${speech_d2} -iname '*.sph' | wc -l` -fcount_t=`find ${transcripts} -iname '*.tdf' | wc -l` -#TODO:it seems like not all speech files have transcripts +transcripts=$dir/links/LDC2010T04/data/transcripts + +fcount_d1=`find ${speech_d1} -iname '*.sph' | wc -l` +fcount_d2=`find ${speech_d2} -iname '*.sph' | wc -l` +fcount_t=`find ${transcripts} -iname '*.tdf' | wc -l` +#TODO:it seems like not all speech files have transcripts #Now check if we got all the files that we needed -if [ $fcount_d1 != 411 -o $fcount_d2 != 408 -o $fcount_t != 819 ]; -then - echo "Incorrect number of files in the data directories" - echo "DISC1 and DISC2 should contain 411 and 408 .sph files respectively" - echo "The transcripts should contain 819 files" - exit 1; -fi +if [ $fcount_d1 != 411 -o $fcount_d2 != 408 -o $fcount_t != 819 ]; +then + echo "Incorrect number of files in the data directories" + echo "DISC1 and DISC2 should contain 411 and 408 .sph files respectively" + echo "The transcripts should contain 819 files" + exit 1; +fi if [ $stage -le 0 ]; then #Gather all the speech files together to create a file list @@ -105,7 +105,7 @@ if [ $stage -le 1 ]; then mv $tmpdir/reco2file_and_channel $dir/train_all/ fi -if [ $stage -le 2 ]; then +if [ $stage -le 2 ]; then sort $tmpdir/text.1 | grep -v '((' | \ awk '{if (NF > 1){ print; }}' | \ sed 's:<\s*[/]*\s*\s*for[ei][ei]g[nh]\s*\w*>::g' | \ @@ -149,7 +149,7 @@ if [ $stage -le 3 ]; then for f in `cat $tmpdir/train_sph.flist`; do # convert to absolute path readlink -e $f - done > $tmpdir/train_sph_abs.flist + done > $tmpdir/train_sph_abs.flist cat $tmpdir/train_sph_abs.flist | perl -ane 'm:/([^/]+)\.sph$: || die "bad line $_; "; print "$1 $_"; ' > $tmpdir/sph.scp cat $tmpdir/sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);}' | \ diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh index 0f2bd037ba0..6d04f53c7e5 100755 --- a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh +++ b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh @@ -22,12 +22,32 @@ lexicon=$1 #Get all unique words, remove punctuation. if [ $stage -le 0 ]; then cat $datadir/text | sed 's:[0-9][0-9]\S*::g' | sed 's:[\.,\?]::g' | tr " " "\n" | sort | uniq | awk '{if (NF > 0){ print; }}' > $tmpdir/uniquewords - if [ -f "/export/a04/gkumar/corpora/gigaword-spanish/bin/gigaword-lexicon.json" ]; then - # Merge with gigaword corpus - $local/merge_lexicons.py - mv $tmpdir/uniquewords $tmpdir/uniquewords.small - mv $tmpdir/uniquewords64k $tmpdir/uniquewords + if [ ! -f "${tmpdir}/es_wordlist.json" ]; then + echo "Could not find the large collection of Spanish words es_wordlist.json" + echo "Trying to download it via wget" + + if ! which wget >&/dev/null; then + echo "This script requires you to first install wget" + exit 1; + fi + + cwd=`pwd` + cd $tmpdir + wget -T 10 -t 3 -c http://www.openslr.org/resources/21/es_wordlist.json.tgz + + if [ ! -e ${tmpdir}/es_wordlist.json.tgz ]; then + echo "Download of the large Spanish word list failed" + exit 1; + fi + + tar -xovzf es_wordlist.json.tgz || exit 1; + cd $cwd fi + + # Merge with gigaword corpus + $local/merge_lexicons.py ${tmpdir} ${lexicon} + mv $tmpdir/uniquewords $tmpdir/uniquewords.small + mv $tmpdir/uniquewords64k $tmpdir/uniquewords fi #Then get the list of phones form basic_rules in the lexicon folder @@ -50,6 +70,7 @@ if [ $stage -le 2 ]; then # representation cat $tmpdir/uniquewords | $local/spron.pl $lexicon/callhome_spanish_lexicon_970908/preferences $lexicon/callhome_spanish_lexicon_970908/basic_rules \ | cut -f1 | sed -r 's:#\S+\s\S+\s\S+\s\S+\s(\S+):\1:g' \ + | awk -F '[/][/]' '{print $1}' \ > $tmpdir/lexicon_raw fi diff --git a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py index 8c67ae56804..5c09f09bc35 100755 --- a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py +++ b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py @@ -7,55 +7,58 @@ import sys import json import codecs -import os import operator -wordlimit=64000 -uw_fisher="data/local/tmp/uniquewords" -uw_gigaword="/export/a04/gkumar/corpora/gigaword-spanish/bin/gigaword-lexicon.json" -uw_LDC="/export/corpora/LDC/LDC96L16/callhome_spanish_lexicon_970908/preferences" +wordlimit = 64000 +tmpdir = sys.argv[1] +ldc_lexicon = sys.argv[2] +uw_fisher = tmpdir + "/uniquewords" +uw_gigaword = tmpdir + "/es_wordlist.json" +uw_LDC = ldc_lexicon + "/callhome_spanish_lexicon_970908/preferences" merged_lexicon = [] # All three lexicons are in different formats # First add the data from lexicon_fisher (A) into the dictionary fisher = codecs.open(uw_fisher, encoding='utf-8') for line in fisher: - merged_lexicon.append(line.strip()) + merged_lexicon.append(line.strip()) fisher.close() -print "After adding the fisher data, the lexicon contains " + str(len(merged_lexicon)) + " entries." +print "After adding the fisher data, the lexicon contains " \ + + str(len(merged_lexicon)) + " entries." # Now add data from the LDC lexicon ldc = codecs.open(uw_LDC, encoding='iso-8859-1') -for line in ldc: - entries = line.strip().split('\t') - if entries[0].lower() not in merged_lexicon: - merged_lexicon.append(entries[0].lower()) +for line in ldc: + entries = line.strip().split('\t') + if entries[0].lower() not in merged_lexicon: + merged_lexicon.append(entries[0].lower()) -print "After adding the LDC data, the lexicon contains " + str(len(merged_lexicon)) + " entries." +print "After adding the LDC data, the lexicon contains " \ + + str(len(merged_lexicon)) + " entries." # Finally add the gigaword data gigaword = json.load(open(uw_gigaword)) gigaword = reversed(sorted(gigaword.iteritems(), key=operator.itemgetter(1))) for item in gigaword: - # We need a maximum of wordlimit words in the lexicon - if len(merged_lexicon) == wordlimit: - break + # We need a maximum of wordlimit words in the lexicon + if len(merged_lexicon) == wordlimit: + break - if item[0].lower() not in merged_lexicon: - merged_lexicon.append(item[0].lower()) - -print "After adding the Gigaword data, the lexicon contains " + str(len(merged_lexicon)) + " entries." + if item[0].lower() not in merged_lexicon: + merged_lexicon.append(item[0].lower()) + +print "After adding the Gigaword data, the lexicon contains " \ + + str(len(merged_lexicon)) + " entries." # Now write the uniquewords to a file -lf = codecs.open('data/local/tmp/uniquewords64k', encoding='utf-8', mode='w+') +lf = codecs.open(tmpdir + '/uniquewords64k', encoding='utf-8', mode='w+') ltuples = sorted(merged_lexicon) for item in ltuples: - lf.write(item + "\n") + lf.write(item + "\n") lf.close() print "Finshed writing unique words" - diff --git a/egs/fisher_callhome_spanish/s5/run.sh b/egs/fisher_callhome_spanish/s5/run.sh index 706f3793278..edd7f56bad2 100755 --- a/egs/fisher_callhome_spanish/s5/run.sh +++ b/egs/fisher_callhome_spanish/s5/run.sh @@ -17,12 +17,10 @@ set -e sfisher_speech=/home/mpost/data/LDC/LDC2010S01 sfisher_transcripts=/home/mpost/data/LDC/LDC2010T04 spanish_lexicon=/export/corpora/LDC/LDC96L16 -#split=/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt split=local/splits/split_fisher callhome_speech=/export/corpora/LDC/LDC96S35 callhome_transcripts=/export/corpora/LDC/LDC96T17 -#split_callhome=/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome split=local/splits/split_callhome local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts @@ -33,16 +31,16 @@ local/fsp_prepare_dict.sh $spanish_lexicon # Rewrite ----------------------------- This section is no longer needed---- # At this point, it might make sense to use a bigger lexicon -# The one I will use is derived from this exercise (spanish fisher) and -# the LDC spanish lexicon along with the most frequent words derived from the +# The one I will use is derived from this exercise (spanish fisher) and +# the LDC spanish lexicon along with the most frequent words derived from the # gigaword corpus such that the total number of entries in the lexicon # are 64k # To generate the merged lexicon, run # /export/a04/gkumar/corpora/gigaword/bin/merge_lexicons.py # you might have to set the locations of the three lexicons within this -# file. Note that the LDC rule base phoneme generator works only from its -# own directory. So the merged lexicon is actually created in +# file. Note that the LDC rule base phoneme generator works only from its +# own directory. So the merged lexicon is actually created in # /export/a04/gkumar/corpora/LDC9..../spanish_lexicon../lexicon64k # This can be easily fixed and will be done. #TODO # Also run the clean lexicon script to take care of non stressable vowels @@ -57,11 +55,11 @@ utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang # Make sure that you do not use your test and your dev sets to train the LM -# Some form of cross validation is possible where you decode your dev/set based on an +# Some form of cross validation is possible where you decode your dev/set based on an # LM that is trained on everything but that that conversation # When in doubt about what your data partitions should be use local/fsp_ideal_data_partitions.pl -# to get the numbers. Depending on your needs, you might have to change the size of -# the splits within that file. The default paritions are based on the Kaldi + Joshua +# to get the numbers. Depending on your needs, you might have to change the size of +# the splits within that file. The default paritions are based on the Kaldi + Joshua # requirements which means that I have very large dev and test sets local/fsp_train_lms.sh $split local/fsp_create_test_lang.sh @@ -95,7 +93,7 @@ cp -r data/local/data/callhome_train_all data/callhome_train_all # MT Tune : Same as the ASR eval set (Use the lattices from here) # MT Eval : 20k utterances # The dev and the test sets need to be carefully chosen so that there is no conversation/speaker -# overlap. This has been setup and the script local/fsp_ideal_data_partitions provides the numbers that are needed below. +# overlap. This has been setup and the script local/fsp_ideal_data_partitions provides the numbers that are needed below. # As noted above, the LM has not been trained on the dev and the test sets. #utils/subset_data_dir.sh --first data/train_all 158126 data/dev_and_test #utils/subset_data_dir.sh --first data/dev_and_test 37814 data/asr_dev_and_test @@ -136,7 +134,7 @@ utils/subset_data_dir.sh --shortest data/train 90000 data/train_100kshort utils/subset_data_dir.sh data/train_100kshort 10000 data/train_10k local/remove_dup_utts.sh 100 data/train_10k data/train_10k_nodup utils/subset_data_dir.sh --speakers data/train 30000 data/train_30k -utils/subset_data_dir.sh --speakers data/train 90000 data/train_100k +utils/subset_data_dir.sh --speakers data/train 90000 data/train_100k steps/train_mono.sh --nj 10 --cmd "$train_cmd" \ data/train_10k_nodup data/lang exp/mono0a @@ -178,7 +176,7 @@ steps/train_lda_mllt.sh --cmd "$train_cmd" \ exp/tri3a/graph data/dev exp/tri3a/decode_dev || exit 1; )& -# Next we'll use fMLLR and train with SAT (i.e. on +# Next we'll use fMLLR and train with SAT (i.e. on # fMLLR features) steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ diff --git a/egs/fisher_english/s5/cmd.sh b/egs/fisher_english/s5/cmd.sh index a4a11bef039..88db78823a5 100644 --- a/egs/fisher_english/s5/cmd.sh +++ b/egs/fisher_english/s5/cmd.sh @@ -1,29 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64" -export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G" -export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G" - -#export cuda_cmd="..." - - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" - -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 4G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/fisher_swbd/s5/cmd.sh b/egs/fisher_swbd/s5/cmd.sh index e3294fde05a..88db78823a5 100644 --- a/egs/fisher_swbd/s5/cmd.sh +++ b/egs/fisher_swbd/s5/cmd.sh @@ -1,32 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64*" -export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G" -export cuda_cmd="..." -export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G" - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl - -#d) Gorgon cluster -#export train_cmd="gorgon_queue.pl -q gorgon" -#export decode_cmd="gorgon_queue.pl -q gorgon" -#export cuda_cmd="gorgon_queue.pl -q gorgon" -#export mkgraph_cmd="gorgon_queue.pl -q gorgon" +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 4G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/fisher_swbd/s5/conf/MSU_single_letter.txt b/egs/fisher_swbd/s5/conf/MSU_single_letter.txt new file mode 100644 index 00000000000..1f7b419cca7 --- /dev/null +++ b/egs/fisher_swbd/s5/conf/MSU_single_letter.txt @@ -0,0 +1,26 @@ +A ey +B b iy +C s iy +D d iy +E iy +F eh f +G jh iy +H ey ch +I ay +J jh ey +K k ey +L eh l +M eh m +N eh n +O ow +P p iy +Q k y uw +R aa r +S eh s +T t iy +U y uw +V v iy +W d ah b ax l y uw +X eh k s +Y w ay +Z z iy diff --git a/egs/fisher_swbd/s5/local/dict.patch b/egs/fisher_swbd/s5/local/dict.patch new file mode 100644 index 00000000000..7fcaa98b4f5 --- /dev/null +++ b/egs/fisher_swbd/s5/local/dict.patch @@ -0,0 +1,378 @@ +8645a8646 +> uh-hum ah m hh ah m +9006c9007 +< April ey p r ih l +--- +> April ey p r ax l +9144d9144 +< B ay zh aa n iy z +9261c9261 +< Battle b ae t el +--- +> Battle b ae t ax l +10014a10015 +> Chevy sh eh v iy +10211a10213 +> Colorado k ao l ax r aa d ow +10212a10215 +> Colorado' k ao l ax r aa d ow z +10370c10373 +< Creek k r ih k +--- +> Creek k r iy k +10889a10893 +> Eleven ax l eh v ih n +10951c10955 +< Erie ih r iy +--- +> Erie iy r iy +11183c11187 +< Forever f ax r eh v er +--- +> Forever f er eh v er +11231a11236 +> Friday f r ay d iy +11744a11750 +> History hh ih s t r iy +12004a12011,12012 +> Israel ih z r ih l +> Israel's ih z r ih l z +12573a12582 +> Lincoln l ih ng k ih n +12574a12584 +> Lincolns l ih ng k ih n z +13268c13278 +< NAACP eh ey ey s iy p iy +--- +> NAACP eh n ey ey s iy p iy +13286c13296 +< NIT eh ay t iy +--- +> NIT eh n ay t iy +13292c13302 +< NTSC eh t iy eh s s iy +--- +> NTSC eh n t iy eh s s iy +14058a14069 +> Quarter k ow r t er +14059a14071 +> Quarterback k ow r t er b ae k +14060a14073 +> Quarters k ow r t er z +14569a14583 +> Science s ay n s +15087a15102 +> Sunday s ah n d iy +15088a15104 +> Sunday's s ah n d iy z +15089a15106 +> Sundays s ah n d iy z +15290,15291c15307,15308 +< Texan t eh k sh ih n +< Texan's t eh k sh ih n s +--- +> Texan t eh k s ih n +> Texan's t eh k s ih n s +15335a15353 +> Thousands th aw z ih n z +15739c15757 +< Waco w ae k ow +--- +> Waco w ey k ow +15841a15860 +> Weekends w iy k eh n z +16782a16802 +> acceptable eh k s eh p ax b ax l +16833a16854 +> accounting ax k aw n ih ng +16948a16970 +> address ax d r eh s +17281a17304 +> already aa r d iy +17315a17339 +> am m +17709a17734 +> asked ae s t +17847a17873 +> attorney ih t er n iy +17919a17946 +> autopilot ao t ow p ay l ih t +17960a17988 +> awfully ao f l iy +18221a18250 +> basketball b ae s k ax b ao l +18222a18252 +> basketball's b ae s k ax b ao l z +18302a18333 +> become b ah k ah m +18303a18335 +> becomes b iy k ah m z +18344a18377 +> began b ax g en n +18817c18850 +< bottle b aa t el +--- +> bottle b aa t ax l +19332,19333c19365,19367 +< camera's k ae m ax r ax z +< cameras k ae m ax r ax z +--- +> camera k ae m r ax +> camera's k ae m r ax z +> cameras k ae m r ax z +19411a19446 +> capital k ae p ax l +19505a19541 +> carrying k ae r ih ng +20316a20353,20354 +> combination k aa m ih n ey sh ih n +> combinations k aa m ih n ey sh ih n z +20831a20870 +> contracts k aa n t r ae k s +21010a21050 +> costs k ao s +21062a21103 +> county k aw n iy +21371a21413 +> cultural k ao l ch ax r ax l +21372a21415 +> culturally k ao l ch ax r ax l iy +21373a21417 +> culture k ao l ch er +21375a21420 +> cultures k ao l ch er z +21543a21589 +> data d ey t ax +22097a22144 +> differently d ih f ax r ih n t l iy +22972a23020 +> effects ax f eh k t s +23016a23065 +> election ax l eh k sh ih n +23018a23068 +> elections ax l eh k sh ih n z +23052a23103 +> eleven ax l eh v ih n +23242a23294 +> enjoyable ae n jh oy ax b ax l +23248a23301 +> enjoys ae n jh oy z +23293a23347 +> entire ih n t ay r +23295a23350,23351 +> entirely ih n t ay r l iy +> entirety ih n t ay r t iy +23745a23802 +> extra eh k s t er +23818a23876 +> facts f ae k s +24508c24566 +< forever f ax r eh v er +--- +> forever f er eh v er +24514c24572 +< forget f ow r g eh t +--- +> forget f er r g eh t +24521a24580 +> forgot f er r g aa t +24522a24582 +> forgotten f er r g aa t ax n +24563a24624 +> forward f ow er d +24680a24742 +> frightening f r ay t n ih ng +24742a24805 +> full-time f ax l t ay m +24862a24926 +> garage g r aa jh +25218a25283 +> grandmother g r ae m ah dh er +25790a25856 +> heavily hh eh v ax l iy +25949a26016 +> history hh ih s t r iy +26038a26106 +> honestly aa n ax s t l iy +26039a26108 +> honesty aa n ax s t iy +26099a26169 +> horror hh ow r +26155a26226 +> houses hh aw z ih z +26184c26255 +< huh-uh hh ah hh ah +--- +> huh-uh ah hh ah +26189c26260 +< hum-um hh m hh m +--- +> hum-um ah m hh ah m +26236a26308 +> hunting hh ah n ih ng +26307a26380,26381 +> ideal ay d iy l +> idealist ay d iy l ih s t +26369a26444 +> imagine m ae jh ih n +26628a26704 +> individuals ih n d ih v ih jh ax l z +26968a27045 +> interest ih n t r ih s t +27184a27262 +> it'd ih d +27702a27781 +> lead l iy d +28378a28458 +> mandatory m ae n d ih t ow r iy +28885a28966 +> minute m ih n ih t +29167a29249 +> mountains m aw t n z +29317a29400 +> mysteries m ih s t r iy z +29318a29402 +> mystery m ih s t r iy +29470a29555 +> nervous n er v ih s +29578,29580c29663,29665 +< nobody n ow b aa d iy +< nobody'll n ow b aa d iy l +< nobody's n ow b aa d iy z +--- +> nobody n ow b ah d iy +> nobody'll n ow b ah d iy l +> nobody's n ow b ah d iy z +29712a29798 +> nuclear n uw k l iy r +29938a30025 +> onto aa n t ax +30051a30139 +> originally ax r ih jh ax l iy +30507a30596 +> particularly p er t ih k y ax l iy +30755a30845 +> perfectly p er f ih k l iy +30820a30911 +> personally p er s n ax l iy +30915a31007 +> physically f ih z ih k l iy +30986a31079 +> pilot p ay l ih t +30987a31081 +> pilot's p ay l ih t s +31227a31322 +> police p l iy s +31513a31609 +> prefer p er f er +31553a31650 +> prepare p r ax p ey r +31578a31676 +> prescription p er s k r ih p sh ih n +31579a31678 +> prescriptions p er s k r ih p sh ih n z +31770a31870 +> products p r aa d ax k s +31821a31922 +> projects p r aa jh eh k s +31908a32010 +> protect p er t eh k t +31909a32012 +> protected p er t eh k t ih d +31911a32015 +> protection p er t eh k sh ih n +31914a32019 +> protection p er t eh k t ih v +32149a32255 +> quarter k ow r t er +32414a32521 +> read r iy d +32785a32893 +> rehabilitation r iy ax b ih l ih t ey sh ih n +33150a33259 +> resource r ih s ow r s +33151a33261 +> resources r iy s ow r s ih z +33539c33649 +< roots r uh t s +--- +> roots r uw t s +33929a34040 +> science s ay n s +34315a34427 +> seventy s eh v ih n iy +34319,34320c34431,34432 +< severe s ax v iy r +< severely s ax v iy r l iy +--- +> severe s ih v iy r +> severely s ih v iy r l iy +35060a35173 +> software s ao f w ey r +35083a35197 +> solid s ao l ih d +35084a35199 +> solidly s ao l ih d l iy +35750a35866 +> stood s t ih d +35854a35971 +> strictly s t r ih k l iy +35889c36006 +< stronger s t r ao ng er +--- +> stronger s t r ao ng g er +36192a36310,36311 +> supposed s p ow z +> supposed s p ow s +36510a36630 +> tastes t ey s +36856a36977 +> thoroughly th er r l iy +36866a36988 +> thousands th aw z ih n z +37081c37203 +< toots t uh t s +--- +> toots t uw t s +37157a37280 +> toward t w ow r d +37158a37282 +> towards t w ow r d z +37564a37689 +> twenties t w eh n iy z +37565a37691 +> twentieth t w eh n iy ih th +37637a37764 +> unacceptable ah n ae k s eh p ax b ax l +37728a37856 +> understand ah n d er s t ae n +37860a37989 +> unless ih n l eh s +38040a38170 +> use y uw z +38049a38180 +> uses y uw z ih z +38125a38257 +> various v ah r iy ih s +38202a38335 +> versus v er s ih z +38381c38514 +< wacko w ae k ow +--- +> wacko w ey k ow +38455c38588 +< wanna w aa n ax +--- +> wanna w ah n ax +38675c38808 +< whatnot w ah t n aa t +--- +> whatnot w aa t n aa t +38676a38810 +> whatsoever w aa t s ow eh v er +38890c39024 +< wok w aa k +--- +> wok w ao k +38910a39045 +> wondering w ah n d r ih ng diff --git a/egs/fisher_swbd/s5/local/fisher_train_lms.sh b/egs/fisher_swbd/s5/local/fisher_train_lms.sh index 5d8b9e2e18d..a9e3fa4566a 100755 --- a/egs/fisher_swbd/s5/local/fisher_train_lms.sh +++ b/egs/fisher_swbd/s5/local/fisher_train_lms.sh @@ -30,6 +30,7 @@ export PATH=$PATH:`pwd`/../../../tools/kaldi_lm else echo Downloading and installing the kaldi_lm tools if [ ! -f kaldi_lm.tar.gz ]; then + wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || wget http://merlin.fit.vutbr.cz/kaldi/kaldi_lm.tar.gz || exit 1; fi tar -xvzf kaldi_lm.tar.gz || exit 1; diff --git a/egs/fisher_swbd/s5/local/fisher_train_lms_fsh.sh b/egs/fisher_swbd/s5/local/fisher_train_lms_fsh.sh index ebc954b756b..3133af6ee1f 100755 --- a/egs/fisher_swbd/s5/local/fisher_train_lms_fsh.sh +++ b/egs/fisher_swbd/s5/local/fisher_train_lms_fsh.sh @@ -30,6 +30,7 @@ export PATH=$PATH:`pwd`/../../../tools/kaldi_lm else echo Downloading and installing the kaldi_lm tools if [ ! -f kaldi_lm.tar.gz ]; then + wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || wget http://merlin.fit.vutbr.cz/kaldi/kaldi_lm.tar.gz || exit 1; fi tar -xvzf kaldi_lm.tar.gz || exit 1; diff --git a/egs/fisher_swbd/s5/local/swbd1_data_download.sh b/egs/fisher_swbd/s5/local/swbd1_data_download.sh new file mode 100755 index 00000000000..95c9d5e58a4 --- /dev/null +++ b/egs/fisher_swbd/s5/local/swbd1_data_download.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# Switchboard-1 training data preparation customized for Edinburgh +# Author: Arnab Ghoshal (Jan 2013) + +# To be run from one directory above this script. + +## The input is some directory containing the switchboard-1 release 2 +## corpus (LDC97S62). Note: we don't make many assumptions about how +## you unpacked this. We are just doing a "find" command to locate +## the .sph files. + +. path.sh + +#check existing directories +if [ $# != 1 ]; then + echo "Usage: swbd1_data_download.sh /path/to/SWBD" + exit 1; +fi + +SWBD_DIR=$1 + +dir=data/local/train_swbd +mkdir -p $dir + +# Audio data directory check +if [ ! -d $SWBD_DIR ]; then + echo "Error: run.sh requires a directory argument" + exit 1; +fi + +# Trans directory check +if [ ! -d $SWBD_DIR/transcriptions/swb_ms98_transcriptions ]; then + ( + cd $dir; + if [ ! -d swb_ms98_transcriptions ]; then + echo " *** Downloading trascriptions and dictionary ***" + wget http://www.openslr.org/resources/5/switchboard_word_alignments.tar.gz || + wget http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz + tar -xf switchboard_word_alignments.tar.gz + fi + ) +else + echo "Directory with transcriptions exists, skipping downloading" + [ -f $dir/swb_ms98_transcriptions ] \ + || ln -sf $SWBD_DIR/transcriptions/swb_ms98_transcriptions $dir/ +fi diff --git a/egs/fisher_swbd/s5/local/swbd1_data_prep.sh b/egs/fisher_swbd/s5/local/swbd1_data_prep.sh index 552e304a6a3..54513437dbe 100755 --- a/egs/fisher_swbd/s5/local/swbd1_data_prep.sh +++ b/egs/fisher_swbd/s5/local/swbd1_data_prep.sh @@ -14,7 +14,7 @@ #check existing directories if [ $# != 1 ]; then - echo "Usage: swbd1_data_prep_edin.sh /path/to/SWBD" + echo "Usage: swbd1_data_prep.sh /path/to/SWBD" exit 1; fi @@ -23,7 +23,6 @@ SWBD_DIR=$1 dir=data/local/train_swbd mkdir -p $dir - # Audio data directory check if [ ! -d $SWBD_DIR ]; then echo "Error: run.sh requires a directory argument" @@ -34,22 +33,6 @@ sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe [ ! -x $sph2pipe ] \ && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1; - -# Trans directory check -if [ ! -d $SWBD_DIR/transcriptions/swb_ms98_transcriptions ]; then - # To get the SWBD transcriptions and dict, do: - echo " *** Downloading transcriptions and dictionary ***" - ( - cd $dir; - wget http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz - tar -xf switchboard_word_alignments.tar.gz - ) -else - echo "Directory with transcriptions exists, skipping downloading" - [ -f $dir/swb_ms98_transcriptions ] \ - || ln -sf $SWBD_DIR/transcriptions/swb_ms98_transcriptions $dir/ -fi - # Option A: SWBD dictionary file check [ ! -f $dir/swb_ms98_transcriptions/sw-ms98-dict.text ] && \ echo "SWBD dictionary file does not exist" && exit 1; @@ -101,7 +84,7 @@ local/swbd1_map_words.pl -f 2- $dir/transcripts2.txt > $dir/text # final trans # format acronyms in text python local/map_acronyms_transcripts.py -i $dir/text -o $dir/text_map \ - -M data/local/dict/acronyms_swbd.map + -M data/local/dict_nosp/acronyms_swbd.map cp $dir/text $dir/text_bk mv $dir/text_map $dir/text diff --git a/egs/fisher_swbd/s5/run.sh b/egs/fisher_swbd/s5/run.sh index 4bb0a55b0a9..fa3ad62fa84 100755 --- a/egs/fisher_swbd/s5/run.sh +++ b/egs/fisher_swbd/s5/run.sh @@ -25,7 +25,6 @@ local/swbd1_data_prep.sh /export/corpora3/LDC/LDC97S62 # local/swbd1_data_prep.sh /mnt/matylda2/data/SWITCHBOARD_1R2 # local/swbd1_data_prep.sh /exports/work/inf_hcrc_cstr_general/corpora/switchboard/switchboard1 - utils/prepare_lang.sh data/local/dict_nosp \ "" data/local/lang_nosp data/lang_nosp @@ -135,15 +134,14 @@ local/remove_dup_utts.sh 300 data/train data/train_nodup ) # Start training on the Switchboard subset, which has cleaner alignments - steps/train_mono.sh --nj 3 --cmd "$train_cmd" \ - data/train_10k_nodup data/lang_nopp exp/mono0a + data/train_10k_nodup data/lang_nosp exp/mono0a steps/align_si.sh --nj 10 --cmd "$train_cmd" \ - data/train_30k_nodup data/lang_nopp exp/mono0a exp/mono0a_ali || exit 1; + data/train_30k_nodup data/lang_nosp exp/mono0a exp/mono0a_ali || exit 1; steps/train_deltas.sh --cmd "$train_cmd" \ - 3200 30000 data/train_30k_nodup data/lang_nopp exp/mono0a_ali exp/tri1a || exit 1; + 3200 30000 data/train_30k_nodup data/lang_nosp exp/mono0a_ali exp/tri1a || exit 1; #used to be 2500 20000 ( graph_dir=exp/tri1a/graph_nosp_fsh_sw1_tg diff --git a/egs/gale_arabic/s5/cmd.sh b/egs/gale_arabic/s5/cmd.sh index 6e2777b595b..71dd849a93b 100755 --- a/egs/gale_arabic/s5/cmd.sh +++ b/egs/gale_arabic/s5/cmd.sh @@ -1,11 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -export train_cmd="queue.pl -l 'arch=*64*'" -export decode_cmd="queue.pl -l 'arch=*64*'" -export cuda_cmd="queue.pl -l gpu=1" +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/gale_mandarin/s5/cmd.sh b/egs/gale_mandarin/s5/cmd.sh index 6e2777b595b..2d51ad82004 100755 --- a/egs/gale_mandarin/s5/cmd.sh +++ b/egs/gale_mandarin/s5/cmd.sh @@ -1,11 +1,18 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -export train_cmd="queue.pl -l 'arch=*64*'" -export decode_cmd="queue.pl -l 'arch=*64*'" -export cuda_cmd="queue.pl -l gpu=1" +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" +# the use of cuda_cmd is deprecated, but it's still used in this example +# directory. +export cuda_cmd="queue.pl --gpu 1" diff --git a/egs/hkust/s5/cmd.sh b/egs/hkust/s5/cmd.sh index 2a46d89f385..71dd849a93b 100644 --- a/egs/hkust/s5/cmd.sh +++ b/egs/hkust/s5/cmd.sh @@ -1,13 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu" -export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu" -#export train_cmd=run.pl -#export decode_cmd=run.pl - +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/librispeech/s5/cmd.sh b/egs/librispeech/s5/cmd.sh index 6395d96ca36..71dd849a93b 100644 --- a/egs/librispeech/s5/cmd.sh +++ b/egs/librispeech/s5/cmd.sh @@ -1,30 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64" -export decode_cmd="queue.pl -l arch=*64 --mem 2G" -export mkgraph_cmd="queue.pl -l arch=*64 --mem 4G" -export big_memory_cmd="queue.pl -l arch=*64 --mem 8G" -export cuda_cmd="queue.pl -l gpu=1" - - - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" - -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/librispeech/s5/run.sh b/egs/librispeech/s5/run.sh index 1cb03b04ffe..4a542cc30c0 100755 --- a/egs/librispeech/s5/run.sh +++ b/egs/librispeech/s5/run.sh @@ -2,7 +2,7 @@ # Set this to somewhere where you want to put your data, or where -# someone else has already put it. You'll want to change this +# someone else has already put it. You'll want to change this # if you're not on the CLSP grid. data=/export/a15/vpanayotov/data @@ -10,8 +10,8 @@ data=/export/a15/vpanayotov/data data_url=www.openslr.org/resources/12 lm_url=www.openslr.org/resources/11 -. cmd.sh -. path.sh +. ./cmd.sh +. ./path.sh # you might not want to do this for interactive shells. set -e @@ -24,12 +24,12 @@ for part in dev-clean test-clean dev-other test-other train-clean-100; do done # download the LM resources -local/download_lm.sh $lm_url data/local/lm || exit 1 +local/download_lm.sh $lm_url data/local/lm # format the data as Kaldi data directories for part in dev-clean test-clean dev-other test-other train-clean-100; do # use underscore-separated names in data directories. - local/data_prep.sh $data/LibriSpeech/$part data/$(echo $part | sed s/-/_/g) || exit 1 + local/data_prep.sh $data/LibriSpeech/$part data/$(echo $part | sed s/-/_/g) done ## Optional text corpus normalization and LM training @@ -39,7 +39,7 @@ done ## well as some intermediate data(e.g. the normalized text used for LM training), ## are available for download at http://www.openslr.org/11/ #local/lm/train_lm.sh $LM_CORPUS_ROOT \ -# data/local/lm/norm/tmp data/local/lm/norm/norm_texts data/local/lm || exit 1 +# data/local/lm/norm/tmp data/local/lm/norm/norm_texts data/local/lm ## Optional G2P training scripts. ## As the LM training scripts above, this script is intended primarily to @@ -49,24 +49,24 @@ done # when "--stage 3" option is used below we skip the G2P steps, and use the # lexicon we have already downloaded from openslr.org/11/ local/prepare_dict.sh --stage 3 --nj 30 --cmd "$train_cmd" \ - data/local/lm data/local/lm data/local/dict_nosp || exit 1 + data/local/lm data/local/lm data/local/dict_nosp utils/prepare_lang.sh data/local/dict_nosp \ - "" data/local/lang_tmp_nosp data/lang_nosp || exit 1; + "" data/local/lang_tmp_nosp data/lang_nosp -local/format_lms.sh --src-dir data/lang_nosp data/local/lm || exit 1 +local/format_lms.sh --src-dir data/lang_nosp data/local/lm # Create ConstArpaLm format language model for full 3-gram and 4-gram LMs utils/build_const_arpa_lm.sh data/local/lm/lm_tglarge.arpa.gz \ - data/lang_nosp data/lang_nosp_test_tglarge || exit 1; + data/lang_nosp data/lang_nosp_test_tglarge utils/build_const_arpa_lm.sh data/local/lm/lm_fglarge.arpa.gz \ - data/lang_nosp data/lang_nosp_test_fglarge || exit 1; + data/lang_nosp data/lang_nosp_test_fglarge mfccdir=mfcc # spread the mfccs over various machines, as this data-set is quite large. -if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then +if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then mfcc=$(basename mfccdir) # in case was absolute pathname (unlikely), get basename. - utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/librispeech/s5/$mfcc/storage \ + utils/create_split_dir.pl /export/b{02,11,12,13}/$USER/kaldi-data/egs/librispeech/s5/$mfcc/storage \ $mfccdir/storage fi @@ -87,15 +87,15 @@ utils/subset_data_dir.sh data/train_clean_100 10000 data/train_10k # train a monophone system steps/train_mono.sh --boost-silence 1.25 --nj 20 --cmd "$train_cmd" \ - data/train_2kshort data/lang_nosp exp/mono || exit 1; + data/train_2kshort data/lang_nosp exp/mono # decode using the monophone model ( utils/mkgraph.sh --mono data/lang_nosp_test_tgsmall \ - exp/mono exp/mono/graph_nosp_tgsmall || exit 1 + exp/mono exp/mono/graph_nosp_tgsmall for test in test_clean test_other dev_clean dev_other; do steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/mono/graph_nosp_tgsmall \ - data/$test exp/mono/decode_nosp_tgsmall_$test || exit 1 + data/$test exp/mono/decode_nosp_tgsmall_$test done )& @@ -104,97 +104,97 @@ steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \ # train a first delta + delta-delta triphone system on a subset of 5000 utterances steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \ - 2000 10000 data/train_5k data/lang_nosp exp/mono_ali_5k exp/tri1 || exit 1; + 2000 10000 data/train_5k data/lang_nosp exp/mono_ali_5k exp/tri1 # decode using the tri1 model ( utils/mkgraph.sh data/lang_nosp_test_tgsmall \ - exp/tri1 exp/tri1/graph_nosp_tgsmall || exit 1; + exp/tri1 exp/tri1/graph_nosp_tgsmall for test in test_clean test_other dev_clean dev_other; do steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/tri1/graph_nosp_tgsmall \ - data/$test exp/tri1/decode_nosp_tgsmall_$test || exit 1; + data/$test exp/tri1/decode_nosp_tgsmall_$test steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \ - data/$test exp/tri1/decode_nosp_{tgsmall,tgmed}_$test || exit 1; + data/$test exp/tri1/decode_nosp_{tgsmall,tgmed}_$test steps/lmrescore_const_arpa.sh \ --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \ - data/$test exp/tri1/decode_nosp_{tgsmall,tglarge}_$test || exit 1; + data/$test exp/tri1/decode_nosp_{tgsmall,tglarge}_$test done )& steps/align_si.sh --nj 10 --cmd "$train_cmd" \ - data/train_10k data/lang_nosp exp/tri1 exp/tri1_ali_10k || exit 1; + data/train_10k data/lang_nosp exp/tri1 exp/tri1_ali_10k # train an LDA+MLLT system. steps/train_lda_mllt.sh --cmd "$train_cmd" \ --splice-opts "--left-context=3 --right-context=3" 2500 15000 \ - data/train_10k data/lang_nosp exp/tri1_ali_10k exp/tri2b || exit 1; + data/train_10k data/lang_nosp exp/tri1_ali_10k exp/tri2b # decode using the LDA+MLLT model ( utils/mkgraph.sh data/lang_nosp_test_tgsmall \ - exp/tri2b exp/tri2b/graph_nosp_tgsmall || exit 1; + exp/tri2b exp/tri2b/graph_nosp_tgsmall for test in test_clean test_other dev_clean dev_other; do steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/tri2b/graph_nosp_tgsmall \ - data/$test exp/tri2b/decode_nosp_tgsmall_$test || exit 1; + data/$test exp/tri2b/decode_nosp_tgsmall_$test steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \ - data/$test exp/tri2b/decode_nosp_{tgsmall,tgmed}_$test || exit 1; + data/$test exp/tri2b/decode_nosp_{tgsmall,tgmed}_$test steps/lmrescore_const_arpa.sh \ --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \ - data/$test exp/tri2b/decode_nosp_{tgsmall,tglarge}_$test || exit 1; + data/$test exp/tri2b/decode_nosp_{tgsmall,tglarge}_$test done )& # Align a 10k utts subset using the tri2b model steps/align_si.sh --nj 10 --cmd "$train_cmd" --use-graphs true \ - data/train_10k data/lang_nosp exp/tri2b exp/tri2b_ali_10k || exit 1; + data/train_10k data/lang_nosp exp/tri2b exp/tri2b_ali_10k # Train tri3b, which is LDA+MLLT+SAT on 10k utts steps/train_sat.sh --cmd "$train_cmd" 2500 15000 \ - data/train_10k data/lang_nosp exp/tri2b_ali_10k exp/tri3b || exit 1; + data/train_10k data/lang_nosp exp/tri2b_ali_10k exp/tri3b # decode using the tri3b model ( utils/mkgraph.sh data/lang_nosp_test_tgsmall \ - exp/tri3b exp/tri3b/graph_nosp_tgsmall || exit 1; + exp/tri3b exp/tri3b/graph_nosp_tgsmall for test in test_clean test_other dev_clean dev_other; do steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \ exp/tri3b/graph_nosp_tgsmall data/$test \ - exp/tri3b/decode_nosp_tgsmall_$test || exit 1; + exp/tri3b/decode_nosp_tgsmall_$test steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \ - data/$test exp/tri3b/decode_nosp_{tgsmall,tgmed}_$test || exit 1; + data/$test exp/tri3b/decode_nosp_{tgsmall,tgmed}_$test steps/lmrescore_const_arpa.sh \ --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \ - data/$test exp/tri3b/decode_nosp_{tgsmall,tglarge}_$test || exit 1; + data/$test exp/tri3b/decode_nosp_{tgsmall,tglarge}_$test done )& # align the entire train_clean_100 subset using the tri3b model steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \ data/train_clean_100 data/lang_nosp \ - exp/tri3b exp/tri3b_ali_clean_100 || exit 1; + exp/tri3b exp/tri3b_ali_clean_100 # train another LDA+MLLT+SAT system on the entire 100 hour subset steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \ data/train_clean_100 data/lang_nosp \ - exp/tri3b_ali_clean_100 exp/tri4b || exit 1; + exp/tri3b_ali_clean_100 exp/tri4b # decode using the tri4b model ( utils/mkgraph.sh data/lang_nosp_test_tgsmall \ - exp/tri4b exp/tri4b/graph_nosp_tgsmall || exit 1; + exp/tri4b exp/tri4b/graph_nosp_tgsmall for test in test_clean test_other dev_clean dev_other; do steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \ exp/tri4b/graph_nosp_tgsmall data/$test \ - exp/tri4b/decode_nosp_tgsmall_$test || exit 1; + exp/tri4b/decode_nosp_tgsmall_$test steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \ - data/$test exp/tri4b/decode_nosp_{tgsmall,tgmed}_$test || exit 1; + data/$test exp/tri4b/decode_nosp_{tgsmall,tgmed}_$test steps/lmrescore_const_arpa.sh \ --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \ - data/$test exp/tri4b/decode_nosp_{tgsmall,tglarge}_$test || exit 1; + data/$test exp/tri4b/decode_nosp_{tgsmall,tglarge}_$test steps/lmrescore_const_arpa.sh \ --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,fglarge} \ - data/$test exp/tri4b/decode_nosp_{tgsmall,fglarge}_$test || exit 1; + data/$test exp/tri4b/decode_nosp_{tgsmall,fglarge}_$test done )& @@ -205,125 +205,125 @@ steps/get_prons.sh --cmd "$train_cmd" \ utils/dict_dir_add_pronprobs.sh --max-normalize true \ data/local/dict_nosp \ exp/tri4b/pron_counts_nowb.txt exp/tri4b/sil_counts_nowb.txt \ - exp/tri4b/pron_bigram_counts_nowb.txt data/local/dict || exit 1 + exp/tri4b/pron_bigram_counts_nowb.txt data/local/dict utils/prepare_lang.sh data/local/dict \ "" data/local/lang_tmp data/lang local/format_lms.sh --src-dir data/lang data/local/lm utils/build_const_arpa_lm.sh \ - data/local/lm/lm_tglarge.arpa.gz data/lang data/lang_test_tglarge || exit 1; + data/local/lm/lm_tglarge.arpa.gz data/lang data/lang_test_tglarge utils/build_const_arpa_lm.sh \ - data/local/lm/lm_fglarge.arpa.gz data/lang data/lang_test_fglarge || exit 1; + data/local/lm/lm_fglarge.arpa.gz data/lang data/lang_test_fglarge # decode using the tri4b model with pronunciation and silence probabilities ( utils/mkgraph.sh \ - data/lang_test_tgsmall exp/tri4b exp/tri4b/graph_tgsmall || exit 1; + data/lang_test_tgsmall exp/tri4b exp/tri4b/graph_tgsmall for test in test_clean test_other dev_clean dev_other; do steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \ exp/tri4b/graph_tgsmall data/$test \ - exp/tri4b/decode_tgsmall_$test || exit 1; + exp/tri4b/decode_tgsmall_$test steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \ - data/$test exp/tri4b/decode_{tgsmall,tgmed}_$test || exit 1; + data/$test exp/tri4b/decode_{tgsmall,tgmed}_$test steps/lmrescore_const_arpa.sh \ --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \ - data/$test exp/tri4b/decode_{tgsmall,tglarge}_$test || exit 1; + data/$test exp/tri4b/decode_{tgsmall,tglarge}_$test steps/lmrescore_const_arpa.sh \ --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \ - data/$test exp/tri4b/decode_{tgsmall,fglarge}_$test || exit 1; + data/$test exp/tri4b/decode_{tgsmall,fglarge}_$test done )& # align train_clean_100 using the tri4b model steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ - data/train_clean_100 data/lang exp/tri4b exp/tri4b_ali_clean_100 || exit 1; + data/train_clean_100 data/lang exp/tri4b exp/tri4b_ali_clean_100 # if you want at this point you can train and test NN model(s) on the 100 hour # subset -local/nnet2/run_5a_clean_100.sh || exit 1 +local/nnet2/run_5a_clean_100.sh -local/download_and_untar.sh $data $data_url train-clean-360 || exit 1; +local/download_and_untar.sh $data $data_url train-clean-360 # now add the "clean-360" subset to the mix ... local/data_prep.sh \ - $data/LibriSpeech/train-clean-360 data/train_clean_360 || exit 1 + $data/LibriSpeech/train-clean-360 data/train_clean_360 steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 data/train_clean_360 \ - exp/make_mfcc/train_clean_360 $mfccdir || exit 1 + exp/make_mfcc/train_clean_360 $mfccdir steps/compute_cmvn_stats.sh \ - data/train_clean_360 exp/make_mfcc/train_clean_360 $mfccdir || exit 1 + data/train_clean_360 exp/make_mfcc/train_clean_360 $mfccdir # ... and then combine the two sets into a 460 hour one utils/combine_data.sh \ - data/train_clean_460 data/train_clean_100 data/train_clean_360 || exit 1 + data/train_clean_460 data/train_clean_100 data/train_clean_360 # align the new, combined set, using the tri4b model steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \ - data/train_clean_460 data/lang exp/tri4b exp/tri4b_ali_clean_460 || exit 1; + data/train_clean_460 data/lang exp/tri4b exp/tri4b_ali_clean_460 # create a larger SAT model, trained on the 460 hours of data. steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \ - data/train_clean_460 data/lang exp/tri4b_ali_clean_460 exp/tri5b || exit 1; + data/train_clean_460 data/lang exp/tri4b_ali_clean_460 exp/tri5b # decode using the tri5b model ( utils/mkgraph.sh data/lang_test_tgsmall \ - exp/tri5b exp/tri5b/graph_tgsmall || exit 1; + exp/tri5b exp/tri5b/graph_tgsmall for test in test_clean test_other dev_clean dev_other; do steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \ exp/tri5b/graph_tgsmall data/$test \ - exp/tri5b/decode_tgsmall_$test || exit 1; + exp/tri5b/decode_tgsmall_$test steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \ - data/$test exp/tri5b/decode_{tgsmall,tgmed}_$test || exit 1; + data/$test exp/tri5b/decode_{tgsmall,tgmed}_$test steps/lmrescore_const_arpa.sh \ --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \ - data/$test exp/tri5b/decode_{tgsmall,tglarge}_$test || exit 1; + data/$test exp/tri5b/decode_{tgsmall,tglarge}_$test steps/lmrescore_const_arpa.sh \ --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \ - data/$test exp/tri5b/decode_{tgsmall,fglarge}_$test || exit 1; + data/$test exp/tri5b/decode_{tgsmall,fglarge}_$test done )& # train a NN model on the 460 hour set -local/nnet2/run_6a_clean_460.sh || exit 1 +local/nnet2/run_6a_clean_460.sh -local/download_and_untar.sh $data $data_url train-other-500 || exit 1; +local/download_and_untar.sh $data $data_url train-other-500 # prepare the 500 hour subset. local/data_prep.sh \ - $data/LibriSpeech/train-other-500 data/train_other_500 || exit 1 + $data/LibriSpeech/train-other-500 data/train_other_500 steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 data/train_other_500 \ - exp/make_mfcc/train_other_500 $mfccdir || exit 1 + exp/make_mfcc/train_other_500 $mfccdir steps/compute_cmvn_stats.sh \ - data/train_other_500 exp/make_mfcc/train_other_500 $mfccdir || exit 1 + data/train_other_500 exp/make_mfcc/train_other_500 $mfccdir # combine all the data utils/combine_data.sh \ - data/train_960 data/train_clean_460 data/train_other_500 || exit 1 + data/train_960 data/train_clean_460 data/train_other_500 steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \ - data/train_960 data/lang exp/tri5b exp/tri5b_ali_960 || exit 1; + data/train_960 data/lang exp/tri5b exp/tri5b_ali_960 # train a SAT model on the 960 hour mixed data. Use the train_quick.sh script # as it is faster. steps/train_quick.sh --cmd "$train_cmd" \ - 7000 150000 data/train_960 data/lang exp/tri5b_ali_960 exp/tri6b || exit 1; + 7000 150000 data/train_960 data/lang exp/tri5b_ali_960 exp/tri6b # decode using the tri6b model ( utils/mkgraph.sh data/lang_test_tgsmall \ - exp/tri6b exp/tri6b/graph_tgsmall || exit 1; + exp/tri6b exp/tri6b/graph_tgsmall for test in test_clean test_other dev_clean dev_other; do steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \ - exp/tri6b/graph_tgsmall data/$test exp/tri6b/decode_tgsmall_$test || exit 1; + exp/tri6b/graph_tgsmall data/$test exp/tri6b/decode_tgsmall_$test steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \ - data/$test exp/tri6b/decode_{tgsmall,tgmed}_$test || exit 1; + data/$test exp/tri6b/decode_{tgsmall,tgmed}_$test steps/lmrescore_const_arpa.sh \ --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \ - data/$test exp/tri6b/decode_{tgsmall,tglarge}_$test || exit 1; + data/$test exp/tri6b/decode_{tgsmall,tglarge}_$test steps/lmrescore_const_arpa.sh \ --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \ - data/$test exp/tri6b/decode_{tgsmall,fglarge}_$test || exit 1; + data/$test exp/tri6b/decode_{tgsmall,fglarge}_$test done )& @@ -349,7 +349,7 @@ steps/train_quick.sh --cmd "$train_cmd" \ # train NN models on the entire dataset -local/nnet2/run_7a_960.sh || exit 1 +local/nnet2/run_7a_960.sh # # train models on cleaned-up data # # we've found that this isn't helpful-- see the comments in local/run_data_cleaning.sh diff --git a/egs/lre/v1/cmd.sh b/egs/lre/v1/cmd.sh index 5c38b3a5d77..d1ca1a6d126 100644 --- a/egs/lre/v1/cmd.sh +++ b/egs/lre/v1/cmd.sh @@ -1,28 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64*" -export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G" -#export cuda_cmd="..." -export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G" - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 4G" diff --git a/egs/lre07/v1/cmd.sh b/egs/lre07/v1/cmd.sh index 5c38b3a5d77..d1ca1a6d126 100644 --- a/egs/lre07/v1/cmd.sh +++ b/egs/lre07/v1/cmd.sh @@ -1,28 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64*" -export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G" -#export cuda_cmd="..." -export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G" - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 4G" diff --git a/egs/reverb/s5/RESULTS b/egs/reverb/s5/RESULTS index 031a6b2ec1a..3537852a827 100644 --- a/egs/reverb/s5/RESULTS +++ b/egs/reverb/s5/RESULTS @@ -1,306 +1,150 @@ -local/summarize_results.pl tri2a -#### RESULTS FOR dt ##### - -exp/tri2a/decode_bg_5k_REVERB_dt* -LMW = 15 -RealData_dt_for_1ch_far_room1_A 89.00 -RealData_dt_for_1ch_near_room1_A 90.39 -SimData_dt_for_1ch_far_room1_A 22.35 -SimData_dt_for_1ch_far_room2_A 88.37 -SimData_dt_for_1ch_far_room3_A 90.85 -SimData_dt_for_1ch_near_room1_A 12.29 -SimData_dt_for_1ch_near_room2_A 42.86 -SimData_dt_for_1ch_near_room3_A 50.17 -Avg_Sim(6) 51.15 -Avg_Real(2) 89.69 - - -#### RESULTS FOR et ##### - -exp/tri2a/decode_bg_5k_REVERB_et* -LMW = 15 -Avg_Sim(0) 0.00 -Avg_Real(0) 0.00 - - -local/summarize_results.pl tri2a_mc -#### RESULTS FOR dt ##### - -exp/tri2a_mc/decode_bg_5k_REVERB_dt* -LMW = 15 -RealData_dt_for_1ch_far_room1_A 51.88 -RealData_dt_for_1ch_near_room1_A 56.14 -SimData_dt_for_1ch_far_room1_A 17.45 -SimData_dt_for_1ch_far_room2_A 44.02 -SimData_dt_for_1ch_far_room3_A 49.90 -SimData_dt_for_1ch_near_room1_A 15.29 -SimData_dt_for_1ch_near_room2_A 22.11 -SimData_dt_for_1ch_near_room3_A 26.34 -Avg_Sim(6) 29.18 -Avg_Real(2) 54.01 - - -#### RESULTS FOR et ##### - -exp/tri2a_mc/decode_bg_5k_REVERB_et* -LMW = 15 -Avg_Sim(0) 0.00 -Avg_Real(0) 0.00 - - -local/summarize_results.pl tri2a_mc basis_fmllr -#### RESULTS FOR dt ##### - -exp/tri2a_mc/decode_basis_fmllr_bg_5k_REVERB_dt* -LMW = 15 -RealData_dt_for_1ch_far_room1_A 43.95 -RealData_dt_for_1ch_near_room1_A 48.91 -SimData_dt_for_1ch_far_room1_A 16.37 -SimData_dt_for_1ch_far_room2_A 35.67 -SimData_dt_for_1ch_far_room3_A 39.59 -SimData_dt_for_1ch_near_room1_A 13.03 -SimData_dt_for_1ch_near_room2_A 17.08 -SimData_dt_for_1ch_near_room3_A 20.00 +#################### +exp/tri2a/decode_bg_5k_REVERB_*dt* +RealData_dt_for_1ch_far_room1_A 89.13 +RealData_dt_for_1ch_near_room1_A 90.27 +SimData_dt_for_1ch_far_room1_A 22.44 +SimData_dt_for_1ch_far_room2_A 88.44 +SimData_dt_for_1ch_far_room3_A 91.27 +SimData_dt_for_1ch_near_room1_A 12.19 +SimData_dt_for_1ch_near_room2_A 42.74 +SimData_dt_for_1ch_near_room3_A 49.31 +Avg_Real(2) 89.70 +Avg_Sim(6) 51.06 + +exp/tri2a/decode_bg_5k_REVERB_*et* +RealData_et_for_1ch_far_room1_A 88.45 +RealData_et_for_1ch_near_room1_A 88.66 +SimData_et_for_1ch_far_room1_A 22.72 +SimData_et_for_1ch_far_room2_A 81.53 +SimData_et_for_1ch_far_room3_A 89.25 +SimData_et_for_1ch_near_room1_A 14.37 +SimData_et_for_1ch_near_room2_A 40.46 +SimData_et_for_1ch_near_room3_A 51.50 +Avg_Real(2) 88.56 +Avg_Sim(6) 49.97 + +#################### +exp/tri2a_mc/decode_bg_5k_REVERB_*dt* +RealData_dt_for_1ch_far_room1_A 53.38 +RealData_dt_for_1ch_near_room1_A 56.27 +SimData_dt_for_1ch_far_room1_A 16.96 +SimData_dt_for_1ch_far_room2_A 44.15 +SimData_dt_for_1ch_far_room3_A 49.88 +SimData_dt_for_1ch_near_room1_A 15.00 +SimData_dt_for_1ch_near_room2_A 21.81 +SimData_dt_for_1ch_near_room3_A 25.10 +Avg_Real(2) 54.83 +Avg_Sim(6) 28.82 + +exp/tri2a_mc/decode_bg_5k_REVERB_*et* +RealData_et_for_1ch_far_room1_A 52.94 +RealData_et_for_1ch_near_room1_A 55.35 +SimData_et_for_1ch_far_room1_A 18.91 +SimData_et_for_1ch_far_room2_A 37.33 +SimData_et_for_1ch_far_room3_A 46.69 +SimData_et_for_1ch_near_room1_A 17.77 +SimData_et_for_1ch_near_room2_A 21.23 +SimData_et_for_1ch_near_room3_A 26.17 +Avg_Real(2) 54.14 +Avg_Sim(6) 28.02 + +#################### +exp/tri2a_mc/decode_basis_fmllr_bg_5k_REVERB_*dt* +RealData_dt_for_1ch_far_room1_A 46.27 +RealData_dt_for_1ch_near_room1_A 48.85 +SimData_dt_for_1ch_far_room1_A 15.59 +SimData_dt_for_1ch_far_room2_A 35.86 +SimData_dt_for_1ch_far_room3_A 39.54 +SimData_dt_for_1ch_near_room1_A 12.78 +SimData_dt_for_1ch_near_room2_A 17.75 +SimData_dt_for_1ch_near_room3_A 20.23 +Avg_Real(2) 47.56 Avg_Sim(6) 23.62 -Avg_Real(2) 46.43 - - -#### RESULTS FOR et ##### - -exp/tri2a_mc/decode_basis_fmllr_bg_5k_REVERB_et* -LMW = 15 -Avg_Sim(0) 0.00 -Avg_Real(0) 0.00 - - -local/summarize_results.pl tri2b -#### RESULTS FOR dt ##### - -exp/tri2b/decode_bg_5k_REVERB_dt* -LMW = 15 -RealData_dt_for_1ch_far_room1_A 91.66 -RealData_dt_for_1ch_near_room1_A 91.33 -SimData_dt_for_1ch_far_room1_A 26.94 -SimData_dt_for_1ch_far_room2_A 85.63 -SimData_dt_for_1ch_far_room3_A 91.99 -SimData_dt_for_1ch_near_room1_A 11.95 -SimData_dt_for_1ch_near_room2_A 34.51 -SimData_dt_for_1ch_near_room3_A 44.81 -Avg_Sim(6) 49.30 -Avg_Real(2) 91.50 - - -#### RESULTS FOR et ##### - -exp/tri2b/decode_bg_5k_REVERB_et* -LMW = 15 -RealData_et_for_1ch_far_room1_A 91.29 -RealData_et_for_1ch_near_room1_A 92.05 -SimData_et_for_1ch_far_room1_A 24.16 -SimData_et_for_1ch_far_room2_A 78.57 -SimData_et_for_1ch_far_room3_A 91.01 -SimData_et_for_1ch_near_room1_A 13.76 -SimData_et_for_1ch_near_room2_A 32.94 -SimData_et_for_1ch_near_room3_A 48.24 -Avg_Sim(6) 48.11 -Avg_Real(2) 91.67 - - -local/summarize_results.pl tri2b_mc -#### RESULTS FOR dt ##### - -exp/tri2b_mc/decode_bg_5k_REVERB_dt* -LMW = 15 -RealData_dt_for_1ch_far_room1_A 45.18 -RealData_dt_for_1ch_near_room1_A 49.91 -SimData_dt_for_1ch_far_room1_A 15.78 -SimData_dt_for_1ch_far_room2_A 34.75 -SimData_dt_for_1ch_far_room3_A 37.56 -SimData_dt_for_1ch_near_room1_A 13.45 -SimData_dt_for_1ch_near_room2_A 17.57 -SimData_dt_for_1ch_near_room3_A 19.49 -Avg_Sim(6) 23.10 -Avg_Real(2) 47.55 - - -#### RESULTS FOR et ##### - -exp/tri2b_mc/decode_bg_5k_REVERB_et* -LMW = 15 -RealData_et_for_1ch_far_room1_A 47.67 -RealData_et_for_1ch_near_room1_A 50.65 -SimData_et_for_1ch_far_room1_A 16.69 -SimData_et_for_1ch_far_room2_A 30.36 -SimData_et_for_1ch_far_room3_A 38.08 -SimData_et_for_1ch_near_room1_A 15.67 -SimData_et_for_1ch_near_room2_A 17.71 -SimData_et_for_1ch_near_room3_A 20.10 -Avg_Sim(6) 23.10 -Avg_Real(2) 49.16 - - -local/summarize_results.pl tri2b_mc basis_fmllr -#### RESULTS FOR dt ##### - -exp/tri2b_mc/decode_basis_fmllr_bg_5k_REVERB_dt* -LMW = 15 -RealData_dt_for_1ch_far_room1_A 39.37 -RealData_dt_for_1ch_near_room1_A 42.48 -SimData_dt_for_1ch_far_room1_A 14.11 -SimData_dt_for_1ch_far_room2_A 28.81 -SimData_dt_for_1ch_far_room3_A 31.53 -SimData_dt_for_1ch_near_room1_A 11.18 -SimData_dt_for_1ch_near_room2_A 15.01 -SimData_dt_for_1ch_near_room3_A 15.48 -Avg_Sim(6) 19.35 -Avg_Real(2) 40.92 - - -#### RESULTS FOR et ##### - -exp/tri2b_mc/decode_basis_fmllr_bg_5k_REVERB_et* -LMW = 15 -RealData_et_for_1ch_far_room1_A 42.03 -RealData_et_for_1ch_near_room1_A 43.53 -SimData_et_for_1ch_far_room1_A 13.87 -SimData_et_for_1ch_far_room2_A 26.02 -SimData_et_for_1ch_far_room3_A 32.80 -SimData_et_for_1ch_near_room1_A 12.42 -SimData_et_for_1ch_near_room2_A 14.82 -SimData_et_for_1ch_near_room3_A 17.02 -Avg_Sim(6) 19.49 -Avg_Real(2) 42.78 - - -local/summarize_results.pl tri2b_mc_mmi_b0.1 -#### RESULTS FOR dt ##### - -exp/tri2b_mc_mmi_b0.1/decode_bg_5k_REVERB_dt* -LMW = 15 -RealData_dt_for_1ch_far_room1_A 43.06 -RealData_dt_for_1ch_near_room1_A 46.04 -SimData_dt_for_1ch_far_room1_A 13.59 -SimData_dt_for_1ch_far_room2_A 29.55 -SimData_dt_for_1ch_far_room3_A 32.52 -SimData_dt_for_1ch_near_room1_A 11.21 -SimData_dt_for_1ch_near_room2_A 15.23 -SimData_dt_for_1ch_near_room3_A 16.42 -Avg_Sim(6) 19.75 -Avg_Real(2) 44.55 - - -#### RESULTS FOR et ##### - -exp/tri2b_mc_mmi_b0.1/decode_bg_5k_REVERB_et* -LMW = 15 -RealData_et_for_1ch_far_room1_A 43.45 -RealData_et_for_1ch_near_room1_A 46.89 -SimData_et_for_1ch_far_room1_A 13.37 -SimData_et_for_1ch_far_room2_A 25.96 -SimData_et_for_1ch_far_room3_A 31.73 -SimData_et_for_1ch_near_room1_A 11.89 -SimData_et_for_1ch_near_room2_A 14.64 -SimData_et_for_1ch_near_room3_A 17.26 -Avg_Sim(6) 19.14 -Avg_Real(2) 45.17 - - -local/summarize_results.pl tri2b_mc_mmi_b0.1 basis_fmllr -#### RESULTS FOR dt ##### - -exp/tri2b_mc_mmi_b0.1/decode_basis_fmllr_bg_5k_REVERB_dt* -LMW = 15 -RealData_dt_for_1ch_far_room1_A 36.98 -RealData_dt_for_1ch_near_room1_A 39.68 -SimData_dt_for_1ch_far_room1_A 11.43 -SimData_dt_for_1ch_far_room2_A 25.24 -SimData_dt_for_1ch_far_room3_A 27.77 -SimData_dt_for_1ch_near_room1_A 9.19 -SimData_dt_for_1ch_near_room2_A 12.77 -SimData_dt_for_1ch_near_room3_A 13.30 -Avg_Sim(6) 16.62 -Avg_Real(2) 38.33 - - -#### RESULTS FOR et ##### - -exp/tri2b_mc_mmi_b0.1/decode_basis_fmllr_bg_5k_REVERB_et* -LMW = 15 -RealData_et_for_1ch_far_room1_A 38.93 -RealData_et_for_1ch_near_room1_A 39.51 -SimData_et_for_1ch_far_room1_A 11.32 -SimData_et_for_1ch_far_room2_A 22.31 -SimData_et_for_1ch_far_room3_A 28.40 -SimData_et_for_1ch_near_room1_A 9.69 -SimData_et_for_1ch_near_room2_A 12.36 -SimData_et_for_1ch_near_room3_A 14.77 -Avg_Sim(6) 16.47 -Avg_Real(2) 39.22 - - -local/summarize_results.pl tri2b_mc_mmi_b0.1 basis_fmllr -#### RESULTS FOR dt ##### - -exp/tri2b_mc_mmi_b0.1/decode_basis_fmllr_tg_5k_REVERB_dt* -LMW = 15 -RealData_dt_for_1ch_far_room1_A 31.58 -RealData_dt_for_1ch_near_room1_A 32.00 -SimData_dt_for_1ch_far_room1_A 8.51 -SimData_dt_for_1ch_far_room2_A 18.36 -SimData_dt_for_1ch_far_room3_A 20.40 -SimData_dt_for_1ch_near_room1_A 6.47 -SimData_dt_for_1ch_near_room2_A 9.61 -SimData_dt_for_1ch_near_room3_A 9.59 -Avg_Sim(6) 12.16 -Avg_Real(2) 31.79 - - -#### RESULTS FOR et ##### - -exp/tri2b_mc_mmi_b0.1/decode_basis_fmllr_tg_5k_REVERB_et* -LMW = 15 -RealData_et_for_1ch_far_room1_A 30.32 -RealData_et_for_1ch_near_room1_A 32.45 -SimData_et_for_1ch_far_room1_A 7.74 -SimData_et_for_1ch_far_room2_A 17.01 -SimData_et_for_1ch_far_room3_A 21.05 -SimData_et_for_1ch_near_room1_A 7.01 -SimData_et_for_1ch_near_room2_A 9.52 -SimData_et_for_1ch_near_room3_A 11.29 -Avg_Sim(6) 12.27 -Avg_Real(2) 31.39 - - -local/summarize_results.pl tri2b_mc_mmi_b0.1 mbr_basis_fmllr -#### RESULTS FOR dt ##### - -exp/tri2b_mc_mmi_b0.1/decode_mbr_basis_fmllr_tg_5k_REVERB_dt* -LMW = 15 -RealData_dt_for_1ch_far_room1_A 30.96 -RealData_dt_for_1ch_near_room1_A 30.88 -SimData_dt_for_1ch_far_room1_A 8.33 -SimData_dt_for_1ch_far_room2_A 18.14 -SimData_dt_for_1ch_far_room3_A 20.15 -SimData_dt_for_1ch_near_room1_A 6.24 -SimData_dt_for_1ch_near_room2_A 9.47 -SimData_dt_for_1ch_near_room3_A 9.62 -Avg_Sim(6) 11.99 -Avg_Real(2) 30.92 - - -#### RESULTS FOR et ##### - -exp/tri2b_mc_mmi_b0.1/decode_mbr_basis_fmllr_tg_5k_REVERB_et* -LMW = 15 -RealData_et_for_1ch_far_room1_A 29.37 -RealData_et_for_1ch_near_room1_A 31.84 -SimData_et_for_1ch_far_room1_A 7.64 -SimData_et_for_1ch_far_room2_A 16.86 -SimData_et_for_1ch_far_room3_A 20.59 -SimData_et_for_1ch_near_room1_A 6.93 -SimData_et_for_1ch_near_room2_A 9.48 -SimData_et_for_1ch_near_room3_A 11.19 -Avg_Sim(6) 12.11 -Avg_Real(2) 30.61 +exp/tri2a_mc/decode_basis_fmllr_bg_5k_REVERB_*et* +RealData_et_for_1ch_far_room1_A 48.11 +RealData_et_for_1ch_near_room1_A 48.42 +SimData_et_for_1ch_far_room1_A 16.57 +SimData_et_for_1ch_far_room2_A 31.54 +SimData_et_for_1ch_far_room3_A 39.32 +SimData_et_for_1ch_near_room1_A 14.31 +SimData_et_for_1ch_near_room2_A 18.42 +SimData_et_for_1ch_near_room3_A 21.03 +Avg_Real(2) 48.27 +Avg_Sim(6) 23.53 + +#################### +exp/tri2b_mc/decode_basis_fmllr_tg_5k_REVERB_*dt* +RealData_dt_for_1ch_far_room1_A 34.04 +RealData_dt_for_1ch_near_room1_A 33.37 +SimData_dt_for_1ch_far_room1_A 10.57 +SimData_dt_for_1ch_far_room2_A 22.63 +SimData_dt_for_1ch_far_room3_A 25.00 +SimData_dt_for_1ch_near_room1_A 7.57 +SimData_dt_for_1ch_near_room2_A 10.97 +SimData_dt_for_1ch_near_room3_A 12.59 +Avg_Real(2) 33.70 +Avg_Sim(6) 14.89 + +exp/tri2b_mc/decode_basis_fmllr_tg_5k_REVERB_*et* +RealData_et_for_1ch_far_room1_A 33.49 +RealData_et_for_1ch_near_room1_A 34.72 +SimData_et_for_1ch_far_room1_A 10.03 +SimData_et_for_1ch_far_room2_A 20.16 +SimData_et_for_1ch_far_room3_A 25.08 +SimData_et_for_1ch_near_room1_A 8.45 +SimData_et_for_1ch_near_room2_A 11.16 +SimData_et_for_1ch_near_room3_A 12.88 +Avg_Real(2) 34.11 +Avg_Sim(6) 14.63 + +#################### +exp/tri2b_mc_mmi_b0.1/decode_basis_fmllr_tg_5k_REVERB_*dt* +RealData_dt_for_1ch_far_room1_A 31.17 +RealData_dt_for_1ch_near_room1_A 31.82 +SimData_dt_for_1ch_far_room1_A 8.53 +SimData_dt_for_1ch_far_room2_A 17.43 +SimData_dt_for_1ch_far_room3_A 21.04 +SimData_dt_for_1ch_near_room1_A 6.78 +SimData_dt_for_1ch_near_room2_A 8.97 +SimData_dt_for_1ch_near_room3_A 10.01 +Avg_Real(2) 31.50 +Avg_Sim(6) 12.13 + +exp/tri2b_mc_mmi_b0.1/decode_basis_fmllr_tg_5k_REVERB_*et* +RealData_et_for_1ch_far_room1_A 31.20 +RealData_et_for_1ch_near_room1_A 30.98 +SimData_et_for_1ch_far_room1_A 8.42 +SimData_et_for_1ch_far_room2_A 17.63 +SimData_et_for_1ch_far_room3_A 20.71 +SimData_et_for_1ch_near_room1_A 7.03 +SimData_et_for_1ch_near_room2_A 9.50 +SimData_et_for_1ch_near_room3_A 11.11 +Avg_Real(2) 31.09 +Avg_Sim(6) 12.40 + +#################### +exp/tri2b_mc_mmi_b0.1/decode_mbr_basis_fmllr_tg_5k_REVERB_*dt* +RealData_dt_for_1ch_far_room1_A 30.42 +RealData_dt_for_1ch_near_room1_A 31.50 +SimData_dt_for_1ch_far_room1_A 8.24 +SimData_dt_for_1ch_far_room2_A 17.25 +SimData_dt_for_1ch_far_room3_A 20.72 +SimData_dt_for_1ch_near_room1_A 6.76 +SimData_dt_for_1ch_near_room2_A 8.87 +SimData_dt_for_1ch_near_room3_A 9.92 +Avg_Real(2) 30.96 +Avg_Sim(6) 11.96 + +exp/tri2b_mc_mmi_b0.1/decode_mbr_basis_fmllr_tg_5k_REVERB_*et* +RealData_et_for_1ch_far_room1_A 30.89 +RealData_et_for_1ch_near_room1_A 31.01 +SimData_et_for_1ch_far_room1_A 8.20 +SimData_et_for_1ch_far_room2_A 17.34 +SimData_et_for_1ch_far_room3_A 20.56 +SimData_et_for_1ch_near_room1_A 6.91 +SimData_et_for_1ch_near_room2_A 9.50 +SimData_et_for_1ch_near_room3_A 10.93 +Avg_Real(2) 30.95 +Avg_Sim(6) 12.24 diff --git a/egs/reverb/s5/cmd.sh b/egs/reverb/s5/cmd.sh index e88b07e1195..71dd849a93b 100644 --- a/egs/reverb/s5/cmd.sh +++ b/egs/reverb/s5/cmd.sh @@ -1,29 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64,gpu=1 -q g.q" -export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G" -export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G" - -#export cuda_cmd="..." - - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" - -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/reverb/s5/corpus.sh b/egs/reverb/s5/corpus.sh deleted file mode 100644 index 32a2ee4b85b..00000000000 --- a/egs/reverb/s5/corpus.sh +++ /dev/null @@ -1,17 +0,0 @@ -if [[ $(hostname -f) == *.clsp.jhu.edu ]] ; then - REVERB_home=/export/corpora5/REVERB_2014/REVERB - export wsjcam0=/export/corpora3/LDC/LDC95S24/wsjcam0 - # set LDC WSJ0 directory to obtain LMs - # REVERB data directory only provides bi-gram (bcb05cnp), but this recipe also uses 3-gram (tcb05cnp.z) - export wsj0=/export/corpora5/LDC/LDC93S6A/11-13.1 #LDC93S6A or LDC93S6B - # It is assumed that there will be a 'wsj0' subdirectory - # within the top-level corpus directory -else - echo "Set the data directory locations." && exit 1; -fi - -export reverb_dt=$REVERB_home/REVERB_WSJCAM0_dt -export reverb_et=$REVERB_home/REVERB_WSJCAM0_et -export reverb_real_dt=$REVERB_home/MC_WSJ_AV_Dev -export reverb_real_et=$REVERB_home/MC_WSJ_AV_Eval - diff --git a/egs/reverb/s5/local/Generate_mcTrainData_cut.m b/egs/reverb/s5/local/Generate_mcTrainData_cut.m old mode 100644 new mode 100755 diff --git a/egs/reverb/s5/local/REVERB_mcwsjav_data_prep.sh b/egs/reverb/s5/local/REVERB_mcwsjav_data_prep.sh index c3de2ba7fd3..a4599f97702 100755 --- a/egs/reverb/s5/local/REVERB_mcwsjav_data_prep.sh +++ b/egs/reverb/s5/local/REVERB_mcwsjav_data_prep.sh @@ -65,8 +65,8 @@ if [ ! -z "$3" ]; then dt_or_x=$3 fi -# unfortunately, we need a pointer to HTK baseline -# since the corpus does NOT contain the data set descriptions +# unfortunately, we need a pointer to HTK baseline +# since the corpus does NOT contain the data set descriptions # for the REVERB Challenge taskFileDir=$dir/../reverb_tools/ReleasePackage/reverb_tools_for_asr_ver2.0/taskFiles/1ch @@ -97,11 +97,11 @@ s/\x0D$//' \ # e.g. yield' --> yield # reason: YIELD' is not in dict, while YIELD is s/YIELD'/YIELD/g - s/'ROOTS'/ROOTS/g - s/'WHERE/WHERE/g + s/'ROOTS'/ROOTS/g + s/'WHERE/WHERE/g s/PEOPLE'/PEOPLE/g s/SIT'/SIT/g - s/'DOMINEE/DOMINEE/g + s/'DOMINEE/DOMINEE/g s/CHURCH'/CHURCH/g" \ -e ' # fix the single missing double full stop issue at the end of an utterance @@ -110,9 +110,9 @@ s/\x0D$//' \ /^[A-Z]$/ { # append a line N - # search for single dot on the second line + # search for single dot on the second line /\n\./ { - # found it - now replace the + # found it - now replace the s/\([A-Z]\)\n\./\1\.\n\./ } }' \ @@ -156,9 +156,9 @@ echo "Data preparation for $set succeeded" mfccdir=mfcc/$dataset -#for x in test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean train_si84_clean; do -#for x in si_tr; do -steps/make_mfcc.sh --nj 10 \ +#for x in test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean train_si84_clean; do +#for x in si_tr; do +steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 \ data/$dataset/$set exp/make_mfcc/$dataset/$set $mfccdir || exit 1; steps/compute_cmvn_stats.sh data/$dataset/$set exp/make_mfcc/$dataset/$set $mfccdir || exit 1; diff --git a/egs/reverb/s5/local/REVERB_wsjcam0_data_prep.sh b/egs/reverb/s5/local/REVERB_wsjcam0_data_prep.sh index 2c169e84b59..6ab2f2f4b73 100755 --- a/egs/reverb/s5/local/REVERB_wsjcam0_data_prep.sh +++ b/egs/reverb/s5/local/REVERB_wsjcam0_data_prep.sh @@ -50,8 +50,8 @@ fi cd $dir MIC=primary -# unfortunately, we need a pointer to HTK baseline -# since the corpus does NOT contain the data set descriptions +# unfortunately, we need a pointer to HTK baseline +# since the corpus does NOT contain the data set descriptions # for the REVERB Challenge taskFileDir=$dir/../reverb_tools/ReleasePackage/reverb_tools_for_asr_ver2.0/taskFiles/1ch #taskFiles=`ls $taskFileDir/*Data_dt_for_*` @@ -108,9 +108,9 @@ echo "Data preparation for $set succeeded" mfccdir=mfcc/$dataset -#for x in test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean train_si84_clean; do -#for x in si_tr; do -steps/make_mfcc.sh --nj 10 \ +#for x in test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean train_si84_clean; do +#for x in si_tr; do +steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 \ data/$dataset/$set exp/make_mfcc/$dataset/$set $mfccdir || exit 1; steps/compute_cmvn_stats.sh data/$dataset/$set exp/make_mfcc/$dataset/$set $mfccdir || exit 1; diff --git a/egs/reverb/s5/local/calc_wer.sh b/egs/reverb/s5/local/calc_wer.sh new file mode 100755 index 00000000000..c4b5eeb87f3 --- /dev/null +++ b/egs/reverb/s5/local/calc_wer.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +# Copyright 2016 MERL (author: Shinji Watanabe) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +. ./cmd.sh +. ./path.sh + +lmw=15 +am="tri2a" +lm="bg_5k" +decode="" + +. utils/parse_options.sh + +if [ ! -z $decode ]; then + decode="_$decode" +fi + +dir="exp/$am/decode${decode}_${lm}_REVERB_" +echo "####################" +echo "${dir}*dt*" +for a in `echo ${dir}*dt* | tr " " "\n" | grep -v "A\.si"`; do + echo $a | awk -F '_' '{for(i=NF-6;i -1) { - if ($ARGV[0] =~ /^--lmw=(\d+)$/) - { - $opt_lmw = $1 + 0; - shift @ARGV; - } - elsif ($ARGV[0] =~ /^--lm=(\w+)$/) { - $lm = $1; - shift @ARGV; - } - else { - last; - } -} - - -print "$0 @ARGV\n"; - -my $system = "tri2b_mc"; -if ($ARGV[0] ne "") { $system = $ARGV[0]; } - -for my $dt_or_et ("dt", "et") { - -print "#### RESULTS FOR $dt_or_et ##### \n\n"; - -my $pref = "REVERB_$dt_or_et"; -#if ($lm ne "bg_5k") { -$pref = "${lm}_$pref"; -#} -if ($ARGV[1] ne "") { $pref = $ARGV[1] . '_' . $pref; } -if ($ARGV[2] ne "") { $pref = $pref . '_' . $ARGV[2]; } - -my $suff = ""; - -print "exp/$system/decode_$suff$pref*\n"; -my @folders = glob("exp/$system/decode_$suff$pref*"); - -my ($min_lmw, $max_lmw) = (9, 20); -@folders = grep { -f "$_/wer_$min_lmw" } @folders; -my @sum_wer; -my %wer; -my %avg_wer_disp; -my $nc = 0; -my $ns = 0; -my $nr = 0; -for my $lmw ($min_lmw..$max_lmw) -{ - for my $fold (@folders) { - my $res_file = "$fold/wer_$lmw"; - #print "fold = $fold pref = $pref\n"; - #my ($cond) = $fold =~ /decode_(\w+)$/; - my ($cond) = $fold =~ /decode_\Q$suff\E\Q${pref}\E_(\w+)$/; - if ($cond =~ /^Sim.+(far|near|cln)|^Real/) { - open(RES, $res_file) or die "$res_file: $_"; - while () { - if (/%WER\s+(\S+)/) { - my $wer = $1; - #print "cond = $cond lmw = $lmw wer = $1\n"; - if ($cond !~ /cln/) { - $sum_wer[$lmw] += $wer; - } - $wer{$cond}[$lmw] = $wer; - } - } - #print "cond = $cond fold = $fold\n"; - } - } -} - -if (!$opt_lmw && $dt_or_et eq "dt") { - $opt_lmw = $min_lmw; - for my $lmw ($min_lmw+1..$max_lmw) { - if ($sum_wer[$lmw] < $sum_wer[$opt_lmw]) { - $opt_lmw = $lmw; - } - } -} - -print "LMW = $opt_lmw\n"; -for my $cond (sort keys %wer) { - print "$cond\t$wer{$cond}[$opt_lmw]\n"; - if ($cond =~ /SimData_[de]t/) { - if ($cond !~ /cln/) { - $avg_wer_disp{"SimData"} += ($wer{$cond}[$opt_lmw] - $avg_wer_disp{"SimData"}) / ++$ns; - } - else { - $avg_wer_disp{"CleanData"} += ($wer{$cond}[$opt_lmw] - $avg_wer_disp{"CleanData"}) / ++$nc; - } - } - elsif ($cond =~ /RealData_[de]t/) { - $avg_wer_disp{"RealData"} += ($wer{$cond}[$opt_lmw] - $avg_wer_disp{"RealData"}) / ++$nr; - } -} - -#print "Avg_Clean($nc)\t", sprintf("%.2f", $avg_wer_disp{"CleanData"}), "\n"; -print "Avg_Sim($ns)\t", sprintf("%.2f", $avg_wer_disp{"SimData"}), "\n"; -print "Avg_Real($nr)\t", sprintf("%.2f", $avg_wer_disp{"RealData"}), "\n"; -print "\n\n"; - -} diff --git a/egs/reverb/s5/run.sh b/egs/reverb/s5/run.sh index 0e3eac6e6c1..ffb0b20422d 100755 --- a/egs/reverb/s5/run.sh +++ b/egs/reverb/s5/run.sh @@ -15,89 +15,92 @@ # See the Apache 2 License for the specific language governing permissions and # limitations under the License. +# This is a shell script, but it's recommended that you run the commands one by +# one by copying and pasting into the shell. +# Caution: some of the graph creation steps use quite a bit of memory, so you +# should run this on a machine that has sufficient memory. + # Requirements) matlab and tcsh if [ ! `which tcsh` ]; then - echo "Install tcsh, which is used in some REVERB scripts" - exit 1 + echo "Install tcsh, which is used in some REVERB scripts" + exit 1 fi if [ ! `which matlab` ]; then - echo "Install matlab, which is used to generate multi-condition data" - exit 1 + echo "Install matlab, which is used to generate multi-condition data" + exit 1 fi -if [ ! -e path.sh ] || [ ! -e corpus.sh ]; then - echo "ERROR: path.sh and/or corpus.sh not found" - echo "You need to create these from {path,corpus}.sh.default to match your system" - echo "Make sure you follow the instructions in ../README.txt" - exit 1 +. ./cmd.sh +. ./path.sh + +stage=1 +. utils/parse_options.sh +# Set bash to 'debug' mode, it prints the commands (option '-x') and exits on : +# -e 'error', -u 'undefined variable', -o pipefail 'error in pipeline', +set -euxo pipefail + +# please make sure to set the paths of the REVERB and WSJ0 data +if [[ $(hostname -f) == *.clsp.jhu.edu ]] ; then + REVERB_home=/export/corpora5/REVERB_2014/REVERB + export wsjcam0=/export/corpora3/LDC/LDC95S24/wsjcam0 + # set LDC WSJ0 directory to obtain LMs + # REVERB data directory only provides bi-gram (bcb05cnp), but this recipe also uses 3-gram (tcb05cnp.z) + export wsj0=/export/corpora5/LDC/LDC93S6A/11-13.1 #LDC93S6A or LDC93S6B + # It is assumed that there will be a 'wsj0' subdirectory + # within the top-level corpus directory +elif [[ $(hostname -f) == *.merl.com ]] ; then + REVERB_home=/db/laputa1/data/original/public/REVERB + export wsjcam0=$REVERB_home/wsjcam0 + # set LDC WSJ0 directory to obtain LMs + # REVERB data directory only provides bi-gram (bcb05cnp), but this recipe also uses 3-gram (tcb05cnp.z) + export wsj0=/db/laputa1/data/original/public/WSJ0/11-13.1 #LDC93S6A or LDC93S6B + # It is assumed that there will be a 'wsj0' subdirectory + # within the top-level corpus directory +else + echo "Set the data directory locations." && exit 1; fi +export reverb_dt=$REVERB_home/REVERB_WSJCAM0_dt +export reverb_et=$REVERB_home/REVERB_WSJCAM0_et +export reverb_real_dt=$REVERB_home/MC_WSJ_AV_Dev +export reverb_real_et=$REVERB_home/MC_WSJ_AV_Eval -. ./cmd.sh - -# please make sure to set the paths of the REVERB and WSJ0 data -. ./corpus.sh - -# set the directory of the multi-condition training data generated +# set the directory of the multi-condition training data to be generated reverb_tr=`pwd`/data_tr_cut/REVERB_WSJCAM0_tr_cut # LDA context size (left/right) (4 is default) context_size=4 -# The language models with which to decode (tg_5k or bg_5k or "tg_5k bg_5k" for -# both) -lms="bg_5k tg_5k" +# The language models with which to decode (tg_5k or bg_5k) +lm="tg_5k" # number of jobs for feature extraction and model training nj_train=30 # number of jobs for decoding -# use less jobs for trigram model -# if you have enough RAM (~ 32 GB), you can use 8 jobs for trigram as well -nj_bg=8 -nj_tg=8 -nj_bg=25 ## -nj_tg=25 ## - -# set to true if running from scratch -do_prep=true +nj_decode=8 # set to true if you want the tri2a systems (re-implementation of the HTK baselines) do_tri2a=true - -# The following are the settings determined by Gaussian Process optimization. -# However, they are not used in the final system. -# You can use the code below for training the "tri2c_mc" system. - -# LDA parameters for MCT recognizer. -# Use significantly more context than the default (7 frames ~ 85 ms) -mct_lda_left_context=7 -mct_lda_right_context=5 - -# Number of states and Gaussians for the MCT recognizer. -mct_nstates=7500 -mct_ngauss=45000 - -## End of GP tuned settings - -false && { -if $do_prep; then +if [ $stage -le 1 ]; then # Generate multi-condition training data # Note that utterance lengths match the original set. # This enables using clean alignments in multi-condition training (stereo training) - #local/REVERB_create_mcdata.sh $wsjcam0 $reverb_tr + local/REVERB_create_mcdata.sh $wsjcam0 $reverb_tr +fi +if [ $stage -le 2 ]; then # Prepare wsjcam0 clean data and wsj0 language model. - local/wsjcam0_data_prep.sh $wsjcam0 $wsj0 || exit 1 + local/wsjcam0_data_prep.sh $wsjcam0 $wsj0 # Prepare merged BEEP/CMU dictionary. - local/wsj_prepare_beep_dict.sh || exit 1; + local/wsj_prepare_beep_dict.sh # Prepare wordlists, etc. - utils/prepare_lang.sh data/local/dict "" data/local/lang_tmp data/lang || exit 1; + utils/prepare_lang.sh data/local/dict "" data/local/lang_tmp data/lang # Prepare directory structure for clean data. Apply some language model fixes. - local/wsjcam0_format_data.sh || exit 1; + local/wsjcam0_format_data.sh # Now it's getting more interesting. # Prepare the multi-condition training data and the REVERB dt set. @@ -108,253 +111,227 @@ if $do_prep; then # local/REVERB_wsjcam0_data_prep.sh /path/to/processed/REVERB_WSJCAM0_dt processed_REVERB_dt dt # The first argument is supposed to point to a folder that has the same structure # as the REVERB corpus. - local/REVERB_wsjcam0_data_prep.sh $reverb_tr REVERB_tr_cut tr || exit 1; - local/REVERB_wsjcam0_data_prep.sh $reverb_dt REVERB_dt dt || exit 1; - local/REVERB_wsjcam0_data_prep.sh $reverb_et REVERB_et et || exit 1; + local/REVERB_wsjcam0_data_prep.sh $reverb_tr REVERB_tr_cut tr + local/REVERB_wsjcam0_data_prep.sh $reverb_dt REVERB_dt dt + local/REVERB_wsjcam0_data_prep.sh $reverb_et REVERB_et et # Prepare the REVERB "real" dt set from MCWSJAV corpus. # This corpus is *never* used for training. # This creates the data set called REVERB_Real_dt and its subfolders - local/REVERB_mcwsjav_data_prep.sh $reverb_real_dt REVERB_Real_dt dt || exit 1; + local/REVERB_mcwsjav_data_prep.sh $reverb_real_dt REVERB_Real_dt dt # The MLF file exists only once in the corpus, namely in the real_dt directory # so we pass it as 4th argument - local/REVERB_mcwsjav_data_prep.sh $reverb_real_et REVERB_Real_et et $reverb_real_dt/mlf/WSJ.mlf || exit 1; + local/REVERB_mcwsjav_data_prep.sh $reverb_real_et REVERB_Real_et et $reverb_real_dt/mlf/WSJ.mlf +fi +if [ $stage -le 3 ]; then # Extract MFCC features for clean sets. # For the non-clean data sets, this is outsourced to the data preparation scripts. mfccdir=mfcc ### for x in si_tr si_dt; do it seems that the number of transcriptions of si_dt is not correct. - for x in si_tr; do - steps/make_mfcc.sh --nj $nj_train \ - data/$x exp/make_mfcc/$x $mfccdir || exit 1; - steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1; + for x in si_tr; do + steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj_train \ + data/$x exp/make_mfcc/$x $mfccdir + steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir done fi -# Train monophone model on clean data (si_tr). -if [ ! -e exp/mono0a/final.mdl ]; then - echo "### TRAINING mono0a ###" - steps/train_mono.sh --boost-silence 1.25 --nj $nj_train \ - data/si_tr data/lang exp/mono0a || exit 1; +if [ $stage -le 4 ]; then + # Train monophone model on clean data (si_tr). + echo "### TRAINING mono0a ###" + steps/train_mono.sh --boost-silence 1.25 --nj $nj_train --cmd "$train_cmd" \ + data/si_tr data/lang exp/mono0a + + # Align monophones with clean data. + echo "### ALIGNING mono0a_ali ###" + steps/align_si.sh --boost-silence 1.25 --nj $nj_train --cmd "$train_cmd" \ + data/si_tr data/lang exp/mono0a exp/mono0a_ali + + # Create first triphone recognizer. + echo "### TRAINING tri1 ###" + steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \ + 2000 10000 data/si_tr data/lang exp/mono0a_ali exp/tri1 + + echo "### ALIGNING tri1_ali ###" + # Re-align triphones. + steps/align_si.sh --nj $nj_train --cmd "$train_cmd" \ + data/si_tr data/lang exp/tri1 exp/tri1_ali fi -# Align monophones with clean data. -if [ ! -e exp/mono0a_ali/ali.1.gz ]; then - echo "### ALIGNING mono0a_ali ###" - steps/align_si.sh --boost-silence 1.25 --nj $nj_train \ - data/si_tr data/lang exp/mono0a exp/mono0a_ali || exit 1; -fi - -# Create first triphone recognizer. -if [ ! -e exp/tri1/final.mdl ]; then - echo "### TRAINING tri1 ###" - steps/train_deltas.sh --boost-silence 1.25 \ - 2000 10000 data/si_tr data/lang exp/mono0a_ali exp/tri1 || exit 1; -fi - -# Prepare first triphone recognizer and decode clean si_dt for verification. -#utils/mkgraph.sh data/lang_test_bg_5k exp/tri1 exp/tri1/graph_bg_5k || exit 1; -#steps/decode.sh --nj 8 exp/tri1/graph_bg_5k data/si_dt exp/tri1/decode_si_dt - -if [ ! -e exp/tri1_ali/ali.1.gz ]; then - echo "### ALIGNING tri1_ali ###" - # Re-align triphones. - steps/align_si.sh --nj $nj_train \ - data/si_tr data/lang exp/tri1 exp/tri1_ali || exit 1; -fi - - # The following code trains and evaluates a delta feature recognizer, which is similar to the HTK # baseline (but using per-utterance basis fMLLR instead of batch MLLR). This is for reference only. if $do_tri2a; then +if [ $stage -le 5 ]; then # Train tri2a, which is deltas + delta-deltas, on clean data. - steps/train_deltas.sh \ - 2500 15000 data/si_tr data/lang exp/tri1_ali exp/tri2a || exit 1; + steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 15000 data/si_tr data/lang exp/tri1_ali exp/tri2a # Re-align triphones using clean data. This gives a smallish performance gain. - steps/align_si.sh --nj $nj_train \ - data/si_tr data/lang exp/tri2a exp/tri2a_ali || exit 1; + steps/align_si.sh --nj $nj_train --cmd "$train_cmd" \ + data/si_tr data/lang exp/tri2a exp/tri2a_ali # Train a multi-condition triphone recognizer. # This uses alignments on *clean* data, which is allowed for REVERB. - # However, we have to use the "cut" version so that the length of the + # However, we have to use the "cut" version so that the length of the # waveforms match. # It is actually asserted by the Challenge that clean and multi-condition waves are aligned. - steps/train_deltas.sh \ - 2500 15000 data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri2a_ali exp/tri2a_mc || exit 1; + steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 15000 data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri2a_ali exp/tri2a_mc # Prepare clean and mc tri2a models for decoding. - utils/mkgraph.sh data/lang_test_bg_5k exp/tri2a exp/tri2a/graph_bg_5k - utils/mkgraph.sh data/lang_test_bg_5k exp/tri2a_mc exp/tri2a_mc/graph_bg_5k + utils/mkgraph.sh data/lang_test_bg_5k exp/tri2a exp/tri2a/graph_bg_5k & + utils/mkgraph.sh data/lang_test_bg_5k exp/tri2a_mc exp/tri2a_mc/graph_bg_5k & + wait +fi +if [ $stage -le 6 ]; then # decode REVERB dt using tri2a, clean - for dataset in data/REVERB_dt/SimData_dt* data/REVERB_Real_dt/RealData_dt*; do - steps/decode.sh --nj $nj_bg \ - exp/tri2a/graph_bg_5k $dataset exp/tri2a/decode_bg_5k_REVERB_dt_`basename $dataset` || exit 1; + for dataset in data/REVERB_*{dt,et}/*; do + steps/decode.sh --nj $nj_decode --cmd "$decode_cmd" \ + exp/tri2a/graph_bg_5k $dataset exp/tri2a/decode_bg_5k_`echo $dataset | awk -F '/' '{print $2 "_" $3}'` & done # decode REVERB dt using tri2a, mc - for dataset in data/REVERB_dt/SimData_dt* data/REVERB_Real_dt/RealData_dt*; do - steps/decode.sh --nj $nj_bg \ - exp/tri2a_mc/graph_bg_5k $dataset exp/tri2a_mc/decode_bg_5k_REVERB_dt_`basename $dataset` || exit 1; + for dataset in data/REVERB_*{dt,et}/*; do + steps/decode.sh --nj $nj_decode --cmd "$decode_cmd" \ + exp/tri2a_mc/graph_bg_5k $dataset exp/tri2a_mc/decode_bg_5k_`echo $dataset | awk -F '/' '{print $2 "_" $3}'` & done + # basis fMLLR for tri2a_mc system # This computes a transform for every training utterance and computes a basis from that. - steps/get_fmllr_basis.sh --per-utt true data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri2a_mc || exit 1; + steps/get_fmllr_basis.sh --cmd "$train_cmd" --per-utt true data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri2a_mc # Recognition using fMLLR adaptation (per-utterance processing). - for dataset in data/REVERB_dt/SimData_dt* data/REVERB_Real_dt/RealData_dt*; do - steps/decode_basis_fmllr.sh --nj $nj_bg \ - exp/tri2a_mc/graph_bg_5k $dataset exp/tri2a_mc/decode_basis_fmllr_bg_5k_REVERB_dt_`basename $dataset` || exit 1; + for dataset in data/REVERB_*{dt,et}/*; do + steps/decode_basis_fmllr.sh --nj $nj_decode --cmd "$decode_cmd" \ + exp/tri2a_mc/graph_bg_5k $dataset exp/tri2a_mc/decode_basis_fmllr_bg_5k_`echo $dataset | awk -F '/' '{print $2 "_" $3}'` & done - -fi # train tri2a, tri2a_mc - - -# Train tri2b recognizer, which uses LDA-MLLT, using the default parameters from the WSJ recipe. -if [ ! -e exp/tri2b/final.mdl ]; then - echo "### TRAINING tri2b ###" - steps/train_lda_mllt.sh \ - --splice-opts "--left-context=$context_size --right-context=$context_size" \ - 2500 15000 data/si_tr data/lang exp/tri1_ali exp/tri2b || exit 1; + wait fi - -# tri2b (LDA-MLLT system) with multi-condition training, using default parameters. -if [ ! -e exp/tri2b_mc/final.mdl ]; then - echo "### TRAINING tri2b_mc ###" - steps/train_lda_mllt.sh \ - --splice-opts "--left-context=$context_size --right-context=$context_size" \ - 2500 15000 data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri1_ali exp/tri2b_mc || exit 1; fi - -# tri2c (LDA-MLLT system) with multi-condition training, optimized parameters. -# Disabled by default -- it only improves slightly, and tends to overfit. -if [ ! -e exp/tri2c_mc/final.mdl ]; then - echo "### TRAINING tri2c_mc ###" - steps/train_lda_mllt.sh \ - --splice-opts "--left-context=$mct_lda_left_context --right-context=$mct_lda_right_context" \ - $mct_nstates $mct_ngauss data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri1_ali exp/tri2c_mc || exit 1; +if [ $stage -le 7 ]; then + # Train tri2b recognizer, which uses LDA-MLLT, using the default parameters from the WSJ recipe. + echo "### TRAINING tri2b ###" + steps/train_lda_mllt.sh --cmd "$train_cmd" \ + --splice-opts "--left-context=$context_size --right-context=$context_size" \ + 2500 15000 data/si_tr data/lang exp/tri1_ali exp/tri2b + + # tri2b (LDA-MLLT system) with multi-condition training, using default parameters. + echo "### TRAINING tri2b_mc ###" + steps/train_lda_mllt.sh --cmd "$train_cmd"\ + --splice-opts "--left-context=$context_size --right-context=$context_size" \ + 2500 15000 data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri1_ali exp/tri2b_mc fi - # Prepare tri2b* systems for decoding. -for recog in tri2b tri2b_mc; do - for lm in $lms; do - graph=exp/$recog/graph_$lm - if [ ! -e "$graph" ]; then - echo "### MAKING GRAPH $graph ###" - utils/mkgraph.sh data/lang_test_$lm exp/$recog $graph || exit 1; - fi - done -done - +if [ $stage -le 8 ]; then + echo "### MAKING GRAPH {tri2b,tri2b_mc}/graph_$lm ###" + for recog in tri2b tri2b_mc; do + utils/mkgraph.sh data/lang_test_$lm exp/$recog exp/$recog/graph_$lm & + done + wait +fi # discriminative training on top of multi-condition systems # one could also add tri2b here to have a DT clean recognizer for reference -for base_recog in tri2b_mc; do - - bmmi_recog=${base_recog}_mmi_b0.1 - echo "### DT $base_recog --> $bmmi_recog ###" +if [ $stage -le 9 ]; then + base_recog=tri2b_mc + bmmi_recog=${base_recog}_mmi_b0.1 + echo "### DT $base_recog --> $bmmi_recog ###" + + # get alignments from base recognizer + steps/align_si.sh --nj $nj_train --cmd "$train_cmd" \ + --use-graphs true data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/$base_recog exp/${base_recog}_ali + + # get lattices from base recognizer + denlats_dir=${base_recog}_denlats + subsplit=`echo $nj_train \* 2 | bc` + # DT with multi-condition data ... + steps/make_denlats.sh --sub-split $subsplit --nj $nj_train --cmd "$decode_cmd" \ + data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/$base_recog exp/$denlats_dir + + # boosted MMI training + steps/train_mmi.sh --boost 0.1 --cmd "$train_cmd" \ + data/REVERB_tr_cut/SimData_tr_for_1ch_A \ + data/lang \ + exp/${base_recog}_ali \ + exp/$denlats_dir \ + exp/$bmmi_recog + cp exp/$base_recog/ali.* exp/$bmmi_recog +fi - # get alignments from base recognizer - if [ ! -e exp/${base_recog}_ali/ali.1.gz ]; then - steps/align_si.sh --nj $nj_train \ - --use-graphs true data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/$base_recog exp/${base_recog}_ali || exit 1; - fi +# decoding using various recognizers +if [ $stage -le 10 ]; then + # put tri2b last since it takes longest due to the large mismatch. + for recog in tri2b_mc tri2b_mc_mmi_b0.1 tri2b; do + # The graph from the ML directory is used in recipe + recog2=`echo $recog | sed s/_mmi.*//` + graph=exp/$recog2/graph_$lm + + echo "### DECODING with $recog, noadapt, $lm ###" + for dataset in data/REVERB_*{dt,et}/*; do + decode_suff=${lm}_`echo $dataset | awk -F '/' '{print $2 "_" $3}'` + steps/decode.sh --nj $nj_decode --cmd "$decode_cmd" \ + $graph $dataset \ + exp/$recog/decode_$decode_suff & + done + wait + + echo " ## MBR RESCORING with $recog, noadapt ##" + for dataset in data/REVERB_*{dt,et}/*; do + decode_suff=${lm}_`echo $dataset | awk -F '/' '{print $2 "_" $3}'` + mkdir -p exp/$recog/decode_mbr_$decode_suff + cp exp/$recog/decode_$decode_suff/lat.*.gz exp/$recog/decode_mbr_$decode_suff + local/score_mbr.sh --cmd "$decode_cmd" \ + $dataset data/lang_test_$lm/ exp/$recog/decode_mbr_$decode_suff & + done + wait - # get lattices from base recognizer - denlats_dir=${base_recog}_denlats - subsplit=`echo $nj_train \* 2 | bc` - if [ ! -e exp/$denlats_dir/.done.1 ]; then - # DT with multi-condition data ... - steps/make_denlats.sh --sub-split $subsplit --nj $nj_train \ - data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/$base_recog exp/$denlats_dir || exit 1; - fi + done # loop recog +fi - # boosted MMI training - if [ ! -e exp/$bmmi_recog/final.mdl ]; then - steps/train_mmi.sh --boost 0.1 \ - data/REVERB_tr_cut/SimData_tr_for_1ch_A \ - data/lang \ - exp/${base_recog}_ali \ - exp/$denlats_dir \ - exp/$bmmi_recog || exit 1; - cp exp/$base_recog/ali.* exp/$bmmi_recog +# decoding using various recognizers with adaptation +if [ $stage -le 11 ]; then + # put tri2b last since it takes longest due to the large mismatch. + for recog in tri2b_mc tri2b_mc_mmi_b0.1 tri2b; do + # The graph from the ML directory is used in recipe + recog2=`echo $recog | sed s/_mmi.*//` + graph=exp/$recog2/graph_$lm + + # set the adaptation data + if [[ "$recog" =~ _mc ]]; then + tr_dataset=REVERB_tr_cut/SimData_tr_for_1ch_A + else + tr_dataset=si_tr fi -done - -} + echo "### DECODING with $recog, basis_fmllr, $lm ###" + steps/get_fmllr_basis.sh --cmd "$train_cmd" --per-utt true data/$tr_dataset data/lang exp/$recog + for dataset in data/REVERB_*{dt,et}/*; do + ( + decode_suff=${lm}_`echo $dataset | awk -F '/' '{print $2 "_" $3}'` + steps/decode_basis_fmllr.sh --nj $nj_decode --cmd "$decode_cmd" \ + $graph $dataset \ + exp/$recog/decode_basis_fmllr_$decode_suff + ) & + done + wait + + echo " ## MBR RESCORING with $recog, basis_fmllr ##" + for dataset in data/REVERB_*{dt,et}/*; do + decode_suff=${lm}_`echo $dataset | awk -F '/' '{print $2 "_" $3}'` + mkdir -p exp/$recog/decode_mbr_basis_fmllr_$decode_suff + cp exp/$recog/decode_basis_fmllr_$decode_suff/lat.*.gz exp/$recog/decode_mbr_basis_fmllr_$decode_suff + local/score_mbr.sh --cmd "$decode_cmd" \ + $dataset data/lang_test_$lm/ exp/$recog/decode_mbr_basis_fmllr_$decode_suff & + done + wait -# decoding using bigram / trigram and various recognizers -do_adapt=true -for lm in $lms; do - if [[ "$lm" =~ tg ]]; then - nj=$nj_tg - else - nj=$nj_bg - fi - # put tri2b last since it takes longest due to the large mismatch. - for recog in tri2b_mc tri2b_mc_mmi_b0.1 tri2b; do - # The graph from the ML directory is used in recipe - recog2=`echo $recog | sed s/_mmi.*//` - graph=exp/$recog2/graph_$lm - for dataset in data/REVERB_dt/SimData_dt* \ - data/REVERB_et/SimData_et* \ - data/REVERB_Real_dt/RealData_dt* \ - data/REVERB_Real_et/RealData_et*; do - if [[ $dataset =~ _dt ]]; then - pdataset=REVERB_dt - elif [[ $dataset =~ _et ]]; then - pdataset=REVERB_et - else - echo "$0: Cannot figure out what to do with: $dataset" - exit 1 - fi - #pdataset=$(basename $(dirname $dataset)) - #echo $pdataset - decode_suff=${lm}_${pdataset}_`basename $dataset` - if [ ! -e exp/$recog/decode_$decode_suff/wer_15 ]; then - echo "### DECODING $dataset | $recog, noadapt, $lm ###" - steps/decode.sh --nj $nj \ - $graph $dataset \ - exp/$recog/decode_$decode_suff || exit 1; - fi - if [ ! -e exp/$recog/decode_mbr_$decode_suff/wer_15 ]; then - mkdir -p exp/$recog/decode_mbr_$decode_suff - cp exp/$recog/decode_$decode_suff/lat.*.gz exp/$recog/decode_mbr_$decode_suff - echo " ## MBR RESCORING $dataset | $recog, noadapt ##" - local/score_mbr.sh \ - $dataset data/lang_test_$lm/ exp/$recog/decode_mbr_$decode_suff || exit 1 - fi - if $do_adapt; then - if [ ! -e exp/$recog/fmllr.basis ]; then - if [[ "$recog" =~ _mc ]]; then - tr_dataset=REVERB_tr_cut/SimData_tr_for_1ch_A - else - tr_dataset=si_tr - fi - steps/get_fmllr_basis.sh --per-utt true data/$tr_dataset data/lang exp/$recog || exit 1; - fi - if [ ! -e exp/$recog/decode_basis_fmllr_$decode_suff/wer_15 ]; then - echo "### DECODING $dataset | $recog, basis_fmllr, $lm ###" - steps/decode_basis_fmllr.sh --nj $nj \ - $graph $dataset \ - exp/$recog/decode_basis_fmllr_$decode_suff || exit 1; - fi - if [ ! -e exp/$recog/decode_mbr_basis_fmllr_$decode_suff/wer_15 ]; then - mkdir -p exp/$recog/decode_mbr_basis_fmllr_$decode_suff - cp exp/$recog/decode_basis_fmllr_$decode_suff/lat.*.gz exp/$recog/decode_mbr_basis_fmllr_$decode_suff - echo " ## MBR RESCORING $dataset | $recog, basis_fmllr ##" - local/score_mbr.sh \ - $dataset data/lang_test_$lm/ exp/$recog/decode_mbr_basis_fmllr_$decode_suff || exit 1 - fi - fi - - done # loop data set - done # loop recog -done # loop LM + done # loop recog +fi # get all WERs with lmw=15 -local/get_results.sh +if [ $stage -le 12 ]; then + local/get_results.sh +fi diff --git a/egs/rm/s5/RESULTS b/egs/rm/s5/RESULTS index b515804cfc2..1014fce03ed 100644 --- a/egs/rm/s5/RESULTS +++ b/egs/rm/s5/RESULTS @@ -229,6 +229,9 @@ for x in exp/nnet2_online_wsj/nnet_ms_a_smbr_0.00005/1/decode_*; do grep WER $x/ %WER 7.33 [ 919 / 12533, 80 ins, 153 del, 686 sub ] exp/nnet2_online_wsj/nnet_ms_a_smbr_0.00005/1/decode_ug_epoch3/wer_13 %WER 7.36 [ 923 / 12533, 85 ins, 148 del, 690 sub ] exp/nnet2_online_wsj/nnet_ms_a_smbr_0.00005/1/decode_ug_epoch4/wer_13 +### chain results ### +# current best chain result with TDNN (check local/chain/run_tdnn_5f.sh) +%WER 2.94 [ 369 / 12533, 51 ins, 71 del, 247 sub ] exp/chain/tdnn_5f/decode/wer_3_0.5 ### nnet1 results ### diff --git a/egs/rm/s5/cmd.sh b/egs/rm/s5/cmd.sh index 4478796305e..6e2f3e9ee48 100644 --- a/egs/rm/s5/cmd.sh +++ b/egs/rm/s5/cmd.sh @@ -1,30 +1,31 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -train_cmd="queue.pl -l arch=*64" -decode_cmd="queue.pl -l arch=*64" +export train_cmd=queue.pl +export decode_cmd=queue.pl +export mkgraph_cmd=queue.pl +export cuda_cmd="queue.pl --gpu 1" -# cuda_cmd is used for nnet1 scripts e.g. local/run_dnn.sh, but -# in the nnet2 scripts e.g. local/run_nnet2.sh, this is not -# used and we append options to train_cmd. -cuda_cmd="queue.pl -l arch=*64 -l gpu=1" - -#train_cmd="run.pl" -# with run.pl we do training locally. Note: for jobs on smallish subsets, -# it's way faster to run on a single machine with a handful of CPUs, as -# you avoid the latency of starting GridEngine jobs. +# The rest of this file is here for historical reasons. For cluster-specific +# configuration it's generally better to use conf/queue.conf, see +# http://kaldi-asr.org/doc/queue.html. # BUT cluster: if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then queue="all.q@@blade,all.q@@speech" - gpu_queue="long.q@supergpu*,long.q@dellgpu*,long.q@pcspeech-gpu,long.q@pcgpu*" + gpu_queue="long.q@@gpu" storage="matylda5" export train_cmd="queue.pl -q $queue -l ram_free=1500M,mem_free=1500M,${storage}=1" export decode_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,${storage}=0.5" - export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1" + export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1" fi diff --git a/egs/rm/s5/local/chain/run_tdnn_5f.sh b/egs/rm/s5/local/chain/run_tdnn_5f.sh new file mode 100644 index 00000000000..0379d16fe13 --- /dev/null +++ b/egs/rm/s5/local/chain/run_tdnn_5f.sh @@ -0,0 +1,131 @@ +#!/bin/bash + +# this script is a modified version of swbd/run_tdnn_5f.sh + +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +dir=exp/chain/tdnn_5f + +# training options +num_epochs=12 +initial_effective_lrate=0.005 +final_effective_lrate=0.0005 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=2 +num_jobs_final=4 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 6 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 1200 data/train $lang $ali_dir $treedir +fi + +if [ $stage -le 7 ]; then + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 200 --jesus-forward-output-dim 500 --jesus-hidden-dim 2000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -2,-1,0,1 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1000000 \ + --lm-opts "--num-extra-lm-states=200" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet2_online/ivectors \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/train $treedir exp/tri3b_lats $dir || exit 1; +fi + +if [ $stage -le 8 ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \ + data/test exp/nnet2_online/extractor exp/nnet2_online/ivectors_test || exit 1; +fi + +if [ $stage -le 9 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 --scoring-opts "--min-lmwt 1" \ + --nj 20 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet2_online/ivectors_test \ + $dir/graph data/test $dir/decode || exit 1; +fi + +if [ $stage -le 10 ]; then + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_ug $dir $dir/graph_ug + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 20 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet2_online/ivectors_test \ + $dir/graph_ug data/test $dir/decode_ug || exit 1; +fi +wait; +exit 0; diff --git a/egs/sprakbanken/s5/cmd.sh b/egs/sprakbanken/s5/cmd.sh index 43867ccf0d9..71dd849a93b 100644 --- a/egs/sprakbanken/s5/cmd.sh +++ b/egs/sprakbanken/s5/cmd.sh @@ -1,30 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -#export train_cmd="queue.pl -l arch=*64" -#export decode_cmd="queue.pl -l arch=*64 --mem 2G" -#export mkgraph_cmd="queue.pl -l arch=*64 --mem 2G" -#export big_memory_cmd="queue.pl -l arch=*64 --mem 2G" -#export cuda_cmd="queue.pl -l gpu=1" - - - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" - -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -export train_cmd=run.pl -export decode_cmd=run.pl -export cuda_cmd=run.pl -export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/sre08/v1/cmd.sh b/egs/sre08/v1/cmd.sh index 5c38b3a5d77..d1ca1a6d126 100644 --- a/egs/sre08/v1/cmd.sh +++ b/egs/sre08/v1/cmd.sh @@ -1,28 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64*" -export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G" -#export cuda_cmd="..." -export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G" - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 4G" diff --git a/egs/sre10/v1/cmd.sh b/egs/sre10/v1/cmd.sh index 5c38b3a5d77..d1ca1a6d126 100755 --- a/egs/sre10/v1/cmd.sh +++ b/egs/sre10/v1/cmd.sh @@ -1,28 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64*" -export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G" -#export cuda_cmd="..." -export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G" - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 4G" diff --git a/egs/swbd/s5/cmd.sh b/egs/swbd/s5/cmd.sh index 4abf8546b0d..bae7f5cdf45 100644 --- a/egs/swbd/s5/cmd.sh +++ b/egs/swbd/s5/cmd.sh @@ -1,28 +1,16 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64*" -export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G" -#export cuda_cmd="..." -export mkgraph_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G" - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. +export train_cmd="queue.pl" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/swbd/s5b/cmd.sh b/egs/swbd/s5b/cmd.sh index 4abf8546b0d..575407ac0ff 100644 --- a/egs/swbd/s5b/cmd.sh +++ b/egs/swbd/s5b/cmd.sh @@ -1,28 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64*" -export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G" -#export cuda_cmd="..." -export mkgraph_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G" - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl - - +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/swbd/s5c/RESULTS b/egs/swbd/s5c/RESULTS index bba9b4cbfdd..7c2e22888d9 100644 --- a/egs/swbd/s5c/RESULTS +++ b/egs/swbd/s5c/RESULTS @@ -105,6 +105,9 @@ exit 0 %WER 14.5 | 1831 21395 | 86.8 8.5 4.6 1.3 14.5 52.4 | exp/nnet2_online/nnet_ms_b_online/decode_eval2000_hires_sw1_tg/score_12/eval2000_hires.ctm.swbd.filt.sys %WER 14.8 | 1831 21395 | 86.7 9.0 4.3 1.6 14.8 52.8 | exp/nnet2_online/nnet_ms_b_online/decode_eval2000_hires_sw1_tg_per_utt/score_10/eval2000_hires.ctm.swbd.filt.sys + +( +# old results with 25 million parameter model. We do not want to use such a big model. So see the new results below # local/nnet3/run_lstm.sh # these are results with nnet3 LSTMs cell_dim=1280, recurrent_dim=384, lstm_delay=-1 -2 -3, label_delay=5 num_params=25010228 (8 epoch training on speed-perturbed # and volume perturbed data) @@ -114,6 +117,21 @@ exit 0 %WER 18.1 | 4459 42989 | 84.0 11.2 4.8 2.0 18.1 54.9 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys %WER 22.0 | 2628 21594 | 80.5 13.9 5.6 2.5 22.0 57.3 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys %WER 23.3 | 2628 21594 | 79.4 14.7 6.0 2.7 23.3 59.2 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys +) + + +# local/nnet3/run_lstm.sh +# these are results with nnet3 LSTMs cell_dim=1024, recurrent_dim=256, nonrecurrent_projection_dim=256, lstm_delay=-1 -2 -3, label_delay=5 num_params=14.6M (8 epoch training on speed-perturbed +# this setup has the newly introduced feature self-repair, in addition to shrink +%WER 11.6 | 1831 21395 | 89.7 6.9 3.4 1.3 11.6 46.9 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 12.6 | 1831 21395 | 88.7 7.6 3.7 1.4 12.6 49.6 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 21.3 | 2628 21594 | 81.0 13.2 5.8 2.4 21.3 57.3 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 23.1 | 2628 21594 | 79.5 14.7 5.8 2.6 23.1 59.6 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 16.5 | 4459 42989 | 85.3 10.1 4.6 1.8 16.5 53.0 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +%WER 17.9 | 4459 42989 | 84.1 11.2 4.8 2.0 17.9 55.5 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 15.17 [ 7466 / 49204, 993 ins, 1937 del, 4536 sub ] exp/nnet3/lstm_ld5_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +%WER 16.12 [ 7931 / 49204, 1072 ins, 1910 del, 4949 sub ] exp/nnet3/lstm_ld5_sp/decode_train_dev_sw1_tg/wer_11_0.0 + # bidirectional LSTM # ----------------------- @@ -142,7 +160,11 @@ exit 0 %WER 11.3 | 1831 21395 | 90.0 6.8 3.2 1.3 11.3 46.6 | exp/chain/tdnn_2o_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys %WER 13.0 | 1831 21395 | 88.6 7.9 3.6 1.6 13.0 50.4 | exp/chain/tdnn_2o_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.swbd.filt.sys - +# current best 'chain' models with LSTM (see local/chain/run_lstm_d.sh) +%WER 10.5 | 1831 21395 | 90.8 6.4 2.9 1.3 10.5 44.3 | exp/chain/lstm_d_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.swbd.filt.sys +%WER 15.9 | 4459 42989 | 86.0 9.6 4.3 2.0 15.9 51.7 | exp/chain/lstm_d_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 21.2 | 2628 21594 | 81.4 12.8 5.9 2.6 21.2 56.7 | exp/chain/lstm_d_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 13.88 [ 6829 / 49204, 935 ins, 1690 del, 4204 sub ] exp/chain/lstm_d_ld5_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 # these are results with nnet3 LSTMs with CTC training : local/ctc/run_lstm.sh %WER 17.4 | 1831 21395 | 85.3 10.1 4.6 2.7 17.4 57.8 | exp/ctc/lstm_sp/decode_eval2000_sw1_fsh_fg_0.15/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys diff --git a/egs/swbd/s5c/cmd.sh b/egs/swbd/s5c/cmd.sh index 3f7de21e279..a14090a74a1 100644 --- a/egs/swbd/s5c/cmd.sh +++ b/egs/swbd/s5c/cmd.sh @@ -1,24 +1,29 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -# Default opts, -export train_cmd="queue.pl -l arch=*64*" -export decode_cmd="queue.pl -l arch=*64* --mem 4G" -export cuda_cmd=run.pl # Run on local machine, -export mkgraph_cmd="queue.pl -l arch=*64* --mem 4G" +export train_cmd="queue.pl" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" +export cuda_cmd="queue.pl --gpu 1" -# BUT options, + +# the rest of this file is present for historical reasons. it's better to +# create and edit conf/queue.conf for cluster-specific configuration. if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then # BUT cluster: queue="all.q@@blade,all.q@@speech" - gpu_queue="long.q@supergpu*,long.q@dellgpu*,long.q@pcspeech-gpu,long.q@pcgpu*" storage="matylda5" export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=0.25" export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.1" - export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1" + export cuda_cmd="queue.pl -q long.q -l gpu=1" fi diff --git a/egs/swbd/s5c/conf/mfcc_dbl3.conf b/egs/swbd/s5c/conf/mfcc_dbl3.conf new file mode 100644 index 00000000000..f0e09186f3e --- /dev/null +++ b/egs/swbd/s5c/conf/mfcc_dbl3.conf @@ -0,0 +1,16 @@ +# config for high-resolution MFCC features extracted at double the normal frame +# rate, intended for neural network training. Note: we keep all cepstra, so it +# has the same info as filterbank features, but MFCC is more easily compressible +# (because less correlated) which is why we prefer this method. +--use-energy=false # use average of log energy, not energy. +--sample-frequency=8000 # Switchboard is sampled at 8kHz +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=10 # for the higher-frequency-resolution mfcc coefficients, we'll use + # a larger window size of 25ms and the normal window. +--low-freq=40 # low cutoff frequency for mel bins +--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800) +--frame-length=17 # shorter than normal (25ms) frame length.... the shortest we can + # go without the FFT becoming lower resolution which might cause + # problems +--window-type=hanning # additionally making the context shorter by using a more aggressively tapering window. +--frame-shift=5 # half the normal frame shift diff --git a/egs/swbd/s5c/conf/mfcc_hires_dbl.conf b/egs/swbd/s5c/conf/mfcc_hires_dbl.conf new file mode 100644 index 00000000000..c41b76116ee --- /dev/null +++ b/egs/swbd/s5c/conf/mfcc_hires_dbl.conf @@ -0,0 +1,12 @@ +# config for high-resolution MFCC features extracted at double the normal frame +# rate, intended for neural network training. Note: we keep all cepstra, so it +# has the same info as filterbank features, but MFCC is more easily compressible +# (because less correlated) which is why we prefer this method. +--use-energy=false # use average of log energy, not energy. +--sample-frequency=8000 # Switchboard is sampled at 8kHz +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=40 # there is no dimensionality reduction. +--low-freq=40 # low cutoff frequency for mel bins +--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800) +--frame-length=20 # slightly less than the normal 25ms frame length. +--frame-shift=5 # half the normal frame shift diff --git a/egs/swbd/s5c/conf/mfcc_hires_dbl2.conf b/egs/swbd/s5c/conf/mfcc_hires_dbl2.conf new file mode 100644 index 00000000000..92670e7ed6e --- /dev/null +++ b/egs/swbd/s5c/conf/mfcc_hires_dbl2.conf @@ -0,0 +1,11 @@ +# config for high-resolution MFCC features extracted at double the normal frame +# rate, intended for neural network training. Note: we keep all cepstra, so it +# has the same info as filterbank features, but MFCC is more easily compressible +# (because less correlated) which is why we prefer this method. +--use-energy=false # use average of log energy, not energy. +--sample-frequency=8000 # Switchboard is sampled at 8kHz +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=40 # there is no dimensionality reduction. +--low-freq=40 # low cutoff frequency for mel bins +--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800) +--frame-shift=5 # half the normal frame shift diff --git a/egs/swbd/s5c/conf/mfcc_hiresf.conf b/egs/swbd/s5c/conf/mfcc_hiresf.conf new file mode 100644 index 00000000000..c0b1798a9c5 --- /dev/null +++ b/egs/swbd/s5c/conf/mfcc_hiresf.conf @@ -0,0 +1,12 @@ +# this is a config for 'fast' (7.5ms frame shift) high-resolution MFCC features, +# intended for use with chain models. Note: we keep all cepstra, so it has the +# same info as filterbank features, but MFCC is more easily compressible +# (because less correlated) which is why we prefer this method. +--use-energy=false # use average of log energy, not energy. +--sample-frequency=8000 # Switchboard is sampled at 8kHz +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=40 # there is no dimensionality reduction. +--low-freq=40 # low cutoff frequency for mel bins +--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800) +--frame-length=25 # the normal frame length +--frame-shift=7.5 diff --git a/egs/swbd/s5c/local/chain/README.txt b/egs/swbd/s5c/local/chain/README.txt new file mode 100644 index 00000000000..8e347f4f889 --- /dev/null +++ b/egs/swbd/s5c/local/chain/README.txt @@ -0,0 +1,29 @@ + +there are a lot of tuning experiments here. + +ones to look at right now: + 2y is a TDNN baseline + 4f is a good jesus-layer system + 4q is an improved TDNN with various bells and whistles from Vijay. + 4r is a slightly-better jesus-layer system than 4f, with one more layer. + 5e is the best configuration run so far that doesn't have statistics-averaging layers. + 5g uses a statistics-averaging layer in the middle to slightly improve on 5e (by about + 0.2%). + 5j is a basic configuration without iVectors (about 2% abs worse than 5e) + 5k is the best configurations without iVectors... about 1% abs worse than 5e; we + use statistics-averaging layers to do some crude adaptation. + 5t gives about the same performance as 5e but is about 30% faster to train + and is smaller. + 5v is what I am currently using as a baseline- it has an even smaller + --jesus-hidden-dim as 5t (hence faster to train), but gives the same + performance. + 6g is a setup with a 'thinner' jesus-layer (with only one repeated-affine component) + and slightly more parameters, which is quicker to train than 5v but gives + about the same results. I'm hoping to use this setup, going forward. + 6i is like 6i but with a separate last-but-one affine layer for the xent output + (marginally better than 6g). + 6z is probably the thing I currently recommend to run-- it's a TDNN+ReLU based + setup that's quite fast to train and gives better results than our old + jesus-layer-based system. + + diff --git a/egs/swbd/s5c/local/chain/compare_wer.sh b/egs/swbd/s5c/local/chain/compare_wer.sh new file mode 100755 index 00000000000..ded03563711 --- /dev/null +++ b/egs/swbd/s5c/local/chain/compare_wer.sh @@ -0,0 +1,62 @@ +#!/bin/bash + + +echo -n "System " +for x in $*; do printf "% 10s" $x; done +echo + +echo -n "WER on train_dev(tg) " +for x in $*; do + wer=$(grep WER exp/chain/tdnn_${x}_sp/decode_train_dev_sw1_tg/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "WER on train_dev(fg) " +for x in $*; do + wer=$(grep WER exp/chain/tdnn_${x}_sp/decode_train_dev_sw1_fsh_fg/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "WER on eval2000(tg) " +for x in $*; do + wer=$(grep Sum exp/chain/tdnn_${x}_sp/decode_eval2000_sw1_tg/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "WER on eval2000(fg) " +for x in $*; do + wer=$(grep Sum exp/chain/tdnn_${x}_sp/decode_eval2000_sw1_fsh_fg/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "Final train prob " +for x in $*; do + prob=$(grep Overall exp/chain/tdnn_${x}_sp/log/compute_prob_train.final.log | grep -v xent | awk '{print $8}') + printf "% 10s" $prob +done +echo + +echo -n "Final valid prob " +for x in $*; do + prob=$(grep Overall exp/chain/tdnn_${x}_sp/log/compute_prob_valid.final.log | grep -v xent | awk '{print $8}') + printf "% 10s" $prob +done +echo + +echo -n "Final train prob (xent) " +for x in $*; do + prob=$(grep Overall exp/chain/tdnn_${x}_sp/log/compute_prob_train.final.log | grep -w xent | awk '{print $8}') + printf "% 10s" $prob +done +echo + +echo -n "Final valid prob (xent) " +for x in $*; do + prob=$(grep Overall exp/chain/tdnn_${x}_sp/log/compute_prob_valid.final.log | grep -w xent | awk '{print $8}') + printf "% 10s" $prob +done +echo diff --git a/egs/swbd/s5c/local/chain/run_blstm_6h.sh b/egs/swbd/s5c/local/chain/run_blstm_6h.sh new file mode 100755 index 00000000000..b19a0b489a0 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_blstm_6h.sh @@ -0,0 +1,206 @@ +#!/bin/bash + +# based on run_tdnn_6h.sh + +#%WER 9.6 | 1831 21395 | 91.6 5.8 2.6 1.2 9.6 44.2 | exp/chain/blstm_6h_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.swbd.filt.sys +#%WER 14.5 | 4459 42989 | 87.4 8.9 3.7 1.9 14.5 50.5 | exp/chain/blstm_6h_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +#%WER 19.3 | 2628 21594 | 83.3 11.8 4.9 2.5 19.3 54.8 | exp/chain/blstm_6h_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.callhm.filt.sys +#%WER 13.32 [ 6554 / 49204, 830 ins, 1696 del, 4028 sub ] exp/chain/blstm_6h_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/blstm_6h # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= +decode_dir_affix= + +# training options +leftmost_questions_truncate=-1 +chunk_width=150 +chunk_left_context=40 +chunk_right_context=40 +xent_regularize=0.025 + +label_delay=0 +# decode options +extra_left_context= +extra_right_context= +frames_per_chunk= + +remove_egs=false +common_egs_dir= + +affix= +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + + steps/nnet3/lstm/make_configs.py \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + --splice-indexes="-2,-1,0,1,2 0 0" \ + --lstm-delay=" [-3,3] [-3,3] [-3,3] " \ + --xent-regularize $xent_regularize \ + --include-log-softmax false \ + --num-lstm-layers 3 \ + --cell-dim 1024 \ + --hidden-dim 1024 \ + --recurrent-projection-dim 256 \ + --non-recurrent-projection-dim 256 \ + --label-delay $label_delay \ + --self-repair-scale 0.00001 \ + $dir/configs || exit 1; + +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --chain.left-deriv-truncate 0 \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 1200000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $extra_right_context ] && extra_right_context=$chunk_right_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 250 --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_lstm_6h.sh b/egs/swbd/s5c/local/chain/run_lstm_6h.sh new file mode 100755 index 00000000000..feb72aee726 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_lstm_6h.sh @@ -0,0 +1,211 @@ +#!/bin/bash + +# based on run_tdnn_6h.sh + +# %WER 15.6 | 4459 42989 | 86.1 9.2 4.7 1.8 15.6 52.1 | exp/chain/lstm_6h_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.filt.sys +# %WER 10.3 | 1831 21395 | 90.9 6.1 3.0 1.3 10.3 44.7 | exp/chain/lstm_6h_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +# %WER 20.7 | 2628 21594 | 82.0 12.8 5.3 2.7 20.7 56.7 | exp/chain/lstm_6h_ld5_sp/decode_eval2000_sw1_fsh_fg/score_8_0.0/eval2000_hires.ctm.callhm.filt.sys + +# if right-tolerance was 10 (these are old results) +#--------------------------- +# %WER 15.8 | 4459 42989 | 86.0 9.3 4.8 1.8 15.8 52.0 | exp/chain/lstm_6h_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +# %WER 10.6 | 1831 21395 | 90.6 6.2 3.2 1.2 10.6 45.2 | exp/chain/lstm_6h_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +# %WER 21.0 | 2628 21594 | 81.4 12.4 6.3 2.4 21.0 56.8 | exp/chain/lstm_6h_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.callhm.filt.sys + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/lstm_6h # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= +decode_dir_affix= + +# training options +leftmost_questions_truncate=-1 +chunk_width=150 +chunk_left_context=40 +chunk_right_context=0 +xent_regularize=0.025 + +label_delay=5 +# decode options +extra_left_context= +extra_right_context= +frames_per_chunk= + +remove_egs=false +common_egs_dir= + +affix= +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + + steps/nnet3/lstm/make_configs.py \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + --splice-indexes="-2,-1,0,1,2 0 0" \ + --lstm-delay=" -3 -3 -3 " \ + --xent-regularize $xent_regularize \ + --include-log-softmax false \ + --num-lstm-layers 3 \ + --cell-dim 1024 \ + --hidden-dim 1024 \ + --recurrent-projection-dim 256 \ + --non-recurrent-projection-dim 256 \ + --label-delay $label_delay \ + --self-repair-scale 0.00001 \ + $dir/configs || exit 1; + +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --chain.left-deriv-truncate 0 \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 1200000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $extra_right_context ] && extra_right_context=$chunk_right_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 250 --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_lstm_d.sh b/egs/swbd/s5c/local/chain/run_lstm_d.sh new file mode 100755 index 00000000000..05db63c2bee --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_lstm_d.sh @@ -0,0 +1,216 @@ +#!/bin/bash + +# based on run_tdnn_2o.sh + +set -e + +# configs for 'chain' +stage=10 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/lstm_d # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= +decode_dir_affix= + +# LSTM options +splice_indexes="-2,-1,0,1,2 0 0" +lstm_delay=" -3 -3 -3 " +label_delay=5 +num_lstm_layers=3 +cell_dim=1024 +hidden_dim=1024 +recurrent_projection_dim=256 +non_recurrent_projection_dim=256 + +# training options +leftmost_questions_truncate=-1 +chunk_width=150 +chunk_left_context=40 +chunk_right_context=0 +xent_regularize=0.025 + +# decode options +extra_left_context= +extra_right_context= +frames_per_chunk= + +remove_egs=false +common_egs_dir= + +affix= +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + + # create the config files for nnet initialization + # note an additional space is added to splice_indexes to + # avoid issues with the python ArgParser which can have + # issues with negative arguments (due to minus sign) + config_extra_opts=() + [ ! -z "$lstm_delay" ] && config_extra_opts+=(--lstm-delay "$lstm_delay") + + steps/nnet3/lstm/make_configs.py "${config_extra_opts[@]}" \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + --xent-regularize $xent_regularize \ + --include-log-softmax false \ + --splice-indexes "$splice_indexes " \ + --num-lstm-layers $num_lstm_layers \ + --cell-dim $cell_dim \ + --hidden-dim $hidden_dim \ + --recurrent-projection-dim $recurrent_projection_dim \ + --non-recurrent-projection-dim $non_recurrent_projection_dim \ + --label-delay $label_delay \ + --self-repair-scale 0.00001 \ + $dir/configs || exit 1; + +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00001 \ + --chain.xent-regularize $xent_regularize \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --chain.left-deriv-truncate 0 \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --egs.stage $get_egs_stage \ + --egs.opts="--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $extra_right_context ] && extra_right_context=$chunk_right_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/lstm/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 250 --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2e.sh b/egs/swbd/s5c/local/chain/run_tdnn_2e.sh index 2e08d5e22af..a8552244ed2 100755 --- a/egs/swbd/s5c/local/chain/run_tdnn_2e.sh +++ b/egs/swbd/s5c/local/chain/run_tdnn_2e.sh @@ -276,4 +276,4 @@ b01:s5c: for l in y 2b 2e; do grep WER exp/chain/tdnn_${l}_sp/decode_train_dev_s b01:s5c: for l in y 2b 2e; do grep WER exp/chain/tdnn_${l}_sp/decode_train_dev_sw1_fsh_fg/wer_* | utils/best_wer.sh ; done %WER 16.57 [ 8155 / 49204, 1144 ins, 1988 del, 5023 sub ] exp/chain/tdnn_y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 %WER 16.83 [ 8282 / 49204, 1106 ins, 2115 del, 5061 sub ] exp/chain/tdnn_2b_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 -%WER 16.79 [ 8260 / 49204, 1090 ins, 2138 del, 5032 sub ] exp/chain/tdnn_2e_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 \ No newline at end of file +%WER 16.79 [ 8260 / 49204, 1090 ins, 2138 del, 5032 sub ] exp/chain/tdnn_2e_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2i.sh b/egs/swbd/s5c/local/chain/run_tdnn_2i.sh index eaa5a77949f..218890cc418 100755 --- a/egs/swbd/s5c/local/chain/run_tdnn_2i.sh +++ b/egs/swbd/s5c/local/chain/run_tdnn_2i.sh @@ -1,69 +1,10 @@ #!/bin/bash - -# _2i is as _2d but with a new set of code for estimating the LM, in which we compute -# the log-like change when deciding which states to back off. The code is not the same -# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By -# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration -# is quite similar to 2d, except new/more-exact code is used. - -# see table in run_tdnn_2a.sh for results - -# _2d is as _2c but with different LM options: -# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" -# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. -# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions -# provided from the tree-building, and effectively puts the leftmost context position as a single -# set. -# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg -# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. - -# _2c is as _2a but after a code change in which we start using transition-scale -# and self-loop-scale of 1 instead of zero in training; we change the options to -# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect -# results at all; it's is mainly for convenience in pushing weights in graphs, -# and checking that graphs are stochastic. - -# _2a is as _z but setting --lm-opts "--num-extra-states=8000". - -# _z is as _x but setting --lm-opts "--num-extra-states=2000". -# (see also y, which has --num-extra-states=500). - -# _x is as _s but setting --lm-opts "--num-extra-states=0". -# this is a kind of repeat of the u->v experiment, where it seemed to make things -# worse, but there were other factors involved in that so I want to be sure. - -# _s is as _q but setting pdf-boundary-penalty to 0.0 -# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, -# and 18.07 -> 16.96 on train_dev, after fg rescoring. - -# _q is as _p except making the same change as from n->o, which -# reduces the parameters to try to reduce over-training. We reduce -# relu-dim from 1024 to 850, and target num-states from 12k to 9k, -# and modify the splicing setup. -# note: I don't rerun the tree-building, I just use the '5o' treedir. - -# _p is as _m except with a code change in which we switch to a different, more -# exact mechanism to deal with the edges of the egs, and correspondingly -# different script options... we now dump weights with the egs, and apply the -# weights to the derivative w.r.t. the output instead of using the -# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap -# to 30 also. This wil. give 10 frames on each side with zero derivs, then -# ramping up to a weight of 1.0 over 10 frames. - -# _m is as _k but after a code change that makes the denominator FST more -# compact. I am rerunning in order to verify that the WER is not changed (since -# it's possible in principle that due to edge effects related to weight-pushing, -# the results could be a bit different). -# The results are inconsistently different but broadly the same. On all of eval2000, -# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. -# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. - - -# _k is as _i but reverting the g->h change, removing the --scale-max-param-change -# option and setting max-param-change to 1.. Using the same egs. - +# _2i is as _i but it uses speaker perturbation combined with speed perturbation. # _i is as _h but longer egs: 150 frames instead of 75, and # 128 elements per minibatch instead of 256. +# be cautious comparing the valid probs with h though, because +# we fixed the utt2uniq bug at this point, so from h on, the valid probs +# are properly held out. # _h is as _g but different application of max-param-change (use --scale-max-param-change true) @@ -93,21 +34,23 @@ set -e # configs for 'chain' -stage=12 +stage=1 train_stage=-10 get_egs_stage=-10 speed_perturb=true +speaker_perturb=true dir=exp/chain/tdnn_2i # Note: _sp will get added to this if $speed_perturb == true. # TDNN options -splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -9,0,9 0" # training options num_epochs=4 initial_effective_lrate=0.001 final_effective_lrate=0.0001 leftmost_questions_truncate=30 -max_param_change=1.0 +max_param_change=0.3333 +scale_max_param_change=true final_layer_normalize_target=0.5 num_jobs_initial=3 num_jobs_final=16 @@ -138,16 +81,19 @@ suffix= if [ "$speed_perturb" == "true" ]; then suffix=_sp fi +if [ "$speaker_perturb" == "true" ]; then + suffix=$suffix"_fp" +fi dir=${dir}$suffix train_set=train_nodup$suffix ali_dir=exp/tri4_ali_nodup$suffix -treedir=exp/chain/tri5o_tree$suffix +treedir=exp/chain/tri5f_tree$suffix # if we are using the speed-perturbed data we need to generate # alignments for it. -local/nnet3/run_ivector_common.sh --stage $stage \ - --speed-perturb $speed_perturb \ +local/nnet3/run_ivector_common_2.sh --stage $stage \ + --speed-perturb $speed_perturb --speaker-perturb $speaker_perturb \ --generate-alignments $speed_perturb || exit 1; @@ -161,6 +107,7 @@ if [ $stage -le 9 ]; then fi +if false; then #100 if [ $stage -le 10 ]; then # Create a version of the lang/ directory that has one state per phone in the # topo file. [note, it really has two states.. the first one is only repeated @@ -179,23 +126,23 @@ if [ $stage -le 11 ]; then # Build a tree using our new topology. steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ --leftmost-questions-truncate $leftmost_questions_truncate \ - --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir + --cmd "$train_cmd" 12000 data/$train_set data/lang_chain_d $ali_dir $treedir fi +fi #100 if [ $stage -le 12 ]; then if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then utils/create_split_dir.pl \ - /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage fi touch $dir/egs/.nodelete # keep egs around when that run dies. steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ - --pdf-boundary-penalty 0.0 \ - --lm-opts "--num-extra-lm-states=2000" \ --get-egs-stage $get_egs_stage \ + --left-deriv-truncate 5 --right-deriv-truncate 5 --right-tolerance 5 \ --minibatch-size $minibatch_size \ - --egs-opts "--frames-overlap-per-eg 30" \ + --egs-opts "--frames-overlap-per-eg 10 --nj 40" \ --frames-per-eg $frames_per_eg \ --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ --splice-indexes "$splice_indexes" \ @@ -205,7 +152,7 @@ if [ $stage -le 12 ]; then --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ --max-param-change $max_param_change \ --final-layer-normalize-target $final_layer_normalize_target \ - --relu-dim 850 \ + --relu-dim 1024 \ --cmd "$decode_cmd" \ --remove-egs $remove_egs \ data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; @@ -215,7 +162,8 @@ if [ $stage -le 13 ]; then # Note: it might appear that this $lang directory is mismatched, and it is as # far as the 'topo' is concerned, but this script doesn't read the 'topo' from # the lang directory. - utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg + utils/mkgraph.sh --transition-scale 0.0 \ + --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg fi decode_suff=sw1_tg diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2r.sh b/egs/swbd/s5c/local/chain/run_tdnn_2r.sh index 4c0ac7e62ca..d17ebdf9be7 100755 --- a/egs/swbd/s5c/local/chain/run_tdnn_2r.sh +++ b/egs/swbd/s5c/local/chain/run_tdnn_2r.sh @@ -301,4 +301,4 @@ LOG (lattice-best-path:main():lattice-best-path.cc:99) For utterance sp1.0-sw028 LOG (lattice-best-path:main():lattice-best-path.cc:124) Overall score per frame is 46.9461 = 0.0637047 [graph] + 46.8824 [acoustic] over 843 frames. LOG (lattice-best-path:main():lattice-best-path.cc:128) Done 1 lattices, failed for 0 LOG (ali-to-phones:main():ali-to-phones.cc:134) Done 1 utterances. -sp1.0-sw02859-B_050239-051084 sil ow_S ay_B k_I m_I ax_I n_E hh_B ih_I m_I s_I eh_I l_I f_E ih_B f_E hh_B iy_E hh_B ae_I d_E s_B ah_I m_E t_B ae_I l_I ih_I n_I t_E ax_B r_I aw_I n_I d_E ay_S th_B ih_I ng_I k_E dh_B ey_I d_E b_B iy_E ax_S s_B uw_I p_I er_E t_B iy_I m_E b_B ah_I t_E hh_B iy_E k_B ae_I n_I t_E d_B uw_E ih_B t_E b_B ay_E hh_B ih_I m_I s_I eh_I l_I f_E hh_B iy_I z_E g_B aa_I t_E t_B ax_E hh_B ae_I v_E ax_S l_B ay_I n_E ih_B n_E f_B r_I ah_I n_I t_E ah_B v_E hh_B ih_I m_E dh_B ae_I t_E n_B ow_I z_E hh_B aw_E t_B ax_E b_B l_I aa_I k_E sil \ No newline at end of file +sp1.0-sw02859-B_050239-051084 sil ow_S ay_B k_I m_I ax_I n_E hh_B ih_I m_I s_I eh_I l_I f_E ih_B f_E hh_B iy_E hh_B ae_I d_E s_B ah_I m_E t_B ae_I l_I ih_I n_I t_E ax_B r_I aw_I n_I d_E ay_S th_B ih_I ng_I k_E dh_B ey_I d_E b_B iy_E ax_S s_B uw_I p_I er_E t_B iy_I m_E b_B ah_I t_E hh_B iy_E k_B ae_I n_I t_E d_B uw_E ih_B t_E b_B ay_E hh_B ih_I m_I s_I eh_I l_I f_E hh_B iy_I z_E g_B aa_I t_E t_B ax_E hh_B ae_I v_E ax_S l_B ay_I n_E ih_B n_E f_B r_I ah_I n_I t_E ah_B v_E hh_B ih_I m_E dh_B ae_I t_E n_B ow_I z_E hh_B aw_E t_B ax_E b_B l_I aa_I k_E sil diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3c.sh b/egs/swbd/s5c/local/chain/run_tdnn_3c.sh new file mode 100755 index 00000000000..4f350891e8a --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3c.sh @@ -0,0 +1,274 @@ +#!/bin/bash + +# _3c is as _2y, but using 'jesus' nonlinearity: the --jesus-dim 800 option, instead of +# --relu-dim 850. +# reusing the egs from 2y. +# caution: see config section, I changed some things while running. + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3c # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +# max_param_change=1.0 +max_param_change=0.5 # Changed it to this value on iteration 74. +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=64 # switched to 64 on iteration 7 after a failure. +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --jesus-dim 800 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3d.sh b/egs/swbd/s5c/local/chain/run_tdnn_3d.sh new file mode 100755 index 00000000000..ca8080db080 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3d.sh @@ -0,0 +1,286 @@ +#!/bin/bash + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# (note: cannot be reproduced using current scripts). +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" +# Results are about the same as 2y, or maybe just a little worse. + +# a03:s5c: ./show_wer.sh 3d +# %WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3d # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3e.sh b/egs/swbd/s5c/local/chain/run_tdnn_3e.sh new file mode 100755 index 00000000000..af5661b8c85 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3e.sh @@ -0,0 +1,275 @@ +#!/bin/bash + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. +# (note: cannot be reproduced using current scripts). + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3e # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000 --num-jesus-blocks 200" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3f.sh b/egs/swbd/s5c/local/chain/run_tdnn_3f.sh new file mode 100755 index 00000000000..f33459f5f08 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3f.sh @@ -0,0 +1,283 @@ +#!/bin/bash + + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# (note: cannot be reproduced using current scripts). +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3f # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000 --num-jesus-blocks 200" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3g.sh b/egs/swbd/s5c/local/chain/run_tdnn_3g.sh new file mode 100755 index 00000000000..ff1e539306f --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3g.sh @@ -0,0 +1,303 @@ +#!/bin/bash + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# (note: cannot be reproduced using current scripts). +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3g # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3h.sh b/egs/swbd/s5c/local/chain/run_tdnn_3h.sh new file mode 100755 index 00000000000..f0e9efc2ac4 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3h.sh @@ -0,0 +1,289 @@ +#!/bin/bash + +# _3h is as _3g but using a different and hopefully better type of recurrence, using +# steps/nnet3/make_jesus_configs_recurrent.py to create the configs. This is more +# similar to LSTMs. +# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worde. + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3h # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3i.sh b/egs/swbd/s5c/local/chain/run_tdnn_3i.sh new file mode 100755 index 00000000000..876048b5852 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3i.sh @@ -0,0 +1,311 @@ +#!/bin/bash + +# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is +# applied, in order to control how fast the final layer's affine component learns. +# also a code fix (the recurrent connections weren't being used; bug in OptionalDescriptor) + +# Here is the original decoding, with frame-per-chunk=50 +#./show_wer.sh 3i +#%WER 18.00 [ 8856 / 49204, 1025 ins, 2376 del, 5455 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 16.52 [ 8129 / 49204, 1084 ins, 1995 del, 5050 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 19.8 | 4459 42989 | 82.6 11.9 5.5 2.4 19.8 57.7 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.9 | 4459 42989 | 84.1 10.5 5.5 2.0 17.9 55.3 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + +# and a newer decoding with frames-per-chunk=100. +# ./show_wer.sh 3i +#%WER 17.86 [ 8787 / 49204, 1015 ins, 2366 del, 5406 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 16.52 [ 8130 / 49204, 1092 ins, 1969 del, 5069 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 19.6 | 4459 42989 | 82.5 11.4 6.0 2.2 19.6 57.5 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.8 | 4459 42989 | 84.1 10.4 5.5 2.0 17.8 55.1 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + +# after initial decoding wasn't great, trying increasing frames-per-chunk from +# 50 to 100. + +# _3h is as _3g but using a different and hopefully better type of recurrence, using +# steps/nnet3/make_jesus_configs_recurrent.py to create the configs. This is more +# similar to LSTMs. +# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worde. + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3i # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk 100 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3j.sh b/egs/swbd/s5c/local/chain/run_tdnn_3j.sh new file mode 100755 index 00000000000..faef84e8879 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3j.sh @@ -0,0 +1,296 @@ +#!/bin/bash + +# _3j is as _3i but using BlockAffineComponent instead of +# RepeatedAffineComponent in Jesus layers. (see --use-repeated-affine false +# option, which is newly added to the script). + +# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is +# applied, in order to control how fast the final layer's affine component learns. + +# _3h is as _3g but using a different and hopefully better type of recurrence, using +# steps/nnet3/make_jesus_configs_recurrent.py to create the configs. This is more +# similar to LSTMs. +# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worde. + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3j # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000 --use-repeated-affine false" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3k.sh b/egs/swbd/s5c/local/chain/run_tdnn_3k.sh new file mode 100755 index 00000000000..b869c7b2553 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3k.sh @@ -0,0 +1,310 @@ +#!/bin/bash + +# _3k is as _3i, but adding the option --jesus-stddev-scale 0.316 " +# [~sqrt(1/10)], which will make the jesus layer learn about 10 times faster- it +# was previously learning too slow, I think. I also changed the script +# make_jesus_configs_recurrent.py to give the recurrent affine layers an initial +# param-stddev of 0 which will discourage those corresponding input weights in +# the jesus layer from getting small in early iters; and removed the --normalize-target +# option and replaced it with the --final-layer-learning-rate-factor option. + +# # these results are with the non-optimal chunk size of 50 (in 3i, 100 was slightly better): +#%WER 17.86 [ 8787 / 49204, 1015 ins, 2366 del, 5406 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 16.52 [ 8130 / 49204, 1092 ins, 1969 del, 5069 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 19.6 | 4459 42989 | 82.5 11.4 6.0 2.2 19.6 57.5 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.8 | 4459 42989 | 84.1 10.4 5.5 2.0 17.8 55.1 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + +# The following are the corresponding results from 3i, decoded with the same chunk size. +##%WER 18.00 [ 8856 / 49204, 1025 ins, 2376 del, 5455 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0 +##%WER 16.52 [ 8129 / 49204, 1084 ins, 1995 del, 5050 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +##%WER 19.8 | 4459 42989 | 82.6 11.9 5.5 2.4 19.8 57.7 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +##%WER 17.9 | 4459 42989 | 84.1 10.5 5.5 2.0 17.9 55.3 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + + +# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is +# applied, in order to control how fast the final layer's affine component learns. + +# _3h is as _3g but using a different and hopefully better type of recurrence, using +# steps/nnet3/make_jesus_configs_recurrent.py to create the configs. This is more +# similar to LSTMs. +# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worde. + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3k # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.316 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3k2.sh b/egs/swbd/s5c/local/chain/run_tdnn_3k2.sh new file mode 100755 index 00000000000..7a016ed2197 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3k2.sh @@ -0,0 +1,358 @@ +#!/bin/bash + +# 3k2 is as 3k, but dumping the egs with --extra-left-context 20. +# Also there will have been some script changes in the meantime, +# e.g. possibly nonzero bias-mean; and reduced max-change on mix-up +# iters. + +# log-probs are better than 3k and in fact better than any experiment so far: +# valid -0.115->-0.107, and train -0.077 to -0.074. + +# Here is the WER using the default --frames-per-chunk of 50, and --extra-left-context 20: +#./show_wer.sh 3k2 +#%WER 20.45 [ 10060 / 49204, 988 ins, 3050 del, 6022 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_tg/wer_12_0.0 +#%WER 19.02 [ 9359 / 49204, 977 ins, 2877 del, 5505 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +#%WER 22.3 | 4459 42989 | 79.9 12.8 7.3 2.3 22.3 60.2 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +#%WER 20.4 | 4459 42989 | 81.5 11.1 7.4 1.9 20.4 58.4 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_fsh_fg/score_12_0.0/eval2000_hires.ctm.filt.sys + +#... and here is the WER after changing it to 150, still with --extra-left-context 20: +#./show_wer.sh 3k2 +#%WER 18.91 [ 9306 / 49204, 1076 ins, 2517 del, 5713 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 17.43 [ 8574 / 49204, 958 ins, 2607 del, 5009 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +#%WER 20.6 | 4459 42989 | 81.7 12.2 6.0 2.4 20.6 58.8 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.filt.sys +#%WER 18.8 | 4459 42989 | 83.4 10.9 5.6 2.3 18.8 56.0 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# the following is --frames-per-chunk 150, --extra-left-context 50 (changing the extra-left-context from 20 to 50 makes it worse): +#./show_wer.sh 3k2 +#%WER 19.46 [ 9574 / 49204, 1134 ins, 2635 del, 5805 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 17.87 [ 8792 / 49204, 880 ins, 3011 del, 4901 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +#%WER 21.0 | 4459 42989 | 81.2 12.4 6.3 2.2 21.0 58.6 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 19.2 | 4459 42989 | 82.7 10.8 6.5 1.9 19.2 56.0 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + +# the following is with --frames-per-chunk 150, --extra-left-context 50, --extra-left-context-initial 20. +#./show_wer.sh 3k2 +#%WER 19.10 [ 9400 / 49204, 1116 ins, 2498 del, 5786 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 17.54 [ 8628 / 49204, 884 ins, 2890 del, 4854 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +#%WER 20.6 | 4459 42989 | 81.7 12.2 6.1 2.3 20.6 58.4 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 18.7 | 4459 42989 | 83.4 10.8 5.8 2.1 18.7 55.6 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# the following is with --extra-left-context-initial 20 --extra-left-context 50 --frames-per-chunk 100. +# I think what's happening is that it's figuring out when it's near the end of the chunk, and encouraging +# deletions at that point, for reasons that relate to edge effects in the objective function. +#./show_wer.sh 3k2 +#%WER 17.87 [ 8793 / 49204, 1061 ins, 2277 del, 5455 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 16.36 [ 8049 / 49204, 1033 ins, 2148 del, 4868 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 19.7 | 4459 42989 | 82.8 11.8 5.5 2.5 19.7 57.8 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.8 | 4459 42989 | 84.4 10.3 5.2 2.2 17.8 54.7 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3k is as _3i, but adding the option --jesus-stddev-scale 0.316 " +# [~sqrt(1/10)], which will make the jesus layer learn about 10 times faster- it +# was previously learning too slow, I think. I also changed the script +# make_jesus_configs_recurrent.py to give the recurrent affine layers an initial +# param-stddev of 0 which will discourage those corresponding input weights in +# the jesus layer from getting small in early iters; and removed the --normalize-target +# option and replaced it with the --final-layer-learning-rate-factor option. + +# # these results are with the non-optimal chunk size of 50 (in 3i, 100 was slightly better): +#%WER 17.86 [ 8787 / 49204, 1015 ins, 2366 del, 5406 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 16.52 [ 8130 / 49204, 1092 ins, 1969 del, 5069 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 19.6 | 4459 42989 | 82.5 11.4 6.0 2.2 19.6 57.5 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.8 | 4459 42989 | 84.1 10.4 5.5 2.0 17.8 55.1 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + +# The following are the corresponding results from 3i, decoded with the same chunk size. +##%WER 18.00 [ 8856 / 49204, 1025 ins, 2376 del, 5455 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0 +##%WER 16.52 [ 8129 / 49204, 1084 ins, 1995 del, 5050 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +##%WER 19.8 | 4459 42989 | 82.6 11.9 5.5 2.4 19.8 57.7 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +##%WER 17.9 | 4459 42989 | 84.1 10.5 5.5 2.0 17.9 55.3 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + + +# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is +# applied, in order to control how fast the final layer's affine component learns. + +# _3h is as _3g but using a different and hopefully better type of recurrence, using +# steps/nnet3/make_jesus_configs_recurrent.py to create the configs. This is more +# similar to LSTMs. +# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worde. + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3k2 # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --extra-left-context 20 \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.316 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context-initial 20 \ + --extra-left-context 50 \ + --frames-per-chunk 100 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3l.sh b/egs/swbd/s5c/local/chain/run_tdnn_3l.sh new file mode 100755 index 00000000000..608e437659e --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3l.sh @@ -0,0 +1,306 @@ +#!/bin/bash + +# [abandoned, not working well.] +# _3l is as _3j, but making similar changes to as 3i->3k, which is (1) adding +# the option --jesus-stddev-scale 0.2 [0.32 was not strong enough], and (2) a +# script change to give the recurrent affine layers an initial param-stddev of +# 0. I also changed the script +# make_jesus_configs_recurrent.py to give the recurrent affine layers an initial +# param-stddev of 0 which will discourage those corresponding input weights in +# the jesus layer from getting small in early iters; and removed the --normalize-target +# option and replaced it with the --final-layer-learning-rate-factor option; +# and added a learning-rate factor for + +# _3j is as _3i but using BlockAffineComponent instead of +# RepeatedAffineComponent in Jesus layers. (see --use-repeated-affine false +# option, which is newly added to the script). + +# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is +# applied, in order to control how fast the final layer's affine component learns. + +# _3h is as _3g but using a different and hopefully better type of recurrence, using +# steps/nnet3/make_jesus_configs_recurrent.py to create the configs. This is more +# similar to LSTMs. +# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worde. + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3l # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000 --use-repeated-affine false --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3m.sh b/egs/swbd/s5c/local/chain/run_tdnn_3m.sh new file mode 100755 index 00000000000..b25f9f15130 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3m.sh @@ -0,0 +1,310 @@ +#!/bin/bash + +# [note: this uses BlockAffineComponent not RepeatedAffineComponent] +# _3m is as _3l, but changing --jesus-stddev-scale from 0.2 to 0.1, as the Jesus layers +# were learning too slowly in 3l (this will make them learn approximately 4x faster). +# [terminated, likelihoods were not promising]. + +# _3l is as _3j, but making similar changes to as 3i->3k, which is (1) adding +# the option --jesus-stddev-scale 0.2 [0.32 was not strong enough], and (2) a +# script change to give the recurrent affine layers an initial param-stddev of +# 0. I also changed the script +# make_jesus_configs_recurrent.py to give the recurrent affine layers an initial +# param-stddev of 0 which will discourage those corresponding input weights in +# the jesus layer from getting small in early iters; and removed the --normalize-target +# option and replaced it with the --final-layer-learning-rate-factor option; +# and added a learning-rate factor for + +# _3j is as _3i but using BlockAffineComponent instead of +# RepeatedAffineComponent in Jesus layers. (see --use-repeated-affine false +# option, which is newly added to the script). + +# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is +# applied, in order to control how fast the final layer's affine component learns. + +# _3h is as _3g but using a different and hopefully better type of recurrence, using +# steps/nnet3/make_jesus_configs_recurrent.py to create the configs. This is more +# similar to LSTMs. +# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worde. + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3m # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000 --use-repeated-affine false --jesus-stddev-scale 0.1 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3n.sh b/egs/swbd/s5c/local/chain/run_tdnn_3n.sh new file mode 100755 index 00000000000..dedbd84be75 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3n.sh @@ -0,0 +1,305 @@ +#!/bin/bash + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3n # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1800 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3o.sh b/egs/swbd/s5c/local/chain/run_tdnn_3o.sh new file mode 100755 index 00000000000..14383fe1a32 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3o.sh @@ -0,0 +1,309 @@ +#!/bin/bash + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. +# [ seemed helpful based on likelihoods on first iterations]: on iter 42, +# train prob is -0.1554->-0.1523, and valid prob is -0.1559->-0.1540. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3o # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1800 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3p.sh b/egs/swbd/s5c/local/chain/run_tdnn_3p.sh new file mode 100755 index 00000000000..ddba7e7f9c5 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3p.sh @@ -0,0 +1,333 @@ +#!/bin/bash + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# Comparing the WER with 2y, it's about 1% abs worse [see below]. However, this is +# for an odd reason: the model, while smaller than the 2y one (8.8 vs. 12.1 million +# parameters), seems to have a lot more learning capacity, with better train and worse valid +# prob. In 3r and 3s I am trying smaller versions of this architecture. + +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +# 2y 3p +# final-train-prob: -0.083068 -0.0771 +# final-valid-prob: -0.01212 -0.12715 +# num-parameters: 12094115 8804087 + + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3p # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1800 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3q.sh b/egs/swbd/s5c/local/chain/run_tdnn_3q.sh new file mode 100755 index 00000000000..9f67164b806 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3q.sh @@ -0,0 +1,315 @@ +#!/bin/bash + +# _3q is as _3p, but now trying out the 'block' training script, where in addition to +# the affine connections we have block-matrix connections between the layers. + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3q # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-block-opts "--jesus-full-output-dim 900 --jesus-full-input-dim 900 --jesus-block-input-dim 900 --jesus-block-output-dim 900 --jesus-hidden-dim 15000 --jesus-final-output-dim 600 --jesus-stddev-scale 0.4 --num-affine-blocks 25 --final-layer-target-rms 0.5" \ + --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,0,3 -6,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3r.sh b/egs/swbd/s5c/local/chain/run_tdnn_3r.sh new file mode 100755 index 00000000000..7815adffb9f --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3r.sh @@ -0,0 +1,321 @@ +#!/bin/bash + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] +# [I think I abandoned this after deciding to reduce the parameters even further, +# to the setup in 3s]. + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3r # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3s.sh b/egs/swbd/s5c/local/chain/run_tdnn_3s.sh new file mode 100755 index 00000000000..6cee8b11925 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3s.sh @@ -0,0 +1,340 @@ +#!/bin/bash + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3s # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3t.sh b/egs/swbd/s5c/local/chain/run_tdnn_3t.sh new file mode 100755 index 00000000000..25e30900e36 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3t.sh @@ -0,0 +1,336 @@ +#!/bin/bash + +# _3t is as _3s but using slightly wider context. Dumping our own egs. +# The final train prob is better -0.0851->-0.0815, but valid prob is worse -0.1231->-0.1243. +# WER is slightly worse. So we won't use this for now, but later if we use more data we +# could try wider context like this. +#a03:s5c: ./show_wer.sh 3s +#%WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# +#%WER 18.01 [ 8860 / 49204, 1043 ins, 2315 del, 5502 sub ] exp/chain/tdnn_3t_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 16.68 [ 8205 / 49204, 930 ins, 2420 del, 4855 sub ] exp/chain/tdnn_3t_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +#%WER 19.7 | 4459 42989 | 82.6 11.9 5.5 2.3 19.7 57.4 | exp/chain/tdnn_3t_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.filt.sys +#%WER 17.8 | 4459 42989 | 84.2 10.4 5.4 2.0 17.8 55.4 | exp/chain/tdnn_3t_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3t # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -3,-2,-1,0,1,2,3 -3,0,3 -6,-3,0,3,6 -6,-3,0,3,6" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3u.sh b/egs/swbd/s5c/local/chain/run_tdnn_3u.sh new file mode 100755 index 00000000000..d1b93d9084c --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3u.sh @@ -0,0 +1,330 @@ +#!/bin/bash + +# _3u is as _3s (and re-using the egs) but with one more layer; keeping the same dim +# and total context, and reducing --jesus-forward-output-dim from 1500 to 1300 to +# ensure that the number of parameters doesn't increase too much. +# [stopping this run, as the likelihoods weren't promising, e.g. by iteration +# 39, the valid-prob was worse vs. 3t, -0.1488 -> -0.1521 (train: -0.1510 -> -0.1532) + +# _3t is as _3s but using slightly wider context. Dumping our own egs. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3u # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_3t_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1300 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -3,-2,-1,0,1,2,3 -3,0,3 -3,0,3 -3,0,3 -6,-3,0,3,6" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3v.sh b/egs/swbd/s5c/local/chain/run_tdnn_3v.sh new file mode 100755 index 00000000000..c7fcb7e24f5 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3v.sh @@ -0,0 +1,328 @@ +#!/bin/bash + +# _3v is as _3t but decreasing the --num-jesus-blocks from 100 to 50. +# I stopped it early after likelihoods were not promising: +# on iter 90, train prob was -0.1226->-0.1240, valid -0.1304->-0.1340. + +# _3t is as _3s but using slightly wider context. Dumping our own egs. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3v # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_3t_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --num-jesus-blocks 50 --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -3,-2,-1,0,1,2,3 -3,0,3 -6,-3,0,3,6 -6,-3,0,3,6" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3w.sh b/egs/swbd/s5c/local/chain/run_tdnn_3w.sh new file mode 100755 index 00000000000..e4165e54de6 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3w.sh @@ -0,0 +1,332 @@ +#!/bin/bash + +# _3w is as _3t but instead of having a rectangular affine component in each +# layer, making it square (700->600 not 1300->400), and introducing a new script +# option --final-hidden-dim to have something like a bottleneck at the last +# layer, to avoid a blowup in parameters. +# (note: num-params was slightly smaller, 4.8 million vs 5.3 +# I stopped this on iter 65 after likelihoods were not promising: +# on iter 63, train -0.133->-0.138, valid -0.138->-0.141. + +# _3t is as _3s but using slightly wider context. Dumping our own egs. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3w # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_3t_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 800 --final-hidden-dim 400 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -3,-2,-1,0,1,2,3 -3,0,3 -6,-3,0,3,6 -6,-3,0,3,6" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3x.sh b/egs/swbd/s5c/local/chain/run_tdnn_3x.sh new file mode 100755 index 00000000000..1585d209a93 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3x.sh @@ -0,0 +1,341 @@ +#!/bin/bash + +# _3x is as _3s (and continuing the same kind of experimentation as in 3t->3w)... +# increasing --jesus-forward-output-dim from 1500 to 2000. +# More overtraining: final-train -0.0852->-0.0799, final-valid -0.1231->-0.1261, +# WER effect is very tiny but maybe slightly better. +#a03:s5c: ./show_wer.sh 3x +#%WER 17.78 [ 8750 / 49204, 910 ins, 2405 del, 5435 sub ] exp/chain/tdnn_3x_sp/decode_train_dev_sw1_tg/wer_12_0.0 +#%WER 16.60 [ 8166 / 49204, 921 ins, 2290 del, 4955 sub ] exp/chain/tdnn_3x_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +#%WER 19.5 | 4459 42989 | 82.7 11.4 5.9 2.2 19.5 57.5 | exp/chain/tdnn_3x_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.7 | 4459 42989 | 84.3 10.3 5.5 1.9 17.7 54.6 | exp/chain/tdnn_3x_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 3s +#%WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + + + +# _3t is as _3s but using slightly wider context. Dumping our own egs. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3x # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_3t_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 2000 --final-hidden-dim 350 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3y.sh b/egs/swbd/s5c/local/chain/run_tdnn_3y.sh new file mode 100755 index 00000000000..042ec84898b --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3y.sh @@ -0,0 +1,346 @@ +#!/bin/bash + +# _3y is as _3s but doubling jesus-hidden-dim from 15000 to 30000. +# not promising: by iteration 228, train prob changed -0.09583->-0.09575, and +# valid prob from -0.1213 -> -0.1239. Killed it. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 3s. + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3y # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 30000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3z.sh b/egs/swbd/s5c/local/chain/run_tdnn_3z.sh new file mode 100755 index 00000000000..f1fa2c5a45e --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3z.sh @@ -0,0 +1,350 @@ +#!/bin/bash + +# _3z is as _3s, but reducing the target num-states in the tree building from 9k to 6k. +# A slight degradation in WER, but it's not 100% consistent. The final train-prob +# was worse -0.0852 -> -0.0888, and valid-prob was worse -0.1231->-0.1280. +#./show_wer.sh 3z +#%WER 18.05 [ 8883 / 49204, 990 ins, 2397 del, 5496 sub ] exp/chain/tdnn_3z_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 16.50 [ 8120 / 49204, 960 ins, 2234 del, 4926 sub ] exp/chain/tdnn_3z_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 19.7 | 4459 42989 | 82.5 11.9 5.5 2.2 19.7 57.6 | exp/chain/tdnn_3z_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.8 | 4459 42989 | 84.1 10.4 5.5 1.9 17.8 55.1 | exp/chain/tdnn_3z_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=11 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3z # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 6000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4a.sh b/egs/swbd/s5c/local/chain/run_tdnn_4a.sh new file mode 100755 index 00000000000..c02ad2cb0e4 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4a.sh @@ -0,0 +1,349 @@ +#!/bin/bash + +# _4a is as _3s, but using narrower splice-indexes in the first layer. +# WER is maybe a fraction worse than 3s (see below); final train prob is +# worse -0->0852 -> -0.0879, and valid prob is better -0.121 ->-0.1213 +#./show_wer.sh 4a +#%WER 17.88 [ 8800 / 49204, 1017 ins, 2233 del, 5550 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 16.73 [ 8231 / 49204, 898 ins, 2397 del, 4936 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +#%WER 19.7 | 4459 42989 | 82.5 12.0 5.5 2.3 19.7 57.6 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.filt.sys +#%WER 17.8 | 4459 42989 | 84.2 10.3 5.5 2.0 17.8 55.1 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4a # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4b.sh b/egs/swbd/s5c/local/chain/run_tdnn_4b.sh new file mode 100755 index 00000000000..aad278c3037 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4b.sh @@ -0,0 +1,346 @@ +#!/bin/bash + +# _4b is as _4a, but even narrower splice-indexes in 1st layer (no splicing) +# stopped early after train and valid likelihoods were not promising. +# [later accidentally overwrote and moved the dir.] + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4b # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "0 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4c.sh b/egs/swbd/s5c/local/chain/run_tdnn_4c.sh new file mode 100755 index 00000000000..d9060251844 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4c.sh @@ -0,0 +1,357 @@ +#!/bin/bash + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. +# Yay-- WER is slightly better or the same. Final train-prob is worse +# -0.0879 -> -0.0882, and valid-prob worse -0.1213 -> -0.1241. + +# %WER 17.63 [ 8673 / 49204, 956 ins, 2334 del, 5383 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.61 [ 8175 / 49204, 964 ins, 2272 del, 4939 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.7 | 4459 42989 | 82.6 11.8 5.6 2.3 19.7 57.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.8 | 4459 42989 | 84.2 10.6 5.2 2.0 17.8 54.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4a +# %WER 17.88 [ 8800 / 49204, 1017 ins, 2233 del, 5550 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.73 [ 8231 / 49204, 898 ins, 2397 del, 4936 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.7 | 4459 42989 | 82.5 12.0 5.5 2.3 19.7 57.6 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.filt.sys +# %WER 17.8 | 4459 42989 | 84.2 10.3 5.5 2.0 17.8 55.1 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4c # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4d.sh b/egs/swbd/s5c/local/chain/run_tdnn_4d.sh new file mode 100755 index 00000000000..1ae220dc21a --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4d.sh @@ -0,0 +1,346 @@ +#!/bin/bash + +# _4d is as _4a, but with --egs-opts "--frames-overlap-per-eg 10 +# --cut-zero-frames 5" and changing apply-deriv-weights to true... this to +# activate the new-style derivative weights. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4d # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights true \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 10 --cut-zero-frames 5" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4e.sh b/egs/swbd/s5c/local/chain/run_tdnn_4e.sh new file mode 100755 index 00000000000..fea5495ee06 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4e.sh @@ -0,0 +1,362 @@ +#!/bin/bash + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. +# big improvement- about 0.7% WER abs. Considering the non-l2 part of the objf, the +# final valid objf c->e is -0.1241->-0.1266 [and the l2 term is -0.0196]. +# and for the training st it's -0.08820 -> -0.1149. + + +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4c +# %WER 17.63 [ 8673 / 49204, 956 ins, 2334 del, 5383 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.61 [ 8175 / 49204, 964 ins, 2272 del, 4939 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.7 | 4459 42989 | 82.6 11.8 5.6 2.3 19.7 57.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.8 | 4459 42989 | 84.2 10.6 5.2 2.0 17.8 54.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.filt.sys + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4e # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --l2-regularize 0.0001 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4f.sh b/egs/swbd/s5c/local/chain/run_tdnn_4f.sh new file mode 100755 index 00000000000..36d5f188c56 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4f.sh @@ -0,0 +1,366 @@ +#!/bin/bash + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4f # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4g.sh b/egs/swbd/s5c/local/chain/run_tdnn_4g.sh new file mode 100755 index 00000000000..430c6c28c70 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4g.sh @@ -0,0 +1,365 @@ +#!/bin/bash + +# _4g is as _4c, but reducing the --jesus-hidden-dim further from 7500 to 4000. +# Strangely, the trend from 4a->4a does not continue: instead of continuing to get worse, +# the train and valid probs both get better. + +# 4a 4c 4g +# Final train prob: -0.0879 -0.08820 -0.08784 +# Final valid prob: -0.1214 -0.1241 -0.1204 + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. +# Yay-- WER is slightly better or the same. Final train-prob is worse +# -0.0879 -> -0.0882, and valid-prob worse -0.1213 -> -0.1241. + +# %WER 17.63 [ 8673 / 49204, 956 ins, 2334 del, 5383 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.61 [ 8175 / 49204, 964 ins, 2272 del, 4939 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.7 | 4459 42989 | 82.6 11.8 5.6 2.3 19.7 57.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.8 | 4459 42989 | 84.2 10.6 5.2 2.0 17.8 54.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4a +# %WER 17.88 [ 8800 / 49204, 1017 ins, 2233 del, 5550 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.73 [ 8231 / 49204, 898 ins, 2397 del, 4936 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.7 | 4459 42989 | 82.5 12.0 5.5 2.3 19.7 57.6 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.filt.sys +# %WER 17.8 | 4459 42989 | 84.2 10.3 5.5 2.0 17.8 55.1 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4g # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 4000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4n.sh b/egs/swbd/s5c/local/chain/run_tdnn_4n.sh new file mode 100644 index 00000000000..9125d4e7967 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4n.sh @@ -0,0 +1,386 @@ +#!/bin/bash + +# _4n is as _4f, but adding the [new] option --convert-repeated-to-block-iter=100. +# reusing iter 100 of model 4f to avoid some iterations of training [did this by +# doing (cd exp/chain; cp -r tdnn_4f_sp tdnn_4n_sp), and then running this script with +# --iter 100]. +# [note: to get the block-affine stuff to train fast enough to make a difference +# I multiplied a factor of sqrt(num-blocks) into the learning-rate factor in +# the code. That change is not committed.] +# +# Essentially no effect on WER, but train and valid probs are worse. +# ./compare_wer.sh 4f 4n +# System 4f 4n +# WER on train_dev(tg) 16.83 16.84 +# WER on train_dev(fg) 15.73 15.69 +# WER on eval2000(tg) 18.4 18.4 +# WER on eval2000(fg) 16.6 16.6 +# Final train prob -0.105832 -0.111309 +# Final valid prob -0.123021 -0.123601 + + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4n # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --convert-repeated-to-block-iter 100 \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4p.sh b/egs/swbd/s5c/local/chain/run_tdnn_4p.sh new file mode 100755 index 00000000000..d2b073cdc77 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4p.sh @@ -0,0 +1,381 @@ +#!/bin/bash + +# _4p is as _4f, but one fewer layer, and making the final-layer context wider to +# compensate; also increasing the jesus-layer input and output dims 400->500 and 1500->1600 to +# somewhat compensate for the reduction in parameters. + +# definitely worse. Later with 4r I go in the opposite direction by adding a new layer, +# and get a small improvement. +# ./compare_wer.sh 4f 4p +# System 4f 4p +# WER on train_dev(tg) 16.83 17.36 +# WER on train_dev(fg) 15.73 16.10 +# WER on eval2000(tg) 18.4 19.1 +# WER on eval2000(fg) 16.6 17.2 +# Final train prob -0.105832 -0.104439 +# Final valid prob -0.123021 -0.125576 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4p # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 450 --jesus-forward-output-dim 1600 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -6,-3,0,3 -9,-6,-3,0,3,6" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4q.sh b/egs/swbd/s5c/local/chain/run_tdnn_4q.sh new file mode 100755 index 00000000000..9f2534f4f22 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4q.sh @@ -0,0 +1,177 @@ +#!/bin/bash + +# this is based on Dan's tdnn_2o script +# it has a different splicing configuration +# it uses the PerDimensionWeightedAverage pooling in place of the Jesus layer + +set -e + +#%WER 11.1 | 1831 21395 | 90.2 6.3 3.5 1.3 11.1 46.6 | exp/chain/tdnn_v1_trial6_sp/decode_eval2000_sw1_fsh_fg/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys +#%WER 16.6 | 4459 42989 | 85.2 9.5 5.3 1.8 16.6 53.4 | exp/chain/tdnn_v1_trial6_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +#%WER 15.59 [ 7671 / 49204, 883 ins, 2234 del, 4554 sub ] exp/chain/tdnn_v1_trial6_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 + + +# configs for 'chain' +affix= +stage=10 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4q # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,0,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" +# smoothing options +pool_window=7 +pool_type='per-dim-weighted-average' +pool_lpfilter_width= + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=64 +relu_dim=700 +frames_per_eg=150 +remove_egs=false +common_egs_dir= + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --apply-deriv-weights false \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --pool-type "$pool_type" \ + --pool-window "$pool_window" \ + --pool-lpfilter-width "$pool_lpfilter_width" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim $relu_dim \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + --egs-dir "$common_egs_dir" \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4r.sh b/egs/swbd/s5c/local/chain/run_tdnn_4r.sh new file mode 100755 index 00000000000..64831b5802a --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4r.sh @@ -0,0 +1,380 @@ +#!/bin/bash + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4r # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1400 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4s.sh b/egs/swbd/s5c/local/chain/run_tdnn_4s.sh new file mode 100755 index 00000000000..92a1a7da277 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4s.sh @@ -0,0 +1,380 @@ +#!/bin/bash + +# _4s is as _4f, but with --leaky-hmm-coefficient 0.02. [A new option- +#currently in a branch] +# Overall no real change. + +# ./compare_wer.sh 4f 4s +# System 4f 4s +# WER on train_dev(tg) 16.83 16.82 +# WER on train_dev(fg) 15.73 15.62 +# WER on eval2000(tg) 18.4 18.5 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.111371 +# Final valid prob -0.123021 -0.12648 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4s # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --leaky-hmm-coefficient 0.02 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4t.sh b/egs/swbd/s5c/local/chain/run_tdnn_4t.sh new file mode 100755 index 00000000000..30b383d05d7 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4t.sh @@ -0,0 +1,382 @@ +#!/bin/bash + +# _4t is as _4s, but with --leaky-hmm-coefficient 0.04. + +# [note, I accidentally overwrote this directory afterwards, and moved it.] +# It's really not clear whether it's helpful. +# ./compare_wer.sh 4f 4t +# System 4f 4t +# WER on train_dev(tg) 16.83 16.75 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.5 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.112721 +# Final valid prob -0.123021 -0.129688 + +# _4s is as _4f, but with --leaky-hmm-coefficient 0.02. [A new option.] + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4u # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --leaky-hmm-coefficient 0.08 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4u.sh b/egs/swbd/s5c/local/chain/run_tdnn_4u.sh new file mode 100755 index 00000000000..ae7cf02b426 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4u.sh @@ -0,0 +1,384 @@ +#!/bin/bash + +# _4u is as _4t, but with --leaky-hmm-coefficient 0.08. Note: the +# ultimate baseline is 4f. + +# It seems a bit better on average. +#./compare_wer.sh 4f 4u +#System 4f 4u +#WER on train_dev(tg) 16.83 16.47 +#WER on train_dev(fg) 15.73 15.23 +#WER on eval2000(tg) 18.4 18.4 +#WER on eval2000(fg) 16.6 16.7 +#Final train prob -0.105832 -0.118911 +#Final valid prob -0.123021 -0.135768 + +# _4t is as _4s, but with --leaky-hmm-coefficient 0.04. + +# _4s is as _4f, but with --leaky-hmm-coefficient 0.02. [A new option.] + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4t # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --leaky-hmm-coefficient 0.08 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4v.sh b/egs/swbd/s5c/local/chain/run_tdnn_4v.sh new file mode 100755 index 00000000000..9cdbfefb5a2 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4v.sh @@ -0,0 +1,394 @@ +#!/bin/bash + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +#./compare_wer.sh 4r 4v +#System 4r 4v +#WER on train_dev(tg) 16.50 15.95 +#WER on train_dev(fg) 15.45 14.69 +#WER on eval2000(tg) 18.3 17.7 +#WER on eval2000(fg) 16.7 16.0 +#Final train prob -0.103652 -0.106646 -1.60775 +#Final valid prob -0.121105 -0.118631 -1.62832 + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4v # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1400 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4w.sh b/egs/swbd/s5c/local/chain/run_tdnn_4w.sh new file mode 100755 index 00000000000..6dd5c587f7a --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4w.sh @@ -0,0 +1,397 @@ +#!/bin/bash + +# _4w is as _4v, but doubling --xent-regularize to 0.2 WER seems consistently a +# bit worse, although final valid prob is very slightly better. + +#./compare_wer.sh 4v 4w +#System 4v 4w +#WER on train_dev(tg) 15.95 16.05 +#WER on train_dev(fg) 14.69 14.92 +#WER on eval2000(tg) 17.7 18.0 +#WER on eval2000(fg) 16.0 16.2 +#Final train prob -0.106646 -0.108816 +#Final valid prob -0.118631 -0.118254 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4w # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.2 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1400 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4x.sh b/egs/swbd/s5c/local/chain/run_tdnn_4x.sh new file mode 100755 index 00000000000..0290e0bdbd5 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4x.sh @@ -0,0 +1,396 @@ +#!/bin/bash + +# _4x is as _4u, but with --leaky-hmm-coefficient 0.2. Note: the +# ultimate baseline is 4f. It seems a little bit worse than 4u on average: (+0.2, +0.2, 0.0, -0.1). +# So I'm guessing the best value is around --leaky-hmm-coefficient 0.1. +# +# ./compare_wer.sh 4f 4u 4x +# System 4f 4u 4x +# WER on train_dev(tg) 16.83 16.47 16.63 +# WER on train_dev(fg) 15.73 15.23 15.42 +# WER on eval2000(tg) 18.4 18.4 18.4 +# WER on eval2000(fg) 16.6 16.7 16.6 +# Final train prob -0.105832 -0.118911 -0.130674 +# Final valid prob -0.123021 -0.135768 -0.146351 + +# _4u is as _4t, but with --leaky-hmm-coefficient 0.08. Note: the +# ultimate baseline is 4f. + +#./compare_wer.sh 4f 4u +#System 4f 4u +#WER on train_dev(tg) 16.83 16.47 +#WER on train_dev(fg) 15.73 15.23 +#WER on eval2000(tg) 18.4 18.4 +#WER on eval2000(fg) 16.6 16.7 +#Final train prob -0.105832 -0.118911 +#Final valid prob -0.123021 -0.135768 + +# _4t is as _4s, but with --leaky-hmm-coefficient 0.04. + +# _4s is as _4f, but with --leaky-hmm-coefficient 0.02. [A new option.] + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4x # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --leaky-hmm-coefficient 0.2 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5a.sh b/egs/swbd/s5c/local/chain/run_tdnn_5a.sh new file mode 100755 index 00000000000..cd1de07a80d --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5a.sh @@ -0,0 +1,401 @@ +#!/bin/bash + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. Very helpful (between 0.2% +# and 0.6%). + +#./compare_wer.sh 4w 5a +#System 4w 5a +#WER on train_dev(tg) 16.05 15.86 +#WER on train_dev(fg) 14.92 14.74 +#WER on eval2000(tg) 18.0 17.4 +#WER on eval2000(fg) 16.2 15.6 +#Final train prob -0.108816-0.0998359 +#Final valid prob -0.118254 -0.115884 + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5a # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.2 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5b.sh b/egs/swbd/s5c/local/chain/run_tdnn_5b.sh new file mode 100755 index 00000000000..7e44c10920e --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5b.sh @@ -0,0 +1,404 @@ +#!/bin/bash + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5b # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.2 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5c.sh b/egs/swbd/s5c/local/chain/run_tdnn_5c.sh new file mode 100755 index 00000000000..93ebb59b16d --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5c.sh @@ -0,0 +1,409 @@ +#!/bin/bash + +# _5c is as _4w, but changing --xent-regularize to 0.05, since 0.2 seemed to be +# worse than 0.1. +# It seems a little worse on average: WER change is (+0.3, +0.3, -0.2, +0.2). +#System 4w 5c +#WER on train_dev(tg) 16.05 16.35 +#WER on train_dev(fg) 14.92 15.21 +#WER on eval2000(tg) 18.0 17.8 +#WER on eval2000(fg) 16.2 16.4 +#Final train prob -0.108816 -0.107098 +#Final valid prob -0.118254 -0.118209 + +# _4w is as _4v, but doubling --xent-regularize to 0.2. WER seems consistently +# a bit worse (+0.1, +0.2, +0.3, +0.2), although final valid prob is very +# slightly better. + +#./compare_wer.sh 4v 4w +#System 4v 4w +#WER on train_dev(tg) 15.95 16.05 +#WER on train_dev(fg) 14.69 14.92 +#WER on eval2000(tg) 17.7 18.0 +#WER on eval2000(fg) 16.0 16.2 +#Final train prob -0.106646 -0.108816 +#Final valid prob -0.118631 -0.118254 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5c # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.05 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1400 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5d.sh b/egs/swbd/s5c/local/chain/run_tdnn_5d.sh new file mode 100755 index 00000000000..8e6e9358003 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5d.sh @@ -0,0 +1,407 @@ +#!/bin/bash + +# _5d is as _5b, but increasing jesus-forward-input-dim from 500 to 600 and +# jesus-forward-output-dim from 1800 to 2000. + +# It's maybe slightly helpful: WER change is (-0.2, -0.2, 0, +0.1). +#./compare_wer.sh 5b 5d +#System 5b 5d +#WER on train_dev(tg) 15.51 15.29 +#WER on train_dev(fg) 14.39 14.17 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.7 +#Final train prob -0.112013 -0.107858 +#Final valid prob -0.130879 -0.128862 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5d # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.2 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 2000 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5e.sh b/egs/swbd/s5c/local/chain/run_tdnn_5e.sh new file mode 100755 index 00000000000..ed48b0673b8 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5e.sh @@ -0,0 +1,417 @@ +#!/bin/bash + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5e # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5f.sh b/egs/swbd/s5c/local/chain/run_tdnn_5f.sh new file mode 100755 index 00000000000..5fb1f0c445c --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5f.sh @@ -0,0 +1,423 @@ +#!/bin/bash + +# _5f is as _5e, but making the 5b->5d change (increasing the +# number of parameters)-- increasing jesus-forward-output-dim from 1800 to 2000, +# and jesus-forward-input-dim from 500 to 600. + +# WER change is (-0.1, -0.2, +0.2, +0.1). So zero on average. +# This means 5e remains the best system so far. + +#./compare_wer.sh 5e 5f +#System 5e 5f +#WER on train_dev(tg) 15.43 15.35 +#WER on train_dev(fg) 14.32 14.15 +#WER on eval2000(tg) 17.3 17.5 +#WER on eval2000(fg) 15.5 15.6 +#Final train prob -0.110056 -0.10574 +#Final valid prob -0.129184 -0.128112 + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.05 is better than 0.2 or 0.1). + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5f # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 2000 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5g.sh b/egs/swbd/s5c/local/chain/run_tdnn_5g.sh new file mode 100755 index 00000000000..784facf5a82 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5g.sh @@ -0,0 +1,499 @@ +#!/bin/bash + +# _5g is as _5e, but adding one statistics-extraction layer to the +# splice indexes, in the middle of the network (with both mean +# and stddev). + + +# Here is decoding with --frames-per-chunk 300. A fairly consistent +# improvement. +#./compare_wer.sh 5e 5g +#System 5e 5g +#WER on train_dev(tg) 15.43 15.27 +#WER on train_dev(fg) 14.32 14.21 +#WER on eval2000(tg) 17.3 16.9 +#WER on eval2000(fg) 15.5 15.2 +#Final train prob -0.110056 -0.103752 +#Final valid prob -0.129184 -0.125641 + + +# *All results below here are broken-- they were computed when I had a bug in +# the index-permutation, and the blocks weren't computed right for the jesus +# layer.* +# Here are WERs when the frames-per-chunk was 50: +#./compare_wer.sh 5e 5g +#System 5e 5g +#WER on train_dev(tg) 15.43 15.62 +#WER on train_dev(fg) 14.32 14.42 +#WER on eval2000(tg) 17.3 17.7 +#WER on eval2000(fg) 15.5 16.0 + +# and here with 150: +# WER on train_dev(tg) 15.43 15.46 +# WER on train_dev(fg) 14.32 14.38 +# WER on eval2000(tg) 17.3 17.3 +# WER on eval2000(fg) 15.5 15.5 + + +# and here with 300 ... we do see a small improvement +# at this value. (could probably improve it further +# by modifying the model to average over a larger window). +#WER on train_dev(tg) 15.43 15.29 +#WER on train_dev(fg) 14.32 14.17 +#WER on eval2000(tg) 17.3 17.2 +#WER on eval2000(fg) 15.5 15.4 +#Final train prob -0.110056 -0.105725 +#Final valid prob -0.129184 -0.125756 + +# Below is also with chunk-size=300, but with the 'wide' model +# that sees more context. Oddly, the WER is worse. It looks like +# the model may be doing something different than just learning +# speaker characteristics. +#./compare_wer.sh 5e 5g +#System 5e 5g +#WER on train_dev(tg) 15.43 15.54 +#WER on train_dev(fg) 14.32 14.34 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.5 15.6 +#Final train prob -0.110056 -0.105725 +#Final valid prob -0.129184 -0.125756 + + + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5g # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3,mean+stddev(-99:3:9:99) -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk 300 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi + +# if [ $stage -le 15 ]; then +# # get wide-context model +# nnet3-am-copy --binary=false $dir/final.mdl - | \ +# sed 's/Context> 99/Context> 306/g' | nnet3-am-copy - $dir/wide.mdl +# for decode_set in train_dev eval2000; do +# ( +# steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ +# --frames-per-chunk 300 --iter wide \ +# --nj 50 --cmd "$decode_cmd" \ +# --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ +# $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; +# if $has_fisher; then +# steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ +# data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ +# $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; +# fi +# ) & +# done +# fi + + +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5h.sh b/egs/swbd/s5c/local/chain/run_tdnn_5h.sh new file mode 100755 index 00000000000..5eeb5ca5d03 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5h.sh @@ -0,0 +1,434 @@ +#!/bin/bash + +# _5h is as _5g, but only mean, no stddev, stats. + +# The following comparison is with 150 frames per chunk +# in both the 5g and 5h decodes. No consistent WER difference +# with either 5e or 5g. +#System 5e 5g 5h +#WER on train_dev(tg) 15.43 15.46 15.45 +#WER on train_dev(fg) 14.32 14.38 14.34 +#WER on eval2000(tg) 17.3 17.3 17.2 +#WER on eval2000(fg) 15.5 15.5 15.7 +#Final train prob -0.110056 -0.105725 -0.106213 +#Final valid prob -0.129184 -0.125756 -0.126809 + +# _5g is as _5e, but adding one statistics-extraction layer to the +# splice indexes, in the middle of the network (with both mean +# and stddev). + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5h # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3,mean(-99:3:9:99) -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk 150 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5i.sh b/egs/swbd/s5c/local/chain/run_tdnn_5i.sh new file mode 100755 index 00000000000..9ffc37793ee --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5i.sh @@ -0,0 +1,432 @@ +#!/bin/bash + +# _5i is as _5g, but adding the mean+stddev features for all hidden layers. +# a little worse than 5g (but for Remi Francis it was a little better). +#local/chain/compare_wer.sh 5e 5g 5i +#System 5e 5g 5i +#WER on train_dev(tg) 15.43 15.27 15.41 +#WER on train_dev(fg) 14.32 14.21 14.47 +#WER on eval2000(tg) 17.3 16.9 17.0 +#WER on eval2000(fg) 15.5 15.2 15.4 +#Final train prob -0.110056 -0.103752 -0.102539 +#Final valid prob -0.129184 -0.125641 -0.12375 + +# _5g is as _5e, but adding one statistics-extraction layer to the +# splice indexes, in the middle of the network (with both mean +# and stddev). + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5i # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2,mean+stddev(-99:1:9:99) -3,0,3,mean+stddev(-99:3:9:99) -3,0,3,mean+stddev(-99:3:9:99) -3,0,3,mean+stddev(-99:3:9:99) -6,-3,0,mean+stddev(-99:3:9:99)" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk 150 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5j.sh b/egs/swbd/s5c/local/chain/run_tdnn_5j.sh new file mode 100755 index 00000000000..892a79fd2a8 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5j.sh @@ -0,0 +1,427 @@ +#!/bin/bash + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5j # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5k.sh b/egs/swbd/s5c/local/chain/run_tdnn_5k.sh new file mode 100755 index 00000000000..b6c984ed253 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5k.sh @@ -0,0 +1,454 @@ +#!/bin/bash + +# _5k is as _5j (omitting iVectors), and adding a statistics-extraction layer +# in the middle, like 5e->5g, to see whether it recovers some of the improvement +# of using the iVectors. + +# It recovers half of the improvement-- but the objf is better than +# we might expect. I think it's learning some phonetic stuff too. +# +#./compare_wer.sh 5e 5j 5k +#System 5e 5j 5k +#WER on train_dev(tg) 15.43 17.59 16.46 +#WER on train_dev(fg) 14.32 16.33 15.17 +#WER on eval2000(tg) 17.3 19.1 18.1 +#WER on eval2000(fg) 15.5 17.5 16.5 +#Final train prob -0.110056 -0.114691 -0.105502 +#Final valid prob -0.129184 -0.130761 -0.12337 + +# The following is decoding with the default frames-per-chunk of 50, and +# --extra-left-context 20. +#./compare_wer.sh 5e 5j 5k +#System 5e 5j 5k +#WER on train_dev(tg) 15.43 17.59 17.37 +#WER on train_dev(fg) 14.32 16.33 16.09 +#WER on eval2000(tg) 17.3 19.1 18.8 +#WER on eval2000(fg) 15.5 17.5 17.3 +#Final train prob -0.110056 -0.114691 -0.105502 +#Final valid prob -0.129184 -0.130761 -0.12337 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5k # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_5j_sp/egs \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3,mean+stddev(-99:3:9:99) -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk 300 \ + --nj 50 --cmd "$decode_cmd" \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5l.sh b/egs/swbd/s5c/local/chain/run_tdnn_5l.sh new file mode 100755 index 00000000000..d5b51eb7551 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5l.sh @@ -0,0 +1,464 @@ +#!/bin/bash + +# _5l is as _5k, but doubling frames-per-eg from 150 to 300, and increasing +# the context radius of the statistics-pooling from 99 to 153. + +# :-( No better than 5k.) +#./compare_wer.sh 5e 5j 5k 5l +#System 5e 5j 5k 5l +#WER on train_dev(tg) 15.43 17.59 16.46 16.68 +#WER on train_dev(fg) 14.32 16.33 15.17 15.40 +#WER on eval2000(tg) 17.3 19.1 18.1 18.3 +#WER on eval2000(fg) 15.5 17.5 16.5 16.5 +#Final train prob -0.110056 -0.114691 -0.105502-0.0804455 +#Final valid prob -0.129184 -0.130761 -0.12337 -0.10712 + +# _5k is as _5j (omitting iVectors), and adding a statistics-extraction layer +# in the middle, like 5e->5g, to see whether it recovers some of the improvement +# of using the iVectors. + +# It recovers half of the improvement-- but the objf is better than +# we might expect. I think it's learning some phonetic stuff too. +# +#./compare_wer.sh 5e 5j 5k +#System 5e 5j 5k +#WER on train_dev(tg) 15.43 17.59 16.46 +#WER on train_dev(fg) 14.32 16.33 15.17 +#WER on eval2000(tg) 17.3 19.1 18.1 +#WER on eval2000(fg) 15.5 17.5 16.5 +#Final train prob -0.110056 -0.114691 -0.105502 +#Final valid prob -0.129184 -0.130761 -0.12337 + +# The following is decoding with the default frames-per-chunk of 50, and +# --extra-left-context 20. +#./compare_wer.sh 5e 5j 5k +#System 5e 5j 5k +#WER on train_dev(tg) 15.43 17.59 17.37 +#WER on train_dev(fg) 14.32 16.33 16.09 +#WER on eval2000(tg) 17.3 19.1 18.8 +#WER on eval2000(fg) 15.5 17.5 17.3 +#Final train prob -0.110056 -0.114691 -0.105502 +#Final valid prob -0.129184 -0.130761 -0.12337 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5l # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.414 # was 2; now 2 / sqrt(2) = sqrt(2), since we're using half the minibatch size. +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --frames-per-eg 300 \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3,mean+stddev(-153:3:9:153) -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size 64 \ + --egs-opts "--frames-overlap-per-eg 0" \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk 300 \ + --nj 50 --cmd "$decode_cmd" \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5m.sh b/egs/swbd/s5c/local/chain/run_tdnn_5m.sh new file mode 100644 index 00000000000..a9e12357c23 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5m.sh @@ -0,0 +1,430 @@ +#!/bin/bash + +# _5m is as _5e, but with a script change where we are randomizing +# the frame shift a bit better. + +# No very clear change, but if anything the optimization is less effective +# and the WER worse -> I'm going to revert this script change. +#System 5e 5m +#WER on train_dev(tg) 15.43 15.57 +#WER on train_dev(fg) 14.32 14.47 +#WER on eval2000(tg) 17.3 17.2 +#WER on eval2000(fg) 15.5 15.7 +#Final train prob -0.110056 -0.112539 +#Final valid prob -0.129184 -0.129006 + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5m # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5n.sh b/egs/swbd/s5c/local/chain/run_tdnn_5n.sh new file mode 100755 index 00000000000..d4372a418d8 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5n.sh @@ -0,0 +1,459 @@ +#!/bin/bash + +# _5n is as _5j (also omitting the iVectors), but using double the input frame +# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying +# the splice indexes accordingly + +# A very nice improvement on dev; small improvement on eval2000 though. +#local/chain/compare_wer.sh 5j 5n +#System 5j 5n +#WER on train_dev(tg) 17.59 16.85 +#WER on train_dev(fg) 16.33 15.67 +#WER on eval2000(tg) 19.1 19.1 +#WER on eval2000(fg) 17.5 17.3 +#Final train prob -0.114691 -0.116341 +#Final valid prob -0.130761 -0.130884 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5n # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=2 # this is about the same amount of compute as the normal 4, since one + # epoch encompasses all frame-shifts of the data. +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=300 # doubling it, since we have half the frame rate. +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +# Generate double-frame-rate version of the data. +if [ $stage -le 12 ]; then + mfccdir=mfcc + for dataset in eval2000 train_dev; do ## ${train_set}; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires_dbl + steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires_dbl.conf \ + data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hires_dbl # remove segments with problems + done +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --frame-subsampling-factor 6 \ + --alignment-subsampling-factor 3 \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -2,0,2 -4,-2,0,2 -6,0,6 -6,0,6 -12,-6,0" \ + --apply-deriv-weights false \ + --frames-per-iter 2400000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires_dbl $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; + + echo "0.005" > $dir/frame_shift # this lets the sclite decoding script know + # what the frame shift was, in seconds. +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + $graph_dir data/${decode_set}_hires_dbl $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires_dbl \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5o.sh b/egs/swbd/s5c/local/chain/run_tdnn_5o.sh new file mode 100755 index 00000000000..86bbe1ad441 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5o.sh @@ -0,0 +1,467 @@ +#!/bin/bash + +# _5o is as _5n but adding an extra splicing layer and increasing the +# splice-width slightly on the 1st layer, to get closer to the context in 5n; +# having one more layer running at double-frequency, and reverting the frame-length to +# the same as in the baseline (25ms) to avoid sacrificing frequency resolution. + +# Objective functions improve but WER change is quite small vs 5n (~0.1%). so +# not clear that the extra time is worth it (it's noticeably slower to train as +# that extra layer is at a higher sampling rate). +# +#System 5j 5n 5o +#WER on train_dev(tg) 17.59 16.85 16.83 +#WER on train_dev(fg) 16.33 15.67 15.60 +#WER on eval2000(tg) 19.1 19.1 18.8 +#WER on eval2000(fg) 17.5 17.3 17.2 +#Final train prob -0.114691 -0.116341 -0.111613 +#Final valid prob -0.130761 -0.130884 -0.126765 + +# _5n is as _5j (also omitting the iVectors), but using double the input frame +# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying +# the splice indexes accordingly + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5o # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=2 # this is about the same amount of compute as the normal 4, since one + # epoch encompasses all frame-shifts of the data. +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=300 # doubling it, since we have half the frame rate. +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +# Generate double-frame-rate version of the data. +if [ $stage -le 12 ]; then + mfccdir=mfcc + for dataset in eval2000 train_dev ${train_set}; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires_dbl2 + steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires_dbl2.conf \ + data/${dataset}_hires_dbl2 exp/make_hires_dbl2/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires_dbl2 exp/make_hires_dbl2/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hires_dbl2 # remove segments with problems + done +fi + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --frame-subsampling-factor 6 \ + --alignment-subsampling-factor 3 \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -2,-1,0,1,2 -2,0,2 -4,-2,0,2 -6,0,6 -6,0,6 -12,-6,0" \ + --apply-deriv-weights false \ + --frames-per-iter 2400000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires_dbl2 $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; + + echo "0.005" > $dir/frame_shift # this lets the sclite decoding script know + # what the frame shift was, in seconds. +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + $graph_dir data/${decode_set}_hires_dbl2 $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires_dbl2 \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5p.sh b/egs/swbd/s5c/local/chain/run_tdnn_5p.sh new file mode 100755 index 00000000000..d2ef7057873 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5p.sh @@ -0,0 +1,421 @@ +#!/bin/bash + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. [abandoned after discovering bug, +# this thread is picked up in 5s and 5t.] + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5p # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5q.sh b/egs/swbd/s5c/local/chain/run_tdnn_5q.sh new file mode 100755 index 00000000000..5968a00417e --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5q.sh @@ -0,0 +1,425 @@ +#!/bin/bash + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. [abandoned after discovering bug, +# this thread is picked up in 5s and 5t.] + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5q # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5r.sh b/egs/swbd/s5c/local/chain/run_tdnn_5r.sh new file mode 100755 index 00000000000..306d76859f9 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5r.sh @@ -0,0 +1,427 @@ +#!/bin/bash + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. +# [abandoned after discovering bug, this thread is picked up in 5s and 5t.] + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5r # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1500 --jesus-hidden-dim 5000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5s.sh b/egs/swbd/s5c/local/chain/run_tdnn_5s.sh new file mode 100755 index 00000000000..65da1e06183 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5s.sh @@ -0,0 +1,441 @@ +#!/bin/bash + +# Comparing with 5e which is the most recent baseline we actually decoded, +# 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700, +# jesus-hidden-dim reduced 7500 to 5000, and +# and the new option --self-repair-scale 0.00001 added. +# Also compare 5t and 5v which have even smaller j3sus-hidden-dims. + +#./compare_wer.sh 5e 5s 5t +#System 5e 5s 5t +#WER on train_dev(tg) 15.43 15.47 15.43 +#WER on train_dev(fg) 14.32 14.31 14.34 +#WER on eval2000(tg) 17.3 17.4 17.4 +#WER on eval2000(fg) 15.5 15.6 15.6 +#Final train prob -0.110056 -0.110928 -0.110752 +#Final valid prob -0.129184 -0.132139 -0.129123 + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5s # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1700 --jesus-hidden-dim 5000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5t.sh b/egs/swbd/s5c/local/chain/run_tdnn_5t.sh new file mode 100755 index 00000000000..9831417003b --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5t.sh @@ -0,0 +1,445 @@ +#!/bin/bash + +# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it +# up), from 5000 to 3500. Seems to make no difference to WERs; valid prob improves. + +#local/chain/compare_wer.sh 5e 5s 5t +#System 5e 5s 5t +#WER on train_dev(tg) 15.43 15.47 15.43 +#WER on train_dev(fg) 14.32 14.31 14.34 +#WER on eval2000(tg) 17.3 17.4 17.4 +#WER on eval2000(fg) 15.5 15.6 15.6 +#Final train prob -0.110056 -0.110928 -0.110752 +#Final valid prob -0.129184 -0.132139 -0.129123 + +# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate +# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair +# code to a bug which was doubling the thresholds so there was, in effect, +# no upper threshold. I stopped the p,q,r runs after I found this, but in +# configuring this run I'm bearing in mind the train and valid probs from the +# p,q,r runs. + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5t # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1700 --jesus-hidden-dim 3500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5u.sh b/egs/swbd/s5c/local/chain/run_tdnn_5u.sh new file mode 100755 index 00000000000..34fe30993cf --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5u.sh @@ -0,0 +1,505 @@ +#!/bin/bash + +# _5u is as _5o but modifying the mfcc generation to use a narrower window while +# generating the lower-order mfcc coefficients (the first 10). + +# Abandoning it partway through after I got the following less-than-promising diagnostics. +# grep Overall exp/chain/tdnn_5{o,u}_sp/log/compute_prob_valid.84.log | grep -v xent +# exp/chain/tdnn_5o_sp/log/compute_prob_valid.84.log:LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:175) Overall log-probability for 'output' is -0.146977 + -0.0159528 = -0.16293 per frame, over 20000 frames. +# exp/chain/tdnn_5u_sp/log/compute_prob_valid.84.log:LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:175) Overall log-probability for 'output' is -0.147207 + -0.015692 = -0.162899 per frame, over 20000 frames. +# a03:s5c: grep Overall exp/chain/tdnn_5{o,u}_sp/log/compute_prob_train.84.log | grep -v xent +# exp/chain/tdnn_5o_sp/log/compute_prob_train.84.log:LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:175) Overall log-probability for 'output' is -0.146703 + -0.0165036 = -0.163207 per frame, over 20000 frames. +# exp/chain/tdnn_5u_sp/log/compute_prob_train.84.log:LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:175) Overall log-probability for 'output' is -0.145524 + -0.0162272 = -0.161751 per frame, over 20000 frames. + +# _5o is as _5n but adding an extra splicing layer and increasing the +# splice-width slightly on the 1st layer, to get closer to the context in 5n; +# having one more layer running at double-frequency, and reverting the frame-length to +# the same as in the baseline (25ms) to avoid sacrificing frequency resolution. + +# Objective functions improve but WER change is quite small vs 5n (~0.1%). so +# not clear that the extra time is worth it (it's noticeably slower to train as +# that extra layer is at a higher sampling rate). +# +#System 5j 5n 5o +#WER on train_dev(tg) 17.59 16.85 16.83 +#WER on train_dev(fg) 16.33 15.67 15.60 +#WER on eval2000(tg) 19.1 19.1 18.8 +#WER on eval2000(fg) 17.5 17.3 17.2 +#Final train prob -0.114691 -0.116341 -0.111613 +#Final valid prob -0.130761 -0.130884 -0.126765 + +# _5n is as _5j (also omitting the iVectors), but using double the input frame +# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying +# the splice indexes accordingly + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=13 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5u # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=2 # this is about the same amount of compute as the normal 4, since one + # epoch encompasses all frame-shifts of the data. +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=300 # doubling it, since we have half the frame rate. +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +# Generate double-frame-rate version of the data with normal window size. +if [ $stage -le 12 ]; then + mfccdir=mfcc + for dataset in eval2000 train_dev ${train_set}; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires_dbl2 + steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires_dbl2.conf \ + data/${dataset}_hires_dbl2 exp/make_hires_dbl2/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires_dbl2 exp/make_hires_dbl2/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hires_dbl2 # remove segments with problems + done +fi + +# Generate double-frame-rate version of the data with smaller than normal window size; +# and only keeping the first 10 MFCC coefficients. +if [ $stage -le 13 ]; then + mfccdir=mfcc + for dataset in eval2000 train_dev ${train_set}; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_dbl3 + steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_dbl3.conf \ + data/${dataset}_dbl3 exp/make_dbl3/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_dbl3 # remove segments with problems + done +fi + +# select dimension 10-39 of the dbl2 features, then create pasted features consisting +# of the 10 dimensions of the dbl3, plus the selected dimensions 10-39 of dbl2. +if [ $stage -le 14 ]; then + mfccdir=mfcc + for dataset in eval2000 train_dev ${train_set}; do + steps/select_feats.sh --cmd "$train_cmd --max-jobs-run 4" 10-39 data/${dataset}_hires_dbl2 data/${dataset}_hires_dbl2_select \ + exp/make_dbl3/$dataset $mfccdir + rm data/${dataset}_hires_dbl2_select/cmvn.scp 2>/dev/null || true + steps/paste_feats.sh --cmd "$train_cmd --max-jobs-run 4" data/${dataset}_hires_dbl2_select data/${dataset}_dbl3 data/${dataset}_pasted \ + exp/make_dbl3/$dataset $mfccdir + steps/compute_cmvn_stats.sh data/${dataset}_pasted exp/make_dbl3/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_pasted + done +fi + + +if [ $stage -le 15 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --frame-subsampling-factor 6 \ + --alignment-subsampling-factor 3 \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -2,-1,0,1,2 -2,0,2 -4,-2,0,2 -6,0,6 -6,0,6 -12,-6,0" \ + --apply-deriv-weights false \ + --frames-per-iter 2400000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_pasted $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; + + echo "0.005" > $dir/frame_shift # this lets the sclite decoding script know + # what the frame shift was, in seconds. +fi + +if [ $stage -le 16 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 17 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + $graph_dir data/${decode_set}_pasted $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_pasted \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5v.sh b/egs/swbd/s5c/local/chain/run_tdnn_5v.sh new file mode 100755 index 00000000000..b33f013b894 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5v.sh @@ -0,0 +1,459 @@ +#!/bin/bash + +# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500. + +# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse. +# I ended up running it again after I suspected that we had 'got lucky' with +# this particular run (since various experiments using 5v as a starting point +# were failures); that rerun is the 5v2 run. +# +# local/chain/compare_wer.sh 5e 5s 5t 5v 5v2 +# System 5e 5s 5t 5v 5v2 +# WER on train_dev(tg) 15.43 15.47 15.43 15.38 15.74 +# WER on train_dev(fg) 14.32 14.31 14.34 14.39 14.50 +# WER on eval2000(tg) 17.3 17.4 17.4 17.4 17.5 +# WER on eval2000(fg) 15.5 15.6 15.6 15.7 15.9 +# Final train prob -0.110056 -0.110928 -0.110752 -0.11156 -0.112155 +# Final valid prob -0.129184 -0.132139 -0.129123 -0.131797 -0.129516 + + +# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it +# up), from 5000 to 3500. + +# about 5s: comparing with 5e which is the most recent baseline we actually +# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700, +# jesus-hidden-dim reduced 7500 to 5000, and and the new option +# --self-repair-scale 0.00001 added. Also compare 5t and 5v which have even +# smaller jesus-hidden-dims. + +# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate +# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair +# code to a bug which was doubling the thresholds so there was, in effect, +# no upper threshold. I stopped the p,q,r runs after I found this, but in +# configuring this run I'm bearing in mind the train and valid probs from the +# p,q,r runs. + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5v # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1700 --jesus-hidden-dim 2500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5w.sh b/egs/swbd/s5c/local/chain/run_tdnn_5w.sh new file mode 100755 index 00000000000..1a40acfa105 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5w.sh @@ -0,0 +1,469 @@ +#!/bin/bash + +# _5w is as _5k (which is a fairly good-performing ivector-free model), but +# making the same changes as 5e -> 5t, which makes the model more lightweight +# and faster to train, specifically: reduce --jesus-hidden-dim from 7500 to +# 3500, add --self-repair-scale 0.00001, and reduce --jesus-forward-output-dim +# from 1800 to 1700. + +# Difference is tiny. +#local/chain/compare_wer.sh 5k 5w +#System 5k 5w +#WER on train_dev(tg) 16.46 16.56 +#WER on train_dev(fg) 15.17 15.30 +#WER on eval2000(tg) 18.1 18.1 +#WER on eval2000(fg) 16.5 16.4 +#Final train prob -0.105502 -0.106549 +#Final valid prob -0.12337 -0.120079 + +# _5k is as _5j (omitting iVectors), and adding a statistics-extraction layer +# in the middle, like 5e->5g, to see whether it recovers some of the improvement +# of using the iVectors. + +# It recovers half of the improvement-- but the objf is better than +# we might expect. I think it's learning some phonetic stuff too. +# +#./compare_wer.sh 5e 5j 5k +#System 5e 5j 5k +#WER on train_dev(tg) 15.43 17.59 16.46 +#WER on train_dev(fg) 14.32 16.33 15.17 +#WER on eval2000(tg) 17.3 19.1 18.1 +#WER on eval2000(fg) 15.5 17.5 16.5 +#Final train prob -0.110056 -0.114691 -0.105502 +#Final valid prob -0.129184 -0.130761 -0.12337 + +# The following is decoding with the default frames-per-chunk of 50, and +# --extra-left-context 20. +#./compare_wer.sh 5e 5j 5k +#System 5e 5j 5k +#WER on train_dev(tg) 15.43 17.59 17.37 +#WER on train_dev(fg) 14.32 16.33 16.09 +#WER on eval2000(tg) 17.3 19.1 18.8 +#WER on eval2000(fg) 15.5 17.5 17.3 +#Final train prob -0.110056 -0.114691 -0.105502 +#Final valid prob -0.129184 -0.130761 -0.12337 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5w # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1700 --jesus-hidden-dim 3500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3,mean+stddev(-99:3:9:99) -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk 300 \ + --nj 50 --cmd "$decode_cmd" \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5x.sh b/egs/swbd/s5c/local/chain/run_tdnn_5x.sh new file mode 100755 index 00000000000..88dc28c2354 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5x.sh @@ -0,0 +1,476 @@ +#!/bin/bash + +# _5x is as _5w but decreasing the context of the averaging layer from +-0.99 +# seconds to +-0.66 seconds. I would not have expected this to work a priori, +# but the change from 5k -> 5l, which made the context wider, made WERs slightly +# worse, so I'd like to see what happens when we decrease the context. + +# It's worse. Odd because increasing the context (5k->5l) seemed to be a little +# worse also. +local/chain/compare_wer.sh 5w 5x +#System 5w 5x +#WER on train_dev(tg) 16.56 16.66 +#WER on train_dev(fg) 15.30 15.41 +#WER on eval2000(tg) 18.1 18.5 +#WER on eval2000(fg) 16.4 16.6 +#Final train prob -0.106549 -0.105693 +#Final valid prob -0.120079 -0.121834 + +# _5w is as _5k (which is a fairly good-performing ivector-free model), but +# making the same changes as 5e -> 5t, which makes the model more lightweight +# and faster to train, specifically: reduce --jesus-hidden-dim from 7500 to +# 3500, add --self-repair-scale 0.00001, and reduce --jesus-forward-output-dim +# from 1800 to 1700. + +# _5k is as _5j (omitting iVectors), and adding a statistics-extraction layer +# in the middle, like 5e->5g, to see whether it recovers some of the improvement +# of using the iVectors. + +# It recovers half of the improvement-- but the objf is better than +# we might expect. I think it's learning some phonetic stuff too. +# +#./compare_wer.sh 5e 5j 5k +#System 5e 5j 5k +#WER on train_dev(tg) 15.43 17.59 16.46 +#WER on train_dev(fg) 14.32 16.33 15.17 +#WER on eval2000(tg) 17.3 19.1 18.1 +#WER on eval2000(fg) 15.5 17.5 16.5 +#Final train prob -0.110056 -0.114691 -0.105502 +#Final valid prob -0.129184 -0.130761 -0.12337 + +# The following is decoding with the default frames-per-chunk of 50, and +# --extra-left-context 20. +#./compare_wer.sh 5e 5j 5k +#System 5e 5j 5k +#WER on train_dev(tg) 15.43 17.59 17.37 +#WER on train_dev(fg) 14.32 16.33 16.09 +#WER on eval2000(tg) 17.3 19.1 18.8 +#WER on eval2000(fg) 15.5 17.5 17.3 +#Final train prob -0.110056 -0.114691 -0.105502 +#Final valid prob -0.129184 -0.130761 -0.12337 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5x # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_5w_sp/egs \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1700 --jesus-hidden-dim 3500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3,mean+stddev(-63:3:9:63) -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk 300 \ + --nj 50 --cmd "$decode_cmd" \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5y.sh b/egs/swbd/s5c/local/chain/run_tdnn_5y.sh new file mode 100755 index 00000000000..54769c23734 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5y.sh @@ -0,0 +1,476 @@ +#!/bin/bash + +# _5y is as _5v, but rebalancing the network to have fewer parameters in the +# final layer and more in the hidden parts, by reducing --final-hidden-dim from 500 +# (it defaults to --jesus-forward-hidden-dim) to 400, and increasing +# --jesus-forward-input-dim from 500 to 600 and +# --jesus-forward-output-dim from 1700 to 1800, +# and --jesus-hidden-dim from 2500 to 3000 (note: I don't really expect this last change +# to make much of a difference). +# Very roughly, we're moving about a million parameters from the final layer to the +# hidden parts of the network. Hopefully this will reduce overtraining, since +# the hidden parts of the network are regularized by the --xent-regularize option. + +# The diagnostics were improved, but the WER is no better (or maybe slightly worse). +#local/chain/compare_wer.sh 5v 5y +#System 5v 5y +#WER on train_dev(tg) 15.38 15.50 +#WER on train_dev(fg) 14.39 14.37 +#WER on eval2000(tg) 17.4 17.5 +#WER on eval2000(fg) 15.7 15.7 +#Final train prob -0.11156 -0.111636 +#Final valid prob -0.131797 -0.128892 + +# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500. + +# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse. +# +#local/chain/compare_wer.sh 5e 5s 5t 5v +#System 5e 5s 5t 5v +#WER on train_dev(tg) 15.43 15.47 15.43 15.38 +#WER on train_dev(fg) 14.32 14.31 14.34 14.39 +#WER on eval2000(tg) 17.3 17.4 17.4 17.4 +#WER on eval2000(fg) 15.5 15.6 15.6 15.7 +#Final train prob -0.110056 -0.110928 -0.110752 -0.11156 +#Final valid prob -0.129184 -0.132139 -0.129123 -0.131797 + +# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it +# up), from 5000 to 3500. + +# about 5s: comparing with 5e which is the most recent baseline we actually +# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700, +# jesus-hidden-dim reduced 7500 to 5000, and and the new option +# --self-repair-scale 0.00001 added. Also compare 5t and 5v which have even +# smaller jesus-hidden-dims. + +# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate +# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair +# code to a bug which was doubling the thresholds so there was, in effect, +# no upper threshold. I stopped the p,q,r runs after I found this, but in +# configuring this run I'm bearing in mind the train and valid probs from the +# p,q,r runs. + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5y # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1800 --final-hidden-dim 400 --jesus-hidden-dim 3000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5z.sh b/egs/swbd/s5c/local/chain/run_tdnn_5z.sh new file mode 100755 index 00000000000..94843bfa2c9 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5z.sh @@ -0,0 +1,468 @@ +#!/bin/bash + +# _5z is as _5v, but adding skip-splicing (a new configuration option) +# It seems not helpful. I'll remove the option soon. +# note: 5v2 is a rerun of 5v. + +# local/chain/compare_wer.sh 5v 5v2 5z +# System 5v 5v2 5z +# WER on train_dev(tg) 15.38 15.74 15.60 +# WER on train_dev(fg) 14.39 14.50 14.50 +# WER on eval2000(tg) 17.4 17.5 17.6 +# WER on eval2000(fg) 15.7 15.9 15.9 +# Final train prob -0.11156 -0.112155 -0.113823 +# Final valid prob -0.131797 -0.129516 -0.131356 + +# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500. + +# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse. +# +#local/chain/compare_wer.sh 5e 5s 5t 5v +#System 5e 5s 5t 5v +#WER on train_dev(tg) 15.43 15.47 15.43 15.38 +#WER on train_dev(fg) 14.32 14.31 14.34 14.39 +#WER on eval2000(tg) 17.3 17.4 17.4 17.4 +#WER on eval2000(fg) 15.5 15.6 15.6 15.7 +#Final train prob -0.110056 -0.110928 -0.110752 -0.11156 +#Final valid prob -0.129184 -0.132139 -0.129123 -0.131797 + +# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it +# up), from 5000 to 3500. + +# about 5s: comparing with 5e which is the most recent baseline we actually +# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700, +# jesus-hidden-dim reduced 7500 to 5000, and and the new option +# --self-repair-scale 0.00001 added. Also compare 5t and 5v which have even +# smaller jesus-hidden-dims. + +# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate +# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair +# code to a bug which was doubling the thresholds so there was, in effect, +# no upper threshold. I stopped the p,q,r runs after I found this, but in +# configuring this run I'm bearing in mind the train and valid probs from the +# p,q,r runs. + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5z # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1700 --jesus-hidden-dim 2500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3,skip0 -3,0,3,skip0 -3,0,3,skip0 -6,-3,0,skip-3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6a.sh b/egs/swbd/s5c/local/chain/run_tdnn_6a.sh new file mode 100755 index 00000000000..c618d1c0adf --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6a.sh @@ -0,0 +1,490 @@ +#!/bin/bash + +# _6a is as _5y, where we keep the hidden parts of the network a bit larger +# but take the final-hidden-dim back up to 500, which is the same as what +# it was in 5v. + +# No better. +#local/chain/compare_wer.sh 5v 6a +#System 5v 6a +#WER on train_dev(tg) 15.38 15.49 +#WER on train_dev(fg) 14.39 14.30 +#WER on eval2000(tg) 17.4 17.5 +#WER on eval2000(fg) 15.7 15.9 +#Final train prob -0.11156 -0.109471 +#Final valid prob -0.131797 -0.129035 + +# _5y is as _5v, but rebalancing the network to have fewer parameters in the +# final layer and more in the hidden parts, by reducing --final-hidden-dim from 500 +# (it defaults to --jesus-forward-hidden-dim) to 400, and increasing +# --jesus-forward-input-dim from 500 to 600 and +# --jesus-forward-output-dim from 1700 to 1800, +# and --jesus-hidden-dim from 2500 to 3000 (note: I don't really expect this last change +# to make much of a difference). +# Very roughly, we're moving about a million parameters from the final layer to the +# hidden parts of the network. Hopefully this will reduce overtraining, since +# the hidden parts of the network are regularized by the --xent-regularize option. + +# The diagnostics were improved, but the WER is no better (or maybe slightly worse). +#local/chain/compare_wer.sh 5v 5y +#System 5v 5y +#WER on train_dev(tg) 15.38 15.50 +#WER on train_dev(fg) 14.39 14.37 +#WER on eval2000(tg) 17.4 17.5 +#WER on eval2000(fg) 15.7 15.7 +#Final train prob -0.11156 -0.111636 +#Final valid prob -0.131797 -0.128892 + +# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500. + +# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse. +# +#local/chain/compare_wer.sh 5e 5s 5t 5v +#System 5e 5s 5t 5v +#WER on train_dev(tg) 15.43 15.47 15.43 15.38 +#WER on train_dev(fg) 14.32 14.31 14.34 14.39 +#WER on eval2000(tg) 17.3 17.4 17.4 17.4 +#WER on eval2000(fg) 15.5 15.6 15.6 15.7 +#Final train prob -0.110056 -0.110928 -0.110752 -0.11156 +#Final valid prob -0.129184 -0.132139 -0.129123 -0.131797 + +# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it +# up), from 5000 to 3500. + +# about 5s: comparing with 5e which is the most recent baseline we actually +# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700, +# jesus-hidden-dim reduced 7500 to 5000, and and the new option +# --self-repair-scale 0.00001 added. Also compare 5t and 5v which have even +# smaller jesus-hidden-dims. + +# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate +# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair +# code to a bug which was doubling the thresholds so there was, in effect, +# no upper threshold. I stopped the p,q,r runs after I found this, but in +# configuring this run I'm bearing in mind the train and valid probs from the +# p,q,r runs. + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6a # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1800 --final-hidden-dim 500 --jesus-hidden-dim 3000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6b.sh b/egs/swbd/s5c/local/chain/run_tdnn_6b.sh new file mode 100755 index 00000000000..5cd3f7dfbf2 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6b.sh @@ -0,0 +1,480 @@ +#!/bin/bash + +# _6b is as _5y, where we keep the hidden parts of the network a bit larger +# but take the final-hidden-dim back up to 500, which is the same as what +# it was in 5v. + +# _5y is as _5v, but rebalancing the network to have fewer parameters in the +# final layer and more in the hidden parts, by reducing --final-hidden-dim from 500 +# (it defaults to --jesus-forward-hidden-dim) to 400, and increasing +# --jesus-forward-input-dim from 500 to 600 and +# --jesus-forward-output-dim from 1700 to 1800, +# and --jesus-hidden-dim from 2500 to 3000 (note: I don't really expect this last change +# to make much of a difference). +# Very roughly, we're moving about a million parameters from the final layer to the +# hidden parts of the network. Hopefully this will reduce overtraining, since +# the hidden parts of the network are regularized by the --xent-regularize option. + +# The diagnostics were improved, but the WER is no better (or maybe slightly worse). +#local/chain/compare_wer.sh 5v 5y +#System 5v 5y +#WER on train_dev(tg) 15.38 15.50 +#WER on train_dev(fg) 14.39 14.37 +#WER on eval2000(tg) 17.4 17.5 +#WER on eval2000(fg) 15.7 15.7 +#Final train prob -0.11156 -0.111636 +#Final valid prob -0.131797 -0.128892 + +# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500. + +# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse. +# +#local/chain/compare_wer.sh 5e 5s 5t 5v +#System 5e 5s 5t 5v +#WER on train_dev(tg) 15.43 15.47 15.43 15.38 +#WER on train_dev(fg) 14.32 14.31 14.34 14.39 +#WER on eval2000(tg) 17.3 17.4 17.4 17.4 +#WER on eval2000(fg) 15.5 15.6 15.6 15.7 +#Final train prob -0.110056 -0.110928 -0.110752 -0.11156 +#Final valid prob -0.129184 -0.132139 -0.129123 -0.131797 + +# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it +# up), from 5000 to 3500. + +# about 5s: comparing with 5e which is the most recent baseline we actually +# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700, +# jesus-hidden-dim reduced 7500 to 5000, and and the new option +# --self-repair-scale 0.00001 added. Also compare 5t and 5v which have even +# smaller jesus-hidden-dims. + +# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate +# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair +# code to a bug which was doubling the thresholds so there was, in effect, +# no upper threshold. I stopped the p,q,r runs after I found this, but in +# configuring this run I'm bearing in mind the train and valid probs from the +# p,q,r runs. + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6b # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1800 --final-hidden-dim 500 --jesus-hidden-dim 3000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6c.sh b/egs/swbd/s5c/local/chain/run_tdnn_6c.sh new file mode 100755 index 00000000000..7334a5e185e --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6c.sh @@ -0,0 +1,468 @@ +#!/bin/bash + +# _6c is as _5v but adding "--thick-jesus-layer true" (new option): extra hidden +# layer inside jesus layer. + +# Note: 5v2 is a rerun of 5v. +#local/chain/compare_wer.sh 5v 5v2 6c +#System 5v 5v2 6c +#WER on train_dev(tg) 15.38 15.74 15.54 +#WER on train_dev(fg) 14.39 14.50 14.55 +#WER on eval2000(tg) 17.4 17.5 17.5 +#WER on eval2000(fg) 15.7 15.9 15.8 +#Final train prob -0.11156 -0.112155 -0.114084 +#Final valid prob -0.131797 -0.129516 -0.129589 + +# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500. + +# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse. +# +#local/chain/compare_wer.sh 5e 5s 5t 5v +#System 5e 5s 5t 5v +#WER on train_dev(tg) 15.43 15.47 15.43 15.38 +#WER on train_dev(fg) 14.32 14.31 14.34 14.39 +#WER on eval2000(tg) 17.3 17.4 17.4 17.4 +#WER on eval2000(fg) 15.5 15.6 15.6 15.7 +#Final train prob -0.110056 -0.110928 -0.110752 -0.11156 +#Final valid prob -0.129184 -0.132139 -0.129123 -0.131797 + +# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it +# up), from 5000 to 3500. + +# about 5s: comparing with 5e which is the most recent baseline we actually +# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700, +# jesus-hidden-dim reduced 7500 to 5000, and and the new option +# --self-repair-scale 0.00001 added. Also compare 5t and 5v which have even +# smaller jesus-hidden-dims. + +# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate +# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair +# code to a bug which was doubling the thresholds so there was, in effect, +# no upper threshold. I stopped the p,q,r runs after I found this, but in +# configuring this run I'm bearing in mind the train and valid probs from the +# p,q,r runs. + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6c # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1700 --jesus-hidden-dim 2500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --thick-jesus-layer true" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6d.sh b/egs/swbd/s5c/local/chain/run_tdnn_6d.sh new file mode 100755 index 00000000000..80b6a18cabf --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6d.sh @@ -0,0 +1,470 @@ +#!/bin/bash + +# _6d is as _5v but changing adding --num-jesus-blocks 84 (default is 100). +# this means (after rounding) that we have 6, not 5, as +# --jesus-forward-input-dim / --num-jesus-blocks. + +# no clear difference. +#[note, 5v2 is a rerun of 5v]. +# local/chain/compare_wer.sh 5v 5v2 6d +# System 5v 5v2 6d +# WER on train_dev(tg) 15.38 15.74 15.66 +# WER on train_dev(fg) 14.39 14.50 14.54 +# WER on eval2000(tg) 17.4 17.5 17.5 +# WER on eval2000(fg) 15.7 15.9 15.8 +# Final train prob -0.11156 -0.112155 -0.112034 +# Final valid prob -0.131797 -0.129516 -0.131714 + +# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500. + +# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse. +# +#local/chain/compare_wer.sh 5e 5s 5t 5v +#System 5e 5s 5t 5v +#WER on train_dev(tg) 15.43 15.47 15.43 15.38 +#WER on train_dev(fg) 14.32 14.31 14.34 14.39 +#WER on eval2000(tg) 17.3 17.4 17.4 17.4 +#WER on eval2000(fg) 15.5 15.6 15.6 15.7 +#Final train prob -0.110056 -0.110928 -0.110752 -0.11156 +#Final valid prob -0.129184 -0.132139 -0.129123 -0.131797 + +# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it +# up), from 5000 to 3500. + +# about 5s: comparing with 5e which is the most recent baseline we actually +# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700, +# jesus-hidden-dim reduced 7500 to 5000, and and the new option +# --self-repair-scale 0.00001 added. Also compare 5t and 5v which have even +# smaller jesus-hidden-dims. + +# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate +# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair +# code to a bug which was doubling the thresholds so there was, in effect, +# no upper threshold. I stopped the p,q,r runs after I found this, but in +# configuring this run I'm bearing in mind the train and valid probs from the +# p,q,r runs. + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6d # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--num-jesus-blocks 84 --jesus-forward-input-dim 500 --jesus-forward-output-dim 1700 --jesus-hidden-dim 2500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6e.sh b/egs/swbd/s5c/local/chain/run_tdnn_6e.sh new file mode 100755 index 00000000000..d44973db7ba --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6e.sh @@ -0,0 +1,464 @@ +#!/bin/bash + + +# _6e is as _6d but going further: reducing --num-jesus-blocks to 72 = ceil(500/7). + +# +# _6d is as _5v but changing adding --num-jesus-blocks 84 (default is 100). +# this means (after rounding) that we have 6, not 5, as +# --jesus-forward-input-dim / --num-jesus-blocks. + + +# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500. + +# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse. +# +#local/chain/compare_wer.sh 5e 5s 5t 5v +#System 5e 5s 5t 5v +#WER on train_dev(tg) 15.43 15.47 15.43 15.38 +#WER on train_dev(fg) 14.32 14.31 14.34 14.39 +#WER on eval2000(tg) 17.3 17.4 17.4 17.4 +#WER on eval2000(fg) 15.5 15.6 15.6 15.7 +#Final train prob -0.110056 -0.110928 -0.110752 -0.11156 +#Final valid prob -0.129184 -0.132139 -0.129123 -0.131797 + +# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it +# up), from 5000 to 3500. + +# about 5s: comparing with 5e which is the most recent baseline we actually +# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700, +# jesus-hidden-dim reduced 7500 to 5000, and and the new option +# --self-repair-scale 0.00001 added. Also compare 5t and 5v which have even +# smaller jesus-hidden-dims. + +# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate +# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair +# code to a bug which was doubling the thresholds so there was, in effect, +# no upper threshold. I stopped the p,q,r runs after I found this, but in +# configuring this run I'm bearing in mind the train and valid probs from the +# p,q,r runs. + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6e # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--num-jesus-blocks 72 --jesus-forward-input-dim 500 --jesus-forward-output-dim 1700 --jesus-hidden-dim 2500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6f.sh b/egs/swbd/s5c/local/chain/run_tdnn_6f.sh new file mode 100755 index 00000000000..fb7ff03b66d --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6f.sh @@ -0,0 +1,470 @@ +#!/bin/bash + +# _6f is as _5v, but setting --jesus-hidden-dim to 0 which with a script change +# means there is no hidden part in the jesus layer (it's just repeated affine and relu). + +# slightly worse, but encouragingly small difference. +# note, 5v2 is a rerun of 5v. +# local/chain/compare_wer.sh 5v 5v2 6f +# System 5v 5v2 6f +# WER on train_dev(tg) 15.38 15.74 15.71 +# WER on train_dev(fg) 14.39 14.50 14.50 +# WER on eval2000(tg) 17.4 17.5 17.5 +# WER on eval2000(fg) 15.7 15.9 15.9 +# Final train prob -0.11156 -0.112155 -0.111305 +# Final valid prob -0.131797 -0.129516 -0.131487 + + +# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500. + +# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse. +# +#local/chain/compare_wer.sh 5e 5s 5t 5v +#System 5e 5s 5t 5v +#WER on train_dev(tg) 15.43 15.47 15.43 15.38 +#WER on train_dev(fg) 14.32 14.31 14.34 14.39 +#WER on eval2000(tg) 17.3 17.4 17.4 17.4 +#WER on eval2000(fg) 15.5 15.6 15.6 15.7 +#Final train prob -0.110056 -0.110928 -0.110752 -0.11156 +#Final valid prob -0.129184 -0.132139 -0.129123 -0.131797 + +# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it +# up), from 5000 to 3500. + +# about 5s: comparing with 5e which is the most recent baseline we actually +# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700, +# jesus-hidden-dim reduced 7500 to 5000, and and the new option +# --self-repair-scale 0.00001 added. Also compare 5t and 5v which have even +# smaller jesus-hidden-dims. + +# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate +# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair +# code to a bug which was doubling the thresholds so there was, in effect, +# no upper threshold. I stopped the p,q,r runs after I found this, but in +# configuring this run I'm bearing in mind the train and valid probs from the +# p,q,r runs. + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6f # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6g.sh b/egs/swbd/s5c/local/chain/run_tdnn_6g.sh new file mode 100755 index 00000000000..8d4e8b79fd0 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6g.sh @@ -0,0 +1,491 @@ +#!/bin/bash + +# _6g is as _6f but increasing the parameters (increasing +# jesus-forward-input-from from 500 to 600). + +# seems better than 6f, and about the same as (5v,5v2). encouraging. +# note, 5v2 is rerun of 5v. +#local/chain/compare_wer.sh 5v 5v2 6f 6g +#System 5v 5v2 6f 6g +#WER on train_dev(tg) 15.38 15.74 15.71 15.50 +#WER on train_dev(fg) 14.39 14.50 14.50 14.31 +#WER on eval2000(tg) 17.4 17.5 17.5 17.5 +#WER on eval2000(fg) 15.7 15.9 15.9 15.8 +#Final train prob -0.11156 -0.112155 -0.111305 -0.105853 +#Final valid prob -0.131797 -0.129516 -0.131487 -0.129997 + +# _6f is as _5v, but setting --jesus-hidden-dim to 0 which with a script change +# means there is no hidden part in the jesus layer (it's just repeated affine and relu). + +# slightly worse, but encouragingly small difference. +#local/chain/compare_wer.sh 5v 6f +#System 5v 6f +#WER on train_dev(tg) 15.38 15.71 +#WER on train_dev(fg) 14.39 14.50 +#WER on eval2000(tg) 17.4 17.5 +#WER on eval2000(fg) 15.7 15.9 +#Final train prob -0.11156 -0.111305 +#Final valid prob -0.131797 -0.131487 + +# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500. + +# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse. +# +#local/chain/compare_wer.sh 5e 5s 5t 5v +#System 5e 5s 5t 5v +#WER on train_dev(tg) 15.43 15.47 15.43 15.38 +#WER on train_dev(fg) 14.32 14.31 14.34 14.39 +#WER on eval2000(tg) 17.3 17.4 17.4 17.4 +#WER on eval2000(fg) 15.5 15.6 15.6 15.7 +#Final train prob -0.110056 -0.110928 -0.110752 -0.11156 +#Final valid prob -0.129184 -0.132139 -0.129123 -0.131797 + +# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it +# up), from 5000 to 3500. + +# about 5s: comparing with 5e which is the most recent baseline we actually +# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700, +# jesus-hidden-dim reduced 7500 to 5000, and and the new option +# --self-repair-scale 0.00001 added. Also compare 5t and 5v which have even +# smaller jesus-hidden-dims. + +# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate +# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair +# code to a bug which was doubling the thresholds so there was, in effect, +# no upper threshold. I stopped the p,q,r runs after I found this, but in +# configuring this run I'm bearing in mind the train and valid probs from the +# p,q,r runs. + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. +# quite helpful: +#local/chain/compare_wer.sh 4w 5a +#System 4w 5a +#WER on train_dev(tg) 16.05 15.86 +#WER on train_dev(fg) 14.92 14.74 +#WER on eval2000(tg) 18.0 17.4 +#WER on eval2000(fg) 16.2 15.6 +#Final train prob -0.108816-0.0998359 +#Final valid prob -0.118254 -0.115884 + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6g # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6h.sh b/egs/swbd/s5c/local/chain/run_tdnn_6h.sh new file mode 100755 index 00000000000..f3065cec603 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6h.sh @@ -0,0 +1,494 @@ +#!/bin/bash + +# _6h is as _6g but adding --xent-separate-forward-affine=true, which +# gives a separate last-but-one weight matrix to the xent output. + +# Although this slight improvement is probably not significant, it's a +# sensible idea so I think I'll stick with it. +#local/chain/compare_wer.sh 6g 6h +#System 6g 6h +#WER on train_dev(tg) 15.50 15.46 +#WER on train_dev(fg) 14.31 14.28 +#WER on eval2000(tg) 17.5 17.4 +#WER on eval2000(fg) 15.8 15.7 +#Final train prob -0.105853 -0.105663 +#Final valid prob -0.129997 -0.130166 + +# _6g is as _6f but increasing the parameters (increasing +# jesus-forward-input-from from 500 to 600). + +# _6f is as _5v, but setting --jesus-hidden-dim to 0 which with a script change +# means there is no hidden part in the jesus layer (it's just repeated affine and relu). + +# slightly worse, but encouragingly small difference. +#local/chain/compare_wer.sh 5v 6f +#System 5v 6f +#WER on train_dev(tg) 15.38 15.71 +#WER on train_dev(fg) 14.39 14.50 +#WER on eval2000(tg) 17.4 17.5 +#WER on eval2000(fg) 15.7 15.9 +#Final train prob -0.11156 -0.111305 +#Final valid prob -0.131797 -0.131487 + +# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500. + +# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse. +# +#local/chain/compare_wer.sh 5e 5s 5t 5v +#System 5e 5s 5t 5v +#WER on train_dev(tg) 15.43 15.47 15.43 15.38 +#WER on train_dev(fg) 14.32 14.31 14.34 14.39 +#WER on eval2000(tg) 17.3 17.4 17.4 17.4 +#WER on eval2000(fg) 15.5 15.6 15.6 15.7 +#Final train prob -0.110056 -0.110928 -0.110752 -0.11156 +#Final valid prob -0.129184 -0.132139 -0.129123 -0.131797 + +# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it +# up), from 5000 to 3500. + +# about 5s: comparing with 5e which is the most recent baseline we actually +# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700, +# jesus-hidden-dim reduced 7500 to 5000, and and the new option +# --self-repair-scale 0.00001 added. Also compare 5t and 5v which have even +# smaller jesus-hidden-dims. + +# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate +# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair +# code to a bug which was doubling the thresholds so there was, in effect, +# no upper threshold. I stopped the p,q,r runs after I found this, but in +# configuring this run I'm bearing in mind the train and valid probs from the +# p,q,r runs. + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. +# quite helpful: +#local/chain/compare_wer.sh 4w 5a +#System 4w 5a +#WER on train_dev(tg) 16.05 15.86 +#WER on train_dev(fg) 14.92 14.74 +#WER on eval2000(tg) 18.0 17.4 +#WER on eval2000(fg) 16.2 15.6 +#Final train prob -0.108816-0.0998359 +#Final valid prob -0.118254 -0.115884 + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6h # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6h_py.sh b/egs/swbd/s5c/local/chain/run_tdnn_6h_py.sh new file mode 100755 index 00000000000..b0f38b9fb0f --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6h_py.sh @@ -0,0 +1,177 @@ +#!/bin/bash + +# this is a replica of_6h script, but makes use of the python trainer +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6h_py # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false +xent_regularize=0.1 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + + steps/nnet3/make_jesus_configs.py \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --jesus-forward-input-dim 600 \ + --jesus-forward-output-dim 1700 \ + --jesus-hidden-dim 0 \ + --jesus-stddev-scale 0.2 \ + --final-layer-learning-rate-factor 0.25 \ + --self-repair-scale 0.00001 \ + --xent-separate-forward-affine=true \ + --xent-regularize=$xent_regularize \ + --include-log-softmax=false \ + $dir/configs || exit 1; +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir exp/chain/tdnn_2y_sp/egs \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1200000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6i.sh b/egs/swbd/s5c/local/chain/run_tdnn_6i.sh new file mode 100755 index 00000000000..457b424be73 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6i.sh @@ -0,0 +1,497 @@ +#!/bin/bash + +# _6i takes aspects from 5n and 6g. Like 6g it uses a 'thin' jesus-layer +# (no hidden dimension), and like 5n it uses a non-standard frame shift at the +# input, but this frame shift is 7.5 ms rather than 5ms (5n) or 10ms (6h). +# the idea is that this allows us to subsample the input frames by a factor +# of 4, rather than 3, and since 4 = 2 * 2, we can do the subsampling +# in two stages. You'll see this reflected in the splice indexes. + +# local/chain/compare_wer.sh 6g 6i +# System 6g 6i +# WER on train_dev(tg) 15.50 15.62 +# WER on train_dev(fg) 14.31 14.46 +# WER on eval2000(tg) 17.5 17.3 +# WER on eval2000(fg) 15.8 15.8 +# Final train prob -0.105853 -0.10417 +# Final valid prob -0.129997 -0.123985 + +# _5n is as _5j (also omitting the iVectors), but using double the input frame +# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying +# the splice indexes accordingly. +# note: the frames-per-iter should be 1.6 million to get the same amount of +# data per iteration, but I'm making it 2 million as the training per is getting +# faster than I like (-> wasting time waiting for the queue). + +# A very nice improvement on dev; small improvement on eval2000 though. +#local/chain/compare_wer.sh 5j 5n +#System 5j 5n +#WER on train_dev(tg) 17.59 16.85 +#WER on train_dev(fg) 16.33 15.67 +#WER on eval2000(tg) 19.1 19.1 +#WER on eval2000(fg) 17.5 17.3 +#Final train prob -0.114691 -0.116341 +#Final valid prob -0.130761 -0.130884 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6i # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=3 # this is about the same amount of compute as the normal 4, since + # epoch encompasses all frame-shifts of the data and we now have 4 + # frames-shifts rather than 3. (3 * 4 == 4 * 3). +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=200 # 20 is equivalent to 150 at 10ms frame rate. +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +# Generate faster-frame-rate (7.5 ms frame shift) version of the data. +if [ $stage -le 12 ]; then + mfccdir=mfcc + for dataset in eval2000 train_dev ${train_set}; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_hiresf + steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hiresf.conf \ + data/${dataset}_hiresf exp/make_hiresf/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hiresf exp/make_hiresf/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hiresf # remove segments with problems + done +fi + +if [ $stage -le 13 ]; then + for dataset in eval2000 train_dev ${train_set}; do + mkdir -p exp/nnet3/ivectors_${dataset}_fake + cp exp/nnet3/ivectors_${dataset}/ivector_online.scp exp/nnet3/ivectors_${dataset}_fake + # verify that the old ivector_period was 10. + [ $(cat exp/nnet3/ivectors_${dataset}/ivector_period) -eq 10 ] || exit 1 + # the ivector_period would have to be 13.333 to get the exact same rate. + # set it to 14 (slightly over) as less likely to produce errors in decoding. + echo 14 > exp/nnet3/ivectors_${dataset}_fake/ivector_period + done + # for the training set, use 13 as the ivector_period... this avoids + # errors for some longer utterances (the code checks the matching + # in a slightly different way). none of this would be necessary + # if we generated iVectors using the same frame shift. + echo 13 > exp/nnet3/ivectors_${train_set}_fake/ivector_period +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --frame-subsampling-factor 4 \ + --alignment-subsampling-factor 3 \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -2,-1,0,1,2 -4,-2,0,2 -4,0,4 -4,0,4 -4,0,4" \ + --apply-deriv-weights false \ + --frames-per-iter 2000000 \ + --online-ivector-dir exp/nnet3/ivectors_${train_set}_fake \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hiresf $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; + + echo "0.0075" > $dir/frame_shift # this lets the sclite decoding script know + # what the frame shift was, in seconds. +fi + +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 16 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set}_fake \ + $graph_dir data/${decode_set}_hiresf $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hiresf \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6j.sh b/egs/swbd/s5c/local/chain/run_tdnn_6j.sh new file mode 100755 index 00000000000..ded13de9470 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6j.sh @@ -0,0 +1,482 @@ +#!/bin/bash + +# _6j is another baseline for _6i, in which we use regular features (10 ms frame +# shift) with the 4-fold subsampling of 6i. I don't expect this will be as +# good, but it will be nice to have confirmation that the lower sampling +# rate is actually helpful. +# reducing frames-per-eg from 200 to 150 and --frames-per-iter from +# 2 million to 1.5 million. + +# Hm- the difference is surprisingly small, about 0.2% worse on average. +#local/chain/compare_wer.sh 6i 6j +#System 6i 6j +#WER on train_dev(tg) 15.62 15.86 +#WER on train_dev(fg) 14.46 14.79 +#WER on eval2000(tg) 17.3 17.6 +#WER on eval2000(fg) 15.8 15.8 +#Final train prob -0.10417 -0.131444 +#Final valid prob -0.123985 -0.167574 +#Final train prob (xent) -1.60566 -1.45908 +#Final valid prob (xent) -1.67945 -1.55937 + +# _6i takes aspects from 5n and 6h. Like 6h it uses a 'thin' jesus-layer +# (no hidden dimension), and like 5n it uses a non-standard frame shift at the +# input, but this frame shift is 7.5 ms rather than 5ms (5n) or 10ms (6h). +# the idea is that this allows us to subsample the input frames by a factor +# of 4, rather than 3, and since 4 = 2 * 2, we can do the subsampling +# in two stages. You'll see this reflected in the splice indexes. +# Some notes: +# - we had the choose the splice indexes; we have 1 hidden layer at +# base frame rate, 2 at + +# _5n is as _5j (also omitting the iVectors), but using double the input frame +# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying +# the splice indexes accordingly. +# note: the frames-per-iter should be 1.6 million to get the same amount of +# data per iteration, but I'm making it 2 million as the training per is getting +# faster than I like (-> wasting time waiting for the queue). + +# A very nice improvement on dev; small improvement on eval2000 though. +#local/chain/compare_wer.sh 5j 5n +#System 5j 5n +#WER on train_dev(tg) 17.59 16.85 +#WER on train_dev(fg) 16.33 15.67 +#WER on eval2000(tg) 19.1 19.1 +#WER on eval2000(fg) 17.5 17.3 +#Final train prob -0.114691 -0.116341 +#Final valid prob -0.130761 -0.130884 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=11 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6j # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=3 # this is about the same amount of compute as the normal 4, since + # epoch encompasses all frame-shifts of the data and we now have 4 + # frames-shifts rather than 3. (3 * 4 == 4 * 3). +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 4 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --frame-subsampling-factor 4 \ + --alignment-subsampling-factor 4 \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \ + --splice-indexes "-1,0,1 -2,-1,0,1,2 -4,-2,0,2 -4,0,4 -4,0,4 -4,0,4" \ + --apply-deriv-weights false \ + --frames-per-iter 1500000 \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; + + echo "0.0075" > $dir/frame_shift # this lets the sclite decoding script know + # what the frame shift was, in seconds. +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6k.sh b/egs/swbd/s5c/local/chain/run_tdnn_6k.sh new file mode 100755 index 00000000000..4625da200e6 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6k.sh @@ -0,0 +1,509 @@ +#!/bin/bash + +# _6k is as _6i, but one more epoch. After running the first few stages, I'm +# copying the last model from 6i and starting from that point, to save compute. +# No better. +#local/chain/compare_wer.sh 6i 6k +#System 6i 6k +#WER on train_dev(tg) 15.62 15.67 +#WER on train_dev(fg) 14.46 14.47 +#WER on eval2000(tg) 17.3 17.4 +#WER on eval2000(fg) 15.8 15.8 +#Final train prob -0.10417-0.0994163 +#Final valid prob -0.123985 -0.122743 + +# _6i takes aspects from 5n and 6h. Like 6h it uses a 'thin' jesus-layer +# (no hidden dimension), and like 5n it uses a non-standard frame shift at the +# input, but this frame shift is 7.5 ms rather than 5ms (5n) or 10ms (6h). +# the idea is that this allows us to subsample the input frames by a factor +# of 4, rather than 3, and since 4 = 2 * 2, we can do the subsampling +# in two stages. You'll see this reflected in the splice indexes. + +# local/chain/compare_wer.sh 6h 6i +# System 6h 6i +# WER on train_dev(tg) 15.46 15.62 +# WER on train_dev(fg) 14.28 14.46 +# WER on eval2000(tg) 17.4 17.3 +# WER on eval2000(fg) 15.7 15.8 +# Final train prob -0.105663 -0.10417 +# Final valid prob -0.130166 -0.123985 + + +# _5n is as _5j (also omitting the iVectors), but using double the input frame +# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying +# the splice indexes accordingly. +# note: the frames-per-iter should be 1.6 million to get the same amount of +# data per iteration, but I'm making it 2 million as the training per is getting +# faster than I like (-> wasting time waiting for the queue). + +# A very nice improvement on dev; small improvement on eval2000 though. +#local/chain/compare_wer.sh 5j 5n +#System 5j 5n +#WER on train_dev(tg) 17.59 16.85 +#WER on train_dev(fg) 16.33 15.67 +#WER on eval2000(tg) 19.1 19.1 +#WER on eval2000(fg) 17.5 17.3 +#Final train prob -0.114691 -0.116341 +#Final valid prob -0.130761 -0.130884 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=14 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6k # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=200 # 20 is equivalent to 150 at 10ms frame rate. +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +# Generate faster-frame-rate (7.5 ms frame shift) version of the data. +if [ $stage -le 12 ]; then + mfccdir=mfcc + for dataset in eval2000 train_dev ${train_set}; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_hiresf + steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hiresf.conf \ + data/${dataset}_hiresf exp/make_hiresf/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hiresf exp/make_hiresf/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hiresf # remove segments with problems + done +fi + +if [ $stage -le 13 ]; then + for dataset in eval2000 train_dev ${train_set}; do + mkdir -p exp/nnet3/ivectors_${dataset}_fake + cp exp/nnet3/ivectors_${dataset}/ivector_online.scp exp/nnet3/ivectors_${dataset}_fake + # verify that the old ivector_period was 10. + [ $(cat exp/nnet3/ivectors_${dataset}/ivector_period) -eq 10 ] || exit 1 + # the ivector_period would have to be 13.333 to get the exact same rate. + # set it to 14 (slightly over) as less likely to produce errors in decoding. + echo 14 > exp/nnet3/ivectors_${dataset}_fake/ivector_period + done + # for the training set, use 13 as the ivector_period... this avoids + # errors for some longer utterances (the code checks the matching + # in a slightly different way). none of this would be necessary + # if we generated iVectors using the same frame shift. + echo 13 > exp/nnet3/ivectors_${train_set}_fake/ivector_period +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_6i_sp/egs \ + --frame-subsampling-factor 4 \ + --alignment-subsampling-factor 3 \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \ + --splice-indexes "-1,0,1 -2,-1,0,1,2 -4,-2,0,2 -4,0,4 -4,0,4 -4,0,4" \ + --apply-deriv-weights false \ + --frames-per-iter 2000000 \ + --online-ivector-dir exp/nnet3/ivectors_${train_set}_fake \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hiresf $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; + + echo "0.0075" > $dir/frame_shift # this lets the sclite decoding script know + # what the frame shift was, in seconds. +fi + +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 16 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set}_fake \ + $graph_dir data/${decode_set}_hiresf $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hiresf \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6l.sh b/egs/swbd/s5c/local/chain/run_tdnn_6l.sh new file mode 100755 index 00000000000..f1e0821f2cf --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6l.sh @@ -0,0 +1,521 @@ +#!/bin/bash + +# _6l is as _6i, but adding the option --xent-separate-forward-affine=true which +# I had accidentally omitted, and adding 4 frames more left context and 2 frames +# more right context. + +# Below I'm also comparing with 6h, which (since we now added +# --xent-separate-forward-affine=true) is the appopriate normal-frame-rate +# baseline, rather than 6g. + +# This experiment is better than 6i, but there is no clear difference with +# 6h. So we can't really say that we're getting any benefit from the higher +# frame rate. + +#local/chain/compare_wer.sh 6h 6i 6l +#System 6h 6i 6l +#WER on train_dev(tg) 15.46 15.62 15.42 +#WER on train_dev(fg) 14.28 14.46 14.25 +#WER on eval2000(tg) 17.4 17.3 17.3 +#WER on eval2000(fg) 15.7 15.8 15.8 +#Final train prob -0.105663 -0.10417-0.0984719 +#Final valid prob -0.130166 -0.123985 -0.119088 +#Final train prob (xent) -1.42483 -1.60566 -1.46581 +#Final valid prob (xent) -1.49792 -1.67945 -1.51644 + + +# _6i takes aspects from 5n and 6g. Like 6g it uses a 'thin' jesus-layer +# (no hidden dimension), and like 5n it uses a non-standard frame shift at the +# input, but this frame shift is 7.5 ms rather than 5ms (5n) or 10ms (6h). +# the idea is that this allows us to subsample the input frames by a factor +# of 4, rather than 3, and since 4 = 2 * 2, we can do the subsampling +# in two stages. You'll see this reflected in the splice indexes. + +# local/chain/compare_wer.sh 6g 6i +# System 6g 6i +# WER on train_dev(tg) 15.50 15.62 +# WER on train_dev(fg) 14.31 14.46 +# WER on eval2000(tg) 17.5 17.3 +# WER on eval2000(fg) 15.8 15.8 +# Final train prob -0.105853 -0.10417 +# Final valid prob -0.129997 -0.123985 + +# _5n is as _5j (also omitting the iVectors), but using double the input frame +# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying +# the splice indexes accordingly. +# note: the frames-per-iter should be 1.6 million to get the same amount of +# data per iteration, but I'm making it 2 million as the training per is getting +# faster than I like (-> wasting time waiting for the queue). + +# A very nice improvement on dev; small improvement on eval2000 though. +#local/chain/compare_wer.sh 5j 5n +#System 5j 5n +#WER on train_dev(tg) 17.59 16.85 +#WER on train_dev(fg) 16.33 15.67 +#WER on eval2000(tg) 19.1 19.1 +#WER on eval2000(fg) 17.5 17.3 +#Final train prob -0.114691 -0.116341 +#Final valid prob -0.130761 -0.130884 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=14 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6l # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=3 # this is about the same amount of compute as the normal 4, since + # epoch encompasses all frame-shifts of the data and we now have 4 + # frames-shifts rather than 3. (3 * 4 == 4 * 3). +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=200 # 20 is equivalent to 150 at 10ms frame rate. +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +# Generate faster-frame-rate (7.5 ms frame shift) version of the data. +if [ $stage -le 12 ]; then + mfccdir=mfcc + for dataset in eval2000 train_dev ${train_set}; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_hiresf + steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hiresf.conf \ + data/${dataset}_hiresf exp/make_hiresf/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hiresf exp/make_hiresf/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hiresf # remove segments with problems + done +fi + +if [ $stage -le 13 ]; then + for dataset in eval2000 train_dev ${train_set}; do + mkdir -p exp/nnet3/ivectors_${dataset}_fake + cp exp/nnet3/ivectors_${dataset}/ivector_online.scp exp/nnet3/ivectors_${dataset}_fake + # verify that the old ivector_period was 10. + [ $(cat exp/nnet3/ivectors_${dataset}/ivector_period) -eq 10 ] || exit 1 + # the ivector_period would have to be 13.333 to get the exact same rate. + # set it to 14 (slightly over) as less likely to produce errors in decoding. + echo 14 > exp/nnet3/ivectors_${dataset}_fake/ivector_period + done + # for the training set, use 13 as the ivector_period... this avoids + # errors for some longer utterances (the code checks the matching + # in a slightly different way). none of this would be necessary + # if we generated iVectors using the same frame shift. + echo 13 > exp/nnet3/ivectors_${train_set}_fake/ivector_period +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b{05,b11,b12,b13}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --frame-subsampling-factor 4 \ + --alignment-subsampling-factor 3 \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \ + --splice-indexes "-1,0,1 -2,-1,0,1,2 -4,-2,0,2,4 -4,0,4 -4,0,4 -8,-4,0,4" \ + --apply-deriv-weights false \ + --frames-per-iter 2000000 \ + --online-ivector-dir exp/nnet3/ivectors_${train_set}_fake \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hiresf $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; + + echo "0.0075" > $dir/frame_shift # this lets the sclite decoding script know + # what the frame shift was, in seconds. +fi + +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 16 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set}_fake \ + $graph_dir data/${decode_set}_hiresf $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hiresf \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6m.sh b/egs/swbd/s5c/local/chain/run_tdnn_6m.sh new file mode 100755 index 00000000000..8a7b14ef342 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6m.sh @@ -0,0 +1,497 @@ +#!/bin/bash + +# _6m is as _6j (which subsamples by 4 frames not 3 at the output), changing just the +# --left-tolerance and --right-tolerance to be the same total width but more +# symmetrical (-7,+8) vs the default (-5, +10). + +# this is unhelpful and if anything is a little worse. +#local/chain/compare_wer.sh 6j 6m +#System 6j 6m +#WER on train_dev(tg) 15.86 16.08 +#WER on train_dev(fg) 14.79 14.85 +#WER on eval2000(tg) 17.6 17.6 +#WER on eval2000(fg) 15.8 15.8 +#Final train prob -0.131444 -0.131515 +#Final valid prob -0.167574 -0.17046 +#Final train prob (xent) -1.45908 -1.43814 +#Final valid prob (xent) -1.55937 -1.5412 + +# _6j is another baseline for _6i, in which we use regular features (10 ms frame +# shift) with the 4-fold subsampling of 6i. I don't expect this will be as +# good, but it will be nice to have confirmation that the lower sampling +# rate is actually helpful. +# reducing frames-per-eg from 200 to 150 and --frames-per-iter from +# 2 million to 1.5 million. + +# Hm- the difference is surprisingly small, about 0.2% worse on average. +#local/chain/compare_wer.sh 6i 6j +#System 6i 6j +#WER on train_dev(tg) 15.62 15.86 +#WER on train_dev(fg) 14.46 14.79 +#WER on eval2000(tg) 17.3 17.6 +#WER on eval2000(fg) 15.8 15.8 +#Final train prob -0.10417 -0.131444 +#Final valid prob -0.123985 -0.167574 + +# _6i takes aspects from 5n and 6h. Like 6h it uses a 'thin' jesus-layer +# (no hidden dimension), and like 5n it uses a non-standard frame shift at the +# input, but this frame shift is 7.5 ms rather than 5ms (5n) or 10ms (6h). +# the idea is that this allows us to subsample the input frames by a factor +# of 4, rather than 3, and since 4 = 2 * 2, we can do the subsampling +# in two stages. You'll see this reflected in the splice indexes. +# Some notes: +# - we had the choose the splice indexes; we have 1 hidden layer at +# base frame rate, 2 at + +# _5n is as _5j (also omitting the iVectors), but using double the input frame +# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying +# the splice indexes accordingly. +# note: the frames-per-iter should be 1.6 million to get the same amount of +# data per iteration, but I'm making it 2 million as the training per is getting +# faster than I like (-> wasting time waiting for the queue). + +# A very nice improvement on dev; small improvement on eval2000 though. +#local/chain/compare_wer.sh 5j 5n +#System 5j 5n +#WER on train_dev(tg) 17.59 16.85 +#WER on train_dev(fg) 16.33 15.67 +#WER on eval2000(tg) 19.1 19.1 +#WER on eval2000(fg) 17.5 17.3 +#Final train prob -0.114691 -0.116341 +#Final valid prob -0.130761 -0.130884 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6m # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=3 # this is about the same amount of compute as the normal 4, since + # epoch encompasses all frame-shifts of the data and we now have 4 + # frames-shifts rather than 3. (3 * 4 == 4 * 3). +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 4 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --left-tolerance 7 --right-tolerance 8 \ + --frame-subsampling-factor 4 \ + --alignment-subsampling-factor 4 \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \ + --splice-indexes "-1,0,1 -2,-1,0,1,2 -4,-2,0,2 -4,0,4 -4,0,4 -4,0,4" \ + --apply-deriv-weights false \ + --frames-per-iter 1500000 \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; + + echo "0.0075" > $dir/frame_shift # this lets the sclite decoding script know + # what the frame shift was, in seconds. +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6n.sh b/egs/swbd/s5c/local/chain/run_tdnn_6n.sh new file mode 100755 index 00000000000..625cb73cf50 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6n.sh @@ -0,0 +1,499 @@ +#!/bin/bash + +# _6n is as _6m, but with a less-wide splicing context. + +# The effect is inconsistent- there is none, on average. +#System 6j 6m 6n +#WER on train_dev(tg) 15.86 16.08 16.01 +#WER on train_dev(fg) 14.79 14.85 14.66 +#WER on eval2000(tg) 17.6 17.6 17.7 +#WER on eval2000(fg) 15.8 15.8 15.9 +#Final train prob -0.131444 -0.131515 -0.133681 +#Final valid prob -0.167574 -0.17046 -0.172072 +#Final train prob (xent) -1.45908 -1.43814 -1.53108 +#Final valid prob (xent) -1.55937 -1.5412 -1.65137 + +# _6m is as _6j (which subsamples by 4 frames), changing just the +# --left-tolerance and --right-tolerance to be the same total width but more +# symmetrical (-7,+8) vs the default (-5, +10). + +# _6j is another baseline for _6i, in which we use regular features (10 ms frame +# shift) with the 4-fold subsampling of 6i. I don't expect this will be as +# good, but it will be nice to have confirmation that the lower sampling +# rate is actually helpful. +# reducing frames-per-eg from 200 to 150 and --frames-per-iter from +# 2 million to 1.5 million. + +# Hm- the difference is surprisingly small, about 0.2% worse on average. +#local/chain/compare_wer.sh 6i 6j +#System 6i 6j +#WER on train_dev(tg) 15.62 15.86 +#WER on train_dev(fg) 14.46 14.79 +#WER on eval2000(tg) 17.3 17.6 +#WER on eval2000(fg) 15.8 15.8 +#Final train prob -0.10417 -0.131444 +#Final valid prob -0.123985 -0.167574 + +# _6i takes aspects from 5n and 6h. Like 6h it uses a 'thin' jesus-layer +# (no hidden dimension), and like 5n it uses a non-standard frame shift at the +# input, but this frame shift is 7.5 ms rather than 5ms (5n) or 10ms (6h). +# the idea is that this allows us to subsample the input frames by a factor +# of 4, rather than 3, and since 4 = 2 * 2, we can do the subsampling +# in two stages. You'll see this reflected in the splice indexes. +# Some notes: +# - we had the choose the splice indexes; we have 1 hidden layer at +# base frame rate, 2 at + +# _5n is as _5j (also omitting the iVectors), but using double the input frame +# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying +# the splice indexes accordingly. +# note: the frames-per-iter should be 1.6 million to get the same amount of +# data per iteration, but I'm making it 2 million as the training per is getting +# faster than I like (-> wasting time waiting for the queue). + +# A very nice improvement on dev; small improvement on eval2000 though. +#local/chain/compare_wer.sh 5j 5n +#System 5j 5n +#WER on train_dev(tg) 17.59 16.85 +#WER on train_dev(fg) 16.33 15.67 +#WER on eval2000(tg) 19.1 19.1 +#WER on eval2000(fg) 17.5 17.3 +#Final train prob -0.114691 -0.116341 +#Final valid prob -0.130761 -0.130884 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6n # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=3 # this is about the same amount of compute as the normal 4, since + # epoch encompasses all frame-shifts of the data and we now have 4 + # frames-shifts rather than 3. (3 * 4 == 4 * 3). +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 4 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_6m_sp/egs \ + --left-tolerance 7 --right-tolerance 8 \ + --frame-subsampling-factor 4 \ + --alignment-subsampling-factor 4 \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \ + --splice-indexes "-1,0,1 -2,-1,0,1,2 -2,0,2 -2,0,2 -4,0,4 -4,0,4" \ + --apply-deriv-weights false \ + --frames-per-iter 1500000 \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; + + echo "0.0075" > $dir/frame_shift # this lets the sclite decoding script know + # what the frame shift was, in seconds. +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6o.sh b/egs/swbd/s5c/local/chain/run_tdnn_6o.sh new file mode 100755 index 00000000000..e07e6092644 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6o.sh @@ -0,0 +1,509 @@ +#!/bin/bash + +# _6o is as _6h but halving the --l2-regularize option, because since the +# time we last tuned this, other regularization methods have been added. + +#It's worse. +#local/chain/compare_wer.sh 6h 6o +#System 6h 6o +#WER on train_dev(tg) 15.46 15.61 +#WER on train_dev(fg) 14.28 14.58 +#WER on eval2000(tg) 17.4 17.5 +#WER on eval2000(fg) 15.7 15.7 +#Final train prob -0.105663-0.0992526 +#Final valid prob -0.130166 -0.127421 +#Final train prob (xent) -1.42483 -1.4369 +#Final valid prob (xent) -1.49792 -1.49867 + +# _6h is as _6g but adding --xent-separate-forward-affine=true, which +# gives a separate last-but-one weight matrix to the xent output. + +# Although this slight improvement is probably not significant, it's a +# sensible idea so I think I'll stick with it. +#local/chain/compare_wer.sh 6g 6h +#System 6g 6h +#WER on train_dev(tg) 15.50 15.46 +#WER on train_dev(fg) 14.31 14.28 +#WER on eval2000(tg) 17.5 17.4 +#WER on eval2000(fg) 15.8 15.7 +#Final train prob -0.105853 -0.105663 +#Final valid prob -0.129997 -0.130166 + +# _6g is as _6f but increasing the parameters (increasing +# jesus-forward-input-from from 500 to 600). + +# _6f is as _5v, but setting --jesus-hidden-dim to 0 which with a script change +# means there is no hidden part in the jesus layer (it's just repeated affine and relu). + +# slightly worse, but encouragingly small difference. +#local/chain/compare_wer.sh 5v 6f +#System 5v 6f +#WER on train_dev(tg) 15.38 15.71 +#WER on train_dev(fg) 14.39 14.50 +#WER on eval2000(tg) 17.4 17.5 +#WER on eval2000(fg) 15.7 15.9 +#Final train prob -0.11156 -0.111305 +#Final valid prob -0.131797 -0.131487 + +# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500. + +# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse. +# +#local/chain/compare_wer.sh 5e 5s 5t 5v +#System 5e 5s 5t 5v +#WER on train_dev(tg) 15.43 15.47 15.43 15.38 +#WER on train_dev(fg) 14.32 14.31 14.34 14.39 +#WER on eval2000(tg) 17.3 17.4 17.4 17.4 +#WER on eval2000(fg) 15.5 15.6 15.6 15.7 +#Final train prob -0.110056 -0.110928 -0.110752 -0.11156 +#Final valid prob -0.129184 -0.132139 -0.129123 -0.131797 + +# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it +# up), from 5000 to 3500. + +# about 5s: comparing with 5e which is the most recent baseline we actually +# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700, +# jesus-hidden-dim reduced 7500 to 5000, and and the new option +# --self-repair-scale 0.00001 added. Also compare 5t and 5v which have even +# smaller jesus-hidden-dims. + +# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate +# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair +# code to a bug which was doubling the thresholds so there was, in effect, +# no upper threshold. I stopped the p,q,r runs after I found this, but in +# configuring this run I'm bearing in mind the train and valid probs from the +# p,q,r runs. + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. +# quite helpful: +#local/chain/compare_wer.sh 4w 5a +#System 4w 5a +#WER on train_dev(tg) 16.05 15.86 +#WER on train_dev(fg) 14.92 14.74 +#WER on eval2000(tg) 18.0 17.4 +#WER on eval2000(fg) 16.2 15.6 +#Final train prob -0.108816-0.0998359 +#Final valid prob -0.118254 -0.115884 + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6o # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.000025 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6p.sh b/egs/swbd/s5c/local/chain/run_tdnn_6p.sh new file mode 100755 index 00000000000..a9f7eef9bbc --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6p.sh @@ -0,0 +1,503 @@ +#!/bin/bash + +# _6p is as _6j, but increasing the various regularization coefficients. +# the intention is to increase them by 4/3, since they are all evaluated +# once per output frame, and there are now fewer output frames by a factor +# of 3/4. To make them rounder numbers, I increased some by a factor +# of 5/4 (--xent-regularize, 0.1 -> 0.125, and --leaky-hmm-coefficient, +# 0.1 -> 0.125), and l2-regularize by 3/2 (0.00005 -> 0.000075). + +# Worse. +#local/chain/compare_wer.sh 6j 6p +#System 6j 6p +#WER on train_dev(tg) 15.86 15.91 +#WER on train_dev(fg) 14.79 14.76 +#WER on eval2000(tg) 17.6 17.9 +#WER on eval2000(fg) 15.8 15.9 +#Final train prob -0.131444 -0.143285 +#Final valid prob -0.167574 -0.173759 +#Final train prob (xent) -1.45908 -1.44287 +#Final valid prob (xent) -1.55937 -1.52918 + + +# _6j is another baseline for _6i, in which we use regular features (10 ms frame +# shift) with the 4-fold subsampling of 6i. I don't expect this will be as +# good, but it will be nice to have confirmation that the lower sampling +# rate is actually helpful. +# reducing frames-per-eg from 200 to 150 and --frames-per-iter from +# 2 million to 1.5 million. + +# Hm- the difference is surprisingly small, about 0.2% worse on average. +#local/chain/compare_wer.sh 6i 6j +#System 6i 6j +#WER on train_dev(tg) 15.62 15.86 +#WER on train_dev(fg) 14.46 14.79 +#WER on eval2000(tg) 17.3 17.6 +#WER on eval2000(fg) 15.8 15.8 +#Final train prob -0.10417 -0.131444 +#Final valid prob -0.123985 -0.167574 +#Final train prob (xent) -1.60566 -1.45908 +#Final valid prob (xent) -1.67945 -1.55937 + +# _6i takes aspects from 5n and 6h. Like 6h it uses a 'thin' jesus-layer +# (no hidden dimension), and like 5n it uses a non-standard frame shift at the +# input, but this frame shift is 7.5 ms rather than 5ms (5n) or 10ms (6h). +# the idea is that this allows us to subsample the input frames by a factor +# of 4, rather than 3, and since 4 = 2 * 2, we can do the subsampling +# in two stages. You'll see this reflected in the splice indexes. +# Some notes: +# - we had the choose the splice indexes; we have 1 hidden layer at +# base frame rate, 2 at + +# _5n is as _5j (also omitting the iVectors), but using double the input frame +# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying +# the splice indexes accordingly. +# note: the frames-per-iter should be 1.6 million to get the same amount of +# data per iteration, but I'm making it 2 million as the training per is getting +# faster than I like (-> wasting time waiting for the queue). + +# A very nice improvement on dev; small improvement on eval2000 though. +#local/chain/compare_wer.sh 5j 5n +#System 5j 5n +#WER on train_dev(tg) 17.59 16.85 +#WER on train_dev(fg) 16.33 15.67 +#WER on eval2000(tg) 19.1 19.1 +#WER on eval2000(fg) 17.5 17.3 +#Final train prob -0.114691 -0.116341 +#Final valid prob -0.130761 -0.130884 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6p # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=3 # this is about the same amount of compute as the normal 4, since + # epoch encompasses all frame-shifts of the data and we now have 4 + # frames-shifts rather than 3. (3 * 4 == 4 * 3). +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 4 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_6j_sp/egs \ + --frame-subsampling-factor 4 \ + --alignment-subsampling-factor 4 \ + --xent-regularize 0.125 \ + --leaky-hmm-coefficient 0.125 \ + --l2-regularize 0.000075 \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \ + --splice-indexes "-1,0,1 -2,-1,0,1,2 -4,-2,0,2 -4,0,4 -4,0,4 -4,0,4" \ + --apply-deriv-weights false \ + --frames-per-iter 1500000 \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; + + echo "0.0075" > $dir/frame_shift # this lets the sclite decoding script know + # what the frame shift was, in seconds. +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6q.sh b/egs/swbd/s5c/local/chain/run_tdnn_6q.sh new file mode 100755 index 00000000000..440da3a1d6b --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6q.sh @@ -0,0 +1,493 @@ +#!/bin/bash + +# _6q is as _5n (which is a double-frame-rate system), but putting back +# the iVectors and otherwise changing the configuration as in 5j -> 6g, +# like 'rebasing' the changes onto 6g. +# (note, I forgot the self-repair-scale, and I probably should have used +# 6h as the baseline because it has --xent-separate-forward-affine=true; +# note, this experiment doesn't have --xent-separate-forward-affine=true but +# it would have been better to have it (retrying as 6r) + +# we're about 0.2% better than 6g. +#local/chain/compare_wer.sh 6g 6q +#System 6g 6q +#WER on train_dev(tg) 15.50 15.25 +#WER on train_dev(fg) 14.31 14.24 +#WER on eval2000(tg) 17.5 17.2 +#WER on eval2000(fg) 15.8 15.6 +#Final train prob -0.105853 -0.106936 +#Final valid prob -0.129997 -0.123066 +#Final train prob (xent) -1.4718 -1.66328 +#Final valid prob (xent) -1.55129 -1.71979 + + + +# _5n is as _5j (also omitting the iVectors), but using double the input frame +# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying +# the splice indexes accordingly + +# A very nice improvement on dev; small improvement on eval2000 though. +#local/chain/compare_wer.sh 5j 5n +#System 5j 5n +#WER on train_dev(tg) 17.59 16.85 +#WER on train_dev(fg) 16.33 15.67 +#WER on eval2000(tg) 19.1 19.1 +#WER on eval2000(fg) 17.5 17.3 +#Final train prob -0.114691 -0.116341 +#Final valid prob -0.130761 -0.130884 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=13 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6q # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=2 # this is about the same amount of compute as the normal 4, since one + # epoch encompasses all frame-shifts of the data. +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=300 # doubling it, since we have half the frame rate. +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +# Generate double-frame-rate version of the data. +if [ $stage -le 12 ]; then + mfccdir=mfcc + for dataset in eval2000 train_dev; do ## ${train_set}; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires_dbl + steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires_dbl.conf \ + data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hires_dbl # remove segments with problems + done +fi + +if [ $stage -le 13 ]; then + for dataset in eval2000 train_dev ${train_set}; do + mkdir -p exp/nnet3/ivectors_${dataset}_fake2 + cp exp/nnet3/ivectors_${dataset}/ivector_online.scp exp/nnet3/ivectors_${dataset}_fake2 + # verify that the old ivector_period was 10. + [ $(cat exp/nnet3/ivectors_${dataset}/ivector_period) -eq 10 ] || exit 1 + echo 20 > exp/nnet3/ivectors_${dataset}_fake2/ivector_period + done +fi + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --frame-subsampling-factor 6 \ + --alignment-subsampling-factor 3 \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -2,0,2 -4,-2,0,2 -6,0,6 -6,0,6 -12,-6,0" \ + --apply-deriv-weights false \ + --frames-per-iter 3000000 \ + --online-ivector-dir exp/nnet3/ivectors_${train_set}_fake2 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires_dbl $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; + + echo "0.005" > $dir/frame_shift # this lets the sclite decoding script know + # what the frame shift was, in seconds. +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set}_fake2 \ + $graph_dir data/${decode_set}_hires_dbl $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires_dbl \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6r.sh b/egs/swbd/s5c/local/chain/run_tdnn_6r.sh new file mode 100755 index 00000000000..ffbac19d1eb --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6r.sh @@ -0,0 +1,492 @@ +#!/bin/bash + +# _6r is as _6q, but adding --self-repair-scale 0.00001 +# --xent-separate-forward-affine=true. the appropriate normal-frame-rate +# baseline for this is 6h (since it has --xent-separate-forward-affine=true), +# so using that as the baseline: + +#local/chain/compare_wer.sh 6h 6r +#System 6h 6r +#WER on train_dev(tg) 15.46 15.06 +#WER on train_dev(fg) 14.28 14.05 +#WER on eval2000(tg) 17.4 17.2 +#WER on eval2000(fg) 15.7 15.4 +#Final train prob -0.105663 -0.106685 +#Final valid prob -0.130166 -0.122293 +#Final train prob (xent) -1.42483 -1.62108 +#Final valid prob (xent) -1.49792 -1.67695 + +# _6q is as _5n (which is a double-frame-rate system), but putting back +# the iVectors and otherwise changing the configuration as in 5j -> 6g, +# like 'rebasing' the changes onto 6g. + +# _5n is as _5j (also omitting the iVectors), but using double the input frame +# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying +# the splice indexes accordingly + +# A very nice improvement on dev; small improvement on eval2000 though. +#local/chain/compare_wer.sh 5j 5n +#System 5j 5n +#WER on train_dev(tg) 17.59 16.85 +#WER on train_dev(fg) 16.33 15.67 +#WER on eval2000(tg) 19.1 19.1 +#WER on eval2000(fg) 17.5 17.3 +#Final train prob -0.114691 -0.116341 +#Final valid prob -0.130761 -0.130884 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=14 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6r # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=2 # this is about the same amount of compute as the normal 4, since one + # epoch encompasses all frame-shifts of the data. +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=300 # doubling it, since we have half the frame rate. +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +# Generate double-frame-rate version of the data. +if [ $stage -le 12 ]; then + mfccdir=mfcc + for dataset in eval2000 train_dev; do ## ${train_set}; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires_dbl + steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires_dbl.conf \ + data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hires_dbl # remove segments with problems + done +fi + +if [ $stage -le 13 ]; then + for dataset in eval2000 train_dev ${train_set}; do + mkdir -p exp/nnet3/ivectors_${dataset}_fake2 + cp exp/nnet3/ivectors_${dataset}/ivector_online.scp exp/nnet3/ivectors_${dataset}_fake2 + # verify that the old ivector_period was 10. + [ $(cat exp/nnet3/ivectors_${dataset}/ivector_period) -eq 10 ] || exit 1 + echo 20 > exp/nnet3/ivectors_${dataset}_fake2/ivector_period + done +fi + + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_6q_sp/egs \ + --frame-subsampling-factor 6 \ + --alignment-subsampling-factor 3 \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \ + --splice-indexes "-1,0,1 -2,0,2 -4,-2,0,2 -6,0,6 -6,0,6 -12,-6,0" \ + --apply-deriv-weights false \ + --frames-per-iter 3000000 \ + --online-ivector-dir exp/nnet3/ivectors_${train_set}_fake2 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires_dbl $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; + + echo "0.005" > $dir/frame_shift # this lets the sclite decoding script know + # what the frame shift was, in seconds. +fi + +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set}_fake2 \ + $graph_dir data/${decode_set}_hires_dbl $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires_dbl \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6s.sh b/egs/swbd/s5c/local/chain/run_tdnn_6s.sh new file mode 100755 index 00000000000..4693dde0a31 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6s.sh @@ -0,0 +1,502 @@ +#!/bin/bash + + +# _6s is as _6r, but changing the splicing indexes to be exactly the same as 6h, +# but all multiplied by 2. This means that for any given frame-shift, the network +# sees exactly the same input as 6h; the only difference is that we see +# more frame shifts, i.e. the data is more carefully perturbed than 6h. +# this is to help disentangle whether the improvement really comes from the +# higher-resolution features, or from the improved data shifting. + +# So we lose the improvement that we got in 6r (see below). This is consistent +# with the idea that we really do need the higher-frame-rate input, but it's +# also possible that some slight differences in the splicing indexes were +# responsible, so in 6t we'll do an experiment where we try to get closer +# to the splicing setup of 6r. +# +# local/chain/compare_wer.sh 6h 6r 6s +#System 6h 6r 6s +#WER on train_dev(tg) 15.46 15.06 15.50 +#WER on train_dev(fg) 14.28 14.05 14.45 +#WER on eval2000(tg) 17.4 17.2 17.5 +#WER on eval2000(fg) 15.7 15.4 15.7 +#Final train prob -0.105663 -0.106685 -0.105965 +#Final valid prob -0.130166 -0.122293 -0.122376 +#Final train prob (xent) -1.42483 -1.62108 -1.5454 +#Final valid prob (xent) -1.49792 -1.67695 -1.58129 + +# _6r is as _6q, but adding --self-repair-scale 0.00001 --xent-separate-forward-affine=true + +# _6q is as _5n (which is a double-frame-rate system), but putting back +# the iVectors and otherwise changing the configuration as in 5j -> 6g, +# like 'rebasing' the changes onto 6g. + +# _5n is as _5j (also omitting the iVectors), but using double the input frame +# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying +# the splice indexes accordingly + +# A very nice improvement on dev; small improvement on eval2000 though. +#local/chain/compare_wer.sh 5j 5n +#System 5j 5n +#WER on train_dev(tg) 17.59 16.85 +#WER on train_dev(fg) 16.33 15.67 +#WER on eval2000(tg) 19.1 19.1 +#WER on eval2000(fg) 17.5 17.3 +#Final train prob -0.114691 -0.116341 +#Final valid prob -0.130761 -0.130884 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=14 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6s # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=2 # this is about the same amount of compute as the normal 4, since one + # epoch encompasses all frame-shifts of the data. +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=300 # doubling it, since we have half the frame rate. +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +# Generate double-frame-rate version of the data. +if [ $stage -le 12 ]; then + mfccdir=mfcc + for dataset in eval2000 train_dev; do ## ${train_set}; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires_dbl + steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires_dbl.conf \ + data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hires_dbl # remove segments with problems + done +fi + +if [ $stage -le 13 ]; then + for dataset in eval2000 train_dev ${train_set}; do + mkdir -p exp/nnet3/ivectors_${dataset}_fake2 + cp exp/nnet3/ivectors_${dataset}/ivector_online.scp exp/nnet3/ivectors_${dataset}_fake2 + # verify that the old ivector_period was 10. + [ $(cat exp/nnet3/ivectors_${dataset}/ivector_period) -eq 10 ] || exit 1 + echo 20 > exp/nnet3/ivectors_${dataset}_fake2/ivector_period + done +fi + + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --frame-subsampling-factor 6 \ + --alignment-subsampling-factor 3 \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \ + --splice-indexes "-2,0,2 -2,0,2,4 -6,0,6 -6,0,6 -6,0,6 -12,-6,0" \ + --apply-deriv-weights false \ + --frames-per-iter 3000000 \ + --online-ivector-dir exp/nnet3/ivectors_${train_set}_fake2 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires_dbl $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; + + echo "0.005" > $dir/frame_shift # this lets the sclite decoding script know + # what the frame shift was, in seconds. +fi + +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set}_fake2 \ + $graph_dir data/${decode_set}_hires_dbl $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires_dbl \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6t.sh b/egs/swbd/s5c/local/chain/run_tdnn_6t.sh new file mode 100755 index 00000000000..47921335155 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6t.sh @@ -0,0 +1,512 @@ +#!/bin/bash + +# since _6s didn't work that well, in 6t we try something else: +# modifying 6s to use almost exactly the same splicing indexes as 6r, +# but with the first splice indexes changed from -1,0,1 to -1,1, so that +# all the differences are multiples of 2 (so the effective frame rate is +# the normal frame rate). In effect we're using a narrower splicing +# at the start of the nnet, than 6s. + +# 6t does seem better than 6s, but not quite as good as 6r. +# the fact that it's not as good as 6r may show that the double-frame-rate +# input was actually giving us some useful information-- although the +# improvement is only something like 0.1%-0.2%, and we didn't actually see +# any difference in the objective function from 6r, which undermines the +# notion that by removing that central 0 splice at the input, we lost +# some information. +# +# +#local/chain/compare_wer.sh 6r 6s 6t +#System 6r 6s 6t +#WER on train_dev(tg) 15.06 15.50 15.34 +#WER on train_dev(fg) 14.05 14.45 14.23 +#WER on eval2000(tg) 17.2 17.5 17.2 +#WER on eval2000(fg) 15.4 15.7 15.6 +#Final train prob -0.106685 -0.105965 -0.106575 +#Final valid prob -0.122293 -0.122376 -0.121902 +#Final train prob (xent) -1.62108 -1.5454 -1.62226 +#Final valid prob (xent) -1.67695 -1.58129 -1.67252 + +# _6s is as _6r, but changing the splicing indexes to be exactly the same as 6h, +# but all multiplied by 2. This means that for any given frame-shift, the network +# sees exactly the same input as 6h; the only differences is that we see +# more frame shifts, i.e. the data is more carefully perturbed than 6h. +# this is to help disentangle whether the improvement really comes from the +# higher-resolution features, or from the improved data shifting. + +# _6r is as _6q, but adding --self-repair-scale 0.00001 --xent-separate-forward-affine=true + +# _6q is as _5n (which is a double-frame-rate system), but putting back +# the iVectors and otherwise changing the configuration as in 5j -> 6g, +# like 'rebasing' the changes onto 6g. + +# _5n is as _5j (also omitting the iVectors), but using double the input frame +# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying +# the splice indexes accordingly + +# A very nice improvement on dev; small improvement on eval2000 though. +#local/chain/compare_wer.sh 5j 5n +#System 5j 5n +#WER on train_dev(tg) 17.59 16.85 +#WER on train_dev(fg) 16.33 15.67 +#WER on eval2000(tg) 19.1 19.1 +#WER on eval2000(fg) 17.5 17.3 +#Final train prob -0.114691 -0.116341 +#Final valid prob -0.130761 -0.130884 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=14 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6t # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=2 # this is about the same amount of compute as the normal 4, since one + # epoch encompasses all frame-shifts of the data. +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=300 # doubling it, since we have half the frame rate. +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +# Generate double-frame-rate version of the data. +if [ $stage -le 12 ]; then + mfccdir=mfcc + for dataset in eval2000 train_dev; do ## ${train_set}; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires_dbl + steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires_dbl.conf \ + data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hires_dbl # remove segments with problems + done +fi + +if [ $stage -le 13 ]; then + for dataset in eval2000 train_dev ${train_set}; do + mkdir -p exp/nnet3/ivectors_${dataset}_fake2 + cp exp/nnet3/ivectors_${dataset}/ivector_online.scp exp/nnet3/ivectors_${dataset}_fake2 + # verify that the old ivector_period was 10. + [ $(cat exp/nnet3/ivectors_${dataset}/ivector_period) -eq 10 ] || exit 1 + echo 20 > exp/nnet3/ivectors_${dataset}_fake2/ivector_period + done +fi + + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_6s_sp/egs \ + --frame-subsampling-factor 6 \ + --alignment-subsampling-factor 3 \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \ + --splice-indexes "-1,1 -2,0,2 -4,-2,0,2 -6,0,6 -6,0,6 -12,-6,0" \ + --apply-deriv-weights false \ + --frames-per-iter 3000000 \ + --online-ivector-dir exp/nnet3/ivectors_${train_set}_fake2 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires_dbl $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; + + echo "0.005" > $dir/frame_shift # this lets the sclite decoding script know + # what the frame shift was, in seconds. +fi + +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set}_fake2 \ + $graph_dir data/${decode_set}_hires_dbl $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires_dbl \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6u.sh b/egs/swbd/s5c/local/chain/run_tdnn_6u.sh new file mode 100755 index 00000000000..4c48a75ffd6 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6u.sh @@ -0,0 +1,524 @@ +#!/bin/bash + +# _6u is as _6h, but with slightly different splicing indexes (start +# narrower than 6h and ramp up slowly). These are designed to be +# equivalent to those in 6t, except for use with normal-frame-rate, +# not double-frame-rate, input. The difference between 6t and 6u +# will show us whether having double-frame-rate input for the purpose +# of getting more different shifted versions of the input, is helpful. +# [however, note that the number of frames-per-iter is not comparable +# between 6t and 6u: here we're using 1.2 million frames per eg, +# and 6s is using 3 million which at the normal frame rate would be +# 1.5 million, and 1.2 != 1.5. + +# 6u is no better than 6h, and maybe slightly worse. Certainly it's worse than +# 6t. In addition, the train-valid difference is bigger with 6h and 6u than +# with 6t. This is all consistent with the notion that the higher-frame-rate +# input, with with we can generate more shifted versions, does really make a +# difference. However, I want to wait till the 6v->6w comparison is ready, +# which may let us know whether the difference in frames-per-iter could have +# been a confounding factor here. (It's unlikely, but possible). +# +#local/chain/compare_wer.sh 6h 6t 6u +#System 6h 6t 6u +#WER on train_dev(tg) 15.46 15.34 15.46 +#WER on train_dev(fg) 14.28 14.23 14.28 +#WER on eval2000(tg) 17.4 17.2 17.6 +#WER on eval2000(fg) 15.7 15.6 15.9 +#Final train prob -0.105663 -0.106575 -0.108665 +#Final valid prob -0.130166 -0.121902 -0.129495 +#Final train prob (xent) -1.42483 -1.62226 -1.54189 +#Final valid prob (xent) -1.49792 -1.67252 -1.60749 + +# _6h is as _6g but adding --xent-separate-forward-affine=true, which +# gives a separate last-but-one weight matrix to the xent output. + +# Although this slight improvement is probably not significant, it's a +# sensible idea so I think I'll stick with it. +#local/chain/compare_wer.sh 6g 6h +#System 6g 6h +#WER on train_dev(tg) 15.50 15.46 +#WER on train_dev(fg) 14.31 14.28 +#WER on eval2000(tg) 17.5 17.4 +#WER on eval2000(fg) 15.8 15.7 +#Final train prob -0.105853 -0.105663 +#Final valid prob -0.129997 -0.130166 + +# _6g is as _6f but increasing the parameters (increasing +# jesus-forward-input-from from 500 to 600). + +# _6f is as _5v, but setting --jesus-hidden-dim to 0 which with a script change +# means there is no hidden part in the jesus layer (it's just repeated affine and relu). + +# slightly worse, but encouragingly small difference. +#local/chain/compare_wer.sh 5v 6f +#System 5v 6f +#WER on train_dev(tg) 15.38 15.71 +#WER on train_dev(fg) 14.39 14.50 +#WER on eval2000(tg) 17.4 17.5 +#WER on eval2000(fg) 15.7 15.9 +#Final train prob -0.11156 -0.111305 +#Final valid prob -0.131797 -0.131487 + +# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500. + +# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse. +# +#local/chain/compare_wer.sh 5e 5s 5t 5v +#System 5e 5s 5t 5v +#WER on train_dev(tg) 15.43 15.47 15.43 15.38 +#WER on train_dev(fg) 14.32 14.31 14.34 14.39 +#WER on eval2000(tg) 17.3 17.4 17.4 17.4 +#WER on eval2000(fg) 15.5 15.6 15.6 15.7 +#Final train prob -0.110056 -0.110928 -0.110752 -0.11156 +#Final valid prob -0.129184 -0.132139 -0.129123 -0.131797 + +# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it +# up), from 5000 to 3500. + +# about 5s: comparing with 5e which is the most recent baseline we actually +# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700, +# jesus-hidden-dim reduced 7500 to 5000, and and the new option +# --self-repair-scale 0.00001 added. Also compare 5t and 5v which have even +# smaller jesus-hidden-dims. + +# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate +# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair +# code to a bug which was doubling the thresholds so there was, in effect, +# no upper threshold. I stopped the p,q,r runs after I found this, but in +# configuring this run I'm bearing in mind the train and valid probs from the +# p,q,r runs. + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. +# quite helpful: +#local/chain/compare_wer.sh 4w 5a +#System 4w 5a +#WER on train_dev(tg) 16.05 15.86 +#WER on train_dev(fg) 14.92 14.74 +#WER on eval2000(tg) 18.0 17.4 +#WER on eval2000(fg) 16.2 15.6 +#Final train prob -0.108816-0.0998359 +#Final valid prob -0.118254 -0.115884 + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6u # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \ + --splice-indexes "-1,0 -1,0,1 -2,-1,0,1 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6v.sh b/egs/swbd/s5c/local/chain/run_tdnn_6v.sh new file mode 100755 index 00000000000..158405a4058 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6v.sh @@ -0,0 +1,227 @@ +#!/bin/bash + +# _6v is as _6h, but moving to a TDNN+ReLU recipe instead of using jesus-layer. +# Otherwise we make everything as similar as possible to 6h. +# The ReLU dimension, at 576, is chosen to make the number of parameters about +# the same as 6h. + +# great improvement! +# local/chain/compare_wer.sh 6h 6v +# System 6h 6v +# WER on train_dev(tg) 15.46 15.00 +# WER on train_dev(fg) 14.28 13.91 +# WER on eval2000(tg) 17.4 17.2 +# WER on eval2000(fg) 15.7 15.7 + +# the following objf values are computed on the last iter (323), because due to +# a script bug, now fixed, the 'final' ones were not computed in 6v. +# note: in this run the xent learning rate was too slow. +# 323 train prob -0.129285 -0.120026 +# 323 valid prob -0.151648 -0.140628 +# 323 train prob (xent) -1.4443 -1.5431 +# 323 valid prob (xent) -1.51731 -1.56975 + + +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6v # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= +self_repair_scale=0.00001 +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +relu_dim=576 +frames_per_eg=150 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + + steps/nnet3/tdnn/make_configs.py $pool_opts \ + $repair_opts \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + $dim_opts \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize $xent_regularize \ + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir exp/chain/tdnn_2y_sp/egs \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1200000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6w.sh b/egs/swbd/s5c/local/chain/run_tdnn_6w.sh new file mode 100755 index 00000000000..3e3bb622290 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6w.sh @@ -0,0 +1,234 @@ +#!/bin/bash + +# 6w is as 6v (a new tdnn-based recipe), but using 1.5 million not 1.2 million +# frames per iter (and of course re-dumping the egs). + +# I discovered after running this that there was a problem with the egs-dumping, +# which seems to have existed for quite a while: the --right-tolerance defaults to 10 +# in the script, but it should have been 5, to match the code. However, 6v was +# run with older egs (before this bug was introduced) from 2y, so it doesn't +# have the problem. + +# note regarding the changes in objfs: these have explanations, they are due to +# the --right-tolerance increasing from 5->10 in 6v->6w: the chain objfs improve +# because of the less-restrictive numerator graphs, and the xent objfs get worse +# because the phone alignments become less consistent; we can see the reverse +# pattern in 6y -> 6z when we revert the right-tolerance back to 5. +# +#local/chain/compare_wer.sh 6v 6w +#System 6v 6w +#WER on train_dev(tg) 15.00 15.33 +#WER on train_dev(fg) 13.91 14.27 +#WER on eval2000(tg) 17.2 17.3 +#WER on eval2000(fg) 15.7 15.6 +#Final train prob -0.105012 -0.10287 +#Final valid prob -0.125877 -0.120451 +#Final train prob (xent) -1.54736 -1.63586 +#Final valid prob (xent) -1.57475 -1.67173 + + + +# this is same as v2 script but with xent-regularization +# it has a different splicing configuration +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6w # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= +self_repair_scale=0.00001 +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +relu_dim=576 +frames_per_eg=150 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + + steps/nnet3/tdnn/make_configs.py $pool_opts \ + $repair_opts \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + $dim_opts \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize $xent_regularize \ + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6x.sh b/egs/swbd/s5c/local/chain/run_tdnn_6x.sh new file mode 100755 index 00000000000..177ddd2a867 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6x.sh @@ -0,0 +1,229 @@ +#!/bin/bash + +# 6x is as 6w, but changing the splice-indexes to be like in 6u +# except since this is a TDNN setup, we need a final "0" [the jesus-layer +# setup had a final ReLU as a special case.]. +# These splice indexes start smaller, and ramp up more slowly, than +# the baseline in 6w. +# We're reusing the 6x egs. + +# no clear benefit; if anything, it's slightly worse. +# local/chain/compare_wer.sh 6w 6x +# System 6w 6x +# WER on train_dev(tg) 15.33 15.30 +# WER on train_dev(fg) 14.27 14.35 +# WER on eval2000(tg) 17.3 17.4 +# WER on eval2000(fg) 15.6 15.7 +# Final train prob -0.10287 -0.103078 +# Final valid prob -0.120451 -0.122477 +# Final train prob (xent) -1.63586 -1.73292 +# Final valid prob (xent) -1.67173 -1.75042 + +# 6w is as 6v (a new tdnn-based recipe), but using 1.5 million not 1.2 million +# frames per iter (and of course re-dumping the egs). + +# this is same as v2 script but with xent-regularization +# it has a different splicing configuration +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6x # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= +self_repair_scale=0.00001 +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +relu_dim=576 +frames_per_eg=150 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + + steps/nnet3/tdnn/make_configs.py $pool_opts \ + $repair_opts \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + $dim_opts \ + --splice-indexes "-1,0 -1,0,1 -2,-1,0,1 -3,0,3 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize $xent_regularize \ + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --egs.dir exp/chain/tdnn_6w_sp/egs \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6y.sh b/egs/swbd/s5c/local/chain/run_tdnn_6y.sh new file mode 100755 index 00000000000..a15c6648641 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6y.sh @@ -0,0 +1,227 @@ +#!/bin/bash + +# 6y is as 6w, but after fixing the config-generation script to use +# a higher learning-rate factor for the final xent layer (it was otherwise +# training too slowly). + +# WER results are inconclusive, but objective values are encouraging. +# We'll keep the change as it makes sense. +# local/chain/compare_wer.sh 6w 6y +# System 6w 6y +# WER on train_dev(tg) 15.33 15.36 +# WER on train_dev(fg) 14.27 14.19 +# WER on eval2000(tg) 17.3 17.2 +# WER on eval2000(fg) 15.6 15.8 +# Final train prob -0.10287 -0.102139 +# Final valid prob -0.120451 -0.119654 +# Final train prob (xent) -1.63586 -1.55598 +# Final valid prob (xent) -1.67173 -1.58821 + +# 6w is as 6v (a new tdnn-based recipe), but using 1.5 million not 1.2 million +# frames per iter (and of course re-dumping the egs). + +# this is same as v2 script but with xent-regularization +# it has a different splicing configuration +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6y # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= +self_repair_scale=0.00001 +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +relu_dim=576 +frames_per_eg=150 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + + steps/nnet3/tdnn/make_configs.py $pool_opts \ + $repair_opts \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + $dim_opts \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize $xent_regularize \ + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --egs.dir exp/chain/tdnn_6w_sp/egs \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6z.sh b/egs/swbd/s5c/local/chain/run_tdnn_6z.sh new file mode 100755 index 00000000000..97cc1b83624 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6z.sh @@ -0,0 +1,231 @@ +#!/bin/bash + +# 6z is as 6y, but fixing the right-tolerance in the scripts to default to 5 (as +# the default is in the code), rather than the previous script default value of +# 10 which I seem to have added to the script around Feb 9th. +# definitely better than 6y- not clear if we have managed to get the same +# results as 6v (could indicate that the larger frames-per-iter is not helpful? +# but I'd rather not decrease it as it would hurt speed). + +# local/chain/compare_wer.sh 6v 6y 6z +# System 6v 6y 6z +# WER on train_dev(tg) 15.00 15.36 15.18 +# WER on train_dev(fg) 13.91 14.19 14.06 +# WER on eval2000(tg) 17.2 17.2 17.2 +# WER on eval2000(fg) 15.7 15.8 15.6 +# Final train prob -0.105012 -0.102139 -0.106268 +# Final valid prob -0.125877 -0.119654 -0.126726 +# Final train prob (xent) -1.54736 -1.55598 -1.4556 +# Final valid prob (xent) -1.57475 -1.58821 -1.50136 + +# 6y is as 6w, but after fixing the config-generation script to use +# a higher learning-rate factor for the final xent layer (it was otherwise +# training too slowly). + +# 6w is as 6v (a new tdnn-based recipe), but using 1.5 million not 1.2 million +# frames per iter (and of course re-dumping the egs). + +# this is same as v2 script but with xent-regularization +# it has a different splicing configuration +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6z # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= +self_repair_scale=0.00001 +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +relu_dim=576 +frames_per_eg=150 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + + steps/nnet3/tdnn/make_configs.py $pool_opts \ + $repair_opts \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + $dim_opts \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize $xent_regularize \ + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_7a.sh b/egs/swbd/s5c/local/chain/run_tdnn_7a.sh new file mode 100755 index 00000000000..95c3c9f4c24 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_7a.sh @@ -0,0 +1,262 @@ +#!/bin/bash + +# 7a inherits from 6z (which is a TDNN+ReLU-based network with various small +# bugs hopefully fixed now), and from 6r, which is our most-successful +# double-frame-rate system. We're re-dumping the egs, because the egs used in +# 6r used right-tolerance=10, which turns out to have been a bug, and not a +# helpful one. + +# it is not better than 6z. +# local/chain/compare_wer.sh 6v 6z 7a +#System 6v 6z 7a +#WER on train_dev(tg) 15.00 15.18 15.05 +#WER on train_dev(fg) 13.91 14.06 14.10 +#WER on eval2000(tg) 17.2 17.2 17.3 +#WER on eval2000(fg) 15.7 15.6 15.7 +#Final train prob -0.105012 -0.106268 -0.110288 +#Final valid prob -0.125877 -0.126726 -0.127071 +#Final train prob (xent) -1.54736 -1.4556 -1.59569 +#Final valid prob (xent) -1.57475 -1.50136 -1.62312 + +# 6z is as 6y, but fixing the right-tolerance in the scripts to default to 5 (as +# the default is in the code), rather than the previous script default value of +# 10 which I seem to have added to the script around Feb 9th. + +# 6y is as 6w, but after fixing the config-generation script to use +# a higher learning-rate factor for the final xent layer (it was otherwise +# training too slowly). + +# 6w is as 6v (a new tdnn-based recipe), but using 1.5 million not 1.2 million +# frames per iter (and of course re-dumping the egs). + +# this is same as v2 script but with xent-regularization +# it has a different splicing configuration +set -e + +# configs for 'chain' +affix= +stage=14 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_7a # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= +self_repair_scale=0.00001 +# training options +num_epochs=2 # use 2 not 4 epochs, as with the double-frame-rate input, we + # shift the input data in double the number of distinct ways + # on each epoch. +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +relu_dim=576 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +# Generate double-frame-rate version of the data. +if [ $stage -le 12 ]; then + mfccdir=mfcc + for dataset in eval2000 train_dev; do ## ${train_set}; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires_dbl + steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires_dbl.conf \ + data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hires_dbl # remove segments with problems + done +fi + +if [ $stage -le 13 ]; then + for dataset in eval2000 train_dev ${train_set}; do + mkdir -p exp/nnet3/ivectors_${dataset}_fake2 + cp exp/nnet3/ivectors_${dataset}/ivector_online.scp exp/nnet3/ivectors_${dataset}_fake2 + # verify that the old ivector_period was 10. + [ $(cat exp/nnet3/ivectors_${dataset}/ivector_period) -eq 10 ] || exit 1 + echo 20 > exp/nnet3/ivectors_${dataset}_fake2/ivector_period + done +fi + +if [ $stage -le 14 ]; then + echo "$0: creating neural net configs"; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + + steps/nnet3/tdnn/make_configs.py $pool_opts \ + $repair_opts \ + --feat-dir data/${train_set}_hires_dbl \ + --ivector-dir exp/nnet3/ivectors_${train_set}_fake2 \ + --tree-dir $treedir \ + $dim_opts \ + --splice-indexes "-1,0,1 -2,0,2 -4,-2,0,2 -6,0,6 -6,0,6 -12,-6,0 0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize $xent_regularize \ + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + + + +if [ $stage -le 15 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{7,11,12,13}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set}_fake2 \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.frame-subsampling-factor 6 \ + --chain.alignment-subsampling-factor 3 \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 300 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 3000000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires_dbl \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + + echo "0.005" > $dir/frame_shift # this lets the sclite decoding script know + # what the frame shift was, in seconds. +fi + +if [ $stage -le 16 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 17 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set}_fake2 \ + $graph_dir data/${decode_set}_hires_dbl $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires_dbl \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_7b.sh b/egs/swbd/s5c/local/chain/run_tdnn_7b.sh new file mode 100755 index 00000000000..8bde54f7eee --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_7b.sh @@ -0,0 +1,234 @@ +#!/bin/bash + +# 7b is as 6z, but increasing the relu-dim slightly from 576 to 625. + +# 6z is as 6y, but fixing the right-tolerance in the scripts to default to 5 (as +# the default is in the code), rather than the previous script default value of +# 10 which I seem to have added to the script around Feb 9th. +# definitely better than 6y- not clear if we have managed to get the same +# results as 6v (could indicate that the larger frames-per-iter is not helpful? +# but I'd rather not decrease it as it would hurt speed). + +# local/chain/compare_wer.sh 6v 6y 6z +# System 6v 6y 6z +# WER on train_dev(tg) 15.00 15.36 15.18 +# WER on train_dev(fg) 13.91 14.19 14.06 +# WER on eval2000(tg) 17.2 17.2 17.2 +# WER on eval2000(fg) 15.7 15.8 15.6 +# Final train prob -0.105012 -0.102139 -0.106268 +# Final valid prob -0.125877 -0.119654 -0.126726 +# Final train prob (xent) -1.54736 -1.55598 -1.4556 +# Final valid prob (xent) -1.57475 -1.58821 -1.50136 + +# 6y is as 6w, but after fixing the config-generation script to use +# a higher learning-rate factor for the final xent layer (it was otherwise +# training too slowly). + +# 6w is as 6v (a new tdnn-based recipe), but using 1.5 million not 1.2 million +# frames per iter (and of course re-dumping the egs). + +# this is same as v2 script but with xent-regularization +# it has a different splicing configuration +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_7b # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= +self_repair_scale=0.00001 +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +relu_dim=625 +frames_per_eg=150 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + + steps/nnet3/tdnn/make_configs.py $pool_opts \ + $repair_opts \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + $dim_opts \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize $xent_regularize \ + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir exp/chain/tdnn_6z_sp/egs \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/show_wer.sh b/egs/swbd/s5c/local/chain/show_wer.sh new file mode 100755 index 00000000000..a82c4acf26d --- /dev/null +++ b/egs/swbd/s5c/local/chain/show_wer.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +for l in $*; do + grep WER exp/chain/tdnn_${l}_sp/decode_train_dev_sw1_tg/wer_* | utils/best_wer.sh +done +for l in $*; do + grep WER exp/chain/tdnn_${l}_sp/decode_train_dev_sw1_fsh_fg/wer_* | utils/best_wer.sh +done +for l in $*; do + grep Sum exp/chain/tdnn_${l}_sp/decode_eval2000_sw1_tg/score*/*ys | grep -v swbd | utils/best_wer.sh +done +for l in $*; do + grep Sum exp/chain/tdnn_${l}_sp/decode_eval2000_sw1_fsh_fg/score*/*ys | grep -v swbd | utils/best_wer.sh +done diff --git a/egs/swbd/s5c/local/nnet2/run_nnet2.sh b/egs/swbd/s5c/local/nnet2/run_nnet2.sh index 0872560337b..e83c587a006 100755 --- a/egs/swbd/s5c/local/nnet2/run_nnet2.sh +++ b/egs/swbd/s5c/local/nnet2/run_nnet2.sh @@ -5,7 +5,7 @@ # units, on top of fMLLR features, on GPU. temp_dir= -dir=exp/nnet2_5 +dir=nnet2_5 has_fisher=true . ./cmd.sh @@ -18,10 +18,10 @@ parallel_opts="--gpu 1" # This is suitable for the CLSP network, you'll ( if [ ! -f exp/$dir/final.mdl ]; then - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d exp/$dir/egs/storage ]; then # spread the egs over various machines. utils/create_split_dir.pl \ - /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/exp/$dir/egs/storage exp/$dir/egs/storage fi steps/nnet2/train_pnorm_accel2.sh --parallel-opts "$parallel_opts" \ diff --git a/egs/swbd/s5c/local/nnet3/run_ivector_common_v2.sh b/egs/swbd/s5c/local/nnet3/run_ivector_common_v2.sh new file mode 100755 index 00000000000..d46d5cc7238 --- /dev/null +++ b/egs/swbd/s5c/local/nnet3/run_ivector_common_v2.sh @@ -0,0 +1,190 @@ +#!/bin/bash + +. ./cmd.sh +set -e +stage=1 +train_stage=-10 +generate_alignments=true # false if doing ctc training +speed_perturb=true +speaker_perturb=true +lpc_order=100 +filter_nj=30 +spkf_per_spk=3 +perturb_suffix="" + +. ./path.sh +. ./utils/parse_options.sh + +mkdir -p nnet3 +# perturbed data preparation +train_set=train_nodup + +if $speed_perturb; then + perturb_suffix="_sp" +fi + +if $speaker_perturb; then + perturb_suffix=$perturb_suffix"_fp" +fi + +if [ "$speed_perturb" == "true" ]; then + if [ $stage -le 1 ]; then + echo "speed perturb the data" + #Although the nnet will be trained by high resolution data, we still have to perturbe the normal data to get the alignment + # _sp stands for speed-perturbed + + for datadir in train_nodup; do + utils/perturb_data_dir_speed.sh 0.9 data/${datadir} data/temp_sp1 + utils/perturb_data_dir_speed.sh 0.95 data/${datadir} data/temp_sp2 + utils/perturb_data_dir_speed.sh 1.05 data/${datadir} data/temp_sp3 + utils/perturb_data_dir_speed.sh 1.1 data/${datadir} data/temp_sp4 + + utils/combine_data.sh data/${datadir}_temp_sp data/temp_sp1 data/temp_sp2 data/temp_sp3 data/temp_sp4 + utils/validate_data_dir.sh --no-feats data/${datadir}_temp_sp + rm -r data/temp_sp1 data/temp_sp2 data/temp_sp3 data/temp_sp4 + + if [ "$speaker_perturb" == "true" ]; then + echo "speaker perturbation of data" + utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${datadir} data/temp_sp0 + utils/combine_data.sh data/${datadir}_sp data/${datadir}_temp_sp data/temp_sp0 + utils/fix_data_dir.sh data/${datadir}_sp + + # compute filter correspond to different speed perturbed speaker. + spk_filters=spkfilters + mkdir -p $spk_filters + utils/split_data.sh data/${datadir}_sp $filter_nj + echo $filter_nj > data/${datadir}_sp/num_filter_jobs + + $decode_cmd JOB=1:$filter_nj data/${datadir}_sp/split$filter_nj/compute_filter.JOB.log \ + compute-filter --lpc-order=$lpc_order scp:data/${datadir}_sp/split$filter_nj/JOB/wav.scp \ + ark,scp:$spk_filters/spk_filter.JOB.ark,$spk_filters/spk_filter.JOB.scp || exit 1; + + # combine filters.scp files together + for n in $(seq $filter_nj); do + cat $spk_filters/spk_filter.$n.scp || exit 1; + done > data/${datadir}_sp/spk_filter.scp + echo "Finished generating filters per speakers." + + echo "Perturb data using speaker perturbation." + utils/perturb_data_signal_v2.sh $spkf_per_spk 'fp' data/${datadir}_sp data/${datadir}_temp_sp_fp + utils/validate_data_dir.sh --no-feats data/${datadir}_temp_sp_fp + fi + + echo "perturb_suffix=$perturb_suffix " + mfccdir=mfcc_perturbed + echo "Generating features using perturbed data" + steps/make_mfcc.sh --cmd "$decode_cmd" --nj 50 \ + data/${datadir}_temp${perturb_suffix} exp/make_mfcc/${datadir}_temp${perturb_suffix} $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_temp${perturb_suffix} exp/make_mfcc/${datadir}_temp${perturb_suffix} $mfccdir || exit 1; + utils/fix_data_dir.sh data/${datadir}_temp${perturb_suffix} + + utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${datadir} data/temp0 + utils/combine_data.sh data/${datadir}${perturb_suffix} data/${datadir}_temp${perturb_suffix} data/temp0 + utils/fix_data_dir.sh data/${datadir}${perturb_suffix} + rm -r data/temp0 data/${datadir}_temp${perturb_suffix} + done + fi + + if [ $stage -le 2 ] && [ "$generate_alignments" == "true" ]; then + #obtain the alignment of the perturbed data + steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \ + data/train_nodup${perturb_suffix} data/lang_nosp exp/tri4 exp/tri4_ali_nodup${perturb_suffix} || exit 1 + fi +fi + +train_set=train_nodup${perturb_suffix} +if [ $stage -le 3 ]; then + mfccdir=mfcc_hires + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + date=$(date +'%m_%d_%H_%M') + utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$date/s5b/$mfccdir/storage $mfccdir/storage + fi + + # the 100k_nodup directory is copied seperately, as + # we want to use exp/tri2_ali_100k_nodup for lda_mllt training + # the main train directory might be speed_perturbed + for dataset in $train_set train_100k_nodup; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires + + # scale the waveforms, this is useful as we don't use CMVN + data_dir=data/${dataset}_hires + cat $data_dir/wav.scp | python -c " +import sys, os, subprocess, re, random +scale_low = 1.0/8 +scale_high = 2.0 +for line in sys.stdin.readlines(): + if len(line.strip()) == 0: + continue + print '{0} sox --vol {1} -t wav - -t wav - |'.format(line.strip(), random.uniform(scale_low, scale_high)) +"| sort -k1,1 -u > $data_dir/wav.scp_scaled || exit 1; + mv $data_dir/wav.scp_scaled $data_dir/wav.scp + + steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/${dataset} $mfccdir; + + # Remove the small number of utterances that couldn't be extracted for some + # reason (e.g. too short; no such file). + utils/fix_data_dir.sh data/${dataset}_hires; + done + if false; then #300 + for dataset in eval2000 train_dev; do + # Create MFCCs for the eval set + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires + steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 --mfcc-config conf/mfcc_hires.conf \ + data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hires # remove segments with problems + done + fi #300 + # Take the first 30k utterances (about 1/8th of the data) this will be used + # for the diagubm training + utils/subset_data_dir.sh --first data/${train_set}_hires 30000 data/${train_set}_30k_hires + local/remove_dup_utts.sh 200 data/${train_set}_30k_hires data/${train_set}_30k_nodup_hires # 33hr +fi +if false; then #400 +# ivector extractor training +if [ $stage -le 5 ]; then + # We need to build a small system just because we need the LDA+MLLT transform + # to train the diag-UBM on top of. We use --num-iters 13 because after we get + # the transform (12th iter is the last), any further training is pointless. + # this decision is based on fisher_english + steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ + --splice-opts "--left-context=3 --right-context=3" \ + 5500 90000 data/train_100k_nodup_hires \ + data/lang_nosp exp/tri2_ali_100k_nodup exp/nnet3/tri3b +fi + +if [ $stage -le 6 ]; then + # To train a diagonal UBM we don't need very much data, so use the smallest subset. + steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 200000 \ + data/${train_set}_30k_nodup_hires 512 exp/nnet3/tri3b exp/nnet3/diag_ubm +fi + +if [ $stage -le 7 ]; then + # iVector extractors can be sensitive to the amount of data, but this one has a + # fairly small dim (defaults to 100) so we don't use all of it, we use just the + # 100k subset (just under half the data). + steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ + data/train_100k_nodup_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1; +fi +fi #400 + +if [ $stage -le 8 ]; then + # We extract iVectors on all the train_nodup data, which will be what we + # train the system on. + + # having a larger number of speakers is helpful for generalization, and to + # handle per-utterance decoding well (iVector starts at zero). + steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set}_hires data/${train_set}_max2_hires + + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${train_set}_max2_hires exp/nnet3/extractor exp/nnet3/ivectors_$train_set || exit 1; + + for data_set in eval2000 train_dev; do + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${data_set}_hires exp/nnet3/extractor exp/nnet3/ivectors_$data_set || exit 1; + done +fi + +exit 0; diff --git a/egs/swbd/s5c/local/nnet3/run_lstm.sh b/egs/swbd/s5c/local/nnet3/run_lstm.sh index e53f3387fd4..11fc851cb71 100755 --- a/egs/swbd/s5c/local/nnet3/run_lstm.sh +++ b/egs/swbd/s5c/local/nnet3/run_lstm.sh @@ -20,16 +20,17 @@ has_fisher=true affix= speed_perturb=true common_egs_dir= +reporting_email= # LSTM options splice_indexes="-2,-1,0,1,2 0 0" lstm_delay=" -1 -2 -3 " label_delay=5 num_lstm_layers=3 -cell_dim=1280 +cell_dim=1024 hidden_dim=1024 -recurrent_projection_dim=384 -non_recurrent_projection_dim=384 +recurrent_projection_dim=256 +non_recurrent_projection_dim=256 chunk_width=20 chunk_left_context=40 chunk_right_context=0 @@ -55,7 +56,7 @@ frames_per_chunk= echo "$0 $@" # Print the command line for logging -. cmd.sh +. ./cmd.sh . ./path.sh . ./utils/parse_options.sh @@ -81,40 +82,62 @@ ali_dir=exp/tri4_ali_nodup$suffix local/nnet3/run_ivector_common.sh --stage $stage \ --speed-perturb $speed_perturb || exit 1; -if [ $stage -le 9 ]; then - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then - utils/create_split_dir.pl \ - /export/b0{3,4,5,6}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage - fi - steps/nnet3/lstm/train.sh --stage $train_stage \ - --label-delay $label_delay \ - --lstm-delay "$lstm_delay" \ - --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ - --num-chunk-per-minibatch $num_chunk_per_minibatch \ - --samples-per-iter $samples_per_iter \ - --splice-indexes "$splice_indexes" \ - --feat-type raw \ - --online-ivector-dir exp/nnet3/ivectors_${train_set} \ - --cmvn-opts "--norm-means=false --norm-vars=false" \ - --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ - --momentum $momentum \ - --cmd "$decode_cmd" \ +if [ $stage -le 9 ]; then + echo "$0: creating neural net configs"; + config_extra_opts=() + [ ! -z "$lstm_delay" ] && config_extra_opts+=(--lstm-delay "$lstm_delay") + steps/nnet3/lstm/make_configs.py "${config_extra_opts[@]}" \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --ali-dir $ali_dir \ --num-lstm-layers $num_lstm_layers \ + --splice-indexes "$splice_indexes " \ --cell-dim $cell_dim \ --hidden-dim $hidden_dim \ --recurrent-projection-dim $recurrent_projection_dim \ --non-recurrent-projection-dim $non_recurrent_projection_dim \ - --chunk-width $chunk_width \ - --chunk-left-context $chunk_left_context \ - --chunk-right-context $chunk_right_context \ - --egs-dir "$common_egs_dir" \ - --remove-egs $remove_egs \ - data/${train_set}_hires data/lang $ali_dir $dir || exit 1; + --label-delay $label_delay \ + --self-repair-scale 0.00001 \ + $dir/configs || exit 1; + fi -graph_dir=exp/tri4/graph_sw1_tg if [ $stage -le 10 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/train_rnn.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --trainer.num-epochs=$num_epochs \ + --trainer.samples-per-iter=$samples_per_iter \ + --trainer.optimization.num-jobs-initial=$num_jobs_initial \ + --trainer.optimization.num-jobs-final=$num_jobs_final \ + --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \ + --trainer.optimization.final-effective-lrate=$final_effective_lrate \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.rnn.num-chunk-per-minibatch=$num_chunk_per_minibatch \ + --trainer.optimization.momentum=$momentum \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.dir="$common_egs_dir" \ + --cleanup.remove-egs=$remove_egs \ + --cleanup.preserve-model-interval=100 \ + --use-gpu=true \ + --feat-dir=data/${train_set}_hires \ + --ali-dir=$ali_dir \ + --lang=data/lang \ + --reporting.email="$reporting_email" \ + --dir=$dir || exit 1; +fi + +graph_dir=exp/tri4/graph_sw1_tg +if [ $stage -le 11 ]; then if [ -z $extra_left_context ]; then extra_left_context=$chunk_left_context fi @@ -129,7 +152,7 @@ if [ $stage -le 10 ]; then num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` steps/nnet3/lstm/decode.sh --nj 250 --cmd "$decode_cmd" \ --extra-left-context $extra_left_context \ - --extra-right-context $extra_right_context \ + --extra-right-context $extra_right_context \ --frames-per-chunk "$frames_per_chunk" \ --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_sw1_tg || exit 1; diff --git a/egs/swbd/s5c/local/nnet3/run_tdnn.sh b/egs/swbd/s5c/local/nnet3/run_tdnn.sh index 448b5bd174c..5254bc31857 100755 --- a/egs/swbd/s5c/local/nnet3/run_tdnn.sh +++ b/egs/swbd/s5c/local/nnet3/run_tdnn.sh @@ -11,10 +11,13 @@ # --num-threads 16 and --minibatch-size 128. stage=0 +affix= train_stage=-10 has_fisher=true speed_perturb=true - +common_egs_dir= +reporting_email= +remove_egs=true . cmd.sh . ./path.sh @@ -41,26 +44,52 @@ ali_dir=exp/tri4_ali_nodup$suffix local/nnet3/run_ivector_common.sh --stage $stage \ --speed-perturb $speed_perturb || exit 1; + if [ $stage -le 9 ]; then + echo "$0: creating neural net configs"; + + # create the config files for nnet initialization + python steps/nnet3/tdnn/make_configs.py \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --ali-dir $ali_dir \ + --relu-dim 1024 \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,3 -7,2 0" \ + --use-presoftmax-prior-scale true \ + $dir/configs || exit 1; +fi + + + +if [ $stage -le 10 ]; then if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then utils/create_split_dir.pl \ /export/b0{3,4,5,6}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage fi - steps/nnet3/train_tdnn.sh --stage $train_stage \ - --num-epochs 2 --num-jobs-initial 3 --num-jobs-final 16 \ - --splice-indexes "-2,-1,0,1,2 -1,2 -3,3 -7,2 0" \ - --feat-type raw \ - --online-ivector-dir exp/nnet3/ivectors_${train_set} \ - --cmvn-opts "--norm-means=false --norm-vars=false" \ - --initial-effective-lrate 0.0017 --final-effective-lrate 0.00017 \ - --cmd "$decode_cmd" \ - --relu-dim 1024 \ - data/${train_set}_hires data/lang $ali_dir $dir || exit 1; + steps/nnet3/train_dnn.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --trainer.num-epochs 2 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0017 \ + --trainer.optimization.final-effective-lrate 0.00017 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --cleanup.preserve-model-interval 100 \ + --use-gpu true \ + --feat-dir=data/${train_set}_hires \ + --ali-dir $ali_dir \ + --lang data/lang \ + --reporting.email="$reporting_email" \ + --dir=$dir || exit 1; + fi graph_dir=exp/tri4/graph_sw1_tg -if [ $stage -le 10 ]; then +if [ $stage -le 11 ]; then for decode_set in train_dev eval2000; do ( num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` diff --git a/egs/swbd/s5c/local/score_sclite.sh b/egs/swbd/s5c/local/score_sclite.sh index 3bce900aecf..26d77d10f15 100755 --- a/egs/swbd/s5c/local/score_sclite.sh +++ b/egs/swbd/s5c/local/score_sclite.sh @@ -50,7 +50,11 @@ if $reverse; then reorder_opt="--reorder=false" fi -if [ -f $dir/../frame_subsampling_factor ]; then + +if [ -f $dir/../frame_shift ]; then + frame_shift_opt="--frame-shift=$(cat $dir/../frame_shift)" + echo "$0: $dir/../frame_shift exists, using $frame_shift_opt" +elif [ -f $dir/../frame_subsampling_factor ]; then factor=$(cat $dir/../frame_subsampling_factor) || exit 1 frame_shift_opt="--frame-shift=0.0$factor" echo "$0: $dir/../frame_subsampling_factor exists, using $frame_shift_opt" diff --git a/egs/swbd/s5c/local/swbd1_data_download.sh b/egs/swbd/s5c/local/swbd1_data_download.sh index 00ec97c5028..d8f076b5141 100755 --- a/egs/swbd/s5c/local/swbd1_data_download.sh +++ b/egs/swbd/s5c/local/swbd1_data_download.sh @@ -10,18 +10,11 @@ ## you unpacked this. We are just doing a "find" command to locate ## the .sph files. -## The second input is optional, which should point to a directory containing -## Switchboard transcriptions/documentations (specifically, the conv.tab file). -## If specified, the script will try to use the actual speaker PINs provided -## with the corpus instead of the conversation side ID (Kaldi default). We -## will be using "find" to locate this file so we don't make any assumptions -## on the directory structure. (Peng Qi, Aug 2014) - . path.sh #check existing directories -if [ $# != 1 -a $# != 2 ]; then - echo "Usage: swbd1_data_prep_edin.sh /path/to/SWBD [/path/to/SWBD_DOC]" +if [ $# != 1 ]; then + echo "Usage: swbd1_data_download.sh /path/to/SWBD" exit 1; fi @@ -30,24 +23,19 @@ SWBD_DIR=$1 dir=data/local/train mkdir -p $dir - # Audio data directory check if [ ! -d $SWBD_DIR ]; then echo "Error: run.sh requires a directory argument" exit 1; fi -sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe -[ ! -x $sph2pipe ] \ - && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1; - - # Trans directory check if [ ! -d $SWBD_DIR/transcriptions/swb_ms98_transcriptions ]; then ( cd $dir; if [ ! -d swb_ms98_transcriptions ]; then echo " *** Downloading trascriptions and dictionary ***" + wget http://www.openslr.org/resources/5/switchboard_word_alignments.tar.gz || wget http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz tar -xf switchboard_word_alignments.tar.gz fi diff --git a/egs/swbd/s5c/local/swbd1_data_prep.sh b/egs/swbd/s5c/local/swbd1_data_prep.sh index 57fb0ff56c8..9621e7fc06e 100755 --- a/egs/swbd/s5c/local/swbd1_data_prep.sh +++ b/egs/swbd/s5c/local/swbd1_data_prep.sh @@ -21,7 +21,7 @@ #check existing directories if [ $# != 1 -a $# != 2 ]; then - echo "Usage: swbd1_data_prep_edin.sh /path/to/SWBD [/path/to/SWBD_DOC]" + echo "Usage: swbd1_data_prep.sh /path/to/SWBD [/path/to/SWBD_DOC]" exit 1; fi @@ -41,23 +41,6 @@ sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe [ ! -x $sph2pipe ] \ && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1; - -# Trans directory check -if [ ! -d $SWBD_DIR/transcriptions/swb_ms98_transcriptions ]; then - ( - cd $dir; - if [ ! -d swb_ms98_transcriptions ]; then - echo " *** Downloading trascriptions and dictionary ***" - wget http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz - tar -xf switchboard_word_alignments.tar.gz - fi - ) -else - echo "Directory with transcriptions exists, skipping downloading" - [ -f $dir/swb_ms98_transcriptions ] \ - || ln -sf $SWBD_DIR/transcriptions/swb_ms98_transcriptions $dir/ -fi - # Option A: SWBD dictionary file check [ ! -f $dir/swb_ms98_transcriptions/sw-ms98-dict.text ] && \ echo "SWBD dictionary file does not exist" && exit 1; diff --git a/egs/tedlium/s5/RESULTS b/egs/tedlium/s5/RESULTS index 9c494712aa8..0c209bddf7e 100644 --- a/egs/tedlium/s5/RESULTS +++ b/egs/tedlium/s5/RESULTS @@ -7,6 +7,27 @@ for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; d for x in exp/{mono,tri,sgmm,nnet,dnn,lstm}*/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null | grep $filter_regexp exit 0 + +#---------------------------------Current results (after fixing the problem)--------------------------------- +# There was a problem with the language model preparation where the scripts expected to represent OOV words while +# the language model used to represent them. See `git log tedlium-unk-fix` for details. +# Fixing this causes a small decrease in WER. + +# GMMs +# DEV SPEAKERS: +%WER 31.0 | 507 17792 | 73.5 20.2 6.3 4.5 31.0 97.2 | -0.032 | exp/tri1/decode_nosp_dev/score_11_0.0/ctm.filt.filt.sys +%WER 26.4 | 507 17792 | 77.8 16.7 5.5 4.2 26.4 95.5 | -0.066 | exp/tri2/decode_nosp_dev/score_13_0.0/ctm.filt.filt.sys +%WER 26.1 | 507 17792 | 77.2 16.3 6.5 3.4 26.1 95.5 | -0.106 | exp/tri2/decode_dev/score_14_1.0/ctm.filt.filt.sys +%WER 22.0 | 507 17792 | 81.6 13.2 5.2 3.6 22.0 93.9 | -0.189 | exp/tri3/decode_dev/score_13_1.0/ctm.filt.filt.sys + +# TEST SPEAKERS: +%WER 30.9 | 1155 27512 | 72.1 21.0 6.9 3.0 30.9 94.5 | 0.035 | exp/tri1/decode_nosp_test/score_12_0.5/ctm.filt.filt.sys +%WER 25.5 | 1155 27512 | 78.0 17.4 4.6 3.6 25.5 92.8 | -0.034 | exp/tri2/decode_nosp_test/score_12_0.0/ctm.filt.filt.sys +%WER 24.9 | 1155 27512 | 78.3 16.7 5.0 3.2 24.9 93.0 | -0.020 | exp/tri2/decode_test/score_14_0.5/ctm.filt.filt.sys +%WER 20.3 | 1155 27512 | 82.7 13.4 3.9 3.0 20.3 90.0 | -0.063 | exp/tri3/decode_test/score_14_0.5/ctm.filt.filt.sys + +#---------------------------------(Pre- fix for Cantab LM) Provided for reference---------------------------------- + # Results from Nikolay, using kaldi scoring: # %WER 35.17 [ 9677 / 27512, 1267 ins, 1681 del, 6729 sub ] exp/tri1/decode/wer_13 # %WER 30.03 [ 8262 / 27512, 1255 ins, 1367 del, 5640 sub ] exp/tri2/decode/wer_15 diff --git a/egs/tedlium/s5/cmd.sh b/egs/tedlium/s5/cmd.sh index bed97d34020..ba7f120e599 100644 --- a/egs/tedlium/s5/cmd.sh +++ b/egs/tedlium/s5/cmd.sh @@ -19,7 +19,7 @@ host=$(hostname -f) if [ ${host#*.} == "fit.vutbr.cz" ]; then # BUT cluster: queue="all.q@@blade,all.q@@speech" - gpu_queue="long.q@supergpu*,long.q@dellgpu*,long.q@pcspeech-gpu,long.q@pcgpu*" + gpu_queue="long.q@@gpu" storage="matylda5" export train_cmd="queue.pl -q $queue -l ram_free=1500M,mem_free=1500M,${storage}=1" export decode_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,${storage}=0.5" diff --git a/egs/tedlium/s5/local/prepare_dict.sh b/egs/tedlium/s5/local/prepare_dict.sh index a3207de050a..fcb03ea7aef 100755 --- a/egs/tedlium/s5/local/prepare_dict.sh +++ b/egs/tedlium/s5/local/prepare_dict.sh @@ -1,7 +1,8 @@ #!/bin/bash # -# Copyright 2014 Nickolay V. Shmyrev +# Copyright 2014 Nickolay V. Shmyrev # 2014 Brno University of Technology (Author: Karel Vesely) +# 2016 Daniel Galvez # Apache 2.0 # @@ -13,10 +14,11 @@ srcdict=db/cantab-TEDLIUM/cantab-TEDLIUM.dct [ ! -r $srcdict ] && echo "Missing $srcdict" && exit 1 # Join dicts and fix some troubles -cat $srcdict | grep -v "" | grep -v "" | LANG= LC_ALL= sort | sed 's:([0-9])::g' > $dir/lexicon_words.txt +cat $srcdict | grep -v -w "" | grep -v -w "" | grep -v -w "" | \ + LANG= LC_ALL= sort | sed 's:([0-9])::g' > $dir/lexicon_words.txt cat $dir/lexicon_words.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \ - grep -v SIL | sort > $dir/nonsilence_phones.txt + grep -v SIL | sort > $dir/nonsilence_phones.txt ( echo SIL; echo BRH; echo CGH; echo NSN ; echo SMK; echo UM; echo UHH ) > $dir/silence_phones.txt @@ -27,9 +29,11 @@ echo SIL > $dir/optional_silence.txt echo -n >$dir/extra_questions.txt # Add to the lexicon the silences, noises etc. +# Typically, you would use " NSN" here, but the Cantab Research language models +# use instead of to represent out of vocabulary words. (echo '!SIL SIL'; echo '[BREATH] BRH'; echo '[NOISE] NSN'; echo '[COUGH] CGH'; echo '[SMACK] SMK'; echo '[UM] UM'; echo '[UH] UHH' - echo ' NSN' ) | \ + echo ' NSN' ) | \ cat - $dir/lexicon_words.txt | sort | uniq > $dir/lexicon.txt # Check that the dict dir is okay! diff --git a/egs/tedlium/s5/run.sh b/egs/tedlium/s5/run.sh index 7a36e49e8e0..e1dbf7b80e0 100755 --- a/egs/tedlium/s5/run.sh +++ b/egs/tedlium/s5/run.sh @@ -9,7 +9,7 @@ # The data is distributed under 'Creative Commons BY-NC-ND 3.0' license, # which allow free non-commercial use, while only a citation is required. # -# Copyright 2014 Nickolay V. Shmyrev +# Copyright 2014 Nickolay V. Shmyrev # 2014 Brno University of Technology (Author: Karel Vesely) # Apache 2.0 # @@ -28,17 +28,18 @@ stage=0 # Data preparation if [ $stage -le 0 ]; then local/download_data.sh || exit 1 - + local/prepare_data.sh || exit 1 local/prepare_dict.sh || exit 1 utils/prepare_lang.sh data/local/dict_nosp \ - "" data/local/lang_nosp data/lang_nosp || exit 1 + "" data/local/lang_nosp data/lang_nosp || exit 1 local/prepare_lm.sh || exit 1 fi + # Feature extraction feat_dir=$pwd/data/mfcc_features if [ $stage -le 1 ]; then @@ -100,7 +101,7 @@ if [ $stage -le 5 ]; then data/local/dict_nosp exp/tri2/pron_counts_nowb.txt \ exp/tri2/sil_counts_nowb.txt \ exp/tri2/pron_bigram_counts_nowb.txt data/local/dict - + utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang cp -rT data/lang data/lang_test cp -rT data/lang data/lang_rescore @@ -134,6 +135,8 @@ if [ $stage -le 6 ]; then exp/tri3/graph data/test exp/tri3/decode_test || exit 1 fi +# steps/cleanup/debug_lexicon.sh --nj 100 --alidir exp/tri3 --cmd "$train_cmd" data/train data/lang exp/tri3 data/local/dict/lexicon.txt exp/tri3_debug_lexicon & + if [ $stage -le 7 ]; then steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ data/train data/lang exp/tri3 exp/tri3_ali || exit 1 diff --git a/egs/thchs30/README.txt b/egs/thchs30/README.txt new file mode 100644 index 00000000000..acbdea4a263 --- /dev/null +++ b/egs/thchs30/README.txt @@ -0,0 +1,10 @@ +THCHS30 is an open Chinese speech database published by Center for Speech and Language Technology (CSLT) at Tsinghua University. + +The origional recording was conducted in 2002 by Dong Wang, supervised by Prof. Xiaoyan Zhu, at the Key State Lab of Intelligence and System, Department of Computer Science, Tsinghua Universeity, and the original name was 'TCMSD', standing for 'Tsinghua Continuous Mandarin Speech Database'. The publication after 13 years has been initiated by Dr. Dong Wang and was supported by Prof. Xiaoyan Zhu. We hope to provide a toy database for new researchers in the field of speech recognition. Therefore, the database is totally free to academic users. + +The database can be downloaded from openslr: +http://www.openslr.org/18/ + +or from the CSLT server: +http://data.cslt.org/thchs30/README.html + diff --git a/egs/thchs30/s5/RESULTS b/egs/thchs30/s5/RESULTS new file mode 100644 index 00000000000..70718ea4c2a --- /dev/null +++ b/egs/thchs30/s5/RESULTS @@ -0,0 +1,61 @@ +#!/bin/bash +for x in exp/{mono,tri1,tri2b,tri3b,tri4b,tri4b_dnn,tri4b_dnn_mpe}/decode_test_phone* ; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done +#clean mono,tri1,tri2b,tri3b,GMM,DNN model +#clean test data +#phone task +%WER 31.49 [ 113986 / 362027, 20820 ins, 22043 del, 71123 sub ] exp/mono/decode_test_phone/wer_5 +%WER 20.56 [ 74445 / 362027, 15452 ins, 12457 del, 46536 sub ] exp/tri1/decode_test_phone/wer_5 +%WER 17.32 [ 62689 / 362027, 11937 ins, 11260 del, 39492 sub ] exp/tri2b/decode_test_phone/wer_6 +%WER 18.06 [ 65368 / 362027, 10426 ins, 13780 del, 41162 sub ] exp/tri3b/decode_test_phone/wer_5 +%WER 18.50 [ 66984 / 362027, 13117 ins, 11917 del, 41950 sub ] exp/tri3b/decode_test_phone.si/wer_5 +%WER 16.17 [ 58544 / 362027, 9628 ins, 11746 del, 37170 sub ] exp/tri4b/decode_test_phone/wer_6 +%WER 16.59 [ 60060 / 362027, 11440 ins, 10477 del, 38143 sub ] exp/tri4b/decode_test_phone.si/wer_6 +%WER 10.27 [ 37173 / 362027, 8675 ins, 6483 del, 22015 sub ] exp/tri4b_dnn/decode_test_phone/wer_4 +%WER 10.11 [ 36591 / 362027, 8702 ins, 6255 del, 21634 sub ] exp/tri4b_dnn_mpe/decode_test_phone_it1/wer_4 +%WER 10.03 [ 36321 / 362027, 7490 ins, 6731 del, 22100 sub ] exp/tri4b_dnn_mpe/decode_test_phone_it2/wer_5 +%WER 10.01 [ 36249 / 362027, 7507 ins, 6677 del, 22065 sub ] exp/tri4b_dnn_mpe/decode_test_phone_it3/wer_5 + +exit 0 + +for x in exp/{mono,tri1,tri2b,tri3b,tri4b,tri4b_dnn,tri4b_dnn_mpe}/decode_test_word* ; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done +#clean mono,tri1,tri2b,tri3b,GMM,DNN model +#clean test data +#word task +%WER 51.04 [ 41414 / 81139, 474 ins, 2404 del, 38536 sub ] exp/mono/decode_test_word/wer_9 +%WER 36.38 [ 29522 / 81139, 516 ins, 1096 del, 27910 sub ] exp/tri1/decode_test_word/wer_10 +%WER 32.51 [ 26379 / 81139, 469 ins, 940 del, 24970 sub ] exp/tri2b/decode_test_word/wer_9 +%WER 31.65 [ 25684 / 81139, 340 ins, 1085 del, 24259 sub ] exp/tri3b/decode_test_word/wer_9 +%WER 34.07 [ 27643 / 81139, 443 ins, 1100 del, 26100 sub ] exp/tri3b/decode_test_word.si/wer_10 +%WER 29.64 [ 24052 / 81139, 341 ins, 929 del, 22782 sub ] exp/tri4b/decode_test_word/wer_11 +%WER 31.71 [ 25732 / 81139, 472 ins, 902 del, 24358 sub ] exp/tri4b/decode_test_word.si/wer_10 +%WER 23.57 [ 19123 / 81139, 419 ins, 585 del, 18119 sub ] exp/tri4b_dnn/decode_test_word/wer_7 +%WER 23.40 [ 18984 / 81139, 397 ins, 567 del, 18020 sub ] exp/tri4b_dnn_mpe/decode_test_word_it1/wer_7 +%WER 23.27 [ 18884 / 81139, 396 ins, 553 del, 17935 sub ] exp/tri4b_dnn_mpe/decode_test_word_it2/wer_7 +%WER 23.18 [ 18804 / 81139, 368 ins, 618 del, 17818 sub ] exp/tri4b_dnn_mpe/decode_test_word_it3/wer_8 + +exit 0 + +for x in exp/{tri4b_dnn_mpe,tri4b_dnn_dae}/decode_phone_0db/{white,car,cafe}; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done +#clean MPE model and mixture DAE model +#0db noise test data +#phone task +%WER 84.01 [ 304141 / 362027, 717 ins, 275948 del, 27476 sub ] exp/tri4b_dnn_mpe/decode_phone_0db/white/wer_4 +%WER 14.11 [ 51074 / 362027, 10941 ins, 8175 del, 31958 sub ] exp/tri4b_dnn_mpe/decode_phone_0db/car/wer_5 +%WER 71.63 [ 259329 / 362027, 6164 ins, 217508 del, 35657 sub ] exp/tri4b_dnn_mpe/decode_phone_0db/cafe/wer_4 +%WER 40.04 [ 144946 / 362027, 17764 ins, 35162 del, 92020 sub ] exp/tri4b_dnn_dae/decode_phone_0db/white/wer_6 +%WER 11.81 [ 42773 / 362027, 9598 ins, 7552 del, 25623 sub ] exp/tri4b_dnn_dae/decode_phone_0db/car/wer_5 +%WER 32.39 [ 117256 / 362027, 17793 ins, 27750 del, 71713 sub ] exp/tri4b_dnn_dae/decode_phone_0db/cafe/wer_6 +exit 0 + +for x in exp/{tri4b_dnn_mpe,tri4b_dnn_dae}/decode_word_0db/{white,car,cafe}; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done +#clean MPE model and mixture DAE model +#0db noise test data +#word task +%WER 98.56 [ 79973 / 81139, 15 ins, 64293 del, 15665 sub ] exp/tri4b_dnn_mpe/decode_word_0db/white/wer_4 +%WER 28.10 [ 22799 / 81139, 553 ins, 661 del, 21585 sub ] exp/tri4b_dnn_mpe/decode_word_0db/car/wer_8 +%WER 85.58 [ 69438 / 81139, 321 ins, 49066 del, 20051 sub ] exp/tri4b_dnn_mpe/decode_word_0db/cafe/wer_8 +%WER 65.23 [ 52923 / 81139, 827 ins, 4198 del, 47898 sub ] exp/tri4b_dnn_dae/decode_word_0db/white/wer_13 +%WER 25.12 [ 20379 / 81139, 444 ins, 676 del, 19259 sub ] exp/tri4b_dnn_dae/decode_word_0db/car/wer_9 +%WER 53.38 [ 43308 / 81139, 907 ins, 4164 del, 38237 sub ] exp/tri4b_dnn_dae/decode_word_0db/cafe/wer_12 + +exit 0 diff --git a/egs/thchs30/s5/cmd.sh b/egs/thchs30/s5/cmd.sh new file mode 100644 index 00000000000..1d8e768790f --- /dev/null +++ b/egs/thchs30/s5/cmd.sh @@ -0,0 +1,15 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd=queue.pl +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/thchs30/s5/conf/decode_dnn.config b/egs/thchs30/s5/conf/decode_dnn.config new file mode 100644 index 00000000000..89dd9929a62 --- /dev/null +++ b/egs/thchs30/s5/conf/decode_dnn.config @@ -0,0 +1,2 @@ +beam=18.0 # beam for decoding. Was 13.0 in the scripts. +lattice_beam=10.0 # this has most effect on size of the lattices. diff --git a/egs/thchs30/s5/conf/fbank.conf b/egs/thchs30/s5/conf/fbank.conf new file mode 100644 index 00000000000..8e6e36c69cf --- /dev/null +++ b/egs/thchs30/s5/conf/fbank.conf @@ -0,0 +1,3 @@ +# No non-default options for now. +#--sample-frequency=8000 +--num-mel-bins=40 diff --git a/egs/thchs30/s5/conf/mfcc.conf b/egs/thchs30/s5/conf/mfcc.conf new file mode 100644 index 00000000000..47d6c48bfe5 --- /dev/null +++ b/egs/thchs30/s5/conf/mfcc.conf @@ -0,0 +1,2 @@ +--use-energy=false # only non-default option. +#--sample-frequency=8000 diff --git a/egs/thchs30/s5/local/dae/add-noise-mod.py b/egs/thchs30/s5/local/dae/add-noise-mod.py new file mode 100755 index 00000000000..33e8a297aef --- /dev/null +++ b/egs/thchs30/s5/local/dae/add-noise-mod.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python +# Copyright 2016 Tsinghua University (Author: Chao Liu, Dong Wang). Apache 2.0. + + +from __future__ import print_function +import optparse +import random +import bisect +import re +import logging +import wave +import math +import struct +import sys +import os + +try: + import pyximport; pyximport.install() + from thchs30_util import * +except: + print("Cython possibly not installed, using standard python code. The process might be slow", file=sys.stderr) + + def energy(mat): + return float(sum([x * x for x in mat])) / len(mat) + + def mix(mat, noise, pos, scale): + ret = [] + l = len(noise) + for i in xrange(len(mat)): + x = mat[i] + d = int(x + scale * noise[pos]) + #if d > 32767 or d < -32768: + # logging.debug('overflow occurred!') + d = max(min(d, 32767), -32768) + ret.append(d) + pos += 1 + if pos == l: + pos = 0 + return (pos, ret) + + +def dirichlet(params): + samples = [random.gammavariate(x, 1) if x > 0 else 0. for x in params] + samples = [x / sum(samples) for x in samples] + for x in xrange(1, len(samples)): + samples[x] += samples[x - 1] + return bisect.bisect_left(samples, random.random()) + +def wave_mat(wav_filename): + f = wave.open(wav_filename, 'r') + n = f.getnframes() + ret = f.readframes(n) + f.close() + return list(struct.unpack('%dh' % n, ret)) + +def num_samples(mat): + return len(mat) + +def scp(scp_filename): + with open(scp_filename) as f: + for l in f: + yield tuple(l.strip().split()) + +def wave_header(sample_array, sample_rate): + byte_count = (len(sample_array)) * 2 # short + # write the header + hdr = struct.pack(' len(n): + noise_energies[type] = energy(n[p::]+n[0:len(n)-p:]) + else: + noise_energies[type] = energy(n[p:p+len(mat):]) + scale = math.sqrt(noise / noise_energies[type]) + logging.debug('noise scale: %f', scale) + pos, result = mix(mat, n, p, scale) + noises[type] = (pos, n) + if args.wavdir != 'NULL': + output_wave_file(args.wavdir, tag, result) + else: + output(tag, result) + +if __name__ == '__main__': + main() + + + diff --git a/egs/thchs30/s5/local/dae/run_dae.sh b/egs/thchs30/s5/local/dae/run_dae.sh new file mode 100755 index 00000000000..f6a6db3a01a --- /dev/null +++ b/egs/thchs30/s5/local/dae/run_dae.sh @@ -0,0 +1,149 @@ +#!/bin/bash +#Copyright 2016 Tsinghua University (Author: Dong Wang, Xuewei Zhang). Apache 2.0. + +#Conducts experiments of dae-based denoisng + +stage=0 +nj=8 + +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. + +. ./path.sh ## Source the tools/utils (import the queue.pl) +. utils/parse_options.sh || exit 1; + +thchs=$1 + +#generate noisy data. We focuse on the 0db condition. +#For training set, generate noisy data with SNR mean=0, variance=10, with three noise types mixed together. +#For dev, generate noisy data with SNR mean=0, variance=0, with three niose types mixed together +#For test, use the standard test data which were generated by SNR mean=0, variance=0. + +if [ $stage = 0 ]; then + #generat noise.scp + mkdir -p data/dae/noise && \ + awk '{print $1 " '$thchs'/resource/noise/"$2}' $thchs/resource/noise/noise.scp > data/dae/noise/noise.scp || exit 1 + + echo "DAE: generate training data..." + noise_scp=data/dae/noise/noise.scp + noise_prior="0.0,10.0,10.0,10.0" #define noise type to sample. [S_clean, S_white, S_car, S_cafe] + noise_level=0 #0db condition + sigma0=10 #some random in SNR + seed=32 + verbose=0 + wavdir=wav/dae/train + rm -rf data/dae/train && mkdir -p data/dae/train || exit 1 + cp data/fbank/train/{spk2utt,utt2spk,text} data/dae/train || exit 1 + mkdir -p $wavdir && awk '{print $1 " '$wavdir'/"$1".wav"}' data/fbank/train/wav.scp > data/dae/train/wav.scp || exit 1 + + mkdir -p exp/dae/gendata + split_scps="" + for n in $(seq $nj); do + split_scps="$split_scps exp/dae/gendata/train_split_${n}.scp" + done + utils/split_scp.pl data/fbank/train/wav.scp $split_scps || exit 1 + $train_cmd JOB=1:$nj exp/dae/gendata/add_noise_train.JOB.log \ + local/dae/add-noise-mod.py --noise-level $noise_level \ + --sigma0 $sigma0 --seed $seed --verbose $verbose \ + --noise-prior $noise_prior --noise-src $noise_scp \ + --wav-src exp/dae/gendata/train_split_JOB.scp --wavdir $wavdir \ + || exit 1 + + steps/make_fbank.sh --nj $nj --cmd "$train_cmd" \ + data/dae/train exp/dae/gendata fbank/dae/train || exit 1 + steps/compute_cmvn_stats.sh data/dae/train exp/dae/cmvn \ + fbank/dae/train || exit 1 + + #genreate dev data. Just the 0db condition is produced. Multiple noise types mixed together. + echo "DAE: generating dev data..." + wavdir=wav/dae/dev/0db + sigma0=0 #no random in SNR + rm -rf data/dae/dev/0db && mkdir -p data/dae/dev/0db && \ + cp -L data/fbank/dev/{spk2utt,utt2spk,text} data/dae/dev/0db || exit 1 + mkdir -p $wavdir && awk '{print $1 " '$wavdir'/"$1".wav"}' data/fbank/dev/wav.scp > data/dae/dev/0db/wav.scp || exit 1 + + split_scps="" + for n in $(seq $nj); do + split_scps="$split_scps exp/dae/gendata/dev_split_${n}.scp" + done + utils/split_scp.pl data/fbank/dev/wav.scp $split_scps || exit 1 + + $train_cmd JOB=1:$nj exp/dae/gendata/add_noise_dev.JOB.log \ + local/dae/add-noise-mod.py --noise-level $noise_level \ + --sigma0 $sigma0 --seed $seed --verbose $verbose \ + --noise-prior $noise_prior --noise-src $noise_scp \ + --wav-src exp/dae/gendata/dev_split_JOB.scp --wavdir $wavdir \ + || exit 1 + steps/make_fbank.sh --nj $nj --cmd "$train_cmd" \ + data/dae/dev/0db exp/dae/gendata fbank/dae/dev/0db || exit 1 + steps/compute_cmvn_stats.sh data/dae/dev/0db exp/dae/cmvn \ + fbank/dae/dev/0db || exit 1 + + #generate test data. Assume it has been downloaded in $thchs/test-noise + echo "DAE: generating test data..." + #generate fbank + for x in car white cafe; do + echo "producing fbanks for $x" + mkdir -p data/dae/test/0db/$x && \ + cp -L data/fbank/test/{spk2utt,utt2spk,text} data/dae/test/0db/$x && \ + awk '{print $1 " '$thchs'/test-noise/0db/'$x'/"$1".wav"}' data/fbank/test/wav.scp > data/dae/test/0db/$x/wav.scp || exit 1 + steps/make_fbank.sh --nj $nj --cmd "$train_cmd" \ + data/dae/test/0db/$x exp/dae/gendata fbank/dae/test/0db/$x || exit 1 + echo "generating cmvn for test data $x" + steps/compute_cmvn_stats.sh data/dae/test/0db/$x exp/dae/cmvn \ + fbank/dae/test/0db/$x || exit 1 + cp -R data/dae/test/0db/$x data/dae/test/0db/${x}_phone && cp data/test/phone.txt data/dae/test/0db/${x}_phone/text || exit 1 + done +fi + +#DAE training +if [ $stage -le 1 ]; then + #train dnn dae using data with mixed noise + #produce merged feats.scp as --labels for both training and cv + dir=exp/tri4b_dnn_dae && mkdir -p exp/tri4b_dnn_dae || exit 1 + cat data/fbank/train/feats.scp data/fbank/dev/feats.scp | sort -u > $dir/tgt_feats.scp + cat data/fbank/train/cmvn.scp data/fbank/dev/cmvn.scp | sort -u > $dir/tgt_cmvn.scp + + num_fea=$(feat-to-dim scp:$dir/tgt_feats.scp -) + echo "num_fea = $num_fea" + + $cuda_cmd exp/tri4b_dnn_dae/log/train_nnet.log \ + steps/nnet/train.sh --hid-layers 2 --hid-dim 1200 \ + --cmvn-opts "--norm-vars=false" --splice 10 \ + --learn-rate 0.0001 \ + --train_tool_opts "--objective-function=mse" \ + --copy_feats false \ + --labels "ark:copy-feats scp:$dir/tgt_feats.scp ark:- | apply-cmvn --norm-vars=false scp:$dir/tgt_cmvn.scp ark:- ark:- | feat-to-post ark:- ark:-|" \ + --num-tgt $num_fea \ + --proto-opts '--no-softmax ' \ + data/dae/train data/dae/dev/0db data/lang \ + data/fbank/train data/fbank/dev \ + exp/tri4b_dnn_dae || exit 1; + nnet-concat exp/tri4b_dnn_dae/final.feature_transform exp/tri4b_dnn_dae/final.nnet \ + exp/tri4b_dnn_mpe/final.feature_transform exp/tri4b_dnn_dae/dae.nnet || exit 1 + +fi + +#decoding +if [ $stage -le 2 ]; then + for x in car white cafe; do + ( + #decode word + steps/nnet/decode.sh --cmd "$decode_cmd" --nj $nj \ + --srcdir exp/tri4b_dnn_mpe \ + exp/tri4b/graph_word data/dae/test/0db/$x exp/tri4b_dnn_mpe/decode_word_0db/$x || exit 1; + steps/nnet/decode.sh --cmd "$decode_cmd" --nj $nj \ + --srcdir exp/tri4b_dnn_mpe --feature-transform exp/tri4b_dnn_dae/dae.nnet \ + exp/tri4b/graph_word data/dae/test/0db/$x exp/tri4b_dnn_dae/decode_word_0db/$x || exit 1; + + #decode phone + steps/nnet/decode.sh --cmd "$decode_cmd" --nj $nj \ + --srcdir exp/tri4b_dnn_mpe \ + exp/tri4b/graph_phone data/dae/test/0db/${x}_phone exp/tri4b_dnn_mpe/decode_phone_0db/$x || exit 1; + steps/nnet/decode.sh --cmd "$decode_cmd" --nj $nj \ + --srcdir exp/tri4b_dnn_mpe --feature-transform exp/tri4b_dnn_dae/dae.nnet \ + exp/tri4b/graph_phone data/dae/test/0db/${x}_phone exp/tri4b_dnn_dae/decode_phone_0db/$x || exit 1; + ) & + done +fi + diff --git a/egs/thchs30/s5/local/dae/thchs30_util.pyx b/egs/thchs30/s5/local/dae/thchs30_util.pyx new file mode 100755 index 00000000000..281ff166032 --- /dev/null +++ b/egs/thchs30/s5/local/dae/thchs30_util.pyx @@ -0,0 +1,27 @@ +# Copyright 2016 Tsinghua University (Author: Chao Liu). Apache 2.0. + +def energy(list mat): + cdef float e + cdef int i, j, l + l = len(mat) + for i in range(l): + j = mat[i] + e += j * j + e /= l + return e + +def mix(list mat, list noise, int pos, double scale): + cdef len_noise, len_mat, i, x, y + ret = [] + len_noise = len(noise) + len_mat = len(mat) + for i in range(len_mat): + x = mat[i] + y = int(x + scale * noise[pos]) + if y > 32767: + y = 32767 + elif y < -32768: + y = -32768 + ret.append(y) + pos = (pos + 1) % len_noise + return pos, ret diff --git a/egs/thchs30/s5/local/download_and_untar.sh b/egs/thchs30/s5/local/download_and_untar.sh new file mode 100755 index 00000000000..655e674dc9b --- /dev/null +++ b/egs/thchs30/s5/local/download_and_untar.sh @@ -0,0 +1,99 @@ +#!/bin/bash + +# Copyright 2014 Johns Hopkins University (author: Daniel Povey) +# Copyright 2016 Tsinghua University (author: Dong Wang) +# Apache 2.0 + +# Adapted from librispeech recipe local/download_and_untar.sh + +remove_archive=false + +if [ "$1" == --remove-archive ]; then + remove_archive=true + shift +fi + +if [ $# -ne 3 ]; then + echo "Usage: $0 [--remove-archive] " + echo "e.g.: $0 /nfs/public/materials/data/thchs30-openslr www.openslr.org/resources/18 data_thchs30" + echo "With --remove-archive it will remove the archive after successfully un-tarring it." + echo " can be one of: data_thchs30, test-noise, resource" +fi + +data=$1 +url=$2 +part=$3 + +if [ ! -d "$data" ]; then + echo "$0: no such directory $data" + exit 1; +fi + +part_ok=false +list="data_thchs30 test-noise resource" +for x in $list; do + if [ "$part" == $x ]; then part_ok=true; fi +done +if ! $part_ok; then + echo "$0: expected to be one of $list, but got '$part'" + exit 1; +fi + +if [ -z "$url" ]; then + echo "$0: empty URL base." + exit 1; +fi + +if [ -f $data/$part/.complete ]; then + echo "$0: data part $part was already successfully extracted, nothing to do." + exit 0; +fi + + +sizes="6453425169 1971460210 24813708" + +if [ -f $data/$part.tgz ]; then + size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}') + size_ok=false + for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done + if ! $size_ok; then + echo "$0: removing existing file $data/$part.tgz because its size in bytes $size" + echo "does not equal the size of one of the archives." + rm $data/$part.tgz + else + echo "$data/$part.tgz exists and appears to be complete." + fi +fi + +if [ ! -f $data/$part.tgz ]; then + if ! which wget >/dev/null; then + echo "$0: wget is not installed." + exit 1; + fi + full_url=$url/$part.tgz + echo "$0: downloading data from $full_url. This may take some time, please be patient." + + cd $data + pwd + echo " wget --no-check-certificate $full_url" + if ! wget --no-check-certificate $full_url; then + echo "$0: error executing wget $full_url" + exit 1; + fi +fi + +cd $data + +if ! tar -xvzf $part.tgz; then + echo "$0: error un-tarring archive $data/$part.tgz" + exit 1; +fi + +touch $data/$part/.complete + +echo "$0: Successfully downloaded and un-tarred $data/$part.tgz" + +if $remove_archive; then + echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied." + rm $data/$part.tgz +fi diff --git a/egs/thchs30/s5/local/nnet/run_dnn.sh b/egs/thchs30/s5/local/nnet/run_dnn.sh new file mode 100755 index 00000000000..d40f48e3609 --- /dev/null +++ b/egs/thchs30/s5/local/nnet/run_dnn.sh @@ -0,0 +1,90 @@ +#!/bin/bash +#Copyright 2016 Tsinghua University (Author: Dong Wang, Xuewei Zhang). Apache 2.0. + +#run from ../.. +#DNN training, both xent and MPE + + +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. + +. ./path.sh ## Source the tools/utils (import the queue.pl) + +stage=0 +nj=8 + +. utils/parse_options.sh || exit 1; + +gmmdir=$1 +alidir=$2 +alidir_cv=$3 + +#generate fbanks +if [ $stage -le 0 ]; then + echo "DNN training: stage 0: feature generation" + rm -rf data/fbank && mkdir -p data/fbank && cp -R data/{train,dev,test,test_phone} data/fbank || exit 1; + for x in train dev test; do + echo "producing fbank for $x" + #fbank generation + steps/make_fbank.sh --nj $nj --cmd "$train_cmd" data/fbank/$x exp/make_fbank/$x fbank/$x || exit 1 + #ompute cmvn + steps/compute_cmvn_stats.sh data/fbank/$x exp/fbank_cmvn/$x fbank/$x || exit 1 + done + + echo "producing test_fbank_phone" + cp data/fbank/test/feats.scp data/fbank/test_phone && cp data/fbank/test/cmvn.scp data/fbank/test_phone || exit 1; + +fi + + +#xEnt training +if [ $stage -le 1 ]; then + outdir=exp/tri4b_dnn + #NN training + (tail --pid=$$ -F $outdir/log/train_nnet.log 2>/dev/null)& # forward log + $cuda_cmd $outdir/log/train_nnet.log \ + steps/nnet/train.sh --copy_feats false --cmvn-opts "--norm-means=true --norm-vars=false" --hid-layers 4 --hid-dim 1024 \ + --learn-rate 0.008 data/fbank/train data/fbank/dev data/lang $alidir $alidir_cv $outdir || exit 1; + #Decode (reuse HCLG graph in gmmdir) + ( + steps/nnet/decode.sh --nj $nj --cmd "$decode_cmd" --srcdir $outdir --config conf/decode_dnn.config --acwt 0.1 \ + $gmmdir/graph_word data/fbank/test $outdir/decode_test_word || exit 1; + )& + ( + steps/nnet/decode.sh --nj $nj --cmd "$decode_cmd" --srcdir $outdir --config conf/decode_dnn.config --acwt 0.1 \ + $gmmdir/graph_phone data/fbank/test_phone $outdir/decode_test_phone || exit 1; + )& + +fi + +#MPE training + +srcdir=exp/tri4b_dnn +acwt=0.1 + +if [ $stage -le 2 ]; then + # generate lattices and alignments + steps/nnet/align.sh --nj $nj --cmd "$train_cmd" \ + data/fbank/train data/lang $srcdir ${srcdir}_ali || exit 1; + steps/nnet/make_denlats.sh --nj $nj --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \ + data/fbank/train data/lang $srcdir ${srcdir}_denlats || exit 1; +fi + +if [ $stage -le 3 ]; then + outdir=exp/tri4b_dnn_mpe + #Re-train the DNN by 3 iteration of MPE + steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 3 --acwt $acwt --do-smbr false \ + data/fbank/train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $outdir || exit 1 + #Decode (reuse HCLG graph) + for ITER in 3 2 1; do + ( + steps/nnet/decode.sh --nj $nj --cmd "$decode_cmd" --nnet $outdir/${ITER}.nnet --config conf/decode_dnn.config --acwt $acwt \ + $gmmdir/graph_word data/fbank/test $outdir/decode_test_word_it${ITER} || exit 1; + )& + ( + steps/nnet/decode.sh --nj $nj --cmd "$decode_cmd" --nnet $outdir/${ITER}.nnet --config conf/decode_dnn.config --acwt $acwt \ + $gmmdir/graph_phone data/fbank/test_phone $outdir/decode_test_phone_it${ITER} || exit 1; + )& + done +fi + diff --git a/egs/thchs30/s5/local/score.sh b/egs/thchs30/s5/local/score.sh new file mode 120000 index 00000000000..0afefc3158c --- /dev/null +++ b/egs/thchs30/s5/local/score.sh @@ -0,0 +1 @@ +../steps/score_kaldi.sh \ No newline at end of file diff --git a/egs/thchs30/s5/local/thchs-30_data_prep.sh b/egs/thchs30/s5/local/thchs-30_data_prep.sh new file mode 100755 index 00000000000..7a85274ce83 --- /dev/null +++ b/egs/thchs30/s5/local/thchs-30_data_prep.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#Copyright 2016 Tsinghua University (Author: Dong Wang, Xuewei Zhang). Apache 2.0. + +#This script pepares the data directory for thchs30 recipe. +#It reads the corpus and get wav.scp and transcriptions. + +dir=$1 +corpus_dir=$2 + + +cd $dir + +echo "creating data/{train,dev,test}" +mkdir -p data/{train,dev,test} + +#create wav.scp, utt2spk.scp, spk2utt.scp, text +( +for x in train dev test; do + echo "cleaning data/$x" + cd $dir/data/$x + rm -rf wav.scp utt2spk spk2utt word.txt phone.txt text + echo "preparing scps and text in data/$x" + for nn in `find $corpus_dir/$x/*.wav | sort -u | xargs -i basename {} .wav`; do + echo $nn $corpus_dir/$x/$nn.wav >> wav.scp + echo $nn $nn >> utt2spk + echo $nn $nn >> spk2utt + echo $nn `sed -n 1p $corpus_dir/data/$nn.wav.trn` >> word.txt + echo $nn `sed -n 3p $corpus_dir/data/$nn.wav.trn` >> phone.txt + done + cp word.txt text +done +) || exit 1 + +echo "creating test_phone for phone decoding" +( + rm -rf data/test_phone && cp -R data/test data/test_phone || exit 1 + cd data/test_phone && rm text && cp phone.txt text || exit 1 +) + diff --git a/egs/thchs30/s5/local/thchs-30_decode.sh b/egs/thchs30/s5/local/thchs-30_decode.sh new file mode 100755 index 00000000000..f9661f61f21 --- /dev/null +++ b/egs/thchs30/s5/local/thchs-30_decode.sh @@ -0,0 +1,34 @@ +#!/bin/bash +#Copyright 2016 Tsinghua University (Author: Dong Wang, Xuewei Zhang). Apache 2.0. + +#decoding wrapper for thchs30 recipe +#run from ../ + +nj=8 +mono=false + +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. + +. ./path.sh ## Source the tools/utils (import the queue.pl) + +. utils/parse_options.sh || exit 1; +decoder=$1 +srcdir=$2 +datadir=$3 + + +if [ $mono = true ];then + echo "using monophone to generate graph" + opt="--mono" +fi + +#decode word +utils/mkgraph.sh $opt data/graph/lang $srcdir $srcdir/graph_word || exit 1; +$decoder --cmd "$decode_cmd" --nj $nj $srcdir/graph_word $datadir/test $srcdir/decode_test_word || exit 1 + +#decode phone +utils/mkgraph.sh $opt data/graph_phone/lang $srcdir $srcdir/graph_phone || exit 1; +$decoder --cmd "$decode_cmd" --nj $nj $srcdir/graph_phone $datadir/test_phone $srcdir/decode_test_phone || exit 1 + + diff --git a/egs/thchs30/s5/local/wer_output_filter b/egs/thchs30/s5/local/wer_output_filter new file mode 100755 index 00000000000..1ccb651a258 --- /dev/null +++ b/egs/thchs30/s5/local/wer_output_filter @@ -0,0 +1,19 @@ +#!/usr/bin/env python +#Copyright 2016 Tsinghua University (Author: Dong Wang). Apache 2.0. + +#This script accepts a Chinese stream and inserts blanks between Chinese characters +#Used to prepare character-based transcriptions and compute CER. + +from __future__ import print_function +import sys + +for l in sys.stdin: + l=l.strip() + ll=l.split() + lk=ll[0] + for v in ll[1:]: + v = v.decode('utf-8') + for i in v: + lk= lk + ' ' + i + + print (lk.encode('utf-8')) diff --git a/egs/thchs30/s5/path.sh b/egs/thchs30/s5/path.sh new file mode 100755 index 00000000000..bc199673fc5 --- /dev/null +++ b/egs/thchs30/s5/path.sh @@ -0,0 +1,7 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh + +export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$PWD:$PATH + +export LC_ALL=C + diff --git a/egs/thchs30/s5/run.sh b/egs/thchs30/s5/run.sh new file mode 100755 index 00000000000..24645f59e83 --- /dev/null +++ b/egs/thchs30/s5/run.sh @@ -0,0 +1,112 @@ +#!/bin/bash + +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. ./path.sh + +H=`pwd` #exp home +n=8 #parallel jobs + +#corpus and trans directory +thchs=/nfs/public/materials/data/thchs30-openslr + +#you can obtain the database by uncommting the following lines +#[ -d $thchs ] || mkdir -p $thchs || exit 1 +#echo "downloading THCHS30 at $thchs ..." +#local/download_and_untar.sh $thchs http://www.openslr.org/resources/18 data_thchs30 || exit 1 +#local/download_and_untar.sh $thchs http://www.openslr.org/resources/18 resource || exit 1 +#local/download_and_untar.sh $thchs http://www.openslr.org/resources/18 test-noise || exit 1 + +#data preparation +#generate text, wav.scp, utt2pk, spk2utt +local/thchs-30_data_prep.sh $H $thchs/data_thchs30 || exit 1; + +#produce MFCC features +rm -rf data/mfcc && mkdir -p data/mfcc && cp -R data/{train,dev,test,test_phone} data/mfcc || exit 1; +for x in train dev test; do + #make mfcc + steps/make_mfcc.sh --nj $n --cmd "$train_cmd" data/mfcc/$x exp/make_mfcc/$x mfcc/$x || exit 1; + #compute cmvn + steps/compute_cmvn_stats.sh data/mfcc/$x exp/mfcc_cmvn/$x mfcc/$x || exit 1; +done +#copy feats and cmvn to test.ph, avoid duplicated mfcc & cmvn +cp data/mfcc/test/feats.scp data/mfcc/test_phone && cp data/mfcc/test/cmvn.scp data/mfcc/test_phone || exit 1; + + +#prepare language stuff +#build a large lexicon that invovles words in both the training and decoding. +( + echo "make word graph ..." + cd $H; mkdir -p data/{dict,lang,graph} && \ + cp $thchs/resource/dict/{extra_questions.txt,nonsilence_phones.txt,optional_silence.txt,silence_phones.txt} data/dict && \ + cat $thchs/resource/dict/lexicon.txt $thchs/data_thchs30/lm_word/lexicon.txt | \ + grep -v '' | grep -v '' | sort -u > data/dict/lexicon.txt || exit 1; + utils/prepare_lang.sh --position_dependent_phones false data/dict "" data/local/lang data/lang || exit 1; + gzip -c $thchs/data_thchs30/lm_word/word.3gram.lm > data/graph/word.3gram.lm.gz || exit 1; + utils/format_lm.sh data/lang data/graph/word.3gram.lm.gz $thchs/data_thchs30/lm_word/lexicon.txt data/graph/lang || exit 1; +) + +#make_phone_graph +( + echo "make phone graph ..." + cd $H; mkdir -p data/{dict_phone,graph_phone,lang_phone} && \ + cp $thchs/resource/dict/{extra_questions.txt,nonsilence_phones.txt,optional_silence.txt,silence_phones.txt} data/dict_phone && \ + cat $thchs/data_thchs30/lm_phone/lexicon.txt | grep -v '' | sort -u > data/dict_phone/lexicon.txt && \ + echo " sil " >> data/dict_phone/lexicon.txt || exit 1; + utils/prepare_lang.sh --position_dependent_phones false data/dict_phone "" data/local/lang_phone data/lang_phone || exit 1; + gzip -c $thchs/data_thchs30/lm_phone/phone.3gram.lm > data/graph_phone/phone.3gram.lm.gz || exit 1; + utils/format_lm.sh data/lang_phone data/graph_phone/phone.3gram.lm.gz $thchs/data_thchs30/lm_phone/lexicon.txt \ + data/graph_phone/lang || exit 1; +) + +#monophone +steps/train_mono.sh --boost-silence 1.25 --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/mono || exit 1; +#test monophone model +local/thchs-30_decode.sh --mono true --nj $n "steps/decode.sh" exp/mono data/mfcc & + +#monophone_ali +steps/align_si.sh --boost-silence 1.25 --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/mono exp/mono_ali || exit 1; + +#triphone +steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2000 10000 data/mfcc/train data/lang exp/mono_ali exp/tri1 || exit 1; +#test tri1 model +local/thchs-30_decode.sh --nj $n "steps/decode.sh" exp/tri1 data/mfcc & + +#triphone_ali +steps/align_si.sh --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/tri1 exp/tri1_ali || exit 1; + +#lda_mllt +steps/train_lda_mllt.sh --cmd "$train_cmd" --splice-opts "--left-context=3 --right-context=3" 2500 15000 data/mfcc/train data/lang exp/tri1_ali exp/tri2b || exit 1; +#test tri2b model +local/thchs-30_decode.sh --nj $n "steps/decode.sh" exp/tri2b data/mfcc & + + +#lda_mllt_ali +steps/align_si.sh --nj $n --cmd "$train_cmd" --use-graphs true data/mfcc/train data/lang exp/tri2b exp/tri2b_ali || exit 1; + +#sat +steps/train_sat.sh --cmd "$train_cmd" 2500 15000 data/mfcc/train data/lang exp/tri2b_ali exp/tri3b || exit 1; +#test tri3b model +local/thchs-30_decode.sh --nj $n "steps/decode_fmllr.sh" exp/tri3b data/mfcc & + +#sat_ali +steps/align_fmllr.sh --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/tri3b exp/tri3b_ali || exit 1; + +#quick +steps/train_quick.sh --cmd "$train_cmd" 4200 40000 data/mfcc/train data/lang exp/tri3b_ali exp/tri4b || exit 1; +#test tri4b model +local/thchs-30_decode.sh --nj $n "steps/decode_fmllr.sh" exp/tri4b data/mfcc & + +#quick_ali +steps/align_fmllr.sh --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/tri4b exp/tri4b_ali || exit 1; + +#quick_ali_cv +steps/align_fmllr.sh --nj $n --cmd "$train_cmd" data/mfcc/dev data/lang exp/tri4b exp/tri4b_ali_cv || exit 1; + +#train dnn model +local/nnet/run_dnn.sh --stage 0 --nj $n exp/tri4b exp/tri4b_ali exp/tri4b_ali_cv || exit 1; + +#train dae model +#python2.6 or above is required for noisy data generation. +#To speed up the process, pyximport for python is recommeded. +local/dae/run_dae.sh --stage 0 $thchs || exit 1; diff --git a/egs/thchs30/s5/steps b/egs/thchs30/s5/steps new file mode 120000 index 00000000000..1b186770dd1 --- /dev/null +++ b/egs/thchs30/s5/steps @@ -0,0 +1 @@ +../../wsj/s5/steps/ \ No newline at end of file diff --git a/egs/thchs30/s5/utils b/egs/thchs30/s5/utils new file mode 120000 index 00000000000..a3279dc8679 --- /dev/null +++ b/egs/thchs30/s5/utils @@ -0,0 +1 @@ +../../wsj/s5/utils/ \ No newline at end of file diff --git a/egs/tidigits/s5/cmd.sh b/egs/tidigits/s5/cmd.sh index c8f0d9d67a7..71dd849a93b 100644 --- a/egs/tidigits/s5/cmd.sh +++ b/egs/tidigits/s5/cmd.sh @@ -1,14 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu" -export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu" -export train_cmd=run.pl -#export decode_cmd=run.pl - - - +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/tidigits/s5/local/tidigits_prepare_lang.sh b/egs/tidigits/s5/local/tidigits_prepare_lang.sh index ff316514fc9..0bc08ab40a0 100755 --- a/egs/tidigits/s5/local/tidigits_prepare_lang.sh +++ b/egs/tidigits/s5/local/tidigits_prepare_lang.sh @@ -88,10 +88,11 @@ utils/make_lexicon_fst.pl $tmpdir/lexicon.txt 0.5 sil | \ cp $lang/L.fst $lang/L_disambig.fst -silphonelist=`cat $lang/phones/silence.csl | sed 's/:/ /g'` -nonsilphonelist=`cat $lang/phones/nonsilence.csl | sed 's/:/ /g'` -cat conf/topo.proto | sed "s:NONSILENCEPHONES:$nonsilphonelist:" | \ - sed "s:SILENCEPHONES:$silphonelist:" > $lang/topo +num_sil_states=5 +num_nonsil_states=3 +silphonelist=`cat $lang/phones/silence.csl` +nonsilphonelist=`cat $lang/phones/nonsilence.csl` +utils/gen_topo.pl $num_nonsil_states $num_sil_states $nonsilphonelist $silphonelist >$lang/topo # Now we prepare a simple grammar G.fst that's a kind of loop of # digits (no silence in this, since that's handled in L.fst) diff --git a/egs/timit/s5/cmd.sh b/egs/timit/s5/cmd.sh index fd91a53ff73..5abbfd4495a 100644 --- a/egs/timit/s5/cmd.sh +++ b/egs/timit/s5/cmd.sh @@ -1,36 +1,31 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -#a) JHU cluster options -#export train_cmd="queue.pl -l arch=*64" -#export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G" -#export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G" -#export cuda_cmd=run.pl +export train_cmd="queue.pl --mem 4G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" +# the use of cuda_cmd is deprecated but it's still sometimes used in nnet1 +# example scripts. +export cuda_cmd="queue.pl --gpu 1" - -if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then - export train_cmd="queue.pl -l arch=*64*" - export decode_cmd="queue.pl -l arch=*64* --mem 3G" - export mkgraph_cmd="queue.pl -l arch=*64* --mem 4G" - export cuda_cmd="queue.pl -l gpu=1" -elif [[ $(hostname -f) == *.fit.vutbr.cz ]]; then +# the rest of this file is present for historical reasons. +# for cluster-specific configuration it's better to rely on conf/queue.conf. +if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then #b) BUT cluster options - queue="all.q@@blade,all.q@@speech,all.q@dellgpu*,all.q@supergpu*" - export train_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,matylda5=0.5" - export decode_cmd="queue.pl -q $queue -l ram_free=3000M,mem_free=3000M,matylda5=0.1" - export mkgraph_cmd="queue.pl -q $queue -l ram_free=4G,mem_free=4G,matylda5=3" - export cuda_cmd="queue.pl -q long.q@pcspeech-gpu,long.q@dellgpu1,long.q@pcgpu*,long.q@supergpu1 -l gpu=1" -else - echo "$0: you need to define options for your cluster." - exit 1; + queue="all.q@@blade,all.q@@speech" + gpu_queue="long.q@@gpu" + storage="matylda5" + export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=0.5" + export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.1" + export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1" fi -#c) run locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl diff --git a/egs/voxforge/s5/cmd.sh b/egs/voxforge/s5/cmd.sh index 2d454050669..71dd849a93b 100644 --- a/egs/voxforge/s5/cmd.sh +++ b/egs/voxforge/s5/cmd.sh @@ -1,14 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu" -#export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu" -export train_cmd=run.pl -export decode_cmd=run.pl - - - +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/vystadial_cz/s5/cmd.sh b/egs/vystadial_cz/s5/cmd.sh index 0900744b5ae..bb0b5337cdb 100644 --- a/egs/vystadial_cz/s5/cmd.sh +++ b/egs/vystadial_cz/s5/cmd.sh @@ -1,22 +1,20 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -#export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu" -#export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu" -# export train_cmd="queue.pl -l mf=5g" -# export decode_cmd="queue.pl -l mf=5g" -export train_cmd="queue.pl -l arch=*64*" -export decode_cmd="queue.pl -l arch=*64*" +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" -# The number of parallel jobs to be started for some parts of the recipe -# Make sure you have enough resources(CPUs and RAM) to accomodate this number of jobs -njobs=20 - -# If you have no GridEngine you can do: -#export train_cmd=run.pl -#export decode_cmd=run.pl -#njobs=2 +# this controls the number of parallel decoding jobs launched in run.sh if you +# are running locally (e.g. with run.pl) you can reduce it to control memory +# usage. +export njobs=20 diff --git a/egs/vystadial_en/s5/cmd.sh b/egs/vystadial_en/s5/cmd.sh index 0900744b5ae..bb0b5337cdb 100644 --- a/egs/vystadial_en/s5/cmd.sh +++ b/egs/vystadial_en/s5/cmd.sh @@ -1,22 +1,20 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -#export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu" -#export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu" -# export train_cmd="queue.pl -l mf=5g" -# export decode_cmd="queue.pl -l mf=5g" -export train_cmd="queue.pl -l arch=*64*" -export decode_cmd="queue.pl -l arch=*64*" +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" -# The number of parallel jobs to be started for some parts of the recipe -# Make sure you have enough resources(CPUs and RAM) to accomodate this number of jobs -njobs=20 - -# If you have no GridEngine you can do: -#export train_cmd=run.pl -#export decode_cmd=run.pl -#njobs=2 +# this controls the number of parallel decoding jobs launched in run.sh if you +# are running locally (e.g. with run.pl) you can reduce it to control memory +# usage. +export njobs=20 diff --git a/egs/wsj/s5/cmd.sh b/egs/wsj/s5/cmd.sh index 00aa0c145a3..537c46ba4f2 100644 --- a/egs/wsj/s5/cmd.sh +++ b/egs/wsj/s5/cmd.sh @@ -1,30 +1,29 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64" -export decode_cmd="queue.pl -l arch=*64 --mem 2G" -export mkgraph_cmd="queue.pl -l arch=*64 --mem 4G" -export big_memory_cmd="queue.pl -l arch=*64 --mem 8G" -export cuda_cmd="queue.pl -l gpu=1" - -#b) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +export train_cmd=queue.pl +export decode_cmd="queue.pl --mem 2G" +export mkgraph_cmd="queue.pl --mem 4G" +# the use of cuda_cmd is deprecated. +export cuda_cmd="queue.pl --gpu 1" +# the rest of this file is present for historical reasons. +# It's better to use conf/queue.conf for cluster-specific configuration. #c) BUT cluster: if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then queue="all.q@@blade,all.q@@speech" - gpu_queue="long.q@supergpu*,long.q@dellgpu*,long.q@pcspeech-gpu,long.q@pcgpu*" + gpu_queue="long.q@@gpu" storage="matylda5" - export train_cmd="queue.pl -q $queue -l ram_free=1500M,mem_free=1500M,${storage}=1" - export decode_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,${storage}=0.5" - export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1" + export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=1" + export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.5" + export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1" fi - diff --git a/egs/wsj/s5/local/nnet3/run_lstm.sh b/egs/wsj/s5/local/nnet3/run_lstm.sh index cd64b654651..2d7ab51d900 100755 --- a/egs/wsj/s5/local/nnet3/run_lstm.sh +++ b/egs/wsj/s5/local/nnet3/run_lstm.sh @@ -46,7 +46,7 @@ frames_per_chunk= echo "$0 $@" # Print the command line for logging -. cmd.sh +. ./cmd.sh . ./path.sh . ./utils/parse_options.sh diff --git a/egs/wsj/s5/local/run_kl_hmm.sh b/egs/wsj/s5/local/run_kl_hmm.sh index 9e7679a7675..efe95052c1d 100644 --- a/egs/wsj/s5/local/run_kl_hmm.sh +++ b/egs/wsj/s5/local/run_kl_hmm.sh @@ -5,6 +5,8 @@ . cmd.sh +big_memory_cmd="$decode_cmd --mem 8G" + states=20000 dir=exp/tri4b_pretrain-dbn_dnn/ diff --git a/egs/wsj/s5/steps/cleanup/create_segments_from_ctm.pl b/egs/wsj/s5/steps/cleanup/create_segments_from_ctm.pl index 911640f5495..3f620083e11 100755 --- a/egs/wsj/s5/steps/cleanup/create_segments_from_ctm.pl +++ b/egs/wsj/s5/steps/cleanup/create_segments_from_ctm.pl @@ -228,7 +228,8 @@ sub SplitLongSegment { $aligned_ctm->[$seg_end_index]->[2] - $aligned_ctm->[$seg_start_index]->[1]; my $current_seg_index = $seg_start_index; - while ($current_seg_length > 1.5 * $max_seg_length) { + my $aligned_ctm_size = keys($aligned_ctm); + while ($current_seg_length > 1.5 * $max_seg_length && $current_seg_index < $aligned_ctm_size) { my $split_point = GetSplitPoint($aligned_ctm, $current_seg_index, $seg_end_index, $max_seg_length); my $ans = PrintSegment($aligned_ctm, $wav_id, $min_sil_length, @@ -322,7 +323,7 @@ sub ProcessWav { } # Save the aligned CTM if needed - if(tell($ACT) != -1){ + if(defined($ACT)){ for (my $i=0; $i<=$#aligned_ctm; $i++) { print $ACT "$aligned_ctm[$i][0] $aligned_ctm[$i][1] "; print $ACT "$aligned_ctm[$i][2] $aligned_ctm[$i][3]\n"; @@ -458,4 +459,4 @@ sub InsertSilence { close(AI); close($SO); close($TO); -close($ACT); +close($ACT) if defined($ACT); diff --git a/egs/wsj/s5/steps/cleanup/debug_lexicon.sh b/egs/wsj/s5/steps/cleanup/debug_lexicon.sh index c768d89b44e..cdf1ff3e5df 100755 --- a/egs/wsj/s5/steps/cleanup/debug_lexicon.sh +++ b/egs/wsj/s5/steps/cleanup/debug_lexicon.sh @@ -4,11 +4,12 @@ # this script gets some stats that will help you debug the lexicon. -# Begin configuration section. +# Begin configuration section. stage=1 remove_stress=false nj=10 # number of jobs for various decoding-type things that we run. cmd=run.pl +alidir= # End configuration section echo "$0 $@" # Print the command line for logging @@ -26,6 +27,8 @@ if [ $# != 5 ]; then echo " --remove-stress # if true, remove stress before printing analysis" echo " # note: if you change this, you only have to rerun" echo " # from stage 10." + echo " --alidir # if supplied, training-data alignments and transforms" + echo " # are obtained from here instead of being generated." exit 1; fi @@ -41,38 +44,46 @@ for f in $data/feats.scp $lang/phones.txt $src/final.mdl $srcdict; do [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1; done -if [ $stage -le 1 ]; then - steps/align_fmllr.sh --cmd "$cmd" --nj $nj $data $lang $src ${src}_ali_$(basename $data) +if [ -z $alidir ]; then + alidir=${src}_ali_$(basename $data) + if [ $stage -le 1 ]; then + steps/align_fmllr.sh --cmd "$cmd" --nj $nj $data $lang $src $alidir + fi fi +phone_lang=data/$(basename $lang)_phone_bg + if [ $stage -le 2 ]; then - utils/make_phone_bigram_lang.sh $lang ${src}_ali_$(basename $data) data/$(basename $lang)_phone_bg + utils/make_phone_bigram_lang.sh $lang $alidir $phone_lang fi if [ $stage -le 3 ]; then - utils/mkgraph.sh data/$(basename $lang)_phone_bg $src $src/graph_phone_bg + utils/mkgraph.sh $phone_lang $src $src/graph_phone_bg fi if [ $stage -le 4 ]; then - steps/decode_si.sh --cmd "$cmd" --nj $nj --transform-dir ${src}_ali_$(basename $data) \ - --acwt 0.25 --beam 25.0 --lattice-beam 5.0 --max-active 2500 \ + steps/decode_si.sh --skip-scoring true \ + --cmd "$cmd" --nj $nj --transform-dir $alidir \ + --acwt 0.25 --beam 10.0 --lattice-beam 5.0 --max-active 2500 \ $src/graph_phone_bg $data $src/decode_$(basename $data)_phone_bg fi if [ $stage -le 5 ]; then - steps/get_train_ctm.sh $data $lang ${src}_ali_$(basename $data) + steps/get_train_ctm.sh --print-silence true --use-segments false \ + --cmd "$cmd" $data $lang $alidir fi if [ $stage -le 6 ]; then - steps/get_ctm.sh --min-lmwt 3 --max-lmwt 8 \ - $data data/$(basename $lang)_phone_bg $src/decode_$(basename $data)_phone_bg + steps/get_ctm.sh --use-segments false --cmd "$cmd" --min-lmwt 3 --max-lmwt 8 \ + $data $phone_lang $src/decode_$(basename $data)_phone_bg fi if [ $stage -le 7 ]; then mkdir -p $dir # lmwt=4 corresponds to the scale we decoded at. cp $src/decode_$(basename $data)_phone_bg/score_4/$(basename $data).ctm $dir/phone.ctm - cp ${src}_ali_$(basename $data)/ctm $dir/word.ctm + + cp $alidir/ctm $dir/word.ctm fi if [ $stage -le 8 ]; then @@ -82,7 +93,7 @@ if [ $stage -le 8 ]; then # we'll convert it into two entries like this, with the start and end separately: # sw02054-A 0021332 START and # sw02054-A 0021356 END and -# +# # and suppose phone.ctm has lines like # sw02054 A 213.09 0.24 sil # sw02054 A 213.33 0.13 ae_B @@ -95,18 +106,17 @@ if [ $stage -le 8 ]; then # then after sorting and merge-sorting the two ctm files we can easily # work out for each word, what the phones were during that time. - grep -v '' data/$(basename $lang)_phone_bg/phones.txt | awk '{print $1, $1}' | \ + grep -v '' $phone_lang/phones.txt | awk '{print $1, $1}' | \ sed 's/_B$//' | sed 's/_I$//' | sed 's/_E$//' | sed 's/_S$//' >$dir/phone_map.txt - silphone=$(cat data/$(basename $lang)_phone_bg/phones/optional_silence.txt) - cat $dir/phone.ctm | utils/apply_map.pl -f 5 $dir/phone_map.txt | grep -v "$silphone\$" > $dir/phone_cleaned.ctm + cat $dir/phone.ctm | utils/apply_map.pl -f 5 $dir/phone_map.txt > $dir/phone_text.ctm > $dir/phone_mapped.ctm export LC_ALL=C - + cat $dir/word.ctm | awk '{printf("%s-%s %09d START %s\n", $1, $2, 100*$3, $5); printf("%s-%s %09d END %s\n", $1, $2, 100*($3+$4), $5);}' | \ sort >$dir/word_processed.ctm - cat $dir/phone_cleaned.ctm | awk '{printf("%s-%s %09d PHONE %s\n", $1, $2, 100*($3+(0.5*$4)), $5);}' | \ + cat $dir/phone_mapped.ctm | awk '{printf("%s-%s %09d PHONE %s\n", $1, $2, 100*($3+(0.5*$4)), $5);}' | \ sort >$dir/phone_processed.ctm # merge-sort both ctm's @@ -129,12 +139,16 @@ if [ $stage -le 10 ]; then else cp $srcdict $dir/lexicon.txt fi + silphone=$(cat $phone_lang/phones/optional_silence.txt) + echo " $silphone" >> $dir/lexicon.txt awk '{count[$2] += $1;} END {for (w in count){print w, count[w];}}' \ <$dir/prons.txt >$dir/counts.txt + + cat $dir/prons.txt | \ - if $remove_stress; then + if $remove_stress; then perl -e 'while(<>) { @A=split(" ", $_); for ($n=1;$n<@A;$n++) { $A[$n] =~ s/[0-9]$//; } print join(" ", @A) . "\n"; } ' else cat @@ -143,9 +157,9 @@ if [ $stage -le 10 ]; then open(D, "<$ARGV[0]") || die "opening dict file $ARGV[0]"; # create a hash of all reference pronuncations, and for each word, record # a list of the prons, separated by " | ". - while () { - @A = split(" ", $_); $is_pron{join(" ",@A)} = 1; - $w = shift @A; + while () { + @A = split(" ", $_); $is_pron{join(" ",@A)} = 1; + $w = shift @A; if (!defined $prons{$w}) { $prons{$w} = join(" ", @A); } else { $prons{$w} = $prons{$w} . " | " . join(" ", @A); } } diff --git a/egs/wsj/s5/steps/cleanup/find_bad_utts.sh b/egs/wsj/s5/steps/cleanup/find_bad_utts.sh index 97fb62a9c4f..80a71b0edc5 100755 --- a/egs/wsj/s5/steps/cleanup/find_bad_utts.sh +++ b/egs/wsj/s5/steps/cleanup/find_bad_utts.sh @@ -5,9 +5,9 @@ # Computes training alignments using a model with delta or # LDA+MLLT features. This version, rather than just using the # text to align, computes mini-language models (unigram) from the text -# and a few common words in the LM, and allows +# and a few common words in the LM. -# Begin configuration section. +# Begin configuration section. nj=4 cmd=run.pl use_graphs=false @@ -82,7 +82,7 @@ echo "$0: feature type is $feat_type" case $feat_type in delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" - cp $srcdir/final.mat $srcdir/full.mat $dir + cp $srcdir/final.mat $srcdir/full.mat $dir ;; *) echo "$0: invalid feature type $feat_type" && exit 1; esac @@ -155,7 +155,7 @@ if [ $stage -le 2 ]; then # # with the fields separated by tabs, e.g. # adg04_sr009_trn 1 12 SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED AT SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED - + paste $dir/edits.txt \ <(awk '{print $2}' $dir/length.txt) \ <(awk '{$1="";print;}' <$dir/aligned_ref.txt) \ @@ -171,9 +171,9 @@ fi if [ $stage -le 3 ]; then ### - # These stats migh help people figure out what is wrong with the data + # These stats might help people figure out what is wrong with the data # a)human-friendly and machine-parsable alignment in the file per_utt_details.txt - # b)evaluation of per-speaker performance to possibly find speakers with + # b)evaluation of per-speaker performance to possibly find speakers with # distinctive accents/speech disorders and similar # c)Global analysis on (Ins/Del/Sub) operation, which might be used to figure # out if there is systematic issue with lexicon, pronunciation or phonetic confusability diff --git a/egs/wsj/s5/steps/cleanup/find_bad_utts_nnet.sh b/egs/wsj/s5/steps/cleanup/find_bad_utts_nnet.sh new file mode 100755 index 00000000000..42c768f9a2d --- /dev/null +++ b/egs/wsj/s5/steps/cleanup/find_bad_utts_nnet.sh @@ -0,0 +1,162 @@ +#!/bin/bash +# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey) +# 2016 Ilya Platonov +# Apache 2.0 +# +# Tweaked version of find_bad_utts.sh to work with nnet2 baseline models. +# +# Begin configuration section. +nj=32 +cmd=run.pl +use_graphs=false +# Begin configuration. +scale_opts="--transition-scale=1.0 --self-loop-scale=0.1" +acoustic_scale=0.1 +beam=15.0 +lattice_beam=8.0 +max_active=750 +transform_dir= # directory to find fMLLR transforms in. +top_n_words=100 # Number of common words that we compile into each graph (most frequent + # in $lang/text. +stage=-1 +cleanup=true +# End configuration options. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh # source the path. +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "usage: $0 " + echo "e.g.: $0 data/train data/lang exp/tri1 exp/tri1_debug" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --use-graphs true # use graphs in src-dir" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +data=$1 +lang=$2 +srcdir=$3 +dir=$4 + +for f in $data/text $lang/oov.int $srcdir/tree $srcdir/final.mdl \ + $lang/L_disambig.fst $lang/phones/disambig.int; do + [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1; +done + +oov=`cat $lang/oov.int` || exit 1; +mkdir -p $dir/log +echo $nj > $dir/num_jobs +sdata=$data/split$nj +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. +cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options. +cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` +cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option. + +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +cp $srcdir/{tree,final.mdl} $dir || exit 1; + + +if [ $stage -le 0 ]; then + utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt <$data/text | \ + awk '{for(x=2;x<=NF;x++) print $x;}' | sort | uniq -c | \ + sort -rn > $dir/word_counts.int || exit 1; + num_words=$(awk '{x+=$1} END{print x}' < $dir/word_counts.int) || exit 1; + # print top-n words with their unigram probabilities. + + head -n $top_n_words $dir/word_counts.int | awk -v tot=$num_words '{print $1/tot, $2;}' >$dir/top_words.int + utils/int2sym.pl -f 2 $lang/words.txt <$dir/top_words.int >$dir/top_words.txt +fi + +echo "$0: feature type is raw" + +feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"; + +if [ $stage -le 1 ]; then + echo "$0: decoding $data using utterance-specific decoding graphs using model from $srcdir, output in $dir" + + rm $dir/edits.*.txt $dir/aligned_ref.*.txt 2>/dev/null + + $cmd JOB=1:$nj $dir/log/decode.JOB.log \ + utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text \| \ + steps/cleanup/make_utterance_fsts.pl $dir/top_words.int \| \ + compile-train-graphs-fsts $scale_opts --read-disambig-syms=$lang/phones/disambig.int \ + $dir/tree $dir/final.mdl $lang/L_disambig.fst ark:- ark:- \| \ + nnet-latgen-faster --acoustic-scale=$acoustic_scale --beam=$beam \ + --max-active=$max_active --lattice-beam=$lattice_beam \ + --word-symbol-table=$lang/words.txt \ + $dir/final.mdl ark:- "$feats" ark:- \| \ + lattice-oracle ark:- "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|" \ + ark,t:- ark,t:$dir/edits.JOB.txt \| \ + utils/int2sym.pl -f 2- $lang/words.txt '>' $dir/aligned_ref.JOB.txt || exit 1; +fi + + +if [ $stage -le 2 ]; then + if [ -f $dir/edits.1.txt ]; then + # the awk commands below are to ensure that partially-written files don't confuse us. + for x in $(seq $nj); do cat $dir/edits.$x.txt; done | awk '{if(NF==2){print;}}' > $dir/edits.txt + for x in $(seq $nj); do cat $dir/aligned_ref.$x.txt; done | awk '{if(NF>=1){print;}}' > $dir/aligned_ref.txt + else + echo "$0: warning: no file $dir/edits.1.txt, using previously concatenated file if present." + fi + + # in case any utterances failed to align, get filtered copy of $data/text + utils/filter_scp.pl $dir/edits.txt < $data/text > $dir/text + cat $dir/text | awk '{print $1, (NF-1);}' > $dir/length.txt + + n1=$(wc -l < $dir/edits.txt) + n2=$(wc -l < $dir/aligned_ref.txt) + n3=$(wc -l < $dir/text) + n4=$(wc -l < $dir/length.txt) + if [ $n1 -ne $n2 ] || [ $n2 -ne $n3 ] || [ $n3 -ne $n4 ]; then + echo "$0: mismatch in lengths of files:" + wc $dir/edits.txt $dir/aligned_ref.txt $dir/text $dir/length.txt + exit 1; + fi + + # note: the format of all_info.txt is: + # + # with the fields separated by tabs, e.g. + # adg04_sr009_trn 1 12 SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED AT SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED + + paste $dir/edits.txt \ + <(awk '{print $2}' $dir/length.txt) \ + <(awk '{$1="";print;}' <$dir/aligned_ref.txt) \ + <(awk '{$1="";print;}' <$dir/text) > $dir/all_info.txt + + sort -nr -k2 $dir/all_info.txt > $dir/all_info.sorted.txt + + if $cleanup; then + rm $dir/edits.*.txt $dir/aligned_ref.*.txt + fi + +fi + +if [ $stage -le 3 ]; then + ### + # These stats migh help people figure out what is wrong with the data + # a)human-friendly and machine-parsable alignment in the file per_utt_details.txt + # b)evaluation of per-speaker performance to possibly find speakers with + # distinctive accents/speech disorders and similar + # c)Global analysis on (Ins/Del/Sub) operation, which might be used to figure + # out if there is systematic issue with lexicon, pronunciation or phonetic confusability + + mkdir -p $dir/analysis + align-text --special-symbol="***" ark:$dir/text ark:$dir/aligned_ref.txt ark,t:- | \ + utils/scoring/wer_per_utt_details.pl --special-symbol "***" > $dir/analysis/per_utt_details.txt + + cat $dir/analysis/per_utt_details.txt | \ + utils/scoring/wer_per_spk_details.pl $data/utt2spk > $dir/analysis/per_spk_details.txt + + cat $dir/analysis/per_utt_details.txt | \ + utils/scoring/wer_ops_details.pl --special-symbol "***" | \ + sort -i -b -k1,1 -k4,4nr -k2,2 -k3,3 > $dir/analysis/ops_details.txt + +fi + diff --git a/egs/wsj/s5/steps/conf/apply_calibration.sh b/egs/wsj/s5/steps/conf/apply_calibration.sh index 33be80d85b2..c1a22e274b8 100755 --- a/egs/wsj/s5/steps/conf/apply_calibration.sh +++ b/egs/wsj/s5/steps/conf/apply_calibration.sh @@ -76,7 +76,7 @@ fi # Create the forwarding data for logistic regression, if [ $stage -le 2 ]; then steps/conf/prepare_calibration_data.py --conf-feats $dir/forward_feats.ark \ - $dir/ctm_int $word_feats $latdepth $word_categories + --lattice-depth $latdepth $dir/ctm_int $word_feats $word_categories fi # Apply calibration model to dev, diff --git a/egs/wsj/s5/steps/conf/prepare_calibration_data.py b/egs/wsj/s5/steps/conf/prepare_calibration_data.py index 003d77c5e8a..23db9633a1c 100755 --- a/egs/wsj/s5/steps/conf/prepare_calibration_data.py +++ b/egs/wsj/s5/steps/conf/prepare_calibration_data.py @@ -13,8 +13,8 @@ The logisitc-regression input features are: - posteriors from 'ctm' transformed by logit, - logarithm of word-length in letters, -- logarithm of average lattice-depth at position of the word, - 10base logarithm of unigram probability of a word from language model, +- logarithm of average lattice-depth at position of the word (optional), The logistic-regresion targets are: - 1 for correct word, @@ -33,12 +33,13 @@ parser = OptionParser(usage=usage, description=desc) parser.add_option("--conf-targets", help="Targets file for logistic regression (no targets generated if '') [default %default]", default='') parser.add_option("--conf-feats", help="Feature file for logistic regression. [default %default]", default='') +parser.add_option("--lattice-depth", help="Per-frame lattice depths, ascii-ark (optional). [default %default]", default='') (o, args) = parser.parse_args() -if len(args) != 4: +if len(args) != 3: parser.print_help() sys.exit(1) -ctm_file, word_feats_file, depths_file, word_categories_file = args +ctm_file, word_feats_file, word_categories_file = args assert(o.conf_feats != '') @@ -76,10 +77,12 @@ # Load the per-frame lattice-depth, # - we assume, the 1st column in 'ctm' is the 'utterance-key' in depth file, -depths = dict() -for l in open(depths_file): - utt,d = l.split(' ',1) - depths[utt] = map(int,d.split()) +# - if the 'ctm' and 'ark' keys don't match, we leave this feature out, +if o.lattice_depth: + depths = dict() + for l in open(o.lattice_depth): + utt,d = l.split(' ',1) + depths[utt] = map(int,d.split()) # Load the 'word_categories' mapping for categorical input features derived from 'lang/words.txt', wrd_to_cat = [ l.split() for l in open(word_categories_file) ] @@ -98,15 +101,19 @@ logit = math.log(float(conf)+damper) - math.log(1.0 - float(conf)+damper) # - log of word-length, log_word_length = math.log(word_length[wrd_id]) # i.e. number of phones in a word, - # - log of average-depth of lattice at the word position, - depth_slice = depths[utt][int(round(100.0*float(beg))):int(round(100.0*(float(beg)+float(dur))))] - log_avg_depth = math.log(float(sum(depth_slice))/len(depth_slice)) # - categorical distribution of words (with frequency higher than min-count), wrd_1_of_k = [0]*wrd_cat_num; wrd_1_of_k[wrd_to_cat[wrd_id]] = 1; # Compose the input feature vector, - feats = [ logit, log_word_length, log_avg_depth, other_feats[wrd_id] ] + wrd_1_of_k + feats = [ logit, log_word_length, other_feats[wrd_id] ] + wrd_1_of_k + + # Optionally add average-depth of lattice at the word position, + if o.lattice_depth != '': + depth_slice = depths[utt][int(round(100.0*float(beg))):int(round(100.0*(float(beg)+float(dur))))] + log_avg_depth = math.log(float(sum(depth_slice))/len(depth_slice)) + feats += [ log_avg_depth ] + # Store the input features, f.write(key + ' [ ' + ' '.join(map(str,feats)) + ' ]\n') diff --git a/egs/wsj/s5/steps/conf/train_calibration.sh b/egs/wsj/s5/steps/conf/train_calibration.sh index 64ca70022c8..c2aca05056e 100755 --- a/egs/wsj/s5/steps/conf/train_calibration.sh +++ b/egs/wsj/s5/steps/conf/train_calibration.sh @@ -104,7 +104,7 @@ fi if [ $stage -le 3 ]; then steps/conf/prepare_calibration_data.py \ --conf-targets $dir/train_targets.ark --conf-feats $dir/train_feats.ark \ - $dir/ctm_aligned_int $word_feats $latdepth $dir/word_categories + --lattice-depth $latdepth $dir/ctm_aligned_int $word_feats $dir/word_categories fi # Train the logistic regression, diff --git a/egs/wsj/s5/steps/decode.sh b/egs/wsj/s5/steps/decode.sh index b0e2fed2017..f2bc1d367fd 100755 --- a/egs/wsj/s5/steps/decode.sh +++ b/egs/wsj/s5/steps/decode.sh @@ -3,8 +3,8 @@ # Copyright 2012 Johns Hopkins University (Author: Daniel Povey) # Apache 2.0 -# Begin configuration section. -transform_dir= # this option won't normally be used, but it can be used if you want to +# Begin configuration section. +transform_dir= # this option won't normally be used, but it can be used if you want to # supply existing fMLLR transforms when decoding. iter= model= # You can specify the model to use (e.g. if you want to use the .alimdl) @@ -64,16 +64,16 @@ mkdir -p $dir/log echo $nj > $dir/num_jobs if [ -z "$model" ]; then # if --model was not specified on the command line... - if [ -z $iter ]; then model=$srcdir/final.mdl; + if [ -z $iter ]; then model=$srcdir/final.mdl; else model=$srcdir/$iter.mdl; fi fi if [ $(basename $model) != final.alimdl ] ; then # Do not use the $srcpath -- look at the path where the model is - if [ -f $(dirname $model)/final.alimdl ] ; then - echo -e '\n\n' - echo $0 'WARNING: Running speaker independent system decoding using a SAT model!' - echo $0 'WARNING: This is OK if you know what you are doing...' + if [ -f $(dirname $model)/final.alimdl ] && [ -z "$transform_dir" ]; then + echo -e '\n\n' + echo $0 'WARNING: Running speaker independent system decoding using a SAT model!' + echo $0 'WARNING: This is OK if you know what you are doing...' echo -e '\n\n' fi fi @@ -90,7 +90,7 @@ cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` delta_opts=`cat $srcdir/delta_opts 2>/dev/null` thread_string= -[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" +[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" case $feat_type in delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; @@ -129,7 +129,7 @@ fi if ! $skip_scoring ; then [ ! -x local/score.sh ] && \ echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; - local/score.sh --cmd "$cmd" $scoring_opts $data $graphdir $dir || + local/score.sh --cmd "$cmd" $scoring_opts $data $graphdir $dir || { echo "$0: Scoring failed. (ignore by '--skip-scoring true')"; exit 1; } fi diff --git a/egs/wsj/s5/steps/get_ctm.sh b/egs/wsj/s5/steps/get_ctm.sh index 3d0ea576a57..2f2f6794e3d 100755 --- a/egs/wsj/s5/steps/get_ctm.sh +++ b/egs/wsj/s5/steps/get_ctm.sh @@ -8,6 +8,7 @@ # begin configuration section. cmd=run.pl stage=0 +frame_shift=0.01 min_lmwt=5 max_lmwt=20 use_segments=true # if we have a segments file, use it to convert @@ -28,6 +29,8 @@ if [ $# -ne 3 ]; then echo " # to produce a ctm relative to the original audio" echo " # files, with channel information (typically needed" echo " # for NIST scoring)." + echo " --frame-shift (default=0.01) # specify this if your lattices have a frame-shift" + echo " # not equal to 0.01 seconds" echo "e.g.:" echo "$0 data/train data/lang exp/tri4a/decode/" echo "See also: steps/get_train_ctm.sh" @@ -55,7 +58,7 @@ if [ $stage -le 0 ]; then [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1; filter_cmd="utils/convert_ctm.pl $data/segments $data/reco2file_and_channel" else - filter_cmd=cat + filter_cmd=cat fi if [ -f $lang/phones/word_boundary.int ]; then @@ -63,7 +66,7 @@ if [ $stage -le 0 ]; then set -o pipefail '&&' mkdir -p $dir/score_LMWT/ '&&' \ lattice-1best --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \ - nbest-to-ctm ark:- - \| \ + nbest-to-ctm --frame-shift=$frame_shift ark:- - \| \ utils/int2sym.pl -f 5 $lang/words.txt \| \ $filter_cmd '>' $dir/score_LMWT/$name.ctm || exit 1; else @@ -76,7 +79,7 @@ if [ $stage -le 0 ]; then set -o pipefail '&&' mkdir -p $dir/score_LMWT/ '&&' \ lattice-1best --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ lattice-align-words-lexicon $lang/phones/align_lexicon.int $model ark:- ark:- \| \ - nbest-to-ctm ark:- - \| \ + nbest-to-ctm --frame-shift=$frame_shift ark:- - \| \ utils/int2sym.pl -f 5 $lang/words.txt \| \ $filter_cmd '>' $dir/score_LMWT/$name.ctm || exit 1; fi diff --git a/egs/wsj/s5/steps/get_train_ctm.sh b/egs/wsj/s5/steps/get_train_ctm.sh index a6cbb2ac06a..10b29708d84 100755 --- a/egs/wsj/s5/steps/get_train_ctm.sh +++ b/egs/wsj/s5/steps/get_train_ctm.sh @@ -7,9 +7,12 @@ # begin configuration section. cmd=run.pl +frame_shift=0.01 stage=0 use_segments=true # if we have a segments file, use it to convert # the segments to be relative to the original files. +print_silence=false # if true, will print (optional-silence) arcs. + #end configuration section. echo "$0 $@" # Print the command line for logging @@ -26,6 +29,8 @@ if [ $# -ne 3 ]; then echo " # to produce a ctm relative to the original audio" echo " # files, with channel information (typically needed" echo " # for NIST scoring)." + echo " --frame-shift (default=0.01) # specify this if your alignments have a frame-shift" + echo " # not equal to 0.01 seconds" echo "e.g.:" echo "$0 data/train data/lang exp/tri3a_ali" echo "Produces ctm in: exp/tri3a_ali/ctm" @@ -58,9 +63,9 @@ if [ $stage -le 0 ]; then "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text |" \ '' '' ark:- \| \ lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \ - nbest-to-ctm ark:- - \| \ + nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \ utils/int2sym.pl -f 5 $lang/words.txt \| \ - gzip -c '>' $dir/ctm.JOB.gz + gzip -c '>' $dir/ctm.JOB.gz || exit 1 else if [ ! -f $lang/phones/align_lexicon.int ]; then echo "$0: neither $lang/phones/word_boundary.int nor $lang/phones/align_lexicon.int exists: cannot align." @@ -71,14 +76,14 @@ if [ $stage -le 0 ]; then "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text |" \ '' '' ark:- \| \ lattice-align-words-lexicon $lang/phones/align_lexicon.int $model ark:- ark:- \| \ - nbest-to-ctm ark:- - \| \ + nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \ utils/int2sym.pl -f 5 $lang/words.txt \| \ - gzip -c '>' $dir/ctm.JOB.gz + gzip -c '>' $dir/ctm.JOB.gz || exit 1 fi fi if [ $stage -le 1 ]; then - if [ -f $data/segments ]; then + if [ -f $data/segments ] && $use_segments; then f=$data/reco2file_and_channel [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1; for n in `seq $nj`; do gunzip -c $dir/ctm.$n.gz; done | \ diff --git a/egs/wsj/s5/steps/lmrescore_const_arpa.sh b/egs/wsj/s5/steps/lmrescore_const_arpa.sh index 425fbc39f99..092bc53f5e8 100755 --- a/egs/wsj/s5/steps/lmrescore_const_arpa.sh +++ b/egs/wsj/s5/steps/lmrescore_const_arpa.sh @@ -9,6 +9,7 @@ cmd=run.pl skip_scoring=false stage=1 +scoring_opts= # End configuration section. echo "$0 $@" # Print the command line for logging @@ -57,7 +58,7 @@ fi if ! $skip_scoring && [ $stage -le 2 ]; then err_msg="Not scoring because local/score.sh does not exist or not executable." [ ! -x local/score.sh ] && echo $err_msg && exit 1; - local/score.sh --cmd "$cmd" $data $newlang $outdir + local/score.sh --cmd "$cmd" $scoring_opts $data $newlang $outdir else echo "Not scoring because requested so..." fi diff --git a/egs/wsj/s5/steps/make_mfcc.sh b/egs/wsj/s5/steps/make_mfcc.sh index 1d152f6cf8d..09c34d40b24 100755 --- a/egs/wsj/s5/steps/make_mfcc.sh +++ b/egs/wsj/s5/steps/make_mfcc.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash # Copyright 2012 Johns Hopkins University (Author: Daniel Povey) # Apache 2.0 @@ -81,7 +81,7 @@ if [ -f $data/segments ]; then for n in $(seq $nj); do split_segments="$split_segments $logdir/segments.$n" done - + utils/split_scp.pl $data/segments $split_segments || exit 1; rm $logdir/.error 2>/dev/null @@ -127,8 +127,8 @@ done > $data/feats.scp rm $logdir/wav_${name}.*.scp $logdir/segments.* 2>/dev/null -nf=`cat $data/feats.scp | wc -l` -nu=`cat $data/utt2spk | wc -l` +nf=`cat $data/feats.scp | wc -l` +nu=`cat $data/utt2spk | wc -l` if [ $nf -ne $nu ]; then echo "It seems not all of the feature files were successfully processed ($nf != $nu);" echo "consider using utils/fix_data_dir.sh $data" diff --git a/egs/wsj/s5/steps/nnet/train.sh b/egs/wsj/s5/steps/nnet/train.sh index 8aceffccdaa..656acf2a815 100755 --- a/egs/wsj/s5/steps/nnet/train.sh +++ b/egs/wsj/s5/steps/nnet/train.sh @@ -145,14 +145,20 @@ else labels_tr_phn="ark:ali-to-phones --per-frame=true $alidir/final.mdl \"ark:gunzip -c $alidir/ali.*.gz |\" ark:- |" # get pdf-counts, used later for decoding/aligning, - analyze-counts --verbose=1 --binary=false "$labels_tr_pdf" $dir/ali_train_pdf.counts 2>$dir/log/analyze_counts_pdf.log || exit 1 + analyze-counts --verbose=1 --binary=false \ + ${frame_weights:+ "--frame-weights=$frame_weights"} \ + ${utt_weights:+ "--utt-weights=$utt_weights"} \ + "$labels_tr_pdf" $dir/ali_train_pdf.counts 2>$dir/log/analyze_counts_pdf.log || exit 1 # copy the old transition model, will be needed by decoder, copy-transition-model --binary=false $alidir/final.mdl $dir/final.mdl || exit 1 # copy the tree cp $alidir/tree $dir/tree || exit 1 # make phone counts for analysis, - [ -e $lang/phones.txt ] && analyze-counts --verbose=1 --symbol-table=$lang/phones.txt "$labels_tr_phn" /dev/null 2>$dir/log/analyze_counts_phones.log || exit 1 + [ -e $lang/phones.txt ] && analyze-counts --verbose=1 --symbol-table=$lang/phones.txt \ + ${frame_weights:+ "--frame-weights=$frame_weights"} \ + ${utt_weights:+ "--utt-weights=$utt_weights"} \ + "$labels_tr_phn" /dev/null 2>$dir/log/analyze_counts_phones.log || exit 1 fi ###### PREPARE FEATURES ###### @@ -365,7 +371,7 @@ if [ ! -z $nnet_init ]; then elif [ ! -z $nnet_proto ]; then echo "# initializing NN from prototype '$nnet_proto'"; nnet_init=$dir/nnet.init; log=$dir/log/nnet_initialize.log - nnet-initialize --seed $seed $nnet_proto $nnet_init + nnet-initialize --seed=$seed $nnet_proto $nnet_init else echo "# getting input/output dims :" # input-dim, @@ -424,7 +430,7 @@ else # initialize, nnet_init=$dir/nnet.init echo "# initializing the NN '$nnet_proto' -> '$nnet_init'" - nnet-initialize $nnet_proto $nnet_init + nnet-initialize --seed=$seed $nnet_proto $nnet_init # optionally prepend dbn to the initialization, if [ ! -z "$dbn" ]; then diff --git a/egs/wsj/s5/steps/nnet2/get_lda_block.sh b/egs/wsj/s5/steps/nnet2/get_lda_block.sh index c840e014250..7bd4ecf5647 100755 --- a/egs/wsj/s5/steps/nnet2/get_lda_block.sh +++ b/egs/wsj/s5/steps/nnet2/get_lda_block.sh @@ -104,7 +104,7 @@ while [ $[$cur_index+$block_size] -le $feat_dim ]; do echo >> $dir/indexes num_blocks=$[$num_blocks+1] cur_index=$[$cur_index+$block_shift] - if [ $[$cur_index+$block_size-1] -gt $feat_dim ]; then + if [ $[$cur_index+$block_size] -gt $feat_dim ]; then cur_index=$[$feat_dim-$block_size]; fi done diff --git a/egs/wsj/s5/steps/nnet2/get_num_frames.sh b/egs/wsj/s5/steps/nnet2/get_num_frames.sh deleted file mode 100755 index a960e2fcfe9..00000000000 --- a/egs/wsj/s5/steps/nnet2/get_num_frames.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash - -# This script works out the approximate number of frames in a training directory -# this is sometimes needed by higher-level scripts - -num_samples=1000 - - -if [ -f path.sh ]; then . ./path.sh; fi -. parse_options.sh || exit 1; - -if [ $# -ne 1 ]; then - ( - echo "Usage: $0 " - echo "Prints the number of frames of data in the data-dir, via sampling rather" - echo "than trying to access all the data." - ) 1>&2 -fi - -data=$1 - -if [ ! -f $data/feats.scp ]; then - if [ -f $data/segments ]; then - echo "$0: $data/feats.scp does not exist, but $data/segments does exist; using that and assuming 100 frames per second." 1>&2 - num_frames=$(cat $data/segments | awk '{x += $4 - $3;} END{print int(x*100);}') || exit 1; - echo $num_frames - exit 0; - else - echo "$0: neither $data/feats.scp nor $data/segments exist." 1>&2 - exit 1; - fi -fi - - -sample_frames=$(utils/shuffle_list.pl $data/feats.scp | head -n $num_samples | sort | feat-to-len --print-args=false scp:-) - -num_files_orig=$(wc -l <$data/feats.scp) -if [ $num_samples -lt $num_files_orig ]; then - num_files_sampled=$num_samples -else - num_files_sampled=$num_files_orig -fi - -perl -e "\$n = int(($sample_frames * 1.0 * $num_files_orig) / (1.0 * $num_files_sampled)); print \"\$n\n\";"; diff --git a/egs/wsj/s5/steps/nnet2/get_num_frames.sh b/egs/wsj/s5/steps/nnet2/get_num_frames.sh new file mode 120000 index 00000000000..d5eab6ede07 --- /dev/null +++ b/egs/wsj/s5/steps/nnet2/get_num_frames.sh @@ -0,0 +1 @@ +../../utils/data/get_num_frames.sh \ No newline at end of file diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh index fc75932d0d3..35a5bac5313 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh @@ -19,13 +19,19 @@ # Begin configuration section. cmd=run.pl feat_type=raw # set it to 'lda' to use LDA features. -frames_per_eg=25 # number of frames of labels per example. more->less disk space and - # less time preparing egs, but more I/O during training. - # note: the script may reduce this if reduce_frames_per_eg is true. +frames_per_eg=25 # number of feature frames example (not counting added context). + # more->less disk space and less time preparing egs, but more + # I/O during training. note: the script may reduce this if + # reduce_frames_per_eg is true. frames_overlap_per_eg=0 # number of supervised frames of overlap that we aim for per eg. # can be useful to avoid wasted data if you're using --left-deriv-truncate # and --right-deriv-truncate. -frame_subsampling_factor=3 # ratio between input and output frame-rate of nnet. +cut_zero_frames=-1 # if activated, activates new-style derivative weights.. i'll reorganize + # this if it works well. +frame_subsampling_factor=3 # frames-per-second of features we train on divided + # by frames-per-second at output of chain model +alignment_subsampling_factor=3 # frames-per-second of input alignments divided + # by frames-per-second at output of chain model left_context=4 # amount of left-context per eg (i.e. extra frames of input features # not present in the output supervision). right_context=4 # amount of right-context per eg. @@ -41,10 +47,13 @@ num_utts_subset=300 # number of utterances in validation and training num_valid_egs_combine=0 # #validation examples for combination weights at the very end. num_train_egs_combine=1000 # number of train examples for the above. num_egs_diagnostic=400 # number of frames for "compute_prob" jobs -frames_per_iter=400000 # each iteration of training, see this many frames - # per job. This is just a guideline; it will pick a number +frames_per_iter=400000 # each iteration of training, see this many frames per + # job, measured at the sampling rate of the features + # used. This is just a guideline; it will pick a number # that divides the number of samples in the entire data. + right_tolerance= #CTC right tolerance == max label delay. +left_tolerance= transform_dir= # If supplied, overrides latdir as the place to find fMLLR transforms @@ -263,7 +272,7 @@ if [ $stage -le 2 ]; then fi -egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --num-frames-overlap=$frames_overlap_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress" +egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --num-frames-overlap=$frames_overlap_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress --cut-zero-frames=$cut_zero_frames" [ -z $valid_left_context ] && valid_left_context=$left_context; @@ -271,10 +280,12 @@ egs_opts="--left-context=$left_context --right-context=$right_context --num-fram # don't do the overlap thing for the validation data. valid_egs_opts="--left-context=$valid_left_context --right-context=$valid_right_context --num-frames=$frames_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress" -ctc_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$frame_subsampling_factor" +ctc_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$alignment_subsampling_factor" [ ! -z $right_tolerance ] && \ ctc_supervision_all_opts="$ctc_supervision_all_opts --right-tolerance=$right_tolerance" +[ ! -z $left_tolerance ] && \ + ctc_supervision_all_opts="$ctc_supervision_all_opts --left-tolerance=$left_tolerance" echo $left_context > $dir/info/left_context echo $right_context > $dir/info/right_context @@ -326,7 +337,7 @@ if [ $stage -le 3 ]; then fi if [ $stage -le 4 ]; then - # create egs_orig.*.*.ark; the first index goes to $nj, + # create cegs_orig.*.*.ark; the first index goes to $nj, # the second to $num_archives_intermediate. egs_list= @@ -379,7 +390,7 @@ if [ $stage -le 5 ]; then for y in $(seq $archives_multiple); do archive_index=$[($x-1)*$archives_multiple+$y] # egs.intermediate_archive.{1,2,...}.ark will point to egs.archive.ark - ln -sf egs.$archive_index.ark $dir/cegs.$x.$y.ark || exit 1 + ln -sf cegs.$archive_index.ark $dir/cegs.$x.$y.ark || exit 1 done done $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ @@ -394,6 +405,9 @@ if [ $stage -le 6 ]; then ( cd $dir for f in $(ls -l . | grep 'cegs_orig' | awk '{ X=NF-1; Y=NF-2; if ($X == "->") print $Y, $NF; }'); do rm $f; done + # the next statement removes them if we weren't using the soft links to a + # 'storage' directory. + rm cegs_orig.*.ark 2>/dev/null ) if [ $archives_multiple -gt 1 ]; then # there are some extra soft links that we should delete. diff --git a/egs/wsj/s5/steps/nnet3/chain/nnet3_chain_lib.py b/egs/wsj/s5/steps/nnet3/chain/nnet3_chain_lib.py new file mode 100644 index 00000000000..87961a0a8a6 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/nnet3_chain_lib.py @@ -0,0 +1,245 @@ + + +# Copyright 2016 Vijayaditya Peddinti. +# Apache 2.0. + + +import subprocess +import logging +import math +import re +import time +import imp +import os + +train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py') + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s') +handler.setFormatter(formatter) +logger.addHandler(handler) + +def GetNumberOfLeaves(dir): + [stdout, stderr] = train_lib.RunKaldiCommand("am-info {0}/final.mdl 2>/dev/null | grep -w pdfs".format(dir)) + parts = stdout.split() + #number of pdfs 7115 + assert(' '.join(parts[0:3]) == "number of pdfs") + num_leaves = int(parts[3]) + if num_leaves == 0: + raise Exception("Number of leaves is 0") + return num_leaves + +def CreatePhoneLm(dir, tree_dir, run_opts, lm_opts = None): + train_lib.RunKaldiCommand(""" + {command} {dir}/log/make_phone_lm.log \ + chain-est-phone-lm {lm_opts} \ + "ark:gunzip -c {tree_dir}/ali.*.gz | ali-to-phones {tree_dir}/final.mdl ark:- ark:- |" \ + {dir}/phone_lm.fst + """.format(command = run_opts.command, + dir = dir, + lm_opts = lm_opts if lm_opts is not None else '', + tree_dir = tree_dir)) + +def CreateDenominatorFst(dir, tree_dir, run_opts): + train_lib.RunKaldiCommand(""" + copy-transition-model {tree_dir}/final.mdl {dir}/0.trans_mdl + {command} {dir}/log/make_den_fst.log \ + chain-make-den-fst {dir}/tree {dir}/0.trans_mdl {dir}/phone_lm.fst \ + {dir}/den.fst {dir}/normalization.fst""".format( + tree_dir = tree_dir, dir = dir, command = run_opts.command)) + +def GenerateChainEgs(dir, data, lat_dir, egs_dir, + left_context, right_context, + run_opts, stage = 0, + valid_left_context = None, valid_right_context = None, + left_tolerance = None, right_tolerance = None, + frame_subsampling_factor = 3, + alignment_subsampling_factor = 3, + feat_type = 'raw', online_ivector_dir = None, + frames_per_iter = 20000, frames_per_eg = 20, + egs_opts = None, cmvn_opts = None, transform_dir = None): + + train_lib.RunKaldiCommand(""" +steps/nnet3/chain/get_egs.sh {egs_opts} \ + --cmd "{command}" \ + --cmvn-opts "{cmvn_opts}" \ + --feat-type {feat_type} \ + --transform-dir "{transform_dir}" \ + --online-ivector-dir "{ivector_dir}" \ + --left-context {left_context} --right-context {right_context} \ + --valid-left-context '{valid_left_context}' \ + --valid-right-context '{valid_right_context}' \ + --left-tolerance '{left_tolerance}' \ + --right-tolerance '{right_tolerance}' \ + --frame-subsampling-factor {frame_subsampling_factor} \ + --alignment-subsampling-factor {alignment_subsampling_factor} \ + --stage {stage} \ + --frames-per-iter {frames_per_iter} \ + --frames-per-eg {frames_per_eg} \ + {data} {dir} {lat_dir} {egs_dir} + """.format(command = run_opts.command, + cmvn_opts = cmvn_opts if cmvn_opts is not None else '', + feat_type = feat_type, + transform_dir = transform_dir if transform_dir is not None else '', + ivector_dir = online_ivector_dir if online_ivector_dir is not None else '', + left_context = left_context, right_context = right_context, + valid_left_context = valid_left_context if valid_left_context is not None else '', + valid_right_context = valid_right_context if valid_right_context is not None else '', + left_tolerance = left_tolerance if left_tolerance is not None else '', + right_tolerance = right_tolerance if right_tolerance is not None else '', + frame_subsampling_factor = frame_subsampling_factor, + alignment_subsampling_factor = alignment_subsampling_factor, + stage = stage, frames_per_iter = frames_per_iter, + frames_per_eg = frames_per_eg, + data = data, lat_dir = lat_dir, dir = dir, egs_dir = egs_dir, + egs_opts = egs_opts if egs_opts is not None else '' )) + +# this function is exactly similar to the version in nnet3_train_lib.py +# except it uses egs files in place of cegs files +def ComputePreconditioningMatrix(dir, egs_dir, num_lda_jobs, run_opts, + max_lda_jobs = None, rand_prune = 4.0, + lda_opts = None): + if max_lda_jobs is not None: + if num_lda_jobs > max_lda_jobs: + num_lda_jobs = max_lda_jobs + + + # Write stats with the same format as stats for LDA. + train_lib.RunKaldiCommand(""" +{command} JOB=1:{num_lda_jobs} {dir}/log/get_lda_stats.JOB.log \ + nnet3-chain-acc-lda-stats --rand-prune={rand_prune} \ + {dir}/init.raw "ark:{egs_dir}/cegs.JOB.ark" {dir}/JOB.lda_stats""".format( + command = run_opts.command, + num_lda_jobs = num_lda_jobs, + dir = dir, + egs_dir = egs_dir, + rand_prune = rand_prune)) + + # the above command would have generated dir/{1..num_lda_jobs}.lda_stats + lda_stat_files = map(lambda x: '{0}/{1}.lda_stats'.format(dir, x), + range(1, num_lda_jobs + 1)) + + train_lib.RunKaldiCommand(""" +{command} {dir}/log/sum_transform_stats.log \ + sum-lda-accs {dir}/lda_stats {lda_stat_files}""".format( + command = run_opts.command, + dir = dir, lda_stat_files = " ".join(lda_stat_files))) + + for file in lda_stat_files: + try: + os.remove(file) + except OSError: + raise Exception("There was error while trying to remove lda stat files.") + # this computes a fixed affine transform computed in the way we described in + # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant + # of an LDA transform but without dimensionality reduction. + + train_lib.RunKaldiCommand(""" +{command} {dir}/log/get_transform.log \ + nnet-get-feature-transform {lda_opts} {dir}/lda.mat {dir}/lda_stats + """.format(command = run_opts.command,dir = dir, + lda_opts = lda_opts if lda_opts is not None else "")) + + train_lib.ForceSymlink("../lda.mat", "{0}/configs/lda.mat".format(dir)) + +def PrepareInitialAcousticModel(dir, run_opts): + """ Adds the first layer; this will also add in the lda.mat and + presoftmax_prior_scale.vec. It will also prepare the acoustic model + with the transition model.""" + + train_lib.RunKaldiCommand(""" +{command} {dir}/log/add_first_layer.log \ + nnet3-init --srand=-1 {dir}/init.raw {dir}/configs/layer1.config {dir}/0.raw """.format(command = run_opts.command, + dir = dir)) + + # The model-format for a 'chain' acoustic model is just the transition + # model and then the raw nnet, so we can use 'cat' to create this, as + # long as they have the same mode (binary or not binary). + # We ensure that they have the same mode (even if someone changed the + # script to make one or both of them text mode) by copying them both + # before concatenating them. + train_lib.RunKaldiCommand(""" +{command} {dir}/log/init_mdl.log \ + nnet3-am-init {dir}/0.trans_mdl {dir}/0.raw {dir}/0.mdl""".format( + command = run_opts.command, dir = dir)) + +def CombineModels(dir, num_iters, num_iters_combine, num_chunk_per_minibatch, + egs_dir, leaky_hmm_coefficient, l2_regularize, + xent_regularize, run_opts): + # Now do combination. In the nnet3 setup, the logic + # for doing averaging of subsets of the models in the case where + # there are too many models to reliably esetimate interpolation + # factors (max_models_combine) is moved into the nnet3-combine + raw_model_strings = [] + for iter in range(num_iters - num_iters_combine + 1, num_iters + 1): + model_file = '{0}/{1}.mdl'.format(dir, iter) + if not os.path.exists(model_file): + raise Exception('Model file {0} missing'.format(model_file)) + raw_model_strings.append('"nnet3-am-copy --raw=true {0} -|"'.format(model_file)) + train_lib.RunKaldiCommand(""" +{command} {combine_queue_opt} {dir}/log/combine.log \ +nnet3-chain-combine --num-iters=40 \ + --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ + --enforce-sum-to-one=true --enforce-positive-weights=true \ + --verbose=3 {dir}/den.fst {raw_models} "ark:nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:{egs_dir}/combine.cegs ark:-|" \ +"|nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl {dir}/final.mdl" + """.format(command = run_opts.command, + combine_queue_opt = run_opts.combine_queue_opt, + l2 = l2_regularize, leaky = leaky_hmm_coefficient, + dir = dir, raw_models = " ".join(raw_model_strings), + num_chunk_per_minibatch = num_chunk_per_minibatch, + num_iters = num_iters, + egs_dir = egs_dir)) + + # Compute the probability of the final, combined model with + # the same subset we used for the previous compute_probs, as the + # different subsets will lead to different probs. + ComputeTrainCvProbabilities(dir, 'final', egs_dir, l2_regularize, xent_regularize, leaky_hmm_coefficient, run_opts, wait = False) + +def ComputeTrainCvProbabilities(dir, iter, egs_dir, l2_regularize, xent_regularize, + leaky_hmm_coefficient, run_opts, wait = False): + + model = '{0}/{1}.mdl'.format(dir, iter) + + train_lib.RunKaldiCommand(""" +{command} {dir}/log/compute_prob_valid.{iter}.log \ + nnet3-chain-compute-prob --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ + --xent-regularize={xent_reg} \ + "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \ + "ark:nnet3-chain-merge-egs ark:{egs_dir}/valid_diagnostic.cegs ark:- |" + """.format(command = run_opts.command, + dir = dir, iter = iter, model = model, + l2 = l2_regularize, leaky = leaky_hmm_coefficient, + xent_reg = xent_regularize, + egs_dir = egs_dir), wait = wait) + + train_lib.RunKaldiCommand(""" +{command} {dir}/log/compute_prob_train.{iter}.log \ + nnet3-chain-compute-prob --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ + --xent-regularize={xent_reg} \ + "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \ + "ark:nnet3-chain-merge-egs ark:{egs_dir}/train_diagnostic.cegs ark:- |" + """.format(command = run_opts.command, + dir = dir, + iter = iter, + model = model, + l2 = l2_regularize, leaky = leaky_hmm_coefficient, + xent_reg = xent_regularize, + egs_dir = egs_dir), wait = wait) + +def ComputeProgress(dir, iter, run_opts, wait=False): + + prev_model = '{0}/{1}.mdl'.format(dir, iter - 1) + model = '{0}/{1}.mdl'.format(dir, iter) + train_lib.RunKaldiCommand(""" +{command} {dir}/log/progress.{iter}.log \ +nnet3-am-info {model} '&&' \ +nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true {prev_model} - |" "nnet3-am-copy --raw=true {model} - |" + """.format(command = run_opts.command, + dir = dir, + iter = iter, + model = model, + prev_model = prev_model), wait = wait) diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py new file mode 100755 index 00000000000..08746d523ee --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -0,0 +1,704 @@ +#!/usr/bin/env python + + +# Copyright 2016 Vijayaditya Peddinti. +# Apache 2.0. + + +# this script is based on steps/nnet3/lstm/train.sh + +import os +import subprocess +import argparse +import sys +import pprint +import logging +import imp +import traceback +import shutil +import math + +train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py') +chain_lib = imp.load_source('ncl', 'steps/nnet3/chain/nnet3_chain_lib.py') +nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py') + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s') +handler.setFormatter(formatter) +logger.addHandler(handler) +logger.info('Starting chain model trainer (train.py)') + + +def GetArgs(): + # we add compulsary arguments as named arguments for readability + parser = argparse.ArgumentParser(description=""" + Trains RNN and DNN acoustic models using the 'chain' objective function. + """, + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + # feat options + parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir', + default = None, action = train_lib.NullstrToNoneAction, + help="directory with the ivectors extracted in an online fashion.") + parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts', + default = None, action = train_lib.NullstrToNoneAction, + help="A string specifying '--norm-means' and '--norm-vars' values") + + # egs extraction options + parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width', + default = 150, + help="Number of output labels in each example. Caution: if you double this you should halve --trainer.samples-per-iter.") + parser.add_argument("--egs.chunk-left-context", type=int, dest='chunk_left_context', + default = 0, + help="Number of additional frames of input to the left" + " of the input chunk. This extra context will be used" + " in the estimation of RNN state before prediction of" + " the first label. In the case of FF-DNN this extra" + " context will be used to allow for frame-shifts") + parser.add_argument("--egs.chunk-right-context", type=int, dest='chunk_right_context', + default = 0, + help="Number of additional frames of input to the right" + " of the input chunk. This extra context will be used" + " in the estimation of bidirectional RNN state before" + " prediction of the first label.") + parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir', + default = None, action = train_lib.NullstrToNoneAction, + help="String to provide options directly to steps/nnet3/get_egs.sh script") + parser.add_argument("--egs.dir", type=str, dest='egs_dir', + default = None, action = train_lib.NullstrToNoneAction, + help="Directory with egs. If specified this directory " + "will be used rather than extracting egs") + parser.add_argument("--egs.stage", type=int, dest='egs_stage', + default = -6, help="Stage at which get_egs.sh should be restarted") + parser.add_argument("--egs.opts", type=str, dest='egs_opts', + default = None, action = train_lib.NullstrToNoneAction, + help="String to provide options directly to steps/nnet3/get_egs.sh script") + + # chain options + parser.add_argument("--chain.lm-opts", type=str, dest='lm_opts', + default = None, action = train_lib.NullstrToNoneAction, + help="options to be be passed to chain-est-phone-lm") + parser.add_argument("--chain.l2-regularize", type=float, dest='l2_regularize', + default = 0.0, + help="Weight of regularization function which is the" + " l2-norm of the output of the network. It should be" + " used without the log-softmax layer for the outputs." + " As l2-norm of the log-softmax outputs can dominate" + " the objective function.") + parser.add_argument("--chain.xent-regularize", type=float, dest='xent_regularize', + default = 0.0, + help="Weight of regularization function which is the" + " cross-entropy cost the outputs.") + parser.add_argument("--chain.right-tolerance", type=int, dest='right_tolerance', + default = 5, help="") + parser.add_argument("--chain.left-tolerance", type=int, dest='left_tolerance', + default = 5, help="") + parser.add_argument("--chain.leaky-hmm-coefficient", type=float, dest='leaky_hmm_coefficient', + default = 0.00001, help="") + parser.add_argument("--chain.apply-deriv-weights", type=str, dest='apply_deriv_weights', + default=True, action=train_lib.StrToBoolAction, + choices = ["true", "false"], + help="") + parser.add_argument("--chain.truncate-deriv-weights", type=float, dest='truncate_deriv_weights', + default =0, + help="Can be used to set to zero the weights of derivs" + " from frames near the edges. (counts subsampled frames)") + parser.add_argument("--chain.frame-subsampling-factor", type=int, + dest='frame_subsampling_factor', + default = 3, + help="ratio of frames-per-second of features we train" + " on, to chain model's output") + parser.add_argument("--chain.alignment-subsampling-factor", type=int, + dest='alignment_subsampling_factor', + default = 3, + help="ratio of frames-per-second of input alignments to" + " chain model's output") + parser.add_argument("--chain.ngram-order", type=int, dest='ngram_order', + default = 3, help="") + parser.add_argument("--chain.left-deriv-truncate", type=int, + dest='left_deriv_truncate', + default = None, help="") + parser.add_argument("--chain.right-deriv-truncate", type=int, + dest='right_deriv_truncate', + default = None, help="") + + + # trainer options + parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs', + default = 10, + help="Number of epochs to train the model") + parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size', + default = 20000, + help="Number of samples for computing priors") + parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior', + default = 10, + help="The prior computation jobs are single threaded and run on the CPU") + parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine', + default = 20, + help="The maximum number of models used in the final" + " model combination stage. These models will themselves" + " be averages of iteration-number ranges") + parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size', + default = 5000, + help="Controls randomization of the samples on each" + " iteration. If 0 or a large value the randomization is" + " complete, but this will consume memory and cause spikes" + " in disk I/O. Smaller is easier on disk and memory but" + " less random. It's not a huge deal though, as samples" + " are anyway randomized right at the start. (the point" + " of this is to get data in different minibatches on" + " different iterations, since in the preconditioning" + " method, 2 samples in the same minibatch can affect" + " each others' gradients.") + parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period', + default=2, + help="The number of iterations between adding layers" + " during layer-wise discriminative training.") + parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change', + default=2.0, + help="The maximum change in parameters allowed per" + " minibatch, measured in Frobenius norm over the entire model") + parser.add_argument("--trainer.frames-per-iter", type=int, dest='frames_per_iter', + default=800000, + help ="Each iteration of training, see this many [input]" + " frames per job. This option is passed to get_egs.sh." + " Aim for about a minute of training time") + parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune', + default=4.0, + help="Value used in preconditioning matrix estimation") + parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs', + default=10, + help="Max number of jobs used for LDA stats accumulation") + + # Parameters for the optimization + parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate', + default = 0.0002, + help="Learning rate used during the initial iteration") + parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate', + default = 0.00002, + help="Learning rate used during the final iteration") + parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial', + default = 1, + help="Number of neural net jobs to run in parallel at the start of training") + parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final', + default = 8, + help="Number of neural net jobs to run in parallel at" + " the end of training") + parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine', + default = 20, + help = "The is the maximum number of models we give to" + " the final 'combine' stage, but these models will" + " themselves be averages of iteration-number ranges.") + parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum', + default = 0.0, + help="Momentum used in update computation." + " Note: we implemented it in such a way that it doesn't" + " increase the effective learning rate.") + parser.add_argument("--trainer.optimization.shrink-value", type=float, dest='shrink_value', + default = 1.0, + help="Scaling factor used for scaling the parameter" + " matrices when the derivative averages are below the" + " shrink-threshold at the non-linearities") + parser.add_argument("--trainer.optimization.shrink-threshold", type=float, dest='shrink_threshold', + default = 0.15, + help="If the derivative averages are below this" + " threshold we scale the parameter matrices with the" + " shrink-value. It is less than 0.25 for sigmoid non-linearities.") + parser.add_argument("--trainer.optimization.shrink-nonlinearity", type=str, dest='shrink_nonlinearity', + default = "SigmoidComponent", choices = ["TanhComponent", "SigmoidComponent"], + help="The non-linear component from which the" + " deriv-avg values are going to used to compute" + " mean-deriv-avg. The mean-deriv-avg is going to be" + " compared with shrink-threshold. Be careful to specify" + " a shrink-threshold which is dependent on the" + " shrink-nonlinearity type") + + # RNN specific trainer options + parser.add_argument("--trainer.num-chunk-per-minibatch", type=int, dest='num_chunk_per_minibatch', + default=512, + help="Number of sequences to be processed in parallel every minibatch" ) + + # General options + parser.add_argument("--stage", type=int, default=-4, + help="Specifies the stage of the experiment to execution from") + parser.add_argument("--exit-stage", type=int, default=None, + help="If specified, training exits before running this stage") + parser.add_argument("--cmd", type=str, action = train_lib.NullstrToNoneAction, dest="command", + help="Specifies the script to launch jobs." + " e.g. queue.pl for launching on SGE cluster run.pl" + " for launching on local machine", default = "queue.pl") + parser.add_argument("--use-gpu", type=str, action = train_lib.StrToBoolAction, + choices = ["true", "false"], + help="Use GPU for training", default=True) + parser.add_argument("--cleanup", type=str, action = train_lib.StrToBoolAction, + choices = ["true", "false"], + help="Clean up models after training", default=True) + parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs', + default = True, action = train_lib.StrToBoolAction, + choices = ["true", "false"], + help="If true, remove egs after experiment") + parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval", + type=int, default=100, + help="Determines iterations for which models will be preserved during cleanup. If iter % preserve_model_interval == 0 model will be preserved.") + + parser.add_argument("--reporting.email", dest = "email", + type=str, default=None, action = train_lib.NullstrToNoneAction, + help="Email-id to report about the progress of the experiment. NOTE: It assumes the machine on which the script is being run can send emails from command line via. mail program. The Kaldi mailing list will not support this feature. It might require local expertise to setup. ") + parser.add_argument("--reporting.interval", dest = "reporting_interval", + type=int, default=0.1, + help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent") + + parser.add_argument("--feat-dir", type=str, required = True, + help="Directory with features used for training the neural network.") + parser.add_argument("--tree-dir", type=str, required = True, + help="Languade directory") + parser.add_argument("--lat-dir", type=str, required = True, + help="Directory with alignments used for training the neural network.") + parser.add_argument("--dir", type=str, required = True, + help="Directory to store the models and all other files.") + + print(' '.join(sys.argv)) + print(sys.argv) + + args = parser.parse_args() + + [args, run_opts] = ProcessArgs(args) + + return [args, run_opts] + +def ProcessArgs(args): + # process the options + if args.chunk_width < 1: + raise Exception("--egs.chunk-width should have a minimum value of 1") + + if args.chunk_left_context < 0: + raise Exception("--egs.chunk-left-context should be non-negative") + + if args.chunk_right_context < 0: + raise Exception("--egs.chunk-right-context should be non-negative") + + if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")): + raise Exception("""This scripts expects {0} to exist and have a configs + directory which is the output of make_configs.py script""") + + if args.transform_dir is None: + args.transform_dir = args.lat_dir + # set the options corresponding to args.use_gpu + run_opts = RunOpts() + if args.use_gpu: + if not train_lib.CheckIfCudaCompiled(): + logger.warning(""" + You are running with one thread but you have not compiled + for CUDA. You may be running a setup optimized for GPUs. If you have + GPUs and have nvcc installed, go to src/ and do ./configure; make""") + + run_opts.train_queue_opt = "--gpu 1" + run_opts.parallel_train_opts = "" + run_opts.combine_queue_opt = "--gpu 1" + + else: + logger.warning(""" + Without using a GPU this will be very slow. nnet3 does not yet support multiple threads.""") + + run_opts.train_queue_opt = "" + run_opts.parallel_train_opts = "--use-gpu=no" + run_opts.combine_queue_opt = "" + + run_opts.command = args.command + + return [args, run_opts] + +# a class to store run options +class RunOpts: + def __init__(self): + self.command = None + self.train_queue_opt = None + self.combine_queue_opt = None + self.parallel_train_opts = None + + +def TrainNewModels(dir, iter, num_jobs, num_archives_processed, num_archives, + raw_model_string, egs_dir, + apply_deriv_weights, + left_deriv_truncate, right_deriv_truncate, + l2_regularize, xent_regularize, leaky_hmm_coefficient, + momentum, max_param_change, + shuffle_buffer_size, num_chunk_per_minibatch, + frame_subsampling_factor, truncate_deriv_weights, + cache_io_opts, run_opts): + # We cannot easily use a single parallel SGE job to do the main training, + # because the computation of which archive and which --frame option + # to use for each job is a little complex, so we spawn each one separately. + # this is no longer true for RNNs as we use do not use the --frame option + # but we use the same script for consistency with FF-DNN code + + deriv_time_opts="" + if left_deriv_truncate is not None: + deriv_time_opts += " --optimization.min-deriv-time={0}".format(left_deriv_truncate) + if right_deriv_truncate is not None: + deriv_time_opts += " --optimization.max-deriv-time={0}".format(int(chunk-width-right_deriv_truncate)) + + processes = [] + for job in range(1,num_jobs+1): + k = num_archives_processed + job - 1 # k is a zero-based index that we will derive + # the other indexes from. + archive_index = (k % num_archives) + 1 # work out the 1-based archive index. + frame_shift = (archive_index + k/num_archives) % frame_subsampling_factor + # previous : frame_shift = (k/num_archives) % frame_subsampling_factor + if job == 1: + cur_cache_io_opts = cache_io_opts + " --write-cache={dir}/cache.{next_iter}".format(dir = dir, next_iter = iter + 1) + else: + cur_cache_io_opts = cache_io_opts + + process_handle = train_lib.RunKaldiCommand(""" +{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \ + nnet3-chain-train {parallel_train_opts} \ + --apply-deriv-weights={app_deriv_wts} \ + --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ + {cache_io_opts} --xent-regularize={xent_reg} {deriv_time_opts} \ + --print-interval=10 --momentum={momentum} \ + --max-param-change={max_param_change} \ + "{raw_model}" {dir}/den.fst \ + "ark:nnet3-chain-copy-egs --truncate-deriv-weights={trunc_deriv} --frame-shift={fr_shft} ark:{egs_dir}/cegs.{archive_index}.ark ark:- | nnet3-chain-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={iter} ark:- ark:-| nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:- ark:- |" \ + {dir}/{next_iter}.{job}.raw + """.format(command = run_opts.command, + train_queue_opt = run_opts.train_queue_opt, + dir = dir, iter = iter, next_iter = iter + 1, job = job, + deriv_time_opts = deriv_time_opts, + trunc_deriv = truncate_deriv_weights, + app_deriv_wts = apply_deriv_weights, + fr_shft = frame_shift, l2 = l2_regularize, + xent_reg = xent_regularize, leaky = leaky_hmm_coefficient, + parallel_train_opts = run_opts.parallel_train_opts, + momentum = momentum, max_param_change = max_param_change, + raw_model = raw_model_string, + egs_dir = egs_dir, archive_index = archive_index, + shuffle_buffer_size = shuffle_buffer_size, + cache_io_opts = cur_cache_io_opts, + num_chunk_per_minibatch = num_chunk_per_minibatch), + wait = False) + + processes.append(process_handle) + + all_success = True + for process in processes: + process.wait() + [stdout_value, stderr_value] = process.communicate() + if stderr_value.strip() != '': + print(stderr_value) + if process.returncode != 0: + all_success = False + + if not all_success: + open('{0}/.error'.format(dir), 'w').close() + raise Exception("There was error during training iteration {0}".format(iter)) + +def TrainOneIteration(dir, iter, egs_dir, + num_jobs, num_archives_processed, num_archives, + learning_rate, shrinkage_value, num_chunk_per_minibatch, + num_hidden_layers, add_layers_period, + apply_deriv_weights, left_deriv_truncate, right_deriv_truncate, + l2_regularize, xent_regularize, leaky_hmm_coefficient, + momentum, max_param_change, shuffle_buffer_size, + frame_subsampling_factor, truncate_deriv_weights, + run_opts): + + # Set off jobs doing some diagnostics, in the background. + # Use the egs dir from the previous iteration for the diagnostics + logger.info("Training neural net (pass {0})".format(iter)) + + chain_lib.ComputeTrainCvProbabilities(dir, iter, egs_dir, + l2_regularize, xent_regularize, leaky_hmm_coefficient, run_opts) + + if iter > 0: + chain_lib.ComputeProgress(dir, iter, run_opts) + + if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0): + + do_average = False # if we've just mixed up, don't do averaging but take the + # best. + cur_num_hidden_layers = 1 + iter / add_layers_period + config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers) + raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={iter} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, config=config_file) + cache_io_opts = "" + else: + do_average = True + if iter == 0: + do_average = False # on iteration 0, pick the best, don't average. + raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter) + cache_io_opts = "--read-cache={dir}/cache.{iter}".format(dir = dir, iter = iter) + + if do_average: + cur_num_chunk_per_minibatch = num_chunk_per_minibatch + cur_max_param_change = max_param_change + else: + # on iteration zero or when we just added a layer, use a smaller minibatch + # size (and we will later choose the output of just one of the jobs): the + # model-averaging isn't always helpful when the model is changing too fast + # (i.e. it can worsen the objective function), and the smaller minibatch + # size will help to keep the update stable. + cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2 + cur_max_param_change = float(max_param_change) / math.sqrt(2) + + TrainNewModels(dir, iter, num_jobs, num_archives_processed, num_archives, + raw_model_string, egs_dir, + apply_deriv_weights, + left_deriv_truncate, right_deriv_truncate, + l2_regularize, xent_regularize, leaky_hmm_coefficient, + momentum, cur_max_param_change, + shuffle_buffer_size, cur_num_chunk_per_minibatch, + frame_subsampling_factor, truncate_deriv_weights, + cache_io_opts, run_opts) + + [models_to_average, best_model] = train_lib.GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter)) + nnets_list = [] + for n in models_to_average: + nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n)) + + if do_average: + # average the output of the different jobs. + train_lib.RunKaldiCommand(""" +{command} {dir}/log/average.{iter}.log \ +nnet3-average {nnet_list} - \| \ +nnet3-am-copy --scale={shrink} --set-raw-nnet=- {dir}/{iter}.mdl {dir}/{new_iter}.mdl + """.format(command = run_opts.command, + dir = dir, + iter = iter, + nnet_list = " ".join(nnets_list), + shrink = shrinkage_value, + new_iter = iter + 1)) + + else: + # choose the best model from different jobs + train_lib.RunKaldiCommand(""" +{command} {dir}/log/select.{iter}.log \ + nnet3-am-copy --scale={shrink} --set-raw-nnet={dir}/{next_iter}.{best_model_index}.raw {dir}/{iter}.mdl {dir}/{next_iter}.mdl + """.format(command = run_opts.command, + dir = dir, iter = iter, next_iter = iter + 1, + shrink = shrinkage_value, best_model_index = best_model)) + + try: + for i in range(1, num_jobs + 1): + os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i)) + except OSError: + raise Exception("Error while trying to delete the raw models") + + new_model = "{0}/{1}.mdl".format(dir, iter + 1) + + if not os.path.isfile(new_model): + raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter)) + elif os.stat(new_model).st_size == 0: + raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter)) + +def CheckForRequiredFiles(feat_dir, tree_dir, lat_dir): + for file in ['{0}/feats.scp'.format(feat_dir), '{0}/ali.1.gz'.format(tree_dir), + '{0}/final.mdl'.format(tree_dir), '{0}/tree'.format(tree_dir), + '{0}/lat.1.gz'.format(lat_dir), '{0}/final.mdl'.format(lat_dir), + '{0}/num_jobs'.format(lat_dir), '{0}/splice_opts'.format(lat_dir)]: + if not os.path.isfile(file): + raise Exception('Expected {0} to exist.'.format(file)) + +# args is a Namespace with the required parameters +def Train(args, run_opts): + arg_string = pprint.pformat(vars(args)) + logger.info("Arguments for the experiment\n{0}".format(arg_string)) + + # Check files + CheckForRequiredFiles(args.feat_dir, args.tree_dir, args.lat_dir) + + # Set some variables. + num_jobs = train_lib.GetNumberOfJobs(args.tree_dir) + feat_dim = train_lib.GetFeatDim(args.feat_dir) + ivector_dim = train_lib.GetIvectorDim(args.online_ivector_dir) + + # split the training data into parts for individual jobs + # we will use the same number of jobs as that used for alignment + train_lib.SplitData(args.feat_dir, num_jobs) + shutil.copy('{0}/tree'.format(args.tree_dir), args.dir) + f = open('{0}/num_jobs'.format(args.dir), 'w') + f.write(str(num_jobs)) + f.close() + + config_dir = '{0}/configs'.format(args.dir) + var_file = '{0}/vars'.format(config_dir) + + [model_left_context, model_right_context, num_hidden_layers] = train_lib.ParseModelConfigVarsFile(var_file) + # Initialize as "raw" nnet, prior to training the LDA-like preconditioning + # matrix. This first config just does any initial splicing that we do; + # we do this as it's a convenient way to get the stats for the 'lda-like' + # transform. + if (args.stage <= -6): + logger.info("Creating phone language-model") + chain_lib.CreatePhoneLm(args.dir, args.tree_dir, run_opts, lm_opts = args.lm_opts) + + if (args.stage <= -5): + logger.info("Creating denominator FST") + chain_lib.CreateDenominatorFst(args.dir, args.tree_dir, run_opts) + + if (args.stage <= -4): + logger.info("Initializing a basic network for estimating preconditioning matrix") + train_lib.RunKaldiCommand(""" +{command} {dir}/log/nnet_init.log \ + nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw + """.format(command = run_opts.command, + dir = args.dir)) + + left_context = args.chunk_left_context + model_left_context + right_context = args.chunk_right_context + model_right_context + + default_egs_dir = '{0}/egs'.format(args.dir) + if (args.stage <= -3) and args.egs_dir is None: + logger.info("Generating egs") + # this is where get_egs.sh is called. + chain_lib.GenerateChainEgs(args.dir, args.feat_dir, args.lat_dir, default_egs_dir, + left_context + args.frame_subsampling_factor/2, + right_context + args.frame_subsampling_factor/2, + run_opts, + left_tolerance = args.left_tolerance, + right_tolerance = args.right_tolerance, + frame_subsampling_factor = args.frame_subsampling_factor, + alignment_subsampling_factor = args.alignment_subsampling_factor, + frames_per_eg = args.chunk_width, + egs_opts = args.egs_opts, + cmvn_opts = args.cmvn_opts, + online_ivector_dir = args.online_ivector_dir, + frames_per_iter = args.frames_per_iter, + transform_dir = args.transform_dir, + stage = args.egs_stage) + + if args.egs_dir is None: + egs_dir = default_egs_dir + else: + egs_dir = args.egs_dir + + [egs_left_context, egs_right_context, frames_per_eg, num_archives] = train_lib.VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context) + assert(args.chunk_width == frames_per_eg) + num_archives_expanded = num_archives * args.frame_subsampling_factor + + if (args.num_jobs_final > num_archives_expanded): + raise Exception('num_jobs_final cannot exceed the expanded number of archives') + + # copy the properties of the egs to dir for + # use during decoding + train_lib.CopyEgsPropertiesToExpDir(egs_dir, args.dir) + + if (args.stage <= -2): + logger.info('Computing the preconditioning matrix for input features') + + chain_lib.ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts, + max_lda_jobs = args.max_lda_jobs, + rand_prune = args.rand_prune) + + if (args.stage <= -1): + logger.info("Preparing the initial acoustic model.") + chain_lib.PrepareInitialAcousticModel(args.dir, run_opts) + + file_handle = open("{0}/frame_subsampling_factor".format(args.dir),"w") + file_handle.write(str(args.frame_subsampling_factor)) + file_handle.close() + + # set num_iters so that as close as possible, we process the data $num_epochs + # times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives, + # where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2. + num_archives_to_process = args.num_epochs * num_archives_expanded + num_archives_processed = 0 + num_iters=(num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final) + + num_iters_combine = train_lib.VerifyIterations(num_iters, args.num_epochs, + num_hidden_layers, num_archives_expanded, + args.max_models_combine, args.add_layers_period, + args.num_jobs_final) + + learning_rate = lambda iter, current_num_jobs, num_archives_processed: train_lib.GetLearningRate(iter, current_num_jobs, num_iters, + num_archives_processed, + num_archives_to_process, + args.initial_effective_lrate, + args.final_effective_lrate) + + logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters)) + for iter in range(num_iters): + if (args.exit_stage is not None) and (iter == args.exit_stage): + logger.info("Exiting early due to --exit-stage {0}".format(iter)) + return + current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters) + + if args.stage <= iter: + if args.shrink_value != 1.0: + model_file = "{dir}/{iter}.mdl".format(dir = args.dir, iter = iter) + shrinkage_value = args.shrink_value if train_lib.DoShrinkage(iter, model_file, args.shrink_nonlinearity, args.shrink_threshold) else 1 + else: + shrinkage_value = args.shrink_value + logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value)) + + TrainOneIteration(args.dir, iter, egs_dir, current_num_jobs, + num_archives_processed, num_archives, + learning_rate(iter, current_num_jobs, num_archives_processed), + shrinkage_value, + args.num_chunk_per_minibatch, + num_hidden_layers, args.add_layers_period, + args.apply_deriv_weights, args.left_deriv_truncate, args.right_deriv_truncate, + args.l2_regularize, args.xent_regularize, args.leaky_hmm_coefficient, + args.momentum, args.max_param_change, + args.shuffle_buffer_size, + args.frame_subsampling_factor, + args.truncate_deriv_weights, run_opts) + if args.cleanup: + # do a clean up everythin but the last 2 models, under certain conditions + train_lib.RemoveModel(args.dir, iter-2, num_iters, num_iters_combine, + args.preserve_model_interval) + + if args.email is not None: + reporting_iter_interval = num_iters * args.reporting_interval + if iter % reporting_iter_interval == 0: + # lets do some reporting + [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir, key="log-probability") + message = report + subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter) + train_lib.SendMail(message, subject, args.email) + + num_archives_processed = num_archives_processed + current_num_jobs + + if args.stage <= num_iters: + logger.info("Doing final combination to produce final.mdl") + chain_lib.CombineModels(args.dir, num_iters, num_iters_combine, + args.num_chunk_per_minibatch, egs_dir, + args.leaky_hmm_coefficient, args.l2_regularize, + args.xent_regularize, run_opts) + + if args.cleanup: + logger.info("Cleaning up the experiment directory {0}".format(args.dir)) + remove_egs = args.remove_egs + if args.egs_dir is not None: + # this egs_dir was not created by this experiment so we will not + # delete it + remove_egs = False + + train_lib.CleanNnetDir(args.dir, num_iters, egs_dir, + preserve_model_interval = args.preserve_model_interval, + remove_egs = remove_egs) + + # do some reporting + [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir, "log-probability") + if args.email is not None: + train_lib.SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email) + + report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w") + report_handle.write(report) + report_handle.close() + +def Main(): + [args, run_opts] = GetArgs() + try: + Train(args, run_opts) + except Exception as e: + if args.email is not None: + message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir) + sendMail(message, message, args.email) + traceback.print_exc() + raise e + +if __name__ == "__main__": + Main() diff --git a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh index 1a62d8d7bb6..d89e9a335dc 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh +++ b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh @@ -23,21 +23,33 @@ truncate_deriv_weights=0 # can be used to set to zero the weights of derivs fro apply_deriv_weights=true initial_effective_lrate=0.0002 final_effective_lrate=0.00002 +extra_left_context=0 # actually for recurrent setups. pnorm_input_dim=3000 pnorm_output_dim=300 relu_dim= # you can use this to make it use ReLU's instead of p-norms. + +jesus_opts= # opts to steps/nnet3/make_jesus_configs.py. + # If nonempty, assumes you want to use the jesus nonlinearity, + # and you should supply various options to that script in + # this string. rand_prune=4.0 # Relates to a speedup we do for LDA. minibatch_size=512 # This default is suitable for GPU-based training. # Set it to 128 for multi-threaded CPU-based training. lm_opts= # options to chain-est-phone-lm +l2_regularize=0.0 +leaky_hmm_coefficient=0.00001 +xent_regularize=0.0 frames_per_iter=800000 # each iteration of training, see this many [input] # frames per job. This option is passed to get_egs.sh. # Aim for about a minute of training time -right_tolerance=10 -denominator_scale=1.0 # relates to tombsone stuff. +right_tolerance=5 # tolerance at the same frame-rate as the alignment directory. +left_tolerance=5 # tolerance at the same frame-rate as the alignment directory. num_jobs_initial=1 # Number of neural net jobs to run in parallel at the start of training num_jobs_final=8 # Number of neural net jobs to run in parallel at the end of training -frame_subsampling_factor=3 # controls reduced frame-rate at the output. +frame_subsampling_factor=3 # ratio of frames-per-second of features we train + # on, to chain model's output +alignment_subsampling_factor=3 # ratio of frames-per-second of input alignments + # to chain model's output get_egs_stage=0 # can be used for rerunning after partial online_ivector_dir= max_param_change=2.0 @@ -66,6 +78,10 @@ exit_stage=-100 # you can set this to terminate the training early. Exits befor # count space-separated fields in splice_indexes to get num-hidden-layers. splice_indexes="-4,-3,-2,-1,0,1,2,3,4 0 -2,2 0 -4,4 0" +pool_type='none' +pool_window= +pool_lpfilter_width= + # Format : layer/....layer/ " # note: hidden layers which are composed of one or more components, # so hidden layer indexing is different from component count @@ -87,7 +103,7 @@ right_deriv_truncate= # number of time-steps to avoid using the deriv of, on th # End configuration section. -trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM +trap 'for pid in $(jobs -pr); do kill -TERM $pid; done' INT QUIT TERM echo "$0 $@" # Print the command line for logging @@ -197,23 +213,44 @@ num_leaves=$(am-info $dir/0.trans_mdl | grep -w pdfs | awk '{print $NF}') || exi if [ $stage -le -5 ]; then echo "$0: creating neural net configs"; - if [ ! -z "$relu_dim" ]; then - dim_opts="--relu-dim $relu_dim" + + if [ ! -z "$jesus_opts" ]; then + $cmd $dir/log/make_configs.log \ + python steps/nnet3/make_jesus_configs.py \ + --xent-regularize=$xent_regularize \ + --include-log-softmax=false \ + --splice-indexes "$splice_indexes" \ + --feat-dim $feat_dim \ + --ivector-dim $ivector_dim \ + $jesus_opts \ + --num-targets $num_leaves \ + $dir/configs || exit 1; else - dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" - fi + [ $xent_regularize != "0.0" ] && \ + echo "$0: --xent-regularize option not supported by tdnn/make_configs.py." && exit 1; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi - # create the config files for nnet initialization - python steps/nnet3/make_tdnn_configs.py \ - --include-log-softmax=false \ - --final-layer-normalize-target $final_layer_normalize_target \ - --splice-indexes "$splice_indexes" \ - --feat-dim $feat_dim \ - --ivector-dim $ivector_dim \ - $dim_opts \ - --num-targets $num_leaves \ - --use-presoftmax-prior-scale false \ - $dir/configs || exit 1; + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + + python steps/nnet3/tdnn/make_configs.py $pool_opts \ + --include-log-softmax=false \ + --final-layer-normalize-target $final_layer_normalize_target \ + --splice-indexes "$splice_indexes" \ + --feat-dim $feat_dim \ + --ivector-dim $ivector_dim \ + $dim_opts \ + --num-targets $num_leaves \ + --use-presoftmax-prior-scale false \ + $dir/configs || exit 1; + fi # Initialize as "raw" nnet, prior to training the LDA-like preconditioning # matrix. This first config just does any initial splicing that we do; @@ -229,6 +266,12 @@ fi # num_hidden_layers=(something) . $dir/configs/vars || exit 1; +# the next 2 lines are in case the configs were created by an older +# config-generating script, which writes to left_context and right_context +# instead of model_left_context and model_right_context. +[ -z $model_left_context ] && model_left_context=$left_context +[ -z $model_right_context ] && model_right_context=$right_context + ! [ "$num_hidden_layers" -gt 0 ] && echo \ "$0: Expected num_hidden_layers to be defined" && exit 1; @@ -242,14 +285,17 @@ if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then extra_opts+=(--transform-dir $transform_dir) # we need a bit of extra left-context and right-context to allow for frame # shifts (we use shifted version of the data for more variety). - extra_opts+=(--left-context $[$left_context+$frame_subsampling_factor/2]) - extra_opts+=(--right-context $[$right_context+$frame_subsampling_factor/2]) + extra_opts+=(--left-context $[$model_left_context+$frame_subsampling_factor/2+$extra_left_context]) + extra_opts+=(--right-context $[$model_right_context+$frame_subsampling_factor/2]) echo "$0: calling get_egs.sh" steps/nnet3/chain/get_egs.sh $egs_opts "${extra_opts[@]}" \ --frames-per-iter $frames_per_iter --stage $get_egs_stage \ --cmd "$cmd" \ + --right-tolerance "$right_tolerance" \ + --left-tolerance "$left_tolerance" \ --frames-per-eg $frames_per_eg \ --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor $alignment_subsampling_factor \ $data $dir $latdir $dir/egs || exit 1; fi @@ -271,8 +317,8 @@ cp $egs_dir/{cmvn_opts,splice_opts,final.mat} $dir 2>/dev/null # the --egs-dir option was used on the command line). egs_left_context=$(cat $egs_dir/info/left_context) || exit -1 egs_right_context=$(cat $egs_dir/info/right_context) || exit -1 -( [ $egs_left_context -lt $left_context ] || \ - [ $egs_right_context -lt $right_context ] ) && \ +( [ $egs_left_context -lt $model_left_context ] || \ + [ $egs_right_context -lt $model_right_context ] ) && \ echo "$0: egs in $egs_dir have too little context" && exit -1; frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; } @@ -414,11 +460,11 @@ while [ $x -lt $num_iters ]; do # Set off jobs doing some diagnostics, in the background. # Use the egs dir from the previous iteration for the diagnostics $cmd $dir/log/compute_prob_valid.$x.log \ - nnet3-chain-compute-prob \ + nnet3-chain-compute-prob --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \ "nnet3-am-copy --raw=true $dir/$x.mdl -|" $dir/den.fst \ "ark:nnet3-chain-merge-egs ark:$egs_dir/valid_diagnostic.cegs ark:- |" & $cmd $dir/log/compute_prob_train.$x.log \ - nnet3-chain-compute-prob \ + nnet3-chain-compute-prob --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \ "nnet3-am-copy --raw=true $dir/$x.mdl -|" $dir/den.fst \ "ark:nnet3-chain-merge-egs ark:$egs_dir/train_diagnostic.cegs ark:- |" & @@ -440,10 +486,12 @@ while [ $x -lt $num_iters ]; do cur_num_hidden_layers=$[1+$x/$add_layers_period] config=$dir/configs/layer$cur_num_hidden_layers.config mdl="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl - | nnet3-init --srand=$x - $config - |" + cache_io_opts="" else do_average=true if [ $x -eq 0 ]; then do_average=false; fi # on iteration 0, pick the best, don't average. mdl="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl -|" + cache_io_opts="--read-cache=$dir/cache.$x" fi if $do_average; then this_minibatch_size=$minibatch_size @@ -461,7 +509,9 @@ while [ $x -lt $num_iters ]; do rm $dir/.error 2>/dev/null - ( # this sub-shell is so that when we "wait" below, + ( + trap 'for pid in $(jobs -pr); do kill -TERM $pid; done' INT QUIT TERM + # this sub-shell is so that when we "wait" below, # we only wait for the training jobs that we just spawned, # not the diagnostic jobs that we spawned above. @@ -473,10 +523,16 @@ while [ $x -lt $num_iters ]; do # the other indexes from. archive=$[($k%$num_archives)+1]; # work out the 1-based archive index. frame_shift=$[($k/$num_archives)%$frame_subsampling_factor]; - + if [ $n -eq 1 ]; then + # opts for computation cache (storing compiled computation). + this_cache_io_opts="$cache_io_opts --write-cache=$dir/cache.$[$x+1]" + else + this_cache_io_opts="$cache_io_opts" + fi $cmd $train_queue_opt $dir/log/train.$x.$n.log \ nnet3-chain-train --apply-deriv-weights=$apply_deriv_weights \ - $parallel_train_opts $deriv_time_opts \ + --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \ + $this_cache_io_opts $parallel_train_opts $deriv_time_opts \ --max-param-change=$this_max_param_change \ --print-interval=10 "$mdl" $dir/den.fst \ "ark:nnet3-chain-copy-egs --truncate-deriv-weights=$truncate_deriv_weights --frame-shift=$frame_shift ark:$egs_dir/cegs.$archive.ark ark:- | nnet3-chain-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-chain-merge-egs --minibatch-size=$this_minibatch_size ark:- ark:- |" \ @@ -518,6 +574,7 @@ while [ $x -lt $num_iters ]; do rm $dir/$[$x-1].mdl fi fi + rm $dir/cache.$x 2>/dev/null x=$[$x+1] num_archives_processed=$[$num_archives_processed+$this_num_jobs] done @@ -543,7 +600,7 @@ if [ $stage -le $num_iters ]; then # num-threads to 8 to speed it up (this isn't ideal...) $cmd $combine_queue_opt $dir/log/combine.log \ - nnet3-chain-combine --num-iters=40 \ + nnet3-chain-combine --num-iters=40 --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient \ --enforce-sum-to-one=true --enforce-positive-weights=true \ --verbose=3 $dir/den.fst "${nnets_list[@]}" "ark:nnet3-chain-merge-egs --minibatch-size=$minibatch_size ark:$egs_dir/combine.cegs ark:-|" \ "|nnet3-am-copy --set-raw-nnet=- $dir/$first_model_combine.mdl $dir/final.mdl" || exit 1; @@ -553,11 +610,11 @@ if [ $stage -le $num_iters ]; then # the same subset we used for the previous compute_probs, as the # different subsets will lead to different probs. $cmd $dir/log/compute_prob_valid.final.log \ - nnet3-chain-compute-prob \ + nnet3-chain-compute-prob --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \ "nnet3-am-copy --raw=true $dir/final.mdl - |" $dir/den.fst \ "ark:nnet3-chain-merge-egs ark:$egs_dir/valid_diagnostic.cegs ark:- |" & $cmd $dir/log/compute_prob_train.final.log \ - nnet3-chain-compute-prob \ + nnet3-chain-compute-prob --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \ "nnet3-am-copy --raw=true $dir/final.mdl - |" $dir/den.fst \ "ark:nnet3-chain-merge-egs ark:$egs_dir/train_diagnostic.cegs ark:- |" & fi diff --git a/egs/wsj/s5/steps/nnet3/components.py b/egs/wsj/s5/steps/nnet3/components.py index 87323a1c3e1..e14ab40519f 100644 --- a/egs/wsj/s5/steps/nnet3/components.py +++ b/egs/wsj/s5/steps/nnet3/components.py @@ -6,6 +6,24 @@ import sys import warnings import copy +from operator import itemgetter + +def GetSumDescriptor(inputs): + sum_descriptors = inputs + while len(sum_descriptors) != 1: + cur_sum_descriptors = [] + pair = [] + while len(sum_descriptors) > 0: + value = sum_descriptors.pop() + if value.strip() != '': + pair.append(value) + if len(pair) == 2: + cur_sum_descriptors.append("Sum({0}, {1})".format(pair[0], pair[1])) + pair = [] + if pair: + cur_sum_descriptors.append(pair[0]) + sum_descriptors = cur_sum_descriptors + return sum_descriptors # adds the input nodes and returns the descriptor def AddInputLayer(config_lines, feat_dim, splice_indexes=[0], ivector_dim=0): @@ -19,11 +37,24 @@ def AddInputLayer(config_lines, feat_dim, splice_indexes=[0], ivector_dim=0): components.append('input-node name=ivector dim=' + str(ivector_dim)) list.append('ReplaceIndex(ivector, t, 0)') output_dim += ivector_dim - splice_descriptor = "Append({0})".format(", ".join(list)) + if len(list) > 1: + splice_descriptor = "Append({0})".format(", ".join(list)) + else: + splice_descriptor = list[0] print(splice_descriptor) return {'descriptor': splice_descriptor, 'dimension': output_dim} +def AddNoOpLayer(config_lines, name, input): + components = config_lines['components'] + component_nodes = config_lines['component-nodes'] + + components.append('component name={0}_noop type=NoOpComponent dim={1}'.format(name, input['dimension'])) + component_nodes.append('component-node name={0}_noop component={0}_noop input={1}'.format(name, input['descriptor'])) + + return {'descriptor': '{0}_noop'.format(name), + 'dimension': input['dimension']} + def AddLdaLayer(config_lines, name, input, lda_file): components = config_lines['components'] component_nodes = config_lines['component-nodes'] @@ -34,6 +65,27 @@ def AddLdaLayer(config_lines, name, input, lda_file): return {'descriptor': '{0}_lda'.format(name), 'dimension': input['dimension']} +def AddBlockAffineLayer(config_lines, name, input, output_dim, num_blocks): + components = config_lines['components'] + component_nodes = config_lines['component-nodes'] + assert((input['dimension'] % num_blocks == 0) and + (output_dim % num_blocks == 0)) + components.append('component name={0}_block_affine type=BlockAffineComponent input-dim={1} output-dim={2} num-blocks={3}'.format(name, input['dimension'], output_dim, num_blocks)) + component_nodes.append('component-node name={0}_block_affine component={0}_block_affine input={1}'.format(name, input['descriptor'])) + + return {'descriptor' : '{0}_block_affine'.format(name), + 'dimension' : output_dim} + +def AddPermuteLayer(config_lines, name, input, column_map): + components = config_lines['components'] + component_nodes = config_lines['component-nodes'] + permute_indexes = ",".join(map(lambda x: str(x), column_map)) + components.append('component name={0}_permute type=PermuteComponent column-map={1}'.format(name, permute_indexes)) + component_nodes.append('component-node name={0}_permute component={0}_permute input={1}'.format(name, input['descriptor'])) + + return {'descriptor': '{0}_permute'.format(name), + 'dimension': input['dimension']} + def AddAffineLayer(config_lines, name, input, output_dim, ng_affine_options = ""): components = config_lines['components'] component_nodes = config_lines['component-nodes'] @@ -44,13 +96,14 @@ def AddAffineLayer(config_lines, name, input, output_dim, ng_affine_options = "" return {'descriptor': '{0}_affine'.format(name), 'dimension': output_dim} -def AddAffRelNormLayer(config_lines, name, input, output_dim, ng_affine_options = ""): +def AddAffRelNormLayer(config_lines, name, input, output_dim, ng_affine_options = " bias-stddev=0 ", norm_target_rms = 1.0, self_repair_scale = None): components = config_lines['components'] component_nodes = config_lines['component-nodes'] + self_repair_string = "self-repair-scale={0:.10f}".format(self_repair_scale) if self_repair_scale is not None else '' components.append("component name={0}_affine type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input['dimension'], output_dim, ng_affine_options)) - components.append("component name={0}_relu type=RectifiedLinearComponent dim={1}".format(name, output_dim)) - components.append("component name={0}_renorm type=NormalizeComponent dim={1}".format(name, output_dim)) + components.append("component name={0}_relu type=RectifiedLinearComponent dim={1} {2}".format(name, output_dim, self_repair_string)) + components.append("component name={0}_renorm type=NormalizeComponent dim={1} target-rms={2}".format(name, output_dim, norm_target_rms)) component_nodes.append("component-node name={0}_affine component={0}_affine input={1}".format(name, input['descriptor'])) component_nodes.append("component-node name={0}_relu component={0}_relu input={0}_affine".format(name)) @@ -59,7 +112,34 @@ def AddAffRelNormLayer(config_lines, name, input, output_dim, ng_affine_options return {'descriptor': '{0}_renorm'.format(name), 'dimension': output_dim} +def AddConvolutionLayer(config_lines, name, input, + input_x_dim, input_y_dim, input_z_dim, + filt_x_dim, filt_y_dim, + filt_x_step, filt_y_step, + num_filters, input_vectorization, + param_stddev = None, bias_stddev = None, + filter_bias_file = None, + is_updatable = True): + assert(input['dimension'] == input_x_dim * input_y_dim * input_z_dim) + components = config_lines['components'] + component_nodes = config_lines['component-nodes'] + + conv_init_string = "component name={0}_conv type=ConvolutionComponent input-x-dim={1} input-y-dim={2} input-z-dim={3} filt-x-dim={4} filt-y-dim={5} filt-x-step={6} filt-y-step={7} input-vectorization-order={8}".format(name, input_x_dim, input_y_dim, input_z_dim, filt_x_dim, filt_y_dim, filt_x_step, filt_y_step, input_vectorization) + if filter_bias_file is not None: + conv_init_string += " matrix={0}".format(filter_bias_file) + if is_updatable: + conv_init_string += " is-updatable=true" + else: + conv_init_string += " is-updatable=false" + + components.append(conv_init_string) + component_nodes.append("component-node name={0}_conv_t component={0}_conv input={1}".format(name, input['descriptor'])) + num_x_steps = (1 + (input_x_dim - filt_x_dim) / filt_x_step) + num_y_steps = (1 + (input_y_dim - filt_y_dim) / filt_y_step) + output_dim = num_x_steps * num_y_steps * num_filters; + return {'descriptor': '{0}_conv_t'.format(name), + 'dimension': output_dim} def AddSoftmaxLayer(config_lines, name, input): components = config_lines['components'] @@ -72,152 +152,83 @@ def AddSoftmaxLayer(config_lines, name, input): 'dimension': input['dimension']} -def AddOutputNode(config_lines, input, label_delay=None): +def AddSigmoidLayer(config_lines, name, input, self_repair_scale = None): components = config_lines['components'] component_nodes = config_lines['component-nodes'] - if label_delay is None: - component_nodes.append('output-node name=output input={0}'.format(input['descriptor'])) - else: - component_nodes.append('output-node name=output input=Offset({0},{1})'.format(input['descriptor'], label_delay)) - -def AddFinalLayer(config_lines, input, output_dim, ng_affine_options = "", label_delay=None, include_softmax = "true"): - prev_layer_output = AddAffineLayer(config_lines, "Final", input, output_dim, ng_affine_options) - if include_softmax == "true": - prev_layer_output = AddSoftmaxLayer(config_lines, "Final", prev_layer_output) - AddOutputNode(config_lines, prev_layer_output, label_delay) + self_repair_string = "self-repair-scale={0:.10f}".format(self_repair_scale) if self_repair_scale is not None else '' + components.append("component name={0}_sigmoid type=SigmoidComponent dim={1}".format(name, input['dimension'], self_repair_string)) + component_nodes.append("component-node name={0}_sigmoid component={0}_sigmoid input={1}".format(name, input['descriptor'])) + return {'descriptor': '{0}_sigmoid'.format(name), + 'dimension': input['dimension']} -def AddLstmLayer(config_lines, - name, input, cell_dim, - recurrent_projection_dim = 0, - non_recurrent_projection_dim = 0, - clipping_threshold = 1.0, - norm_based_clipping = "false", - ng_per_element_scale_options = "", - ng_affine_options = "", - lstm_delay = -1): - assert(recurrent_projection_dim >= 0 and non_recurrent_projection_dim >= 0) +def AddOutputLayer(config_lines, input, label_delay = None, suffix=None, objective_type = "linear"): components = config_lines['components'] component_nodes = config_lines['component-nodes'] + name = 'output' + if suffix is not None: + name = '{0}-{1}'.format(name, suffix) - input_descriptor = input['descriptor'] - input_dim = input['dimension'] - name = name.strip() - - if (recurrent_projection_dim == 0): - add_recurrent_projection = False - recurrent_projection_dim = cell_dim - recurrent_connection = "m_t" - else: - add_recurrent_projection = True - recurrent_connection = "r_t" - if (non_recurrent_projection_dim == 0): - add_non_recurrent_projection = False + if label_delay is None: + component_nodes.append('output-node name={0} input={1} objective={2}'.format(name, input['descriptor'], objective_type)) else: - add_non_recurrent_projection = True - - # Natural gradient per element scale parameters - ng_per_element_scale_options += " param-mean=0.0 param-stddev=1.0 " - # Parameter Definitions W*(* replaced by - to have valid names) - components.append("# Input gate control : W_i* matrices") - components.append("component name={0}_W_i-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options)) - components.append("# note : the cell outputs pass through a diagonal matrix") - components.append("component name={0}_w_ic type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim, ng_per_element_scale_options)) - - components.append("# Forget gate control : W_f* matrices") - components.append("component name={0}_W_f-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options)) - components.append("# note : the cell outputs pass through a diagonal matrix") - components.append("component name={0}_w_fc type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim, ng_per_element_scale_options)) - - components.append("# Output gate control : W_o* matrices") - components.append("component name={0}_W_o-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options)) - components.append("# note : the cell outputs pass through a diagonal matrix") - components.append("component name={0}_w_oc type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim, ng_per_element_scale_options)) - - components.append("# Cell input matrices : W_c* matrices") - components.append("component name={0}_W_c-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options)) - - - components.append("# Defining the non-linearities") - components.append("component name={0}_i type=SigmoidComponent dim={1}".format(name, cell_dim)) - components.append("component name={0}_f type=SigmoidComponent dim={1}".format(name, cell_dim)) - components.append("component name={0}_o type=SigmoidComponent dim={1}".format(name, cell_dim)) - components.append("component name={0}_g type=TanhComponent dim={1}".format(name, cell_dim)) - components.append("component name={0}_h type=TanhComponent dim={1}".format(name, cell_dim)) - - components.append("# Defining the cell computations") - components.append("component name={0}_c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) - components.append("component name={0}_c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) - components.append("component name={0}_m type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) - components.append("component name={0}_c type=ClipGradientComponent dim={1} clipping-threshold={2} norm-based-clipping={3} ".format(name, cell_dim, clipping_threshold, norm_based_clipping)) - - # c1_t and c2_t defined below - component_nodes.append("component-node name={0}_c_t component={0}_c input=Sum({0}_c1_t, {0}_c2_t)".format(name)) - c_tminus1_descriptor = "IfDefined(Offset({0}_c_t, {1}))".format(name, lstm_delay) - - component_nodes.append("# i_t") - component_nodes.append("component-node name={0}_i1 component={0}_W_i-xr input=Append({1}, IfDefined(Offset({0}_{2}, {3})))".format(name, input_descriptor, recurrent_connection, lstm_delay)) - component_nodes.append("component-node name={0}_i2 component={0}_w_ic input={1}".format(name, c_tminus1_descriptor)) - component_nodes.append("component-node name={0}_i_t component={0}_i input=Sum({0}_i1, {0}_i2)".format(name)) - - component_nodes.append("# f_t") - component_nodes.append("component-node name={0}_f1 component={0}_W_f-xr input=Append({1}, IfDefined(Offset({0}_{2}, {3})))".format(name, input_descriptor, recurrent_connection, lstm_delay)) - component_nodes.append("component-node name={0}_f2 component={0}_w_fc input={1}".format(name, c_tminus1_descriptor)) - component_nodes.append("component-node name={0}_f_t component={0}_f input=Sum({0}_f1,{0}_f2)".format(name)) - - component_nodes.append("# o_t") - component_nodes.append("component-node name={0}_o1 component={0}_W_o-xr input=Append({1}, IfDefined(Offset({0}_{2}, {3})))".format(name, input_descriptor, recurrent_connection, lstm_delay)) - component_nodes.append("component-node name={0}_o2 component={0}_w_oc input={0}_c_t".format(name)) - component_nodes.append("component-node name={0}_o_t component={0}_o input=Sum({0}_o1, {0}_o2)".format(name)) - - component_nodes.append("# h_t") - component_nodes.append("component-node name={0}_h_t component={0}_h input={0}_c_t".format(name)) - - component_nodes.append("# g_t") - component_nodes.append("component-node name={0}_g1 component={0}_W_c-xr input=Append({1}, IfDefined(Offset({0}_{2}, {3})))".format(name, input_descriptor, recurrent_connection, lstm_delay)) - component_nodes.append("component-node name={0}_g_t component={0}_g input={0}_g1".format(name)) - - component_nodes.append("# parts of c_t") - component_nodes.append("component-node name={0}_c1_t component={0}_c1 input=Append({0}_f_t, {1})".format(name, c_tminus1_descriptor)) - component_nodes.append("component-node name={0}_c2_t component={0}_c2 input=Append({0}_i_t, {0}_g_t)".format(name)) + component_nodes.append('output-node name={0} input=Offset({1},{2}) objective={3}'.format(name, input['descriptor'], label_delay, objective_type)) + +def AddFinalLayer(config_lines, input, output_dim, + ng_affine_options = " param-stddev=0 bias-stddev=0 ", + label_delay=None, + use_presoftmax_prior_scale = False, + prior_scale_file = None, + include_log_softmax = True, + name_affix = None, + objective_type = "linear"): + components = config_lines['components'] + component_nodes = config_lines['component-nodes'] - component_nodes.append("# m_t") - component_nodes.append("component-node name={0}_m_t component={0}_m input=Append({0}_o_t, {0}_h_t)".format(name)) + if name_affix is not None: + final_node_prefix = 'Final-' + str(name_affix) + else: + final_node_prefix = 'Final' + + prev_layer_output = AddAffineLayer(config_lines, + final_node_prefix , input, output_dim, + ng_affine_options) + if include_log_softmax: + if use_presoftmax_prior_scale : + components.append('component name={0}-fixed-scale type=FixedScaleComponent scales={1}'.format(final_node_prefix, prior_scale_file)) + component_nodes.append('component-node name={0}-fixed-scale component={0}-fixed-scale input={1}'.format(final_node_prefix, + prev_layer_output['descriptor'])) + prev_layer_output['descriptor'] = "{0}-fixed-scale".format(final_node_prefix) + prev_layer_output = AddSoftmaxLayer(config_lines, final_node_prefix, prev_layer_output) + # we use the same name_affix as a prefix in for affine/scale nodes but as a + # suffix for output node + AddOutputLayer(config_lines, prev_layer_output, label_delay, suffix = name_affix, objective_type = objective_type) + +def AddFinalSigmoidLayer(config_lines, input, output_dim, + ng_affine_options = " param-stddev=0 bias-stddev=0 ", + label_delay=None, + name_affix = None, + objective_type = "quadratic"): + # Useful when you need the final outputs to be probabilities + # between 0 and 1. + # Usually used with an objective-type such as "quadratic" + components = config_lines['components'] + component_nodes = config_lines['component-nodes'] - # add the recurrent connections - if (add_recurrent_projection and add_non_recurrent_projection): - components.append("# projection matrices : Wrm and Wpm") - components.append("component name={0}_W-m type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, recurrent_projection_dim + non_recurrent_projection_dim, ng_affine_options)) - components.append("component name={0}_r type=ClipGradientComponent dim={1} clipping-threshold={2} norm-based-clipping={3} ".format(name, recurrent_projection_dim, clipping_threshold, norm_based_clipping)) - component_nodes.append("# r_t and p_t") - component_nodes.append("component-node name={0}_rp_t component={0}_W-m input={0}_m_t".format(name)) - component_nodes.append("dim-range-node name={0}_r_t_preclip input-node={0}_rp_t dim-offset=0 dim={1}".format(name, recurrent_projection_dim)) - component_nodes.append("component-node name={0}_r_t component={0}_r input={0}_r_t_preclip".format(name)) - output_descriptor = '{0}_rp_t'.format(name) - output_dim = recurrent_projection_dim + non_recurrent_projection_dim + if name_affix is not None: + final_node_prefix = 'Final-' + str(name_affix) + else: + final_node_prefix = 'Final' - elif add_recurrent_projection: - components.append("# projection matrices : Wrm") - components.append("component name={0}_Wrm type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, recurrent_projection_dim, ng_affine_options)) - components.append("component name={0}_r type=ClipGradientComponent dim={1} clipping-threshold={2} norm-based-clipping={3} ".format(name, recurrent_projection_dim, clipping_threshold, norm_based_clipping)) - component_nodes.append("# r_t") - component_nodes.append("component-node name={0}_r_t_preclip component={0}_Wrm input={0}_m_t".format(name)) - component_nodes.append("component-node name={0}_r_t component={0}_r input={0}_r_t_preclip".format(name)) - output_descriptor = '{0}_r_t'.format(name) - output_dim = recurrent_projection_dim + prev_layer_output = AddAffineLayer(config_lines, + final_node_prefix , input, output_dim, + ng_affine_options) + prev_layer_output = AddSigmoidLayer(config_lines, final_node_prefix, prev_layer_output) + AddOutputLayer(config_lines, prev_layer_output, label_delay, suffix = name_affix, objective_type = objective_type) - else: - components.append("component name={0}_r type=ClipGradientComponent dim={1} clipping-threshold={2} norm-based-clipping={3} ".format(name, cell_dim, clipping_threshold, norm_based_clipping)) - component_nodes.append("component-node name={0}_r_t component={0}_r input={0}_m_t".format(name)) - output_descriptor = '{0}_r_t'.format(name) - output_dim = cell_dim - return { - 'descriptor': output_descriptor, - 'dimension':output_dim - } -def AddClstmLayer(config_lines, +def AddLstmLayer(config_lines, name, input, cell_dim, recurrent_projection_dim = 0, non_recurrent_projection_dim = 0, @@ -226,7 +237,7 @@ def AddClstmLayer(config_lines, ng_per_element_scale_options = "", ng_affine_options = "", lstm_delay = -1, - rates = [1]): + self_repair_scale = None): assert(recurrent_projection_dim >= 0 and non_recurrent_projection_dim >= 0) components = config_lines['components'] component_nodes = config_lines['component-nodes'] @@ -247,6 +258,7 @@ def AddClstmLayer(config_lines, else: add_non_recurrent_projection = True + self_repair_string = "self-repair-scale={0:.10f}".format(self_repair_scale) if self_repair_scale is not None else '' # Natural gradient per element scale parameters ng_per_element_scale_options += " param-mean=0.0 param-stddev=1.0 " # Parameter Definitions W*(* replaced by - to have valid names) @@ -268,12 +280,13 @@ def AddClstmLayer(config_lines, components.append("# Cell input matrices : W_c* matrices") components.append("component name={0}_W_c-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options)) + components.append("# Defining the non-linearities") - components.append("component name={0}_i type=SigmoidComponent dim={1}".format(name, cell_dim)) - components.append("component name={0}_f type=SigmoidComponent dim={1}".format(name, cell_dim)) - components.append("component name={0}_o type=SigmoidComponent dim={1}".format(name, cell_dim)) - components.append("component name={0}_g type=TanhComponent dim={1}".format(name, cell_dim)) - components.append("component name={0}_h type=TanhComponent dim={1}".format(name, cell_dim)) + components.append("component name={0}_i type=SigmoidComponent dim={1} {2}".format(name, cell_dim, self_repair_string)) + components.append("component name={0}_f type=SigmoidComponent dim={1} {2}".format(name, cell_dim, self_repair_string)) + components.append("component name={0}_o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, self_repair_string)) + components.append("component name={0}_g type=TanhComponent dim={1} {2}".format(name, cell_dim, self_repair_string)) + components.append("component name={0}_h type=TanhComponent dim={1} {2}".format(name, cell_dim, self_repair_string)) components.append("# Defining the cell computations") components.append("component name={0}_c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) @@ -346,6 +359,3 @@ def AddClstmLayer(config_lines, 'descriptor': output_descriptor, 'dimension':output_dim } - - - diff --git a/egs/wsj/s5/steps/nnet3/decode.sh b/egs/wsj/s5/steps/nnet3/decode.sh index f4de09740ae..bfdfa4da23f 100755 --- a/egs/wsj/s5/steps/nnet3/decode.sh +++ b/egs/wsj/s5/steps/nnet3/decode.sh @@ -26,6 +26,10 @@ num_threads=1 # if >1, will use gmm-latgen-faster-parallel parallel_opts= # ignored now. scoring_opts= skip_scoring=false +extra_left_context=0 +extra_right_context=0 +extra_left_context_initial=-1 +extra_right_context_final=-1 feat_type= online_ivector_dir= minimize=false @@ -132,7 +136,7 @@ if [ ! -z "$online_ivector_dir" ]; then fi if [ "$post_decode_acwt" == 1.0 ]; then - lat_wspecifier="ark|gzip -c >$dir/lat.JOB.gz" + lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz" else lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz" fi @@ -146,6 +150,10 @@ if [ $stage -le 1 ]; then $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \ nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \ --frames-per-chunk=$frames_per_chunk \ + --extra-left-context=$extra_left_context \ + --extra-right-context=$extra_right_context \ + --extra-left-context-initial=$extra_left_context_initial \ + --extra-right-context-final=$extra_right_context_final \ --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \ --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \ --word-symbol-table=$graphdir/words.txt "$model" \ diff --git a/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py b/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py index 88cf54e824e..2290c4d2e7f 100755 --- a/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py +++ b/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py @@ -34,6 +34,11 @@ 'shape':'box', 'style':'filled' }, + 'ConvolutionComponent':{ + 'color':'lightpink', + 'shape':'box', + 'style':'filled' + }, 'FixedScaleComponent':{ 'color':'blueviolet', 'shape':'box', @@ -64,6 +69,11 @@ 'shape':'rectangle', 'style':'filled' }, + 'ClipGradientComponent':{ + 'color':'bisque', + 'shape':'rectangle', + 'style':'filled' + }, 'ElementwiseProductComponent':{ 'color':'green', 'shape':'rectangle', @@ -84,10 +94,10 @@ def GetDotNodeName(name_string, is_component = False): # 2. Nnet3 names can be shared among components and component nodes # dot does not allow common names # - name_string = re.sub("-", "hyphen", name_string) + node_name_string = re.sub("-", "hyphen", name_string) if is_component: - name_string += name_string.strip() + "_component" - return name_string + node_name_string += node_name_string.strip() + "_component" + return {"label":name_string, "node":node_name_string} def ProcessAppendDescriptor(segment, parent_node_name, affix, edge_attributes = None): dot_graph = [] @@ -96,18 +106,18 @@ def ProcessAppendDescriptor(segment, parent_node_name, affix, edge_attributes = for i in range(len(segment['sub_segments'])): sub_segment = segment['sub_segments'][i] part_name = "{0}{1}{2}".format(desc_name, sub_segment['name'], i) - names.append("<{0}> part {1}".format(GetDotNodeName(part_name), i)) + names.append("<{0}> part {1}".format(GetDotNodeName(part_name)['node'], i)) dot_graph += DescriptorSegmentToDot(sub_segment, "{0}:{1}".format(desc_name, part_name), desc_name) part_index = len(segment['sub_segments']) for i in range(len(segment['arguments'])): part_name = "{0}{1}{2}".format(desc_name, segment['arguments'][i], part_index + i) - names.append("<{0}> part {1}".format(GetDotNodeName(part_name), part_index + i)) - dot_graph.append("{0} -> {1}:{2}".format(GetDotNodeName(segment['arguments'][i]), GetDotNodeName(desc_name), GetDotNodeName(part_name))) + names.append("<{0}> part {1}".format(GetDotNodeName(part_name)['node'], part_index + i)) + dot_graph.append("{0} -> {1}:{2}".format(GetDotNodeName(segment['arguments'][i])['node'], GetDotNodeName(desc_name)['node'], GetDotNodeName(part_name)['node'])) label = "|".join(names) label = "{{"+label+"}|Append}" - dot_graph.append('{0} [shape=Mrecord, label="{1}"];'.format(GetDotNodeName(desc_name), label)) + dot_graph.append('{0} [shape=Mrecord, label="{1}"];'.format(GetDotNodeName(desc_name)['node'], label)) attr_string = '' if edge_attributes is not None: @@ -116,7 +126,7 @@ def ProcessAppendDescriptor(segment, parent_node_name, affix, edge_attributes = if edge_attributes.has_key('style'): attr_string += ' style={0} '.format(edge_attributes['style']) - dot_string = '{0} -> {1} [tailport=s]'.format(GetDotNodeName(desc_name), GetDotNodeName(parent_node_name)) + dot_string = '{0} -> {1} [tailport=s]'.format(GetDotNodeName(desc_name)['node'], GetDotNodeName(parent_node_name)['node']) if attr_string != '': dot_string += ' [{0}] '.format(attr_string) @@ -125,6 +135,28 @@ def ProcessAppendDescriptor(segment, parent_node_name, affix, edge_attributes = return dot_graph +def ProcessRoundDescriptor(segment, parent_node_name, affix, edge_attributes = None): + dot_graph = [] + + label = 'Round ({0})'.format(segment['arguments'][1]) + style = None + if edge_attributes is not None: + if edge_attributes.has_key('label'): + label = "{0} {1}".format(edge_attributes['label'], label) + if edge_attributes.has_key('style'): + style = 'style={0}'.format(edge_attributes['style']) + + attr_string = 'label="{0}"'.format(label) + if style is not None: + attr_string += ' {0}'.format(style) + dot_graph.append('{0}->{1} [ {2} ]'.format(GetDotNodeName(segment['arguments'][0])['node'], + GetDotNodeName(parent_node_name)['node'], + attr_string)) + if segment['sub_segments']: + raise Exception("Round can just deal with forwarding descriptor, no sub-segments allowed") + return dot_graph + + def ProcessOffsetDescriptor(segment, parent_node_name, affix, edge_attributes = None): dot_graph = [] @@ -140,8 +172,8 @@ def ProcessOffsetDescriptor(segment, parent_node_name, affix, edge_attributes = if style is not None: attr_string += ' {0}'.format(style) - dot_graph.append('{0}->{1} [ {2} ]'.format(GetDotNodeName(segment['arguments'][0]), - GetDotNodeName(parent_node_name), + dot_graph.append('{0}->{1} [ {2} ]'.format(GetDotNodeName(segment['arguments'][0])['node'], + GetDotNodeName(parent_node_name)['node'], attr_string)) if segment['sub_segments']: raise Exception("Offset can just deal with forwarding descriptor, no sub-segments allowed") @@ -151,21 +183,23 @@ def ProcessSumDescriptor(segment, parent_node_name, affix, edge_attributes = Non dot_graph = [] names = [] desc_name = 'Sum_{0}'.format(affix) + # create the sum node for i in range(len(segment['sub_segments'])): sub_segment = segment['sub_segments'][i] part_name = "{0}{1}{2}".format(desc_name, sub_segment['name'], i) - names.append("<{0}> part {1}".format(GetDotNodeName(part_name), i)) - dot_graph += DescriptorSegmentToDot(sub_segment, "{0}:{1}".format(desc_name, part_name), desc_name) + names.append("<{0}> part {1}".format(GetDotNodeName(part_name)['node'], i)) + dot_graph += DescriptorSegmentToDot(sub_segment, "{0}:{1}".format(desc_name, part_name), desc_name+"_"+str(i)) + # link the sum node parts to corresponding segments part_index = len(segment['sub_segments']) for i in range(len(segment['arguments'])): part_name = "{0}{1}{2}".format(desc_name, segment['arguments'][i], part_index + i) - names.append("<{0}> part {1}".format(GetDotNodeName(part_name), part_index + i)) - dot_graph.append("{0} -> {1}:{2}".format(GetDotNodeName(segment['arguments'][i]), GetDotNodeName(desc_name), GetDotNodeName(part_name))) + names.append("<{0}> part {1}".format(GetDotNodeName(part_name)['node'], part_index + i)) + dot_graph.append("{0} -> {1}:{2}".format(GetDotNodeName(segment['arguments'][i])['node'], GetDotNodeName(desc_name)['node'], GetDotNodeName(part_name)['node'])) label = "|".join(names) label = '{{'+label+'}|Sum}' - dot_graph.append('{0} [shape=Mrecord, label="{1}", color=red];'.format(GetDotNodeName(desc_name), label)) + dot_graph.append('{0} [shape=Mrecord, label="{1}", color=red];'.format(GetDotNodeName(desc_name)['node'], label)) attr_string = '' if edge_attributes is not None: @@ -174,7 +208,7 @@ def ProcessSumDescriptor(segment, parent_node_name, affix, edge_attributes = Non if edge_attributes.has_key('style'): attr_string += ' style={0} '.format(edge_attributes['style']) - dot_string = '{0} -> {1}'.format(GetDotNodeName(desc_name), GetDotNodeName(parent_node_name)) + dot_string = '{0} -> {1}'.format(GetDotNodeName(desc_name)['node'], GetDotNodeName(parent_node_name)['node']) dot_string += ' [{0} tailport=s ] '.format(attr_string) dot_graph.append(dot_string) @@ -195,8 +229,8 @@ def ProcessReplaceIndexDescriptor(segment, parent_node_name, affix, edge_attribu if style is not None: attr_string += ' {0}'.format(style) - dot_graph.append('{0}->{1} [{2}]'.format(GetDotNodeName(segment['arguments'][0]), - GetDotNodeName(parent_node_name), + dot_graph.append('{0}->{1} [{2}]'.format(GetDotNodeName(segment['arguments'][0])['node'], + GetDotNodeName(parent_node_name)['node'], attr_string)) if segment['sub_segments']: raise Exception("ReplaceIndex can just deal with forwarding descriptor, no sub-segments allowed") @@ -215,7 +249,7 @@ def ProcessIfDefinedDescriptor(segment, parent_node_name, affix, edge_attributes dot_graph += DescriptorSegmentToDot(sub_segment, parent_node_name, parent_node_name, edge_attributes={'style':'dotted', 'label':'IfDefined'}) if segment['arguments']: - dot_graph.append('{0} -> {1} [style=dotted, label="IfDefined"]'.format(GetDotNodeName(segment['arguments'][0]), GetDotNodeName(parent_node_name))) + dot_graph.append('{0} -> {1} [style=dotted, label="IfDefined"]'.format(GetDotNodeName(segment['arguments'][0])['node'], GetDotNodeName(parent_node_name)['node'])) return dot_graph @@ -232,6 +266,8 @@ def DescriptorSegmentToDot(segment, parent_node_name, affix, edge_attributes = N dot_graph += ProcessIfDefinedDescriptor(segment, parent_node_name, affix, edge_attributes) elif segment['name'] == "ReplaceIndex": dot_graph += ProcessReplaceIndexDescriptor(segment, parent_node_name, affix, edge_attributes) + elif segment['name'] == "Round": + dot_graph += ProcessRoundDescriptor(segment, parent_node_name, affix, edge_attributes) else: raise Exception('Descriptor {0}, is not recognized by this script. Please add Process{0}Descriptor method'.format(segment['name'])) return dot_graph @@ -244,7 +280,7 @@ def Nnet3DescriptorToDot(descriptor, parent_node_name): dot_lines += DescriptorSegmentToDot(segment, parent_node_name, parent_node_name) elif arguments: assert(len(arguments) == 1) - dot_lines.append("{0} -> {1}".format(GetDotNodeName(arguments[0]), GetDotNodeName(parent_node_name))) + dot_lines.append("{0} -> {1}".format(GetDotNodeName(arguments[0])['node'], GetDotNodeName(parent_node_name)['node'])) return dot_lines def ParseNnet3String(string): @@ -298,27 +334,28 @@ def Nnet3ComponentToDot(component_config, component_attributes = None): except KeyError: pass - return ['{0} [label="{1}" {2}]'.format(GetDotNodeName(component_config['name'], is_component = True), label, attr_string)] + return ['{0} [label="{1}" {2}]'.format(GetDotNodeName(component_config['name'], is_component = True)['node'], label, attr_string)] # input-node name=input dim=40 def Nnet3InputToDot(parsed_config): - return ['{0} [ label="{1}\\ndim={2}"]'.format(GetDotNodeName(parsed_config['name']), parsed_config['name'], parsed_config['dim'] )] + return ['{0} [ label="{1}\\ndim={2}"]'.format(GetDotNodeName(parsed_config['name'])['node'], parsed_config['name'], parsed_config['dim'] )] # output-node name=output input=Final_log_softmax dim=3940 objective=linear +#output-node name=output input=Offset(Final_log_softmax, 5) dim=3940 objective=linear def Nnet3OutputToDot(parsed_config): dot_graph = [] - dot_graph.append('{0} [ label="{1}\\nobjective={2}"]'.format(GetDotNodeName(parsed_config['name']), parsed_config['name'], parsed_config['objective'])) - dot_graph.append('{0} -> {1}'.format(GetDotNodeName(parsed_config['input']), GetDotNodeName(parsed_config['name']))) + dot_graph += Nnet3DescriptorToDot(parsed_config['input'], parsed_config['name']) + dot_graph.append('{0} [ label="{1}\\nobjective={2}"]'.format(GetDotNodeName(parsed_config['name'])['node'], parsed_config['name'], parsed_config['objective'])) return dot_graph # dim-range-node name=Lstm1_r_t input-node=Lstm1_rp_t dim-offset=0 dim=256 def Nnet3DimrangeToDot(parsed_config): dot_graph = [] - dot_graph.append(parsed_config['name']) - dot_graph.append('{0} [shape=rectangle]'.format(GetDotNodeName(parsed_config['name']))) - dot_graph.append('{0} -> {1} [taillabel="dimrange({2}, {3})"]'.format(GetDotNodeName(parsed_config['input-node']), - GetDotNodeName(parsed_config['name']), + dot_node = GetDotNodeName(parsed_config['name']) + dot_graph.append('{0} [shape=rectangle, label="{1}"]'.format(dot_node['node'], dot_node['label'])) + dot_graph.append('{0} -> {1} [taillabel="dimrange({2}, {3})"]'.format(GetDotNodeName(parsed_config['input-node'])['node'], + GetDotNodeName(parsed_config['name'])['node'], parsed_config['dim-offset'], parsed_config['dim'])) return dot_graph @@ -326,9 +363,10 @@ def Nnet3DimrangeToDot(parsed_config): def Nnet3ComponentNodeToDot(parsed_config): dot_graph = [] dot_graph += Nnet3DescriptorToDot(parsed_config['input'], parsed_config['name']) - dot_graph.append('{0} [ label="{1}", shape=box ]'.format(GetDotNodeName(parsed_config['name']), parsed_config['name'])) - dot_graph.append('{0} -> {1} [ weight=10 ]'.format(GetDotNodeName(parsed_config['component'], is_component = True), - GetDotNodeName(parsed_config['name']))) + dot_node = GetDotNodeName(parsed_config['name']) + dot_graph.append('{0} [ label="{1}", shape=box ]'.format(dot_node['node'], dot_node['label'])) + dot_graph.append('{0} -> {1} [ weight=10 ]'.format(GetDotNodeName(parsed_config['component'], is_component = True)['node'], + GetDotNodeName(parsed_config['name'])['node'])) return dot_graph def GroupConfigs(configs, node_prefixes = []): @@ -408,6 +446,8 @@ def ParseConfigLines(lines, node_prefixes = [], component_attributes = None ): " will be clustered together in the dot-graph" " --node-prefixes Lstm1,Lstm2,Layer1", default=None) + parser.add_argument("dotfile", help="name of the dot output file") + print(' '.join(sys.argv), file=sys.stderr) args = parser.parse_args() @@ -420,4 +460,7 @@ def ParseConfigLines(lines, node_prefixes = [], component_attributes = None ): lines = sys.stdin.readlines() dot_graph = ParseConfigLines(lines, component_attributes = component_attributes, node_prefixes = node_prefixes) - print("\n".join(dot_graph)) + + dotfile_handle = open(args.dotfile, "w") + dotfile_handle.write("\n".join(dot_graph)) + dotfile_handle.close() diff --git a/egs/wsj/s5/steps/nnet3/get_egs.sh b/egs/wsj/s5/steps/nnet3/get_egs.sh index dc8cac9c0b0..364f6a72443 100755 --- a/egs/wsj/s5/steps/nnet3/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/get_egs.sh @@ -170,8 +170,8 @@ esac if [ -f $dir/trans.scp ]; then feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |" - valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp|' ark:- ark:- |" - train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp|' ark:- ark:- |" + valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" + train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" fi if [ ! -z "$online_ivector_dir" ]; then diff --git a/egs/wsj/s5/steps/nnet3/get_egs_targets.sh b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh new file mode 100644 index 00000000000..7fbc24858b5 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh @@ -0,0 +1,409 @@ +#!/bin/bash + +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). +# 2015-2016 Vimal Manohar +# Apache 2.0. + +# This script is similar to steps/nnet3/get_egs.sh but used +# when getting general targets (not from alignment directory) for raw nnet +# +# This script, which will generally be called from other neural-net training +# scripts, extracts the training examples used to train the neural net (and also +# the validation examples used for diagnostics), and puts them in separate archives. +# +# This script dumps egs with several frames of labels, controlled by the +# frames_per_eg config variable (default: 8). This takes many times less disk +# space because typically we have 4 to 7 frames of context on the left and +# right, and this ends up getting shared. This is at the expense of slightly +# higher disk I/O while training. + + +# Begin configuration section. +cmd=run.pl +feat_type=raw # set it to 'lda' to use LDA features. +target_type=sparse # dense to have dense targets, + # sparse to have posteriors targets +num_targets= # required for target-type=sparse with raw nnet +frames_per_eg=8 # number of frames of labels per example. more->less disk space and + # less time preparing egs, but more I/O during training. + # note: the script may reduce this if reduce_frames_per_eg is true. +left_context=4 # amount of left-context per eg (i.e. extra frames of input features + # not present in the output supervision). +right_context=4 # amount of right-context per eg. +valid_left_context= # amount of left_context for validation egs, typically used in + # recurrent architectures to ensure matched condition with + # training egs +valid_right_context= # amount of right_context for validation egs +compress=true # set this to false to disable compression (e.g. if you want to see whether + # results are affected). + +reduce_frames_per_eg=true # If true, this script may reduce the frames_per_eg + # if there is only one archive and even with the + # reduced frames_per_eg, the number of + # samples_per_iter that would result is less than or + # equal to the user-specified value. +num_utts_subset=300 # number of utterances in validation and training + # subsets used for shrinkage and diagnostics. +num_valid_frames_combine=0 # #valid frames for combination weights at the very end. +num_train_frames_combine=10000 # # train frames for the above. +num_frames_diagnostic=4000 # number of frames for "compute_prob" jobs +samples_per_iter=400000 # this is the target number of egs in each archive of egs + # (prior to merging egs). We probably should have called + # it egs_per_iter. This is just a guideline; it will pick + # a number that divides the number of samples in the + # entire data. + +transform_dir= + +stage=0 +nj=6 # This should be set to the maximum number of jobs you are + # comfortable to run in parallel; you can increase it if your disk + # speed is greater and you have more machines. +online_ivector_dir= # can be used if we are including speaker information as iVectors. +cmvn_opts= # can be used for specifying CMVN options, if feature type is not lda (if lda, + # it doesn't make sense to use different options than were used as input to the + # LDA transform). This is used to turn off CMVN in the online-nnet experiments. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 data/train data/train/snr_targets.scp exp/tri4_nnet/egs" + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options" + echo " --nj # The maximum number of jobs you want to run in" + echo " # parallel (increase this only if you have good disk and" + echo " # network speed). default=6" + echo " --cmd (utils/run.pl;utils/queue.pl ) # how to run jobs." + echo " --samples-per-iter <#samples;400000> # Target number of egs per archive (option is badly named)" + echo " --feat-type # (raw is the default). The feature type you want" + echo " # to use as input to the neural net." + echo " --frames-per-eg # number of frames per eg on disk" + echo " --left-context # Number of frames on left side to append for feature input" + echo " --right-context # Number of frames on right side to append for feature input" + echo " --num-frames-diagnostic <#frames;4000> # Number of frames used in computing (train,valid) diagnostics" + echo " --num-valid-frames-combine <#frames;10000> # Number of frames used in getting combination weights at the" + echo " # very end." + echo " --stage # Used to run a partially-completed training process from somewhere in" + echo " # the middle." + + exit 1; +fi + +data=$1 +targets_scp=$2 +dir=$3 + +# Check some files. +[ ! -z "$online_ivector_dir" ] && \ + extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period" + +for f in $data/feats.scp $targets_scp $extra_files; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +sdata=$data/split$nj +utils/split_data.sh $data $nj + +mkdir -p $dir/log $dir/info + + +# Get list of validation utterances. +awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset | sort \ + > $dir/valid_uttlist || exit 1; + +if [ -f $data/utt2uniq ]; then # this matters if you use data augmentation. + echo "File $data/utt2uniq exists, so augmenting valid_uttlist to" + echo "include all perturbed versions of the same 'real' utterances." + mv $dir/valid_uttlist $dir/valid_uttlist.tmp + utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt + cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \ + sort | uniq | utils/apply_map.pl $dir/uniq2utt | \ + awk '{for(n=1;n<=NF;n++) print $n;}' | sort > $dir/valid_uttlist + rm $dir/uniq2utt $dir/valid_uttlist.tmp +fi + +awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \ + utils/shuffle_list.pl | head -$num_utts_subset | sort > $dir/train_subset_uttlist || exit 1; + +if [ ! -z "$transform_dir" ] && [ -f $transform_dir/trans.1 ] && [ $feat_type != "raw" ]; then + echo "$0: using transforms from $transform_dir" + if [ $stage -le 0 ]; then + $cmd $dir/log/copy_transforms.log \ + copy-feats "ark:cat $transform_dir/trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp" + fi +fi +if [ -f $transform_dir/raw_trans.1 ] && [ $feat_type == "raw" ]; then + echo "$0: using raw transforms from $transform_dir" + if [ $stage -le 0 ]; then + $cmd $dir/log/copy_transforms.log \ + copy-feats "ark:cat $transform_dir/raw_trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp" + fi +fi + + + +## Set up features. +echo "$0: feature type is $feat_type" + +case $feat_type in + raw) feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |" + valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" + train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" + echo $cmvn_opts >$dir/cmvn_opts # caution: the top-level nnet training script should copy this to its own dir now. + ;; + lda) + splice_opts=`cat $transform_dir/splice_opts 2>/dev/null` + # caution: the top-level nnet training script should copy these to its own dir now. + cp $transform_dir/{splice_opts,cmvn_opts,final.mat} $dir || exit 1; + [ ! -z "$cmvn_opts" ] && \ + echo "You cannot supply --cmvn-opts option if feature type is LDA." && exit 1; + cmvn_opts=$(cat $dir/cmvn_opts) + feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |" + valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |" + train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |" + ;; + *) echo "$0: invalid feature type --feat-type '$feat_type'" && exit 1; +esac + +if [ -f $dir/trans.scp ]; then + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |" + valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" + train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" +fi + +if [ ! -z "$online_ivector_dir" ]; then + ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1; + echo $ivector_dim > $dir/info/ivector_dim + ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; + + ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'" + valid_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'" + train_subset_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'" +else + echo 0 >$dir/info/ivector_dim +fi + +if [ $stage -le 1 ]; then + echo "$0: working out number of frames of training data" + num_frames=$(steps/nnet2/get_num_frames.sh $data) + echo $num_frames > $dir/info/num_frames + echo "$0: working out feature dim" + feats_one="$(echo $feats | sed s:JOB:1:g)" + feat_dim=$(feat-to-dim "$feats_one" -) || exit 1; + echo $feat_dim > $dir/info/feat_dim +else + num_frames=$(cat $dir/info/num_frames) || exit 1; + feat_dim=$(cat $dir/info/feat_dim) || exit 1; +fi + +# the + 1 is to round up, not down... we assume it doesn't divide exactly. +num_archives=$[$num_frames/($frames_per_eg*$samples_per_iter)+1] +# (for small data)- while reduce_frames_per_eg == true and the number of +# archives is 1 and would still be 1 if we reduced frames_per_eg by 1, reduce it +# by 1. +reduced=false +while $reduce_frames_per_eg && [ $frames_per_eg -gt 1 ] && \ + [ $[$num_frames/(($frames_per_eg-1)*$samples_per_iter)] -eq 0 ]; do + frames_per_eg=$[$frames_per_eg-1] + num_archives=1 + reduced=true +done +$reduced && echo "$0: reduced frames_per_eg to $frames_per_eg because amount of data is small." + +# We may have to first create a smaller number of larger archives, with number +# $num_archives_intermediate, if $num_archives is more than the maximum number +# of open filehandles that the system allows per process (ulimit -n). +max_open_filehandles=$(ulimit -n) || exit 1 +num_archives_intermediate=$num_archives +archives_multiple=1 +while [ $[$num_archives_intermediate+4] -gt $max_open_filehandles ]; do + archives_multiple=$[$archives_multiple+1] + num_archives_intermediate=$[$num_archives/$archives_multiple+1]; +done +# now make sure num_archives is an exact multiple of archives_multiple. +num_archives=$[$archives_multiple*$num_archives_intermediate] + +echo $num_archives >$dir/info/num_archives +echo $frames_per_eg >$dir/info/frames_per_eg +# Work out the number of egs per archive +egs_per_archive=$[$num_frames/($frames_per_eg*$num_archives)] +! [ $egs_per_archive -le $samples_per_iter ] && \ + echo "$0: script error: egs_per_archive=$egs_per_archive not <= samples_per_iter=$samples_per_iter" \ + && exit 1; + +echo $egs_per_archive > $dir/info/egs_per_archive + +echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with" +echo "$0: $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)" + + + +if [ -e $dir/storage ]; then + # Make soft links to storage directories, if distributing this way.. See + # utils/create_split_dir.pl. + echo "$0: creating data links" + utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $dir/egs.$x.ark; done) + for x in $(seq $num_archives_intermediate); do + utils/create_data_link.pl $(for y in $(seq $nj); do echo $dir/egs_orig.$y.$x.ark; done) + done +fi + +egs_opts="--left-context=$left_context --right-context=$right_context --compress=$compress" + +[ -z $valid_left_context ] && valid_left_context=$left_context; +[ -z $valid_right_context ] && valid_right_context=$right_context; +valid_egs_opts="--left-context=$valid_left_context --right-context=$valid_right_context --compress=$compress" + +echo $left_context > $dir/info/left_context +echo $right_context > $dir/info/right_context + +for n in `seq $nj`; do + utils/filter_scp.pl $sdata/$n/utt2spk $targets_scp > $dir/targets.$n.scp +done + +targets_scp_split=$dir/targets.JOB.scp + +if [ $target_type == "dense" ]; then + num_targets=$(feat-to-dim "scp:$targets_scp" - 2>/dev/null) || exit 1 +fi + +if [ -z "$num_targets" ]; then + echo "$0: num-targets is not set" + exit 1 +fi + +case $target_type in + "dense") + get_egs_program="nnet3-get-egs-dense-targets --num-targets=$num_targets" + + targets="ark:utils/filter_scp.pl --exclude $dir/valid_uttlist $targets_scp_split | copy-feats scp:- ark:- |" + valid_targets="ark:utils/filter_scp.pl $dir/valid_uttlist $targets_scp | copy-feats scp:- ark:- |" + train_subset_targets="ark:utils/filter_scp.pl $dir/train_subset_uttlist $targets_scp | copy-feats scp:- ark:- |" + ;; + "sparse") + get_egs_program="nnet3-get-egs --num-pdfs=$num_targets" + targets="ark:utils/filter_scp.pl --exclude $dir/valid_uttlist $targets_scp_split | ali-to-post scp:- ark:- |" + valid_targets="ark:utils/filter_scp.pl $dir/valid_uttlist $targets_scp | ali-to-post scp:- ark:- |" + train_subset_targets="ark:utils/filter_scp.pl $dir/train_subset_uttlist $targets_scp | ali-to-post scp:- ark:- |" + ;; + default) + echo "$0: Unknown --target-type $target_type. Choices are dense and sparse" + exit 1 +esac + +if [ $stage -le 3 ]; then + echo "$0: Getting validation and training subset examples." + rm -f $dir/.error 2>/dev/null + $cmd $dir/log/create_valid_subset.log \ + $get_egs_program \ + $valid_ivector_opt $valid_egs_opts "$valid_feats" \ + "$valid_targets" \ + "ark:$dir/valid_all.egs" || touch $dir/.error & + $cmd $dir/log/create_train_subset.log \ + $get_egs_program \ + $train_subset_ivector_opt $valid_egs_opts "$train_subset_feats" \ + "$train_subset_targets" \ + "ark:$dir/train_subset_all.egs" || touch $dir/.error & + wait; + [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1 + echo "... Getting subsets of validation examples for diagnostics and combination." + $cmd $dir/log/create_valid_subset_combine.log \ + nnet3-subset-egs --n=$num_valid_frames_combine ark:$dir/valid_all.egs \ + ark:$dir/valid_combine.egs || touch $dir/.error & + $cmd $dir/log/create_valid_subset_diagnostic.log \ + nnet3-subset-egs --n=$num_frames_diagnostic ark:$dir/valid_all.egs \ + ark:$dir/valid_diagnostic.egs || touch $dir/.error & + + $cmd $dir/log/create_train_subset_combine.log \ + nnet3-subset-egs --n=$num_train_frames_combine ark:$dir/train_subset_all.egs \ + ark:$dir/train_combine.egs || touch $dir/.error & + $cmd $dir/log/create_train_subset_diagnostic.log \ + nnet3-subset-egs --n=$num_frames_diagnostic ark:$dir/train_subset_all.egs \ + ark:$dir/train_diagnostic.egs || touch $dir/.error & + wait + sleep 5 # wait for file system to sync. + cat $dir/valid_combine.egs $dir/train_combine.egs > $dir/combine.egs + + for f in $dir/{combine,train_diagnostic,valid_diagnostic}.egs; do + [ ! -s $f ] && echo "No examples in file $f" && exit 1; + done + rm -f $dir/valid_all.egs $dir/train_subset_all.egs $dir/{train,valid}_combine.egs +fi + +if [ $stage -le 4 ]; then + # create egs_orig.*.*.ark; the first index goes to $nj, + # the second to $num_archives_intermediate. + + egs_list= + for n in $(seq $num_archives_intermediate); do + egs_list="$egs_list ark:$dir/egs_orig.JOB.$n.ark" + done + echo "$0: Generating training examples on disk" + # The examples will go round-robin to egs_list. + $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \ + $get_egs_program \ + $ivector_opt $egs_opts --num-frames=$frames_per_eg "$feats" "$targets" \ + ark:- \| \ + nnet3-copy-egs --random=true --srand=JOB ark:- $egs_list || exit 1; +fi + +if [ $stage -le 5 ]; then + echo "$0: recombining and shuffling order of archives on disk" + # combine all the "egs_orig.*.JOB.scp" (over the $nj splits of the data) and + # shuffle the order, writing to the egs.JOB.ark + + # the input is a concatenation over the input jobs. + egs_list= + for n in $(seq $nj); do + egs_list="$egs_list $dir/egs_orig.$n.JOB.ark" + done + + if [ $archives_multiple == 1 ]; then # normal case. + $cmd --max-jobs-run $nj JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ + nnet3-shuffle-egs --srand=JOB "ark:cat $egs_list|" ark:$dir/egs.JOB.ark || exit 1; + else + # we need to shuffle the 'intermediate archives' and then split into the + # final archives. we create soft links to manage this splitting, because + # otherwise managing the output names is quite difficult (and we don't want + # to submit separate queue jobs for each intermediate archive, because then + # the --max-jobs-run option is hard to enforce). + output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/egs.JOB.$y.ark; done)" + for x in $(seq $num_archives_intermediate); do + for y in $(seq $archives_multiple); do + archive_index=$[($x-1)*$archives_multiple+$y] + # egs.intermediate_archive.{1,2,...}.ark will point to egs.archive.ark + ln -sf egs.$archive_index.ark $dir/egs.$x.$y.ark || exit 1 + done + done + $cmd --max-jobs-run $nj JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ + nnet3-shuffle-egs --srand=JOB "ark:cat $egs_list|" ark:- \| \ + nnet3-copy-egs ark:- $output_archives || exit 1; + fi + +fi + +if [ $stage -le 6 ]; then + echo "$0: removing temporary archives" + for x in $(seq $nj); do + for y in $(seq $num_archives_intermediate); do + file=$dir/egs_orig.$x.$y.ark + [ -L $file ] && rm $(readlink -f $file) + rm $file + done + done + if [ $archives_multiple -gt 1 ]; then + # there are some extra soft links that we should delete. + for f in $dir/egs.*.*.ark; do rm $f; done + fi + echo "$0: removing temporary" + # Ignore errors below because trans.* might not exist. + rm -f $dir/trans.{ark,scp} $dir/targets.*.scp 2>/dev/null +fi + +echo "$0: Finished preparing training examples" + diff --git a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py index 17b8bea228d..9c2c641b0e9 100755 --- a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py +++ b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py @@ -8,7 +8,133 @@ import copy import imp -nodes = imp.load_source('', 'steps/nnet3/components.py') +nodes = imp.load_source('nodes', 'steps/nnet3/components.py') +nnet3_train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py') +chain_lib = imp.load_source('ncl', 'steps/nnet3/chain/nnet3_chain_lib.py') + +def GetArgs(): + # we add compulsary arguments as named arguments for readability + parser = argparse.ArgumentParser(description="Writes config files and variables " + "for LSTMs creation and training", + epilog="See steps/nnet3/lstm/train.sh for example.") + + # Only one of these arguments can be specified, and one of them has to + # be compulsarily specified + feat_group = parser.add_mutually_exclusive_group(required = True) + feat_group.add_argument("--feat-dim", type=int, + help="Raw feature dimension, e.g. 13") + feat_group.add_argument("--feat-dir", type=str, + help="Feature directory, from which we derive the feat-dim") + + # only one of these arguments can be specified + ivector_group = parser.add_mutually_exclusive_group(required = False) + ivector_group.add_argument("--ivector-dim", type=int, + help="iVector dimension, e.g. 100", default=0) + ivector_group.add_argument("--ivector-dir", type=str, + help="iVector dir, which will be used to derive the ivector-dim ", default=None) + + num_target_group = parser.add_mutually_exclusive_group(required = True) + num_target_group.add_argument("--num-targets", type=int, + help="number of network targets (e.g. num-pdf-ids/num-leaves)") + num_target_group.add_argument("--ali-dir", type=str, + help="alignment directory, from which we derive the num-targets") + num_target_group.add_argument("--tree-dir", type=str, + help="directory with final.mdl, from which we derive the num-targets") + + # General neural network options + parser.add_argument("--splice-indexes", type=str, + help="Splice indexes at input layer, e.g. '-3,-2,-1,0,1,2,3'", required = True, default="0") + parser.add_argument("--xent-regularize", type=float, + help="For chain models, if nonzero, add a separate output for cross-entropy " + "regularization (with learning-rate-factor equal to the inverse of this)", + default=0.0) + parser.add_argument("--include-log-softmax", type=str, action=nnet3_train_lib.StrToBoolAction, + help="add the final softmax layer ", default=True, choices = ["false", "true"]) + + # LSTM options + parser.add_argument("--num-lstm-layers", type=int, + help="Number of LSTM layers to be stacked", default=1) + parser.add_argument("--cell-dim", type=int, + help="dimension of lstm-cell") + parser.add_argument("--recurrent-projection-dim", type=int, + help="dimension of recurrent projection") + parser.add_argument("--non-recurrent-projection-dim", type=int, + help="dimension of non-recurrent projection") + parser.add_argument("--hidden-dim", type=int, + help="dimension of fully-connected layers") + + # Natural gradient options + parser.add_argument("--ng-per-element-scale-options", type=str, + help="options to be supplied to NaturalGradientPerElementScaleComponent", default="") + parser.add_argument("--ng-affine-options", type=str, + help="options to be supplied to NaturalGradientAffineComponent", default="") + + # Gradient clipper options + parser.add_argument("--norm-based-clipping", type=str, action=nnet3_train_lib.StrToBoolAction, + help="use norm based clipping in ClipGradient components ", default=True, choices = ["false", "true"]) + parser.add_argument("--clipping-threshold", type=float, + help="clipping threshold used in ClipGradient components, if clipping-threshold=0 no clipping is done", default=30) + parser.add_argument("--self-repair-scale", type=float, + help="A non-zero value activates the self-repair mechanism in the sigmoid and tanh non-linearities of the LSTM", default=None) + + # Delay options + parser.add_argument("--label-delay", type=int, default=None, + help="option to delay the labels to make the lstm robust") + + parser.add_argument("--lstm-delay", type=str, default=None, + help="option to have different delays in recurrence for each lstm") + + parser.add_argument("config_dir", + help="Directory to write config files and variables") + + print(' '.join(sys.argv)) + + args = parser.parse_args() + args = CheckArgs(args) + + return args + +def CheckArgs(args): + if not os.path.exists(args.config_dir): + os.makedirs(args.config_dir) + + ## Check arguments. + if args.feat_dir is not None: + args.feat_dim = nnet3_train_lib.GetFeatDim(args.feat_dir) + + if args.ali_dir is not None: + args.num_targets = nnet3_train_lib.GetNumberOfLeaves(args.ali_dir) + elif args.tree_dir is not None: + args.num_targets = chain_lib.GetNumberOfLeaves(args.tree_dir) + + if args.ivector_dir is not None: + args.ivector_dim = nnet3_train_lib.GetIvectorDim(args.ivector_dir) + + if not args.feat_dim > 0: + raise Exception("feat-dim has to be postive") + + if not args.num_targets > 0: + print(args.num_targets) + raise Exception("num_targets has to be positive") + + if not args.ivector_dim >= 0: + raise Exception("ivector-dim has to be non-negative") + + if (args.num_lstm_layers < 1): + sys.exit("--num-lstm-layers has to be a positive integer") + if (args.clipping_threshold < 0): + sys.exit("--clipping-threshold has to be a non-negative") + if args.lstm_delay is None: + args.lstm_delay = [[-1]] * args.num_lstm_layers + else: + try: + args.lstm_delay = ParseLstmDelayString(args.lstm_delay.strip()) + except ValueError: + sys.exit("--lstm-delay has incorrect format value. Provided value is '{0}'".format(args.lstm_delay)) + if len(args.lstm_delay) != args.num_lstm_layers: + sys.exit("--lstm-delay: Number of delays provided has to match --num-lstm-layers") + + return args def PrintConfig(file_name, config_lines): f = open(file_name, 'w') @@ -77,143 +203,60 @@ def ParseLstmDelayString(lstm_delay): return lstm_delay_array - -if __name__ == "__main__": - # we add compulsary arguments as named arguments for readability - parser = argparse.ArgumentParser(description="Writes config files and variables " - "for LSTMs creation and training", - epilog="See steps/nnet3/lstm/train.sh for example.") - # General neural network options - parser.add_argument("--splice-indexes", type=str, - help="Splice indexes at input layer, e.g. '-3,-2,-1,0,1,2,3' [compulsary argument]", default="0") - parser.add_argument("--feat-dim", type=int, - help="Raw feature dimension, e.g. 13") - parser.add_argument("--ivector-dim", type=int, - help="iVector dimension, e.g. 100", default=0) - parser.add_argument("--include-log-softmax", type=str, - help="add the final softmax layer ", default="true", choices = ["false", "true"]) - # LSTM options - parser.add_argument("--num-lstm-layers", type=int, - help="Number of LSTM layers to be stacked", default=1) - parser.add_argument("--cell-dim", type=int, - help="dimension of lstm-cell") - parser.add_argument("--recurrent-projection-dim", type=int, - help="dimension of recurrent projection") - parser.add_argument("--non-recurrent-projection-dim", type=int, - help="dimension of non-recurrent projection") - parser.add_argument("--hidden-dim", type=int, - help="dimension of fully-connected layers") - - # Natural gradient options - parser.add_argument("--ng-per-element-scale-options", type=str, - help="options to be supplied to NaturalGradientPerElementScaleComponent", default="") - parser.add_argument("--ng-affine-options", type=str, - help="options to be supplied to NaturalGradientAffineComponent", default="") - - # Gradient clipper options - parser.add_argument("--norm-based-clipping", type=str, - help="use norm based clipping in ClipGradient components ", default="false", choices = ["false", "true"]) - parser.add_argument("--clipping-threshold", type=float, - help="clipping threshold used in ClipGradient components, if clipping-threshold=0 no clipping is done", default=15) - - parser.add_argument("--num-targets", type=int, - help="number of network targets (e.g. num-pdf-ids/num-leaves)") - parser.add_argument("config_dir", - help="Directory to write config files and variables") - - # Delay options - parser.add_argument("--label-delay", type=int, default=None, - help="option to delay the labels to make the lstm robust") - - parser.add_argument("--lstm-delay", type=str, default=None, - help="option to have different delays in recurrence for each lstm") - - - - print(' '.join(sys.argv)) - - args = parser.parse_args() - - if not os.path.exists(args.config_dir): - os.makedirs(args.config_dir) - - ## Check arguments. - if args.splice_indexes is None: - sys.exit("--splice-indexes argument is required") - if args.feat_dim is None or not (args.feat_dim > 0): - sys.exit("--feat-dim argument is required") - if args.num_targets is None or not (args.num_targets > 0): - sys.exit("--feat-dim argument is required") - if (args.num_lstm_layers < 1): - sys.exit("--num-lstm-layers has to be a positive integer") - if (args.clipping_threshold < 0): - sys.exit("--clipping-threshold has to be a non-negative") - if args.lstm_delay is None: - lstm_delay = [-1] * args.num_lstm_layers - else: - try: - lstm_delay = ParseLstmDelayString(args.lstm_delay.strip()) - except ValueError: - sys.exit("--lstm-delay has incorrect format value. Provided value is '{0}'".format(args.lstm_delay)) - if len(lstm_delay) != args.num_lstm_layers: - sys.exit("--lstm-delay: Number of delays provided has to match --num-lstm-layers") - - parsed_splice_output = ParseSpliceString(args.splice_indexes.strip(), args.label_delay) - left_context = parsed_splice_output['left_context'] - right_context = parsed_splice_output['right_context'] - num_hidden_layers = parsed_splice_output['num_hidden_layers'] - splice_indexes = parsed_splice_output['splice_indexes'] - - if (num_hidden_layers < args.num_lstm_layers): - sys.exit("--num-lstm-layers : number of lstm layers has to be greater than number of layers, decided based on splice-indexes") - - # write the files used by other scripts like steps/nnet3/get_egs.sh - f = open(args.config_dir + "/vars", "w") - print('model_left_context=' + str(left_context), file=f) - print('model_right_context=' + str(right_context), file=f) - print('num_hidden_layers=' + str(num_hidden_layers), file=f) - # print('initial_right_context=' + str(splice_array[0][-1]), file=f) - f.close() +def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets, + splice_indexes, lstm_delay, cell_dim, + recurrent_projection_dim, non_recurrent_projection_dim, + num_lstm_layers, num_hidden_layers, + norm_based_clipping, clipping_threshold, + ng_per_element_scale_options, ng_affine_options, + label_delay, include_log_softmax, xent_regularize, self_repair_scale): config_lines = {'components':[], 'component-nodes':[]} config_files={} - prev_layer_output = nodes.AddInputLayer(config_lines, args.feat_dim, splice_indexes[0], args.ivector_dim) + prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0], ivector_dim) # Add the init config lines for estimating the preconditioning matrices init_config_lines = copy.deepcopy(config_lines) init_config_lines['components'].insert(0, '# Config file for initializing neural network prior to') init_config_lines['components'].insert(0, '# preconditioning matrix computation') - nodes.AddOutputNode(init_config_lines, prev_layer_output) - config_files[args.config_dir + '/init.config'] = init_config_lines + nodes.AddOutputLayer(init_config_lines, prev_layer_output) + config_files[config_dir + '/init.config'] = init_config_lines - prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, args.config_dir + '/lda.mat') + prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, config_dir + '/lda.mat') - for i in range(args.num_lstm_layers): + for i in range(num_lstm_layers): if len(lstm_delay[i]) == 2: # BLSTM layer case, add both forward and backward - prev_layer_output1 = nodes.AddLstmLayer(config_lines, "BLstm{0}_forward".format(i+1), prev_layer_output, args.cell_dim, - args.recurrent_projection_dim, args.non_recurrent_projection_dim, - args.clipping_threshold, args.norm_based_clipping, - args.ng_per_element_scale_options, args.ng_affine_options, - lstm_delay = lstm_delay[i][0]) - prev_layer_output2 = nodes.AddLstmLayer(config_lines, "BLstm{0}_backward".format(i+1), prev_layer_output, args.cell_dim, - args.recurrent_projection_dim, args.non_recurrent_projection_dim, - args.clipping_threshold, args.norm_based_clipping, - args.ng_per_element_scale_options, args.ng_affine_options, - lstm_delay = lstm_delay[i][1]) + prev_layer_output1 = nodes.AddLstmLayer(config_lines, "BLstm{0}_forward".format(i+1), prev_layer_output, cell_dim, + recurrent_projection_dim, non_recurrent_projection_dim, + clipping_threshold, norm_based_clipping, + ng_per_element_scale_options, ng_affine_options, + lstm_delay = lstm_delay[i][0], self_repair_scale = self_repair_scale) + prev_layer_output2 = nodes.AddLstmLayer(config_lines, "BLstm{0}_backward".format(i+1), prev_layer_output, cell_dim, + recurrent_projection_dim, non_recurrent_projection_dim, + clipping_threshold, norm_based_clipping, + ng_per_element_scale_options, ng_affine_options, + lstm_delay = lstm_delay[i][1], self_repair_scale = self_repair_scale) prev_layer_output['descriptor'] = 'Append({0}, {1})'.format(prev_layer_output1['descriptor'], prev_layer_output2['descriptor']) prev_layer_output['dimension'] = prev_layer_output1['dimension'] + prev_layer_output2['dimension'] else: # LSTM layer case - prev_layer_output = nodes.AddLstmLayer(config_lines, "Lstm{0}".format(i+1), prev_layer_output, args.cell_dim, - args.recurrent_projection_dim, args.non_recurrent_projection_dim, - args.clipping_threshold, args.norm_based_clipping, - args.ng_per_element_scale_options, args.ng_affine_options, - lstm_delay = lstm_delay[i][0]) + prev_layer_output = nodes.AddLstmLayer(config_lines, "Lstm{0}".format(i+1), prev_layer_output, cell_dim, + recurrent_projection_dim, non_recurrent_projection_dim, + clipping_threshold, norm_based_clipping, + ng_per_element_scale_options, ng_affine_options, + lstm_delay = lstm_delay[i][0], self_repair_scale = self_repair_scale) # make the intermediate config file for layerwise discriminative # training - nodes.AddFinalLayer(config_lines, prev_layer_output, args.num_targets, args.ng_affine_options, args.label_delay, args.include_log_softmax) - config_files['{0}/layer{1}.config'.format(args.config_dir, i+1)] = config_lines + nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax) + + + if xent_regularize != 0.0: + nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, + include_log_softmax = True, + name_affix = 'xent') + + config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines config_lines = {'components':[], 'component-nodes':[]} if len(lstm_delay[i]) == 2: # since the form 'Append(Append(xx, yy), zz)' is not allowed, here we don't wrap the descriptor with 'Append()' so that we would have the form @@ -223,17 +266,65 @@ def ParseLstmDelayString(lstm_delay): if len(lstm_delay[i]) == 2: # since there is no 'Append' in 'AffRelNormLayer', here we wrap the descriptor with 'Append()' prev_layer_output['descriptor'] = 'Append({0})'.format(prev_layer_output['descriptor']) - for i in range(args.num_lstm_layers, num_hidden_layers): + for i in range(num_lstm_layers, num_hidden_layers): prev_layer_output = nodes.AddAffRelNormLayer(config_lines, "L{0}".format(i+1), - prev_layer_output, args.hidden_dim, - args.ng_affine_options) + prev_layer_output, hidden_dim, + ng_affine_options, self_repair_scale = self_repair_scale) # make the intermediate config file for layerwise discriminative # training - nodes.AddFinalLayer(config_lines, prev_layer_output, args.num_targets, args.ng_affine_options, args.label_delay, args.include_log_softmax) - config_files['{0}/layer{1}.config'.format(args.config_dir, i+1)] = config_lines + nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax) + + if xent_regularize != 0.0: + nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, + include_log_softmax = True, + name_affix = 'xent') + + config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines config_lines = {'components':[], 'component-nodes':[]} # printing out the configs # init.config used to train lda-mllt train for key in config_files.keys(): PrintConfig(key, config_files[key]) + + + + +def ProcessSpliceIndexes(config_dir, splice_indexes, label_delay, num_lstm_layers): + parsed_splice_output = ParseSpliceString(splice_indexes.strip(), label_delay) + left_context = parsed_splice_output['left_context'] + right_context = parsed_splice_output['right_context'] + num_hidden_layers = parsed_splice_output['num_hidden_layers'] + splice_indexes = parsed_splice_output['splice_indexes'] + + if (num_hidden_layers < num_lstm_layers): + raise Exception("num-lstm-layers : number of lstm layers has to be greater than number of layers, decided based on splice-indexes") + + # write the files used by other scripts like steps/nnet3/get_egs.sh + f = open(config_dir + "/vars", "w") + print('model_left_context=' + str(left_context), file=f) + print('model_right_context=' + str(right_context), file=f) + print('num_hidden_layers=' + str(num_hidden_layers), file=f) + # print('initial_right_context=' + str(splice_array[0][-1]), file=f) + f.close() + + return [left_context, right_context, num_hidden_layers, splice_indexes] + + +def Main(): + args = GetArgs() + [left_context, right_context, num_hidden_layers, splice_indexes] = ProcessSpliceIndexes(args.config_dir, args.splice_indexes, args.label_delay, args.num_lstm_layers) + + MakeConfigs(args.config_dir, + args.feat_dim, args.ivector_dim, args.num_targets, + splice_indexes, args.lstm_delay, args.cell_dim, + args.recurrent_projection_dim, args.non_recurrent_projection_dim, + args.num_lstm_layers, num_hidden_layers, + args.norm_based_clipping, + args.clipping_threshold, + args.ng_per_element_scale_options, args.ng_affine_options, + args.label_delay, args.include_log_softmax, args.xent_regularize, + args.self_repair_scale) + +if __name__ == "__main__": + Main() diff --git a/egs/wsj/s5/steps/nnet3/lstm/train.sh b/egs/wsj/s5/steps/nnet3/lstm/train.sh index 10f6f793079..1717ea7b431 100755 --- a/egs/wsj/s5/steps/nnet3/lstm/train.sh +++ b/egs/wsj/s5/steps/nnet3/lstm/train.sh @@ -560,10 +560,13 @@ while [ $x -lt $num_iters ]; do cur_num_hidden_layers=$[1+$x/$add_layers_period] config=$dir/configs/layer$cur_num_hidden_layers.config raw="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl - | nnet3-init --srand=$x - $config - |" + cache_read_opt="" # an option for writing cache (storing pairs of nnet-computations + # and computation-requests) during training. else do_average=true if [ $x -eq 0 ]; then do_average=false; fi # on iteration 0, pick the best, don't average. raw="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl -|" + cache_read_opt="--read-cache=$dir/cache.$x" fi if $do_average; then this_num_chunk_per_minibatch=$num_chunk_per_minibatch @@ -593,8 +596,15 @@ while [ $x -lt $num_iters ]; do k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we will derive # the other indexes from. archive=$[($k%$num_archives)+1]; # work out the 1-based archive index. + if [ $n -eq 1 ]; then + # an option for writing cache (storing pairs of nnet-computations and + # computation-requests) during training. + cache_write_opt=" --write-cache=$dir/cache.$[$x+1]" + else + cache_write_opt="" + fi $cmd $train_queue_opt $dir/log/train.$x.$n.log \ - nnet3-train $parallel_train_opts --print-interval=10 --momentum=$momentum \ + nnet3-train $parallel_train_opts $cache_read_opt $cache_write_opt --print-interval=10 --momentum=$momentum \ --max-param-change=$max_param_change \ --optimization.min-deriv-time=$min_deriv_time "$raw" \ "ark:nnet3-copy-egs $context_opts ark:$cur_egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --minibatch-size=$this_num_chunk_per_minibatch --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \ @@ -641,6 +651,7 @@ while [ $x -lt $num_iters ]; do rm $dir/$[$x-1].mdl fi fi + rm $dir/cache.$x 2>/dev/null x=$[$x+1] num_archives_processed=$[$num_archives_processed+$this_num_jobs] done @@ -661,9 +672,6 @@ if [ $stage -le $num_iters ]; then nnets_list[$n]="nnet3-am-copy --raw=true $mdl -|"; done - # Below, we use --use-gpu=no to disable nnet3-combine-fast from using a GPU, - # as if there are many models it can give out-of-memory error; and we set - # num-threads to 8 to speed it up (this isn't ideal...) combine_num_chunk_per_minibatch=$(python -c "print int(1024.0/($chunk_width))") $cmd $combine_queue_opt $dir/log/combine.log \ nnet3-combine --num-iters=40 \ diff --git a/egs/wsj/s5/steps/nnet3/make_jesus_configs.py b/egs/wsj/s5/steps/nnet3/make_jesus_configs.py new file mode 100755 index 00000000000..af6afcb99e3 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/make_jesus_configs.py @@ -0,0 +1,538 @@ +#!/usr/bin/env python + +# tdnn or RNN with 'jesus layer' + +# inputs to jesus layer: +# - for each spliced version of the previous layer the output (of dim --jesus-forward-output-dim) + +# outputs of jesus layer: +# for all layers: +# --jesus-forward-output-dim + + +# we're using python 3.x style print but want it to work in python 2.x, +from __future__ import print_function +import re, os, argparse, sys, math, warnings +import imp + +nnet3_train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py') +chain_lib = imp.load_source('ncl', 'steps/nnet3/chain/nnet3_chain_lib.py') + +parser = argparse.ArgumentParser(description="Writes config files and variables " + "for TDNNs creation and training", + epilog="See steps/nnet3/train_tdnn.sh for example."); +parser.add_argument("--splice-indexes", type=str, required = True, + help="Splice[:recurrence] indexes at each hidden layer, e.g. '-3,-2,-1,0,1,2,3 -3,0:-3 -3,0:-3 -6,-3,0:-6,-3'. " + "Note: recurrence indexes are optional, may not appear in 1st layer, and must be " + "either all negative or all positive for any given layer.") + +# Only one of these arguments can be specified, and one of them has to +# be compulsarily specified +feat_group = parser.add_mutually_exclusive_group(required = True) +feat_group.add_argument("--feat-dim", type=int, + help="Raw feature dimension, e.g. 13") +feat_group.add_argument("--feat-dir", type=str, + help="Feature directory, from which we derive the feat-dim") + +# only one of these arguments can be specified +ivector_group = parser.add_mutually_exclusive_group(required = False) +ivector_group.add_argument("--ivector-dim", type=int, + help="iVector dimension, e.g. 100", default=0) +ivector_group.add_argument("--ivector-dir", type=str, + help="iVector dir, which will be used to derive the ivector-dim ", default=None) + +num_target_group = parser.add_mutually_exclusive_group(required = True) +num_target_group.add_argument("--num-targets", type=int, + help="number of network targets (e.g. num-pdf-ids/num-leaves)") +num_target_group.add_argument("--ali-dir", type=str, + help="alignment directory, from which we derive the num-targets") +num_target_group.add_argument("--tree-dir", type=str, + help="directory with final.mdl, from which we derive the num-targets") + +parser.add_argument("--include-log-softmax", type=str, + help="add the final softmax layer ", default="true", choices = ["false", "true"]) +parser.add_argument("--xent-regularize", type=float, + help="For chain models, if nonzero, add a separate output for cross-entropy " + "regularization (with learning-rate-factor equal to the inverse of this)", + default=0.0) +parser.add_argument("--xent-separate-forward-affine", type=str, + help="if using --xent-regularize, gives it separate last-but-one weight matrix", + default="false", choices = ["false", "true"]) +parser.add_argument("--use-repeated-affine", type=str, + help="if true use RepeatedAffineComponent, else BlockAffineComponent (i.e. no sharing)", + default="true", choices = ["false", "true"]) +parser.add_argument("--final-layer-learning-rate-factor", type=float, + help="Learning-rate factor for final affine component", + default=1.0) +parser.add_argument("--self-repair-scale", type=float, + help="Small scale involved in fixing derivatives, if supplied (e.g. try 0.00001)", + default=0.0) +parser.add_argument("--jesus-hidden-dim", type=int, + help="hidden dimension of Jesus layer.", default=10000) +parser.add_argument("--jesus-forward-output-dim", type=int, + help="part of output dimension of Jesus layer that goes to next layer", + default=1000) +parser.add_argument("--jesus-forward-input-dim", type=int, + help="Input dimension of Jesus layer that comes from affine projection " + "from the previous layer (same as output dim of forward affine transform)", + default=1000) +parser.add_argument("--final-hidden-dim", type=int, + help="Final hidden layer dimension-- or if <0, the same as " + "--jesus-forward-input-dim", default=-1) +parser.add_argument("--num-jesus-blocks", type=int, + help="number of blocks in Jesus layer. All configs of the form " + "--jesus-*-dim will be rounded up to be a multiple of this.", + default=100); +parser.add_argument("--jesus-stddev-scale", type=float, + help="Scaling factor on parameter stddev of Jesus layer (smaller->jesus layer learns faster)", + default=1.0) +parser.add_argument("--clipping-threshold", type=float, + help="clipping threshold used in ClipGradient components (only relevant if " + "recurrence indexes are specified). If clipping-threshold=0 no clipping is done", + default=15) +parser.add_argument("config_dir", + help="Directory to write config files and variables"); + +print(' '.join(sys.argv)) + +args = parser.parse_args() + +if not os.path.exists(args.config_dir): + os.makedirs(args.config_dir) + +## Check arguments. +if args.feat_dir is not None: + args.feat_dim = nnet3_train_lib.GetFeatDim(args.feat_dir) + +if args.ali_dir is not None: + args.num_targets = nnet3_train_lib.GetNumberOfLeaves(args.ali_dir) +elif args.tree_dir is not None: + args.num_targets = chain_lib.GetNumberOfLeaves(args.tree_dir) + +if args.ivector_dir is not None: + args.ivector_dim = nnet3_train_lib.GetIvectorDim(args.ivector_dir) + +if not args.feat_dim > 0: + raise Exception("feat-dim has to be postive") + +if not args.num_targets > 0: + print(args.num_targets) + raise Exception("num_targets has to be positive") + +if not args.ivector_dim >= 0: + raise Exception("ivector-dim has to be non-negative") + + +## Check arguments. +if args.num_jesus_blocks < 1: + sys.exit("invalid --num-jesus-blocks value"); +if args.final_hidden_dim < 0: + args.final_hidden_dim = args.jesus_forward_input_dim + +for name in [ "jesus_hidden_dim", "jesus_forward_output_dim", "jesus_forward_input_dim", + "final_hidden_dim" ]: + old_val = getattr(args, name) + if old_val % args.num_jesus_blocks != 0: + new_val = old_val + args.num_jesus_blocks - (old_val % args.num_jesus_blocks) + printable_name = '--' + name.replace('_', '-') + print('Rounding up {0} from {1} to {2} to be a multiple of --num-jesus-blocks={3} '.format( + printable_name, old_val, new_val, args.num_jesus_blocks)) + setattr(args, name, new_val); + +# this is a bit like a struct, initialized from a string, which describes how to +# set up the statistics-pooling and statistics-extraction components. +# An example string is 'mean(-99:3:9::99)', which means, compute the mean of +# data within a window of -99 to +99, with distinct means computed every 9 frames +# (we round to get the appropriate one), and with the input extracted on multiples +# of 3 frames (so this will force the input to this layer to be evaluated +# every 3 frames). Another example string is 'mean+stddev(-99:3:9:99)', +# which will also cause the standard deviation to be computed. +class StatisticsConfig: + # e.g. c = StatisticsConfig('mean+stddev(-99:3:9:99)', 400, 'jesus1-forward-output-affine') + def __init__(self, config_string, input_dim, input_name): + self.input_dim = input_dim + self.input_name = input_name + + m = re.search("(mean|mean\+stddev)\((-?\d+):(-?\d+):(-?\d+):(-?\d+)\)", + config_string) + if m == None: + sys.exit("Invalid splice-index or statistics-config string: " + config_string) + self.output_stddev = (m.group(1) != 'mean') + self.left_context = -int(m.group(2)) + self.input_period = int(m.group(3)) + self.stats_period = int(m.group(4)) + self.right_context = int(m.group(5)) + if not (self.left_context > 0 and self.right_context > 0 and + self.input_period > 0 and self.stats_period > 0 and + self.left_context % self.stats_period == 0 and + self.right_context % self.stats_period == 0 and + self.stats_period % self.input_period == 0): + sys.exit("Invalid configuration of statistics-extraction: " + config_string) + + # OutputDim() returns the output dimension of the node that this produces. + def OutputDim(self): + return self.input_dim * (2 if self.output_stddev else 1) + + # OutputDims() returns an array of output dimensions, consisting of + # [ input-dim ] if just "mean" was specified, otherwise + # [ input-dim input-dim ] + def OutputDims(self): + return [ self.input_dim, self.input_dim ] if self.output_stddev else [ self.input_dim ] + + # Descriptor() returns the textual form of the descriptor by which the + # output of this node is to be accessed. + def Descriptor(self): + return 'Round({0}-pooling-{1}-{2}, {3})'.format(self.input_name, self.left_context, self.right_context, + self.stats_period) + + # This function writes the configuration lines need to compute the specified + # statistics, to the file f. + def WriteConfigs(self, f): + print('component name={0}-extraction-{1}-{2} type=StatisticsExtractionComponent input-dim={3} ' + 'input-period={4} output-period={5} include-variance={6} '.format( + self.input_name, self.left_context, self.right_context, + self.input_dim, self.input_period, self.stats_period, + ('true' if self.output_stddev else 'false')), file=f) + print('component-node name={0}-extraction-{1}-{2} component={0}-extraction-{1}-{2} input={0} '.format( + self.input_name, self.left_context, self.right_context), file=f) + stats_dim = 1 + self.input_dim * (2 if self.output_stddev else 1) + print('component name={0}-pooling-{1}-{2} type=StatisticsPoolingComponent input-dim={3} ' + 'input-period={4} left-context={1} right-context={2} num-log-count-features=0 ' + 'output-stddevs={5} '.format(self.input_name, self.left_context, self.right_context, + stats_dim, self.stats_period, + ('true' if self.output_stddev else 'false')), + file=f) + print('component-node name={0}-pooling-{1}-{2} component={0}-pooling-{1}-{2} input={0}-extraction-{1}-{2} '.format( + self.input_name, self.left_context, self.right_context), file=f) + + + + +## Work out splice_array +## e.g. for +## args.splice_indexes == '-3,-2,-1,0,1,2,3 -3,0:-3 -3,0:-3 -6,-3,0:-6,-3' +## we would have +## splice_array = [ [ -3,-2,...3 ], [-3,0] [-3,0] [-6,-3,0] + + +splice_array = [] +left_context = 0 +right_context = 0 +split_on_spaces = args.splice_indexes.split(" "); # we already checked the string is nonempty. +if len(split_on_spaces) < 2: + sys.exit("invalid --splice-indexes argument, too short: " + + args.splice_indexes) +try: + for string in split_on_spaces: + this_layer = len(splice_array) + + this_splices = string.split(",") + splice_array.append(this_splices) + # the rest of this block updates left_context and right_context, and + # does some checking. + leftmost_splice = 10000 + rightmost_splice = -10000 + for s in this_splices: + try: + n = int(s) + if n < leftmost_splice: + leftmost_splice = n + if n > rightmost_splice: + rightmost_splice = n + except: + if len(splice_array) == 1: + sys.exit("First dimension of splicing array must not have averaging [yet]") + try: + x = StatisticsConfig(s, 100, 'foo') + except: + sys.exit("The following element of the splicing array is not a valid specifier " + "of statistics: " + s) + + if leftmost_splice == 10000 or rightmost_splice == -10000: + sys.exit("invalid element of --splice-indexes: " + string) + left_context += -leftmost_splice + right_context += rightmost_splice +except ValueError as e: + sys.exit("invalid --splice-indexes argument " + args.splice_indexes + " " + str(e)) +left_context = max(0, left_context) +right_context = max(0, right_context) +num_hidden_layers = len(splice_array) +input_dim = len(splice_array[0]) * args.feat_dim + args.ivector_dim + +f = open(args.config_dir + "/vars", "w") +print('left_context=' + str(left_context), file=f) +print('right_context=' + str(right_context), file=f) +print('num_hidden_layers=' + str(num_hidden_layers), file=f) +f.close() + + +f = open(args.config_dir + "/init.config", "w") +print('# Config file for initializing neural network prior to', file=f) +print('# preconditioning matrix computation', file=f) +print('input-node name=input dim=' + str(args.feat_dim), file=f) +list=[ ('Offset(input, {0})'.format(n) if n != 0 else 'input' ) for n in splice_array[0] ] +if args.ivector_dim > 0: + print('input-node name=ivector dim=' + str(args.ivector_dim), file=f) + list.append('ReplaceIndex(ivector, t, 0)') +# example of next line: +# output-node name=output input="Append(Offset(input, -3), Offset(input, -2), Offset(input, -1), ... , Offset(input, 3), ReplaceIndex(ivector, t, 0))" +print('output-node name=output input=Append({0})'.format(", ".join(list)), file=f) +f.close() + + +for l in range(1, num_hidden_layers + 1): + # the following summarizes the structure of the layers: Here, the Jesus component includes ReLU at its input and output, and renormalize + # at its output after the ReLU. + # layer1: splice + LDA-transform + affine + ReLU + renormalize + # layerX: splice + Jesus + affine + ReLU + + # Inside the jesus component is: + # [permute +] ReLU + repeated-affine + ReLU + repeated-affine + # [we make the repeated-affine the last one so we don't have to redo that in backprop]. + # We follow this with a post-jesus composite component containing the operations: + # [permute +] ReLU + renormalize + # call this post-jesusN. + # After this we use dim-range nodes to split up the output into + # [ jesusN-forward-output, jesusN-direct-output and jesusN-projected-output ] + # parts; + # and nodes for the jesusN-forward-affine. + + f = open(args.config_dir + "/layer{0}.config".format(l), "w") + print('# Config file for layer {0} of the network'.format(l), file=f) + if l == 1: + print('component name=lda type=FixedAffineComponent matrix={0}/lda.mat'. + format(args.config_dir), file=f) + splices = [ ('Offset(input, {0})'.format(n) if n != 0 else 'input') for n in splice_array[l-1] ] + if args.ivector_dim > 0: splices.append('ReplaceIndex(ivector, t, 0)') + orig_input='Append({0})'.format(', '.join(splices)) + # e.g. orig_input = 'Append(Offset(input, -2), ... Offset(input, 2), ivector)' + print('component-node name=lda component=lda input={0}'.format(orig_input), + file=f) + # after the initial LDA transform, put a trainable affine layer and a ReLU, followed + # by a NormalizeComponent. + print('component name=affine1 type=NaturalGradientAffineComponent ' + 'input-dim={0} output-dim={1} bias-stddev=0'.format( + input_dim, args.jesus_forward_input_dim), file=f) + print('component-node name=affine1 component=affine1 input=lda', + file=f) + # the ReLU after the affine + print('component name=relu1 type=RectifiedLinearComponent dim={1} self-repair-scale={2}'.format( + l, args.jesus_forward_input_dim, args.self_repair_scale), file=f) + print('component-node name=relu1 component=relu1 input=affine1', file=f) + # the renormalize component after the ReLU + print ('component name=renorm1 type=NormalizeComponent dim={0} '.format( + args.jesus_forward_input_dim), file=f) + print('component-node name=renorm1 component=renorm1 input=relu1', file=f) + cur_output = 'renorm1' + cur_affine_output_dim = args.jesus_forward_input_dim + else: + splices = [] + spliced_dims = [] + for s in splice_array[l-1]: + # the connection from the previous layer + try: + offset = int(s) + # it's an integer offset. + splices.append('Offset({0}, {1})'.format(cur_output, offset)) + spliced_dims.append(cur_affine_output_dim) + except: + # it's not an integer offset, so assume it specifies the + # statistics-extraction. + stats = StatisticsConfig(s, cur_affine_output_dim, cur_output) + stats.WriteConfigs(f) + splices.append(stats.Descriptor()) + spliced_dims.extend(stats.OutputDims()) + + # get the input to the Jesus layer. + cur_input = 'Append({0})'.format(', '.join(splices)) + cur_dim = sum(spliced_dims) + + this_jesus_output_dim = args.jesus_forward_output_dim + + # As input to the Jesus component we'll append the spliced input and any + # mean/stddev-stats input, and the first thing inside the component that + # we do is rearrange the dimensions so that things pertaining to a + # particular block stay together. + + column_map = [] + for x in range(0, args.num_jesus_blocks): + dim_offset = 0 + for src_splice in spliced_dims: + src_block_size = src_splice / args.num_jesus_blocks + for y in range(0, src_block_size): + column_map.append(dim_offset + (x * src_block_size) + y) + dim_offset += src_splice + if sorted(column_map) != range(0, sum(spliced_dims)): + print("column_map is " + str(column_map)) + print("num_jesus_blocks is " + str(args.num_jesus_blocks)) + print("spliced_dims is " + str(spliced_dims)) + sys.exit("code error creating new column order") + + need_input_permute_component = (column_map != range(0, sum(spliced_dims))) + + # Now add the jesus component. + + permute_offset = (1 if need_input_permute_component else 0) + + if args.jesus_hidden_dim > 0: # normal case where we have jesus-hidden-dim. + num_sub_components = 4 + permute_offset + hidden_else_output_dim = args.jesus_hidden_dim + else: # no hidden part in jesus layer. + num_sub_components = 2 + permute_offset + hidden_else_output_dim = args.jesus_forward_output_dim + print('component name=jesus{0} type=CompositeComponent num-components={1}'.format( + l, num_sub_components), file=f, end='') + # print the sub-components of the CompositeComopnent on the same line. + # this CompositeComponent has the same effect as a sequence of + # components, but saves memory. + if need_input_permute_component: + print(" component1='type=PermuteComponent column-map={1}'".format( + l, ','.join([str(x) for x in column_map])), file=f, end='') + print(" component{0}='type=RectifiedLinearComponent dim={1} self-repair-scale={2}'".format( + 1 + permute_offset, + cur_dim, args.self_repair_scale), file=f, end='') + + if args.use_repeated_affine == "true": + print(" component{0}='type=NaturalGradientRepeatedAffineComponent input-dim={1} output-dim={2} " + "num-repeats={3} param-stddev={4} bias-mean={5} bias-stddev=0'".format( + 2 + permute_offset, + cur_dim, hidden_else_output_dim, + args.num_jesus_blocks, + args.jesus_stddev_scale / math.sqrt(cur_dim / args.num_jesus_blocks), + 0.5 * args.jesus_stddev_scale), + file=f, end='') + else: + print(" component{0}='type=BlockAffineComponent input-dim={1} output-dim={2} " + "num-blocks={3} param-stddev={4} bias-stddev=0'".format( + 2 + permute_offset, + cur_dim, hidden_else_output_dim, + args.num_jesus_blocks, + args.jesus_stddev_scale / math.sqrt(cur_dim / args.num_jesus_blocks)), + file=f, end='') + + if args.jesus_hidden_dim > 0: # normal case where we have jesus-hidden-dim. + print(" component{0}='type=RectifiedLinearComponent dim={1} self-repair-scale={2}'".format( + 3 + permute_offset, hidden_else_output_dim, + args.self_repair_scale), file=f, end='') + + if args.use_repeated_affine == "true": + print(" component{0}='type=NaturalGradientRepeatedAffineComponent input-dim={1} output-dim={2} " + "num-repeats={3} param-stddev={4} bias-mean={5} bias-stddev=0'".format( + 4 + permute_offset, + args.jesus_hidden_dim, + this_jesus_output_dim, + args.num_jesus_blocks, + args.jesus_stddev_scale / math.sqrt(args.jesus_hidden_dim / args.num_jesus_blocks), + 0.5 * args.jesus_stddev_scale), + file=f, end='') + else: + print(" component{0}='type=BlockAffineComponent input-dim={1} output-dim={2} " + "num-blocks={3} param-stddev={4} bias-stddev=0'".format( + 4 + permute_offset, + args.jesus_hidden_dim, + this_jesus_output_dim, + args.num_jesus_blocks, + args.jesus_stddev_scale / math.sqrt((args.jesus_hidden_dim / args.num_jesus_blocks))), + file=f, end='') + + print("", file=f) # print newline. + print('component-node name=jesus{0} component=jesus{0} input={1}'.format( + l, cur_input), file=f) + + # now print the post-Jesus component which consists of ReLU + + # renormalize. + + num_sub_components = 2 + print('component name=post-jesus{0} type=CompositeComponent num-components=2'.format(l), + file=f, end='') + + # still within the post-Jesus component, print the ReLU + print(" component1='type=RectifiedLinearComponent dim={0} self-repair-scale={1}'".format( + this_jesus_output_dim, args.self_repair_scale), file=f, end='') + # still within the post-Jesus component, print the NormalizeComponent + print(" component2='type=NormalizeComponent dim={0} '".format( + this_jesus_output_dim), file=f, end='') + print("", file=f) # print newline. + print('component-node name=post-jesus{0} component=post-jesus{0} input=jesus{0}'.format(l), + file=f) + + # handle the forward output, we need an affine node for this: + cur_affine_output_dim = (args.jesus_forward_input_dim if l < num_hidden_layers else args.final_hidden_dim) + print('component name=forward-affine{0} type=NaturalGradientAffineComponent ' + 'input-dim={1} output-dim={2} bias-stddev=0'. + format(l, args.jesus_forward_output_dim, cur_affine_output_dim), file=f) + print('component-node name=jesus{0}-forward-output-affine component=forward-affine{0} input=post-jesus{0}'.format( + l), file=f) + # for each recurrence delay, create an affine node followed by a + # clip-gradient node. [if there are multiple recurrences in the same layer, + # each one gets its own affine projection.] + + # The reason we set the param-stddev to 0 is out of concern that if we + # initialize to nonzero, this will encourage the corresponding inputs at + # the jesus layer to become small (to remove this random input), which + # in turn will make this component learn slowly (due to small + # derivatives). we set the bias-mean to 0.001 so that the ReLUs on the + # input of the Jesus layer are in the part of the activation that has a + # nonzero derivative- otherwise with this setup it would never learn. + + cur_output = 'jesus{0}-forward-output-affine'.format(l) + + + # with each new layer we regenerate the final-affine component, with a ReLU before it + # because the layers we printed don't end with a nonlinearity. + print('component name=final-relu type=RectifiedLinearComponent dim={0} self-repair-scale={1}'.format( + cur_affine_output_dim, args.self_repair_scale), file=f) + print('component-node name=final-relu component=final-relu input={0}'.format(cur_output), + file=f) + print('component name=final-affine type=NaturalGradientAffineComponent ' + 'input-dim={0} output-dim={1} learning-rate-factor={2} param-stddev=0.0 bias-stddev=0'.format( + cur_affine_output_dim, args.num_targets, + args.final_layer_learning_rate_factor), file=f) + print('component-node name=final-affine component=final-affine input=final-relu', + file=f) + # printing out the next two, and their component-nodes, for l > 1 is not + # really necessary as they will already exist, but it doesn't hurt and makes + # the structure clearer. + if args.include_log_softmax == "true": + print('component name=final-log-softmax type=LogSoftmaxComponent dim={0}'.format( + args.num_targets), file=f) + print('component-node name=final-log-softmax component=final-log-softmax ' + 'input=final-affine', file=f) + print('output-node name=output input=final-log-softmax', file=f) + else: + print('output-node name=output input=final-affine', file=f) + + if args.xent_regularize != 0.0: + xent_input = 'final-relu' + if l == num_hidden_layers and args.xent_separate_forward_affine == "true": + print('component name=forward-affine{0}-xent type=NaturalGradientAffineComponent ' + 'input-dim={1} output-dim={2} bias-stddev=0'. + format(l, args.jesus_forward_output_dim, args.final_hidden_dim), file=f) + print('component-node name=jesus{0}-forward-output-affine-xent component=forward-affine{0}-xent input=post-jesus{0}'.format( + l), file=f) + print('component name=final-relu-xent type=RectifiedLinearComponent dim={0} self-repair-scale={1}'.format( + args.final_hidden_dim, args.self_repair_scale), file=f) + print('component-node name=final-relu-xent component=final-relu-xent ' + 'input=jesus{0}-forward-output-affine-xent'.format(l), file=f) + xent_input = 'final-relu-xent' + + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 1.0 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + print('component name=final-affine-xent type=NaturalGradientAffineComponent ' + 'input-dim={0} output-dim={1} param-stddev=0.0 bias-stddev=0 learning-rate-factor={2}'.format( + cur_affine_output_dim, args.num_targets, 0.5 / args.xent_regularize), file=f) + print('component-node name=final-affine-xent component=final-affine-xent input={0}'.format( + xent_input), file=f) + print('component name=final-log-softmax-xent type=LogSoftmaxComponent dim={0}'.format( + args.num_targets), file=f) + print('component-node name=final-log-softmax-xent component=final-log-softmax-xent ' + 'input=final-affine-xent', file=f) + print('output-node name=output-xent input=final-log-softmax-xent', file=f) + + f.close() diff --git a/egs/wsj/s5/steps/nnet3/make_tdnn_configs.py b/egs/wsj/s5/steps/nnet3/make_tdnn_configs.py index 12c7a26e46d..8403c273a9d 100644 --- a/egs/wsj/s5/steps/nnet3/make_tdnn_configs.py +++ b/egs/wsj/s5/steps/nnet3/make_tdnn_configs.py @@ -70,7 +70,7 @@ splice_array = [] left_context = 0 right_context = 0 -split1 = args.splice_indexes.split(" "); # we already checked the string is nonempty. +split1 = args.splice_indexes.split(); # we already checked the string is nonempty. if len(split1) < 1: sys.exit("invalid --splice-indexes argument, too short: " + args.splice_indexes) diff --git a/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh b/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh index 24666b8bd02..c36de8c16bf 100755 --- a/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh +++ b/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh @@ -1,11 +1,12 @@ #!/bin/bash # script showing use of nnet3_to_dot.py -# Copyright 2015 Johns Hopkins University (Author: Vijayaditya Peddinti). +# Copyright 2015 Johns Hopkins University (Author: Vijayaditya Peddinti). # Begin configuration section. component_attributes="name,type" node_prefixes="" +info_bin=nnet3-am-info echo "$0 $@" # Print the command line for logging [ -f ./path.sh ] && . ./path.sh; # source the path. @@ -20,7 +21,7 @@ if [ $# != 3 ]; then echo " --node-prefixes # list of prefixes. Nnet3 components/component-nodes with the same prefix" echo " # will be clustered together in the dot-graph" - + exit 1; fi @@ -29,10 +30,10 @@ dot_file=$2 output_file=$3 attr=${node_prefixes:+ --node-prefixes "$node_prefixes"} -nnet3-am-info $model | \ +$info_bin $model | \ steps/nnet3/dot/nnet3_to_dot.py \ --component-attributes "$component_attributes" \ - $attr > $dot_file + $attr $dot_file command -v dot >/dev/null 2>&1 || { echo >&2 "This script requires dot but it's not installed. Please compile $dot_file with dot"; exit 1; } -dot -Tpng $dot_file -o $output_file +dot -Tpdf $dot_file -o $output_file diff --git a/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py b/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py new file mode 100644 index 00000000000..166a6b85be2 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py @@ -0,0 +1,658 @@ +import subprocess +import logging +import math +import re +import time +import argparse + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s') +handler.setFormatter(formatter) +logger.addHandler(handler) + + +def SendMail(message, subject, email_id): + try: + subprocess.Popen('echo "{message}" | mail -s "{subject}" {email} '.format( + message = message, + subject = subject, + email = email_id), shell=True) + except Exception as e: + logger.info(" Unable to send mail due to error:\n {error}".format(error = str(e))) + pass + +class StrToBoolAction(argparse.Action): + """ A custom action to convert bools from shell format i.e., true/false + to python format i.e., True/False """ + def __call__(self, parser, namespace, values, option_string=None): + if values == "true": + setattr(namespace, self.dest, True) + elif values == "false": + setattr(namespace, self.dest, False) + else: + raise Exception("Unknown value {0} for --{1}".format(values, self.dest)) + +class NullstrToNoneAction(argparse.Action): + """ A custom action to convert empty strings passed by shell + to None in python. This is necessary as shell scripts print null strings + when a variable is not specified. We could use the more apt None + in python. """ + def __call__(self, parser, namespace, values, option_string=None): + if values.strip() == "": + setattr(namespace, self.dest, None) + else: + setattr(namespace, self.dest, values) + + +def CheckIfCudaCompiled(): + p = subprocess.Popen("cuda-compiled") + p.communicate() + if p.returncode == 1: + return False + else: + return True + +def RunKaldiCommand(command, wait = True): + """ Runs commands frequently seen in Kaldi scripts. These are usually a + sequence of commands connected by pipes, so we use shell=True """ + #logger.info("Running the command\n{0}".format(command)) + p = subprocess.Popen(command, shell = True, + stdout = subprocess.PIPE, + stderr = subprocess.PIPE) + + if wait: + [stdout, stderr] = p.communicate() + if p.returncode is not 0: + raise Exception("There was an error while running the command {0}\n".format(command)+"-"*10+"\n"+stderr) + return stdout, stderr + else: + return p + +def GetSuccessfulModels(num_models, log_file_pattern, difference_threshold=1.0): + assert(num_models > 0) + + parse_regex = re.compile("LOG .* Overall average objective function for 'output' is ([0-9e.\-+]+) over ([0-9e.\-+]+) frames") + objf = [] + for i in range(num_models): + model_num = i + 1 + logfile = re.sub('%', str(model_num), log_file_pattern) + lines = open(logfile, 'r').readlines() + this_objf = -100000 + for line_num in range(1, len(lines) + 1): + # we search from the end as this would result in + # lesser number of regex searches. Python regex is slow ! + mat_obj = parse_regex.search(lines[-1*line_num]) + if mat_obj is not None: + this_objf = float(mat_obj.groups()[0]) + break; + objf.append(this_objf); + max_index = objf.index(max(objf)) + accepted_models = [] + for i in range(num_models): + if (objf[max_index] - objf[i]) <= difference_threshold: + accepted_models.append(i+1) + + if len(accepted_models) != num_models: + logger.warn("Only {0}/{1} of the models have been accepted for averaging, based on log files {2}.".format(len(accepted_models), num_models, log_file_pattern)) + + return [accepted_models, max_index+1] + +def GetNumberOfLeaves(alidir): + [stdout, stderr] = RunKaldiCommand("tree-info {0}/tree 2>/dev/null | grep num-pdfs".format(alidir)) + parts = stdout.split() + assert(parts[0] == "num-pdfs") + num_leaves = int(parts[1]) + if num_leaves == 0: + raise Exception("Number of leaves is 0") + return num_leaves + +def GetNumberOfJobs(alidir): + try: + num_jobs = int(open('{0}/num_jobs'.format(alidir), 'r').readline().strip()) + except IOError, ValueError: + raise Exception('Exception while reading the number of alignment jobs') + return num_jobs +def GetIvectorDim(ivector_dir = None): + if ivector_dir is None: + return 0 + [stdout_val, stderr_val] = RunKaldiCommand("feat-to-dim --print-args=false scp:{dir}/ivector_online.scp -".format(dir = ivector_dir)) + ivector_dim = int(stdout_val) + return ivector_dim + +def GetFeatDim(feat_dir): + [stdout_val, stderr_val] = RunKaldiCommand("feat-to-dim --print-args=false scp:{data}/feats.scp -".format(data = feat_dir)) + feat_dim = int(stdout_val) + return feat_dim + +def ReadKaldiMatrix(matrix_file): + try: + lines = map(lambda x: x.split(), open(matrix_file).readlines()) + first_field = lines[0][0] + last_field = lines[-1][-1] + lines[0] = lines[0][1:] + lines[-1] = lines[-1][:-1] + if not (first_field == "[" and last_field == "]"): + raise Exception("Kaldi matrix file has incorrect format, only text format matrix files can be read by this script") + for i in range(len(lines)): + lines[i] = map(lambda x: int(float(x)), lines[i]) + return lines + except IOError: + raise Exception("Error while reading the kaldi matrix file {0}".format(matrix_file)) + +def WriteKaldiMatrix(output_file, matrix): + # matrix is a list of lists + file = open(output_file, 'w') + file.write("[ ") + num_rows = len(matrix) + if num_rows == 0: + raise Exception("Matrix is empty") + num_cols = len(matrix[0]) + + for row_index in range(len(matrix)): + if num_cols != len(matrix[row_index]): + raise Exception("All the rows of a matrix are expected to have the same length") + file.write(" ".join(map(lambda x: str(x), matrix[row_index]))) + if row_index != num_rows - 1: + file.write("\n") + file.write(" ]") + file.close() + +import shutil +def CopyEgsPropertiesToExpDir(egs_dir, dir): + try: + for file in ['cmvn_opts', 'splice_opts', 'final.mat']: + file_name = '{dir}/{file}'.format(dir = egs_dir, file = file) + if os.path.isfile(file_name): + shutil.copy2(file_name, dir) + except IOError: + raise Exception("Error while trying to copy egs property files to {dir}".format(dir = dir)) + +def SplitData(data, num_jobs): + RunKaldiCommand("utils/split_data.sh {data} {num_jobs}".format(data = data, + num_jobs = num_jobs)) + +def ParseModelConfigVarsFile(var_file): + try: + var_file_handle = open(var_file, 'r') + model_left_context = None + model_right_context = None + num_hidden_layers = None + for line in var_file_handle: + parts = line.split('=') + field_name = parts[0].strip() + field_value = parts[1] + if field_name in ['model_left_context', 'left_context']: + model_left_context = int(field_value) + elif field_name in ['model_right_context', 'right_context']: + model_right_context = int(field_value) + elif field_name == 'num_hidden_layers': + num_hidden_layers = int(field_value) + + if model_left_context is not None and model_right_context is not None and num_hidden_layers is not None: + return [model_left_context, model_right_context, num_hidden_layers] + + except ValueError: + # we will throw an error at the end of the function so I will just pass + pass + + raise Exception('Error while parsing the file {0}'.format(var_file)) + + +def GenerateEgs(data, alidir, egs_dir, + left_context, right_context, + valid_left_context, valid_right_context, + run_opts, stage = 0, + feat_type = 'raw', online_ivector_dir = None, + samples_per_iter = 20000, frames_per_eg = 20, + egs_opts = None, cmvn_opts = None, transform_dir = None): + + RunKaldiCommand(""" +steps/nnet3/get_egs.sh {egs_opts} \ + --cmd "{command}" \ + --cmvn-opts "{cmvn_opts}" \ + --feat-type {feat_type} \ + --transform-dir "{transform_dir}" \ + --online-ivector-dir "{ivector_dir}" \ + --left-context {left_context} --right-context {right_context} \ + --valid-left-context {valid_left_context} \ + --valid-right-context {valid_right_context} \ + --stage {stage} \ + --samples-per-iter {samples_per_iter} \ + --frames-per-eg {frames_per_eg} \ + {data} {alidir} {egs_dir} + """.format(command = run_opts.command, + cmvn_opts = cmvn_opts if cmvn_opts is not None else '', + feat_type = feat_type, + transform_dir = transform_dir if transform_dir is not None else '', + ivector_dir = online_ivector_dir if online_ivector_dir is not None else '', + left_context = left_context, right_context = right_context, + valid_left_context = valid_left_context, + valid_right_context = valid_right_context, + stage = stage, samples_per_iter = samples_per_iter, + frames_per_eg = frames_per_eg, data = data, alidir = alidir, + egs_dir = egs_dir, + egs_opts = egs_opts if egs_opts is not None else '' )) + +def VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context): + try: + egs_feat_dim = int(open('{0}/info/feat_dim'.format(egs_dir)).readline()) + egs_ivector_dim = int(open('{0}/info/ivector_dim'.format(egs_dir)).readline()) + egs_left_context = int(open('{0}/info/left_context'.format(egs_dir)).readline()) + egs_right_context = int(open('{0}/info/right_context'.format(egs_dir)).readline()) + if (feat_dim != egs_feat_dim) or (ivector_dim != egs_ivector_dim): + raise Exception('There is mismatch between featdim/ivector_dim of the current experiment and the provided egs directory') + + if (egs_left_context < left_context) or (egs_right_context < right_context): + raise Exception('The egs have insufficient context') + + frames_per_eg = int(open('{0}/info/frames_per_eg'.format(egs_dir)).readline()) + num_archives = int(open('{0}/info/num_archives'.format(egs_dir)).readline()) + + return [egs_left_context, egs_right_context, frames_per_eg, num_archives] + except IOError, ValueError: + raise Exception('The egs dir {0} has missing or malformed files'.format(egs_dir)) + +def ComputePreconditioningMatrix(dir, egs_dir, num_lda_jobs, run_opts, + max_lda_jobs = None, rand_prune = 4.0, + lda_opts = None): + if max_lda_jobs is not None: + if num_lda_jobs > max_lda_jobs: + num_lda_jobs = max_lda_jobs + + RunKaldiCommand(""" +{command} JOB=1:{num_lda_jobs} {dir}/log/get_lda_stats.JOB.log \ + nnet3-acc-lda-stats --rand-prune={rand_prune} \ + {dir}/init.raw "ark:{egs_dir}/egs.JOB.ark" {dir}/JOB.lda_stats""".format( + command = run_opts.command, + num_lda_jobs = num_lda_jobs, + dir = dir, + egs_dir = egs_dir, + rand_prune = rand_prune)) + + # the above command would have generated dir/{1..num_lda_jobs}.lda_stats + lda_stat_files = map(lambda x: '{0}/{1}.lda_stats'.format(dir, x), + range(1, num_lda_jobs + 1)) + + RunKaldiCommand(""" +{command} {dir}/log/sum_transform_stats.log \ + sum-lda-accs {dir}/lda_stats {lda_stat_files}""".format( + command = run_opts.command, + dir = dir, lda_stat_files = " ".join(lda_stat_files))) + + for file in lda_stat_files: + try: + os.remove(file) + except OSError: + raise Exception("There was error while trying to remove lda stat files.") + # this computes a fixed affine transform computed in the way we described in + # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant + # of an LDA transform but without dimensionality reduction. + + RunKaldiCommand(""" +{command} {dir}/log/get_transform.log \ + nnet-get-feature-transform {lda_opts} {dir}/lda.mat {dir}/lda_stats + """.format(command = run_opts.command,dir = dir, + lda_opts = lda_opts if lda_opts is not None else "")) + + ForceSymlink("../lda.mat", "{0}/configs/lda.mat".format(dir)) + +import os, errno + +def ForceSymlink(file1, file2): + try: + os.symlink(file1, file2) + except OSError, e: + if e.errno == errno.EEXIST: + os.remove(file2) + os.symlink(file1, file2) + +def ComputePresoftmaxPriorScale(dir, alidir, num_jobs, run_opts, + presoftmax_prior_scale_power = None): + + # getting the raw pdf count + RunKaldiCommand(""" +{command} JOB=1:{num_jobs} {dir}/log/acc_pdf.JOB.log \ +ali-to-post "ark:gunzip -c {alidir}/ali.JOB.gz|" ark:- \| \ +post-to-tacc --per-pdf=true {alidir}/final.mdl ark:- {dir}/pdf_counts.JOB + """.format(command = run_opts.command, + num_jobs = num_jobs, + dir = dir, + alidir = alidir)) + + RunKaldiCommand(""" +{command} {dir}/log/sum_pdf_counts.log \ +vector-sum --binary=false {dir}/pdf_counts.* {dir}/pdf_counts + """.format(command = run_opts.command, dir = dir)) + + import glob + for file in glob.glob('{0}/pdf_counts.*'.format(dir)): + os.remove(file) + + smooth=0.01 + pdf_counts = ReadKaldiMatrix('{0}/pdf_counts'.format(dir))[0] + total = sum(pdf_counts) + average_count = total/len(pdf_counts) + scales = [] + for i in range(len(pdf_counts)): + scales.append(math.pow(pdf_counts[i] + smooth * average_count, presoftmax_prior_scale_power)) + num_pdfs = len(pdf_counts) + scaled_counts = map(lambda x: x * float(num_pdfs) / sum(scales), scales) + + output_file = "{0}/presoftmax_prior_scale.vec".format(dir) + WriteKaldiMatrix(output_file, [scaled_counts]) + ForceSymlink("../presoftmax_prior_scale.vec", "{0}/configs/presoftmax_prior_scale.vec".format(dir)) + +def PrepareInitialAcousticModel(dir, alidir, run_opts): + """ Adds the first layer; this will also add in the lda.mat and + presoftmax_prior_scale.vec. It will also prepare the acoustic model + with the transition model.""" + + RunKaldiCommand(""" +{command} {dir}/log/add_first_layer.log \ + nnet3-init --srand=-3 {dir}/init.raw {dir}/configs/layer1.config {dir}/0.raw """.format(command = run_opts.command, + dir = dir)) + + # Convert to .mdl, train the transitions, set the priors. + RunKaldiCommand(""" +{command} {dir}/log/init_mdl.log \ + nnet3-am-init {alidir}/final.mdl {dir}/0.raw - \| \ + nnet3-am-train-transitions - "ark:gunzip -c {alidir}/ali.*.gz|" {dir}/0.mdl + """.format(command = run_opts.command, + dir = dir, alidir = alidir)) + +def VerifyIterations(num_iters, num_epochs, num_hidden_layers, + num_archives, max_models_combine, add_layers_period, + num_jobs_final): + """ Verifies that number of iterations are sufficient for various + phases of training.""" + + finish_add_layers_iter = num_hidden_layers * add_layers_period + + if num_iters <= (finish_add_layers_iter + 2): + raise Exception(' There are insufficient number of epochs. These are not even sufficient for layer-wise discriminatory training.') + + + approx_iters_per_epoch_final = num_archives/num_jobs_final + # First work out how many iterations we want to combine over in the final + # nnet3-combine-fast invocation. (We may end up subsampling from these if the + # number exceeds max_model_combine). The number we use is: + # min(max(max_models_combine, approx_iters_per_epoch_final), + # 1/2 * iters_after_last_layer_added) + half_iters_after_add_layers = (num_iters - finish_add_layers_iter)/2 + num_iters_combine = min(max(max_models_combine, approx_iters_per_epoch_final), half_iters_after_add_layers) + return num_iters_combine + +def GetRealignIters(realign_times, num_iters, + num_jobs_initial, num_jobs_final): + """ Takes the realign_times string and identifies the approximate + iterations at which realignments have to be done.""" + # realign_times is a space seperated string of values between 0 and 1 + + realign_iters = [] + for realign_time in realign_times.split(): + realign_time = float(realign_time) + assert(realign_time > 0 and realign_time < 1) + if num_jobs_initial == num_jobs_final: + realign_iter = int(0.5 + num_iters * realign_time) + else: + realign_iter = math.sqrt((1 - realign_time) * math.pow(num_jobs_initial, 2) + + realign_time * math.pow(num_jobs_final, 2)) + realign_iter = realign_iter - num_jobs_initial + realign_iter = realign_iter / (num_jobs_final - num_jobs_initial) + realign_iter = realign_iter * num_iters + realign_iters.append(int(realign_iter)) + + return realign_iters + +def Align(dir, data, lang, run_opts, iter = None, transform_dir = None, + online_ivector_dir = None): + + alidir = '{dir}/ali{ali_suffix}'.format(dir = dir, + ali_suffix = "_iter_{0}".format(iter) if iter is not None else "") + + logger.info("Aligning the data{gpu}with {num_jobs} jobs.".format( + gpu = " using gpu " if run_opts.realign_use_gpu else " ", + num_jobs = run_opts.realign_num_jobs )) + RunKaldiCommand(""" +steps/nnet3/align.sh --nj {num_jobs_align} --cmd "{align_cmd} {align_queue_opt}" \ + --use-gpu {align_use_gpu} \ + --transform-dir "{transform_dir}" \ + --online-ivector-dir "{online_ivector_dir}" \ + --iter "{iter}" {data} {lang} {dir} {alidir} + """.format(dir = dir, align_use_gpu = "yes" if run_opts.realign_use_gpu else "no", + align_cmd = run_opts.realign_command, + align_queue_opt = run_opts.realign_queue_opt, + num_jobs_align = run_opts.realign_num_jobs, + transform_dir = transform_dir if transform_dir is not None else "", + online_ivector_dir = online_ivector_dir if online_ivector_dir is not None else "", + iter = iter if iter is not None else "", + alidir = alidir, + lang = lang, data = data)) + return alidir + +def Realign(dir, iter, feat_dir, lang, prev_egs_dir, cur_egs_dir, + prior_subset_size, num_archives, run_opts, + transform_dir = None, online_ivector_dir = None): + raise Exception("Realignment stage has not been implemented in nnet3") + logger.info("Getting average posterior for purposes of adjusting the priors.") + # Note: this just uses CPUs, using a smallish subset of data. + # always use the first egs archive, which makes the script simpler; + # we're using different random subsets of it. + + avg_post_vec_file = ComputeAveragePosterior(dir, iter, prev_egs_dir, + num_archives, prior_subset_size, run_opts) + + avg_post_vec_file = "{dir}/post.{iter}.vec".format(dir=dir, iter=iter) + logger.info("Re-adjusting priors based on computed posteriors") + model = '{0}/{1}.mdl'.format(dir, iter) + AdjustAmPriors(dir, model, avg_post_vec_file, model, run_opts) + + alidir = Align(dir, feat_dir, lang, run_opts, iter, + transform_dir, online_ivector_dir) + RunKaldiCommand(""" +steps/nnet3/relabel_egs.sh --cmd "{command}" --iter {iter} {alidir} \ + {prev_egs_dir} {cur_egs_dir}""".format( + command = run_opts.command, + iter = iter, + dir = dir, + alidir = alidir, + prev_egs_dir = prev_egs_dir, + cur_egs_dir = cur_egs_dir)) + +def GetLearningRate(iter, num_jobs, num_iters, num_archives_processed, + num_archives_to_process, + initial_effective_lrate, final_effective_lrate): + if iter + 1 >= num_iters: + effective_learning_rate = final_effective_lrate + else: + effective_learning_rate = initial_effective_lrate * math.exp(num_archives_processed * math.log(final_effective_lrate/ initial_effective_lrate)/num_archives_to_process) + + return num_jobs * effective_learning_rate + +def DoShrinkage(iter, model_file, non_linearity, shrink_threshold): + + if iter == 0: + return True + + try: + output, error = RunKaldiCommand("nnet3-am-info --print-args=false {model_file} | grep {non_linearity}".format(non_linearity = non_linearity, model_file = model_file)) + output = output.strip().split("\n") + # eg. + # component name=Lstm1_f type=SigmoidComponent, dim=1280, count=5.02e+05, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.06,0.17,0.19,0.24 0.28,0.33,0.44,0.62,0.79 0.96,0.99,1.0,1.0), mean=0.482, stddev=0.198], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.0001,0.003,0.004,0.03 0.12,0.18,0.22,0.24,0.25 0.25,0.25,0.25,0.25), mean=0.198, stddev=0.0591] + + mean_pattern = re.compile(".*deriv-avg=.*mean=([0-9\.]+).*") + total_mean_deriv = 0 + num_derivs = 0 + for line in output: + mat_obj = mean_pattern.search(line) + if mat_obj is None: + raise Exception("Something went wrong, unable to find deriv-avg in the line \n{0}".format(line)) + mean_deriv = float(mat_obj.groups()[0]) + total_mean_deriv += mean_deriv + num_derivs += 1 + if total_mean_deriv / num_derivs < shrink_threshold: + return True + except ValueError: + raise Exception("Error while parsing the model info output") + + return False + +def ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, wait = False): + + model = '{0}/{1}.mdl'.format(dir, iter) + + RunKaldiCommand(""" +{command} {dir}/log/compute_prob_valid.{iter}.log \ + nnet3-compute-prob "nnet3-am-copy --raw=true {model} - |" \ + "ark:nnet3-merge-egs ark:{egs_dir}/valid_diagnostic.egs ark:- |" + """.format(command = run_opts.command, + dir = dir, + iter = iter, + model = model, + egs_dir = egs_dir), wait = wait) + + RunKaldiCommand(""" +{command} {dir}/log/compute_prob_train.{iter}.log \ + nnet3-compute-prob "nnet3-am-copy --raw=true {model} - |" \ + "ark:nnet3-merge-egs ark:{egs_dir}/train_diagnostic.egs ark:- |" + """.format(command = run_opts.command, + dir = dir, + iter = iter, + model = model, + egs_dir = egs_dir), wait = wait) + + +def ComputeProgress(dir, iter, egs_dir, run_opts, wait=False): + + prev_model = '{0}/{1}.mdl'.format(dir, iter - 1) + model = '{0}/{1}.mdl'.format(dir, iter) + RunKaldiCommand(""" +{command} {dir}/log/progress.{iter}.log \ +nnet3-info "nnet3-am-copy --raw=true {model} - |" '&&' \ +nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true {prev_model} - |" "nnet3-am-copy --raw=true {model} - |" \ +"ark:nnet3-merge-egs --minibatch-size=256 ark:{egs_dir}/train_diagnostic.egs ark:-|" + """.format(command = run_opts.command, + dir = dir, + iter = iter, + model = model, + prev_model = prev_model, + egs_dir = egs_dir), wait = wait) + +def CombineModels(dir, num_iters, num_iters_combine, egs_dir, + run_opts, chunk_width = None): + # Now do combination. In the nnet3 setup, the logic + # for doing averaging of subsets of the models in the case where + # there are too many models to reliably esetimate interpolation + # factors (max_models_combine) is moved into the nnet3-combine + raw_model_strings = [] + print num_iters_combine + for iter in range(num_iters - num_iters_combine + 1, num_iters + 1): + model_file = '{0}/{1}.mdl'.format(dir, iter) + if not os.path.exists(model_file): + raise Exception('Model file {0} missing'.format(model_file)) + raw_model_strings.append('"nnet3-am-copy --raw=true {0} -|"'.format(model_file)) + + if chunk_width is not None: + # this is an RNN model + mbsize = int(1024.0/(chunk_width)) + else: + mbsize = 1024 + + RunKaldiCommand(""" +{command} {combine_queue_opt} {dir}/log/combine.log \ +nnet3-combine --num-iters=40 \ + --enforce-sum-to-one=true --enforce-positive-weights=true \ + --verbose=3 {raw_models} "ark:nnet3-merge-egs --measure-output-frames=false --minibatch-size={mbsize} ark:{egs_dir}/combine.egs ark:-|" \ +"|nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl {dir}/combined.mdl" + """.format(command = run_opts.command, + combine_queue_opt = run_opts.combine_queue_opt, + dir = dir, raw_models = " ".join(raw_model_strings), + mbsize = mbsize, + num_iters = num_iters, + egs_dir = egs_dir)) + + # Compute the probability of the final, combined model with + # the same subset we used for the previous compute_probs, as the + # different subsets will lead to different probs. + ComputeTrainCvProbabilities(dir, 'combined', egs_dir, run_opts, wait = False) + +def ComputeAveragePosterior(dir, iter, egs_dir, num_archives, + prior_subset_size, run_opts): + # Note: this just uses CPUs, using a smallish subset of data. + """ Computes the average posterior of the network""" + import glob + for file in glob.glob('{0}/post.{1}.*.vec'.format(dir, iter)): + os.remove(file) + + if run_opts.num_jobs_compute_prior > num_archives: + egs_part = 1 + else: + egs_part = 'JOB' + + RunKaldiCommand(""" +{command} JOB=1:{num_jobs_compute_prior} {prior_queue_opt} {dir}/log/get_post.{iter}.JOB.log \ + nnet3-subset-egs --srand=JOB --n={prior_subset_size} ark:{egs_dir}/egs.{egs_part}.ark ark:- \| \ + nnet3-merge-egs --measure-output-frames=true --minibatch-size=128 ark:- ark:- \| \ + nnet3-compute-from-egs {prior_gpu_opt} --apply-exp=true \ + "nnet3-am-copy --raw=true {dir}/combined.mdl -|" ark:- ark:- \| \ +matrix-sum-rows ark:- ark:- \| vector-sum ark:- {dir}/post.{iter}.JOB.vec + """.format(command = run_opts.command, + dir = dir, + num_jobs_compute_prior = run_opts.num_jobs_compute_prior, + prior_queue_opt = run_opts.prior_queue_opt, + iter = iter, prior_subset_size = prior_subset_size, + egs_dir = egs_dir, egs_part = egs_part, + prior_gpu_opt = run_opts.prior_gpu_opt)) + + # make sure there is time for $dir/post.{iter}.*.vec to appear. + time.sleep(5) + avg_post_vec_file = "{dir}/post.{iter}.vec".format(dir=dir, iter=iter) + RunKaldiCommand(""" +{command} {dir}/log/vector_sum.{iter}.log \ + vector-sum {dir}/post.{iter}.*.vec {output_file} + """.format(command = run_opts.command, + dir = dir, iter = iter, output_file = avg_post_vec_file)) + + for file in glob.glob('{0}/post.{1}.*.vec'.format(dir, iter)): + os.remove(file) + return avg_post_vec_file + +def AdjustAmPriors(dir, input_model, avg_posterior_vector, output_model, run_opts): + RunKaldiCommand(""" +{command} {dir}/log/adjust_priors.final.log \ +nnet3-am-adjust-priors {input_model} {avg_posterior_vector} {output_model} + """.format(command = run_opts.command, + dir = dir, input_model = input_model, + avg_posterior_vector = avg_posterior_vector, + output_model = output_model)) + +def RemoveEgs(egs_dir): + RunKaldiCommand("steps/nnet2/remove_egs.sh {egs_dir}".format(egs_dir=egs_dir)) + +def CleanNnetDir(nnet_dir, num_iters, egs_dir, num_iters_combine = None, + preserve_model_interval = 100, + remove_egs = True): + try: + if remove_egs: + RemoveEgs(egs_dir) + + for iter in range(num_iters): + RemoveModel(nnet_dir, iter, num_iters, 1, + preserve_model_interval) + except (IOError, OSError) as err: + logger.warning("Error while cleaning up the nnet directory") + raise err + +def RemoveModel(nnet_dir, iter, num_iters, num_iters_combine = None, + preserve_model_interval = 100): + if iter % preserve_model_interval == 0: + return + if num_iters_combine is not None and iter >= num_iters - num_iters_combine + 1 : + return + file_name = '{0}/{1}.mdl'.format(nnet_dir, iter) + if os.path.isfile(file_name): + os.remove(file_name) + diff --git a/egs/wsj/s5/steps/nnet3/report/generate_plots.py b/egs/wsj/s5/steps/nnet3/report/generate_plots.py new file mode 100755 index 00000000000..5c64aab18f0 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/report/generate_plots.py @@ -0,0 +1,295 @@ +#!/usr/bin/env python + + +# Copyright 2016 Vijayaditya Peddinti. +# Apache 2.0. + +import warnings +import imp +import argparse +import os +import errno +import logging +import re +import subprocess +train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py') + +try: + import matplotlib as mpl + mpl.use('Agg') + import matplotlib.pyplot as plt + from matplotlib.backends.backend_pdf import PdfPages + import numpy as np + + plot = True +except ImportError: + warnings.warn(""" +This script requires matplotlib and numpy. Please install them to generate plots. Proceeding with generation of tables. +If you are on a cluster where you do not have admin rights you could try using virtualenv.""") + +nlp = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py') + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s') +handler.setFormatter(formatter) +logger.addHandler(handler) +logger.info('Generating plots') + + + + +def GetArgs(): + parser = argparse.ArgumentParser(description=""" +Parses the training logs and generates a variety of plots. +example : steps/nnet3/report/generate_plots.py --comparison-dir exp/nnet3/tdnn1 --comparison-dir exp/nnet3/tdnn2 exp/nnet3/tdnn exp/nnet3/tdnn/report +""") + parser.add_argument("--comparison-dir", type=str, action='append', help="other experiment directories for comparison. These will only be used for plots, not tables") + parser.add_argument("--start-iter", type=int, help="Iteration from which plotting will start", default = 1) + parser.add_argument("--is-chain", type=str, default = False, action = train_lib.StrToBoolAction, help="Iteration from which plotting will start") + parser.add_argument("exp_dir", help="experiment directory, e.g. exp/nnet3/tdnn") + parser.add_argument("output_dir", help="experiment directory, e.g. exp/nnet3/tdnn/report") + + args = parser.parse_args() + if args.comparison_dir is not None and len(args.comparison_dir) > 6: + raise Exception("max 6 --comparison-dir options can be specified. If you want to compare with more comparison_dir, you would have to carefully tune the plot_colors variable which specified colors used for plotting.") + assert(args.start_iter >= 1) + return args + +plot_colors = ['red', 'blue', 'green', 'black', 'magenta', 'yellow', 'cyan' ] + + + +class LatexReport: + def __init__(self, pdf_file): + self.pdf_file = pdf_file + self.document=[] + self.document.append(""" +\documentclass[prl,10pt,twocolumn]{revtex4} +\usepackage{graphicx} % Used to import the graphics +\\begin{document} +""") + + def AddFigure(self, figure_pdf, title): + # we will have keep extending this replacement list based on errors during compilation + # escaping underscores in the title + title = "\\texttt{"+re.sub("_","\_", title)+"}" + fig_latex = """ +%... +\\begin{figure}[t] + \\begin{center} + \caption{""" + title + """} + \includegraphics[width=\\textwidth]{""" + figure_pdf + """} + \end{center} +\end{figure} +%... +""" + self.document.append(fig_latex) + + def Close(self): + self.document.append("\end{document}") + return self.Compile() + + def Compile(self): + root, ext = os.path.splitext(self.pdf_file) + dir_name = os.path.dirname(self.pdf_file) + latex_file = root + ".tex" + lat_file = open(latex_file, "w") + lat_file.write("\n".join(self.document)) + lat_file.close() + try: + proc = subprocess.Popen(['pdflatex', '-output-directory='+str(dir_name), latex_file], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + proc.communicate() + except Exception as e: + logger.warning("There was an error compiling the latex file {0}, please do it manually.".format(latex_file)) + return False + return True + +def GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'accuracy', file_basename = 'accuracy', comparison_dir = None, start_iter = 1, latex_report = None): + assert(start_iter >= 1) + + if plot: + fig = plt.figure() + plots = [] + + comparison_dir = [] if comparison_dir is None else comparison_dir + dirs = [exp_dir] + comparison_dir + index = 0 + for dir in dirs: + [accuracy_report, accuracy_times, accuracy_data] = nlp.GenerateAccuracyReport(dir, key) + if index == 0: + # this is the main experiment directory + acc_file = open("{0}/{1}.log".format(output_dir, file_basename), "w") + acc_file.write(accuracy_report) + acc_file.close() + + if plot: + color_val = plot_colors[index] + data = np.array(accuracy_data) + if data.shape[0] == 0: + raise Exception("Couldn't find any rows for the accuracy plot") + data = data[data[:,0]>=start_iter, :] + plot_handle, = plt.plot(data[:, 0], data[:, 1], color = color_val, linestyle = "--", label = "train {0}".format(dir)) + plots.append(plot_handle) + plot_handle, = plt.plot(data[:, 0], data[:, 2], color = color_val, label = "valid {0}".format(dir)) + plots.append(plot_handle) + index += 1 + if plot: + plt.xlabel('Iteration') + plt.ylabel(key) + lgd = plt.legend(handles=plots, loc='lower center', bbox_to_anchor=(0.5, -0.2 + len(dirs) * -0.1 ), ncol=1, borderaxespad=0.) + plt.grid(True) + fig.suptitle("{0} plot".format(key)) + figfile_name = '{0}/{1}.pdf'.format(output_dir, file_basename) + plt.savefig(figfile_name, bbox_extra_artists=(lgd,), bbox_inches='tight') + if latex_report is not None: + latex_report.AddFigure(figfile_name, "Plot of {0} vs iterations".format(key)) + +def GenerateNonlinStatsPlots(exp_dir, output_dir, plot, comparison_dir = None, start_iter = 1, latex_report = None): + assert(start_iter >= 1) + + comparison_dir = [] if comparison_dir is None else comparison_dir + dirs = [exp_dir] + comparison_dir + index = 0 + stats_per_dir = {} + + for dir in dirs: + stats_per_component_per_iter = nlp.ParseProgressLogsForNonlinearityStats(dir) + stats_per_dir[dir] = stats_per_component_per_iter + + # convert the nonlin stats into tables + stat_tables_per_component_per_dir = {} + for dir in dirs: + stats_per_component_per_iter = stats_per_dir[dir] + component_names = stats_per_component_per_iter.keys() + stat_tables_per_component = {} + for component_name in component_names: + comp_data = stats_per_component_per_iter[component_name] + comp_type = comp_data['type'] + comp_stats = comp_data['stats'] + iters = comp_stats.keys() + iters.sort() + iter_stats = [] + for iter in iters: + iter_stats.append([iter] + comp_stats[iter]) + stat_tables_per_component[component_name] = iter_stats + stat_tables_per_component_per_dir[dir] = stat_tables_per_component + + main_stat_tables = stat_tables_per_component_per_dir[exp_dir] + for component_name in main_stat_tables.keys(): + # this is the main experiment directory + file = open("{dir}/nonlinstats_{comp_name}.log".format(dir = output_dir, comp_name = component_name), "w") + file.write("Iteration\tValueMean\tValueStddev\tDerivMean\tDerivStddev\n") + iter_stat_report = "" + iter_stats = main_stat_tables[component_name] + for row in iter_stats: + iter_stat_report += "\t".join(map(lambda x: str(x), row)) + "\n" + file.write(iter_stat_report) + file.close() + + if plot: + main_component_names = main_stat_tables.keys() + main_component_names.sort() + + plot_component_names = set(main_component_names) + for dir in dirs: + component_names = set(stats_per_dir[dir].keys()) + plot_component_names = plot_component_names.intersection(component_names) + plot_component_names = list(plot_component_names) + plot_component_names.sort() + if plot_component_names != main_component_names: + logger.warning("The components in all the neural networks in the given experiment dirs are not the same, so comparison plots are provided only for common component names. Make sure that these are comparable experiments before analyzing these plots.") + + fig = plt.figure() + for component_name in main_component_names: + fig.clf() + index = 0 + plots = [] + for dir in dirs: + color_val = plot_colors[index] + index += 1 + try: + iter_stats = stat_tables_per_component_per_dir[dir][component_name] + except KeyError: + # this component is not available in this network so lets not just plot it + continue + + data = np.array(iter_stats) + data = data[data[:,0] >=start_iter, :] + ax = plt.subplot(211) + mp, = ax.plot(data[:,0], data[:,1], color=color_val, label="Mean {0}".format(dir)) + msph, = ax.plot(data[:,0], data[:,1] + data[:,2], color=color_val, linestyle='--', label = "Mean+-Stddev {0}".format(dir)) + mspl, = ax.plot(data[:,0], data[:,1] - data[:,2], color=color_val, linestyle='--') + plots.append(mp) + plots.append(msph) + ax.set_ylabel('Value-{0}'.format(comp_type)) + ax.grid(True) + + ax = plt.subplot(212) + mp, = ax.plot(data[:,0], data[:,3], color=color_val) + msph, = ax.plot(data[:,0], data[:,3] + data[:,4], color=color_val, linestyle='--') + mspl, = ax.plot(data[:,0], data[:,3] - data[:,4], color=color_val, linestyle='--') + ax.set_xlabel('Iteration') + ax.set_ylabel('Derivative-{0}'.format(comp_type)) + ax.grid(True) + + lgd = plt.legend(handles=plots, loc='lower center', bbox_to_anchor=(0.5, -0.5 + len(dirs) * -0.2 ), ncol=1, borderaxespad=0.) + plt.grid(True) + fig.suptitle("Mean and stddev of the value and derivative at {comp_name}".format(comp_name = component_name)) + figfile_name = '{dir}/nonlinstats_{comp_name}.pdf'.format(dir = output_dir, comp_name = component_name) + fig.savefig(figfile_name, bbox_extra_artists=(lgd,), bbox_inches='tight') + if latex_report is not None: + latex_report.AddFigure(figfile_name, "Mean and stddev of the value and derivative at {0}".format(component_name)) + +def GeneratePlots(exp_dir, output_dir, comparison_dir = None, start_iter = 1, is_chain = False): + try: + os.makedirs(output_dir) + except OSError as e: + if e.errno == errno.EEXIST and os.path.isdir(output_dir): + pass + else: + raise e + if plot: + latex_report = LatexReport("{0}/report.pdf".format(output_dir)) + else: + latex_report = None + + if is_chain: + logger.info("Generating log-probability plots") + GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'log-probability', file_basename = 'log_probability', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report) + else: + logger.info("Generating accuracy plots") + GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'accuracy', file_basename = 'accuracy', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report) + + logger.info("Generating log-likelihood plots") + GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'log-likelihood', file_basename = 'loglikelihood', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report) + + logger.info("Generating non-linearity stats plots") + GenerateNonlinStatsPlots(exp_dir, output_dir, plot, comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report) + + logger.info("Generating parameter difference files") + # Parameter changes + key_file = {"Parameter differences":"parameter.diff", + "Relative parameter differences":"relative_parameter.diff"} + for key in key_file.keys(): + file = open("{0}/{1}".format(output_dir, key_file[key]), "w") + data = nlp.ParseProgressLogsForParamDiff(exp_dir, key) + for row in data: + file.write(" ".join(map(lambda x:str(x),row))+"\n") + file.close() + if plot and latex_report is not None: + has_compiled = latex_report.Close() + if has_compiled: + logger.info("Report has been generated. You can find it at the location {0}".format("{0}/report.pdf".format(output_dir))) + +def Main(): + args = GetArgs() + GeneratePlots(args.exp_dir, args.output_dir, + comparison_dir = args.comparison_dir, + start_iter = args.start_iter, + is_chain = args.is_chain) + +if __name__ == "__main__": + Main() diff --git a/egs/wsj/s5/steps/nnet3/report/nnet3_log_parse_lib.py b/egs/wsj/s5/steps/nnet3/report/nnet3_log_parse_lib.py new file mode 100755 index 00000000000..1c2f3a1e9b8 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/report/nnet3_log_parse_lib.py @@ -0,0 +1,155 @@ +# Copyright 2016 Vijayaditya Peddinti. +# Apache 2.0. + +from __future__ import division +import sys, glob, re, math, datetime, argparse +import imp + +ntl = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py') + +#exp/nnet3/lstm_self_repair_ld5_sp/log/progress.9.log:component name=Lstm3_i type=SigmoidComponent, dim=1280, self-repair-scale=1e-05, count=1.96e+05, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.05,0.09,0.11,0.15 0.19,0.27,0.50,0.72,0.83 0.88,0.92,0.94,0.99), mean=0.502, stddev=0.23], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.009,0.04,0.05,0.06 0.08,0.10,0.14,0.17,0.18 0.19,0.20,0.20,0.21), mean=0.134, stddev=0.0397] +def ParseProgressLogsForNonlinearityStats(exp_dir): + progress_log_files = "%s/log/progress.*.log" % (exp_dir) + stats_per_component_per_iter = {} + + progress_log_lines = ntl.RunKaldiCommand('grep -e "value-avg.*deriv-avg" {0}'.format(progress_log_files))[0] + + parse_regex = re.compile(".*progress.([0-9]+).log:component name=(.+) type=(.*)Component,.*value-avg=\[.*mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*deriv-avg=\[.*mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\]") + for line in progress_log_lines.split("\n") : + mat_obj = parse_regex.search(line) + if mat_obj is None: + continue + groups = mat_obj.groups() + # groups = ('9', 'Lstm3_i', 'Sigmoid', '0.502', '0.23', '0.134', '0.0397') + iteration = int(groups[0]) + component_name = groups[1] + component_type = groups[2] + value_mean = float(groups[3]) + value_stddev = float(groups[4]) + deriv_mean = float(groups[5]) + deriv_stddev = float(groups[6]) + try: + stats_per_component_per_iter[component_name]['stats'][iteration] = [value_mean, value_stddev, deriv_mean, deriv_stddev] + except KeyError: + stats_per_component_per_iter[component_name] = {} + stats_per_component_per_iter[component_name]['type'] = component_type + stats_per_component_per_iter[component_name]['stats'] = {} + stats_per_component_per_iter[component_name]['stats'][iteration] = [value_mean, value_stddev, deriv_mean, deriv_stddev] + + return stats_per_component_per_iter + +def ParseDifferenceString(string): + dict = {} + for parts in string.split(): + sub_parts = parts.split(":") + dict[sub_parts[0]] = float(sub_parts[1]) + return dict + +#exp/chain/cwrnn_trial2_ld5_sp/log/progress.245.log:LOG (nnet3-show-progress:main():nnet3-show-progress.cc:144) Relative parameter differences per layer are [ Cwrnn1_T3_W_r:0.0171537 Cwrnn1_T3_W_x:1.33338e-07 Cwrnn1_T2_W_r:0.048075 Cwrnn1_T2_W_x:1.34088e-07 Cwrnn1_T1_W_r:0.0157277 Cwrnn1_T1_W_x:0.0212704 Final_affine:0.0321521 Cwrnn2_T3_W_r:0.0212082 Cwrnn2_T3_W_x:1.33691e-07 Cwrnn2_T2_W_r:0.0212978 Cwrnn2_T2_W_x:1.33401e-07 Cwrnn2_T1_W_r:0.014976 Cwrnn2_T1_W_x:0.0233588 Cwrnn3_T3_W_r:0.0237165 Cwrnn3_T3_W_x:1.33184e-07 Cwrnn3_T2_W_r:0.0239754 Cwrnn3_T2_W_x:1.3296e-07 Cwrnn3_T1_W_r:0.0194809 Cwrnn3_T1_W_x:0.0271934 ] +def ParseProgressLogsForParamDiff(exp_dir, pattern): + if pattern not in set(["Relative parameter differences", "Parameter differences"]): + raise Exception("Unknown value for pattern : {0}".format(pattern)) + + progress_log_files = "%s/log/progress.*.log" % (exp_dir) + progress_per_iter = {} + component_names = set([]) + progress_log_lines = ntl.RunKaldiCommand('grep -e "{0}" {1}'.format(pattern, progress_log_files))[0] + parse_regex = re.compile(".*progress\.([0-9]+)\.log:LOG.*{0}.*\[(.*)\]".format(pattern)) + for line in progress_log_lines.split("\n") : + mat_obj = parse_regex.search(line) + if mat_obj is None: + continue + groups = mat_obj.groups() + iteration = groups[0] + differences = ParseDifferenceString(groups[1]) + component_names = component_names.union(differences.keys()) + progress_per_iter[int(iteration)] = differences + + component_names = list(component_names) + component_names.sort() + # rearranging the data into an array + data = [] + data.append(["iteration"]+component_names) + max_iter = max(progress_per_iter.keys()) + for iter in range(max_iter + 1): + try: + component_dict = progress_per_iter[iter] + except KeyError: + continue + iter_values = [] + for component_name in component_names: + try: + iter_values.append(component_dict[component_name]) + except KeyError: + # the component was not found this iteration, may be because of layerwise discriminative training + iter_values.append(0) + data.append([iter] + iter_values) + + return data + +def ParseTrainLogs(exp_dir): + train_log_files = "%s/log/train.*.log" % (exp_dir) + train_log_lines = ntl.RunKaldiCommand('grep -e Accounting {0}'.format(train_log_files))[0] + parse_regex = re.compile(".*train\.([0-9]+)\.([0-9]+)\.log:# Accounting: time=([0-9]+) thread.*") + + train_times = {} + for line in train_log_lines.split('\n'): + mat_obj = parse_regex.search(line) + if mat_obj is not None: + groups = mat_obj.groups() + try: + train_times[int(groups[0])][int(groups[1])] = float(groups[2]) + except KeyError: + train_times[int(groups[0])] = {} + train_times[int(groups[0])][int(groups[1])] = float(groups[2]) + iters = train_times.keys() + for iter in iters: + values = train_times[iter].values() + train_times[iter] = max(values) + return train_times + +def ParseProbLogs(exp_dir, key = 'accuracy'): + train_prob_files = "%s/log/compute_prob_train.*.log" % (exp_dir) + valid_prob_files = "%s/log/compute_prob_valid.*.log" % (exp_dir) + train_prob_strings = ntl.RunKaldiCommand('grep -e {0} {1}'.format(key, train_prob_files), wait = True)[0] + valid_prob_strings = ntl.RunKaldiCommand('grep -e {0} {1}'.format(key, valid_prob_files))[0] + + #LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:149) Overall log-probability for 'output' is -0.399395 + -0.013437 = -0.412832 per frame, over 20000 fra + #LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:144) Overall log-probability for 'output' is -0.307255 per frame, over 20000 frames. + parse_regex = re.compile(".*compute_prob_.*\.([0-9]+).log:LOG .nnet3.*compute-prob:PrintTotalStats..:nnet.*diagnostics.cc:[0-9]+. Overall ([a-zA-Z\-]+) for 'output'.*is ([0-9.\-e]+) .*per frame") + train_loss={} + valid_loss={} + + + for line in train_prob_strings.split('\n'): + mat_obj = parse_regex.search(line) + if mat_obj is not None: + groups = mat_obj.groups() + if groups[1] == key: + train_loss[int(groups[0])] = groups[2] + for line in valid_prob_strings.split('\n'): + mat_obj = parse_regex.search(line) + if mat_obj is not None: + groups = mat_obj.groups() + if groups[1] == key: + valid_loss[int(groups[0])] = groups[2] + iters = list(set(valid_loss.keys()).intersection(train_loss.keys())) + iters.sort() + return map(lambda x: (int(x), float(train_loss[x]), float(valid_loss[x])), iters) + +def GenerateAccuracyReport(exp_dir, key = "accuracy"): + times = ParseTrainLogs(exp_dir) + data = ParseProbLogs(exp_dir, key) + report = [] + report.append("%Iter\tduration\ttrain_loss\tvalid_loss\tdifference") + for x in data: + try: + report.append("%d\t%s\t%g\t%g\t%g" % (x[0], str(times[x[0]]), x[1], x[2], x[2]-x[1])) + except KeyError: + continue + + total_time = 0 + for iter in times.keys(): + total_time += times[iter] + report.append("Total training time is {0}\n".format(str(datetime.timedelta(seconds = total_time)))) + return ["\n".join(report), times, data] diff --git a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py new file mode 100755 index 00000000000..57291324d28 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py @@ -0,0 +1,485 @@ +#!/usr/bin/env python + +# we're using python 3.x style print but want it to work in python 2.x, +from __future__ import print_function +import os +import argparse +import sys +import warnings +import copy +import imp +import ast + +nodes = imp.load_source('', 'steps/nnet3/components.py') +nnet3_train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py') +chain_lib = imp.load_source('ncl', 'steps/nnet3/chain/nnet3_chain_lib.py') + +def GetArgs(): + # we add compulsary arguments as named arguments for readability + parser = argparse.ArgumentParser(description="Writes config files and variables " + "for TDNNs creation and training", + epilog="See steps/nnet3/tdnn/train.sh for example.") + + # Only one of these arguments can be specified, and one of them has to + # be compulsarily specified + feat_group = parser.add_mutually_exclusive_group(required = True) + feat_group.add_argument("--feat-dim", type=int, + help="Raw feature dimension, e.g. 13") + feat_group.add_argument("--feat-dir", type=str, + help="Feature directory, from which we derive the feat-dim") + + # only one of these arguments can be specified + ivector_group = parser.add_mutually_exclusive_group(required = False) + ivector_group.add_argument("--ivector-dim", type=int, + help="iVector dimension, e.g. 100", default=0) + ivector_group.add_argument("--ivector-dir", type=str, + help="iVector dir, which will be used to derive the ivector-dim ", default=None) + + num_target_group = parser.add_mutually_exclusive_group(required = True) + num_target_group.add_argument("--num-targets", type=int, + help="number of network targets (e.g. num-pdf-ids/num-leaves)") + num_target_group.add_argument("--ali-dir", type=str, + help="alignment directory, from which we derive the num-targets") + num_target_group.add_argument("--tree-dir", type=str, + help="directory with final.mdl, from which we derive the num-targets") + + # General neural network options + parser.add_argument("--splice-indexes", type=str, required = True, + help="Splice indexes at each layer, e.g. '-3,-2,-1,0,1,2,3'") + parser.add_argument("--include-log-softmax", type=str, action=nnet3_train_lib.StrToBoolAction, + help="add the final softmax layer ", default=True, choices = ["false", "true"]) + parser.add_argument("--xent-regularize", type=float, + help="For chain models, if nonzero, add a separate output for cross-entropy " + "regularization (with learning-rate-factor equal to the inverse of this)", + default=0.0) + parser.add_argument("--xent-separate-forward-affine", type=str, action=nnet3_train_lib.StrToBoolAction, + help="if using --xent-regularize, gives it separate last-but-one weight matrix", + default=False, choices = ["false", "true"]) + parser.add_argument("--final-layer-normalize-target", type=float, + help="RMS target for final layer (set to <1 if final layer learns too fast", + default=1.0) + parser.add_argument("--subset-dim", type=int, default=0, + help="dimension of the subset of units to be sent to the central frame") + parser.add_argument("--pnorm-input-dim", type=int, + help="input dimension to p-norm nonlinearities") + parser.add_argument("--pnorm-output-dim", type=int, + help="output dimension of p-norm nonlinearities") + parser.add_argument("--relu-dim", type=int, + help="dimension of ReLU nonlinearities") + + parser.add_argument("--self-repair-scale", type=float, + help="A non-zero value activates the self-repair mechanism in the sigmoid and tanh non-linearities of the LSTM", default=None) + + + parser.add_argument("--pool-type", type=str, default = 'none', + help="Type of pooling to be used.", choices = ['low-pass', 'weighted-average', 'per-dim-weighted-average', 'multi-dim-weighted-average', 'none']) + parser.add_argument("--pool-window", type=int, default = None, + help="Width of the pooling window") + parser.add_argument("--pool-lpfilter-width", type=float, + default = None, help="Nyquist frequency of the lpfilter to be used for pooling") + parser.add_argument("--use-presoftmax-prior-scale", type=str, action=nnet3_train_lib.StrToBoolAction, + help="if true, a presoftmax-prior-scale is added", + choices=['true', 'false'], default = True) + parser.add_argument("config_dir", + help="Directory to write config files and variables") + + print(' '.join(sys.argv)) + + args = parser.parse_args() + args = CheckArgs(args) + + return args + +def CheckArgs(args): + if not os.path.exists(args.config_dir): + os.makedirs(args.config_dir) + + ## Check arguments. + if args.feat_dir is not None: + args.feat_dim = nnet3_train_lib.GetFeatDim(args.feat_dir) + + if args.ali_dir is not None: + args.num_targets = nnet3_train_lib.GetNumberOfLeaves(args.ali_dir) + elif args.tree_dir is not None: + args.num_targets = chain_lib.GetNumberOfLeaves(args.tree_dir) + + if args.ivector_dir is not None: + args.ivector_dim = nnet3_train_lib.GetIvectorDim(args.ivector_dir) + + if not args.feat_dim > 0: + raise Exception("feat-dim has to be postive") + + if not args.num_targets > 0: + print(args.num_targets) + raise Exception("num_targets has to be positive") + + if not args.ivector_dim >= 0: + raise Exception("ivector-dim has to be non-negative") + + if (args.subset_dim < 0): + raise Exception("--subset-dim has to be non-negative") + if (args.pool_window is not None) and (args.pool_window <= 0): + raise Exception("--pool-window has to be positive") + + if not args.relu_dim is None: + if not args.pnorm_input_dim is None or not args.pnorm_output_dim is None: + raise Exception("--relu-dim argument not compatible with " + "--pnorm-input-dim or --pnorm-output-dim options"); + args.nonlin_input_dim = args.relu_dim + args.nonlin_output_dim = args.relu_dim + else: + if not args.pnorm_input_dim > 0 or not args.pnorm_output_dim > 0: + raise Exception("--relu-dim not set, so expected --pnorm-input-dim and " + "--pnorm-output-dim to be provided."); + args.nonlin_input_dim = args.pnorm_input_dim + args.nonlin_output_dim = args.pnorm_output_dim + + return args + +def AddPerDimAffineLayer(config_lines, name, input, input_window): + filter_context = int((input_window - 1) / 2) + filter_input_splice_indexes = range(-1 * filter_context, filter_context + 1) + list = [('Offset({0}, {1})'.format(input['descriptor'], n) if n != 0 else input['descriptor']) for n in filter_input_splice_indexes] + filter_input_descriptor = 'Append({0})'.format(' , '.join(list)) + filter_input_descriptor = {'descriptor':filter_input_descriptor, + 'dimension':len(filter_input_splice_indexes) * input['dimension']} + + + # add permute component to shuffle the feature columns of the Append + # descriptor output so that columns corresponding to the same feature index + # are contiguous add a block-affine component to collapse all the feature + # indexes across time steps into a single value + num_feats = input['dimension'] + num_times = len(filter_input_splice_indexes) + column_map = [] + for i in range(num_feats): + for j in range(num_times): + column_map.append(j * num_feats + i) + permuted_output_descriptor = nodes.AddPermuteLayer(config_lines, + name, filter_input_descriptor, column_map) + + # add a block-affine component + output_descriptor = nodes.AddBlockAffineLayer(config_lines, name, + permuted_output_descriptor, + num_feats, num_feats) + + return [output_descriptor, filter_context, filter_context] + +def AddMultiDimAffineLayer(config_lines, name, input, input_window, block_input_dim, block_output_dim): + assert(block_input_dim % input_window== 0) + filter_context = int((input_window - 1) / 2) + filter_input_splice_indexes = range(-1 * filter_context, filter_context + 1) + list = [('Offset({0}, {1})'.format(input['descriptor'], n) if n != 0 else input['descriptor']) for n in filter_input_splice_indexes] + filter_input_descriptor = 'Append({0})'.format(' , '.join(list)) + filter_input_descriptor = {'descriptor':filter_input_descriptor, + 'dimension':len(filter_input_splice_indexes) * input['dimension']} + + + # add permute component to shuffle the feature columns of the Append + # descriptor output so that columns corresponding to the same feature index + # are contiguous add a block-affine component to collapse all the feature + # indexes across time steps into a single value + num_feats = input['dimension'] + num_times = len(filter_input_splice_indexes) + column_map = [] + for i in range(num_feats): + for j in range(num_times): + column_map.append(j * num_feats + i) + permuted_output_descriptor = nodes.AddPermuteLayer(config_lines, + name, filter_input_descriptor, column_map) + # add a block-affine component + output_descriptor = nodes.AddBlockAffineLayer(config_lines, name, + permuted_output_descriptor, + num_feats / (block_input_dim / input_window) * block_output_dim, num_feats / (block_input_dim/ input_window)) + + return [output_descriptor, filter_context, filter_context] + +def AddLpFilter(config_lines, name, input, rate, num_lpfilter_taps, lpfilt_filename, is_updatable = False): + try: + import scipy.signal as signal + import numpy as np + except ImportError: + raise Exception(" This recipe cannot be run without scipy." + " You can install it using the command \n" + " pip install scipy\n" + " If you do not have admin access on the machine you are" + " trying to run this recipe, you can try using" + " virtualenv") + # low-pass smoothing of input was specified. so we will add a low-pass filtering layer + lp_filter = signal.firwin(num_lpfilter_taps, rate, width=None, window='hamming', pass_zero=True, scale=True, nyq=1.0) + lp_filter = list(np.append(lp_filter, 0)) + nnet3_train_lib.WriteKaldiMatrix(lpfilt_filename, [lp_filter]) + filter_context = int((num_lpfilter_taps - 1) / 2) + filter_input_splice_indexes = range(-1 * filter_context, filter_context + 1) + list = [('Offset({0}, {1})'.format(input['descriptor'], n) if n != 0 else input['descriptor']) for n in filter_input_splice_indexes] + filter_input_descriptor = 'Append({0})'.format(' , '.join(list)) + filter_input_descriptor = {'descriptor':filter_input_descriptor, + 'dimension':len(filter_input_splice_indexes) * input['dimension']} + + input_x_dim = len(filter_input_splice_indexes) + input_y_dim = input['dimension'] + input_z_dim = 1 + filt_x_dim = len(filter_input_splice_indexes) + filt_y_dim = 1 + filt_x_step = 1 + filt_y_step = 1 + input_vectorization = 'zyx' + + tdnn_input_descriptor = nodes.AddConvolutionLayer(config_lines, name, + filter_input_descriptor, + input_x_dim, input_y_dim, input_z_dim, + filt_x_dim, filt_y_dim, + filt_x_step, filt_y_step, + 1, input_vectorization, + filter_bias_file = lpfilt_filename, + is_updatable = is_updatable) + + + return [tdnn_input_descriptor, filter_context, filter_context] + +def PrintConfig(file_name, config_lines): + f = open(file_name, 'w') + f.write("\n".join(config_lines['components'])+"\n") + f.write("\n#Component nodes\n") + f.write("\n".join(config_lines['component-nodes'])) + f.close() + +def ParseSpliceString(splice_indexes): + splice_array = [] + left_context = 0 + right_context = 0 + split1 = splice_indexes.split(" "); # we already checked the string is nonempty. + if len(split1) < 1: + raise Exception("invalid splice-indexes argument, too short: " + + splice_indexes) + try: + for string in split1: + split2 = string.split(",") + if len(split2) < 1: + raise Exception("invalid splice-indexes argument, too-short element: " + + splice_indexes) + int_list = [] + for int_str in split2: + int_list.append(int(int_str)) + if not int_list == sorted(int_list): + raise Exception("elements of splice-indexes must be sorted: " + + splice_indexes) + left_context += -int_list[0] + right_context += int_list[-1] + splice_array.append(int_list) + except ValueError as e: + raise Exception("invalid splice-indexes argument " + splice_indexes + e) + left_context = max(0, left_context) + right_context = max(0, right_context) + + return {'left_context':left_context, + 'right_context':right_context, + 'splice_indexes':splice_array, + 'num_hidden_layers':len(splice_array) + } + +def MakeConfigs(config_dir, splice_indexes_string, + feat_dim, ivector_dim, num_targets, + nonlin_input_dim, nonlin_output_dim, subset_dim, + pool_type, pool_window, pool_lpfilter_width, + use_presoftmax_prior_scale, final_layer_normalize_target, + include_log_softmax, xent_regularize, xent_separate_forward_affine, self_repair_scale): + + parsed_splice_output = ParseSpliceString(splice_indexes_string.strip()) + + left_context = parsed_splice_output['left_context'] + right_context = parsed_splice_output['right_context'] + num_hidden_layers = parsed_splice_output['num_hidden_layers'] + splice_indexes = parsed_splice_output['splice_indexes'] + input_dim = len(parsed_splice_output['splice_indexes'][0]) + feat_dim + ivector_dim + + if xent_separate_forward_affine: + if splice_indexes[-1] != [0]: + raise Exception("--xent-separate-forward-affine option is supported only if the last-hidden layer has no splicing before it. Please use a splice-indexes with just 0 as the final splicing config.") + + prior_scale_file = '{0}/presoftmax_prior_scale.vec'.format(config_dir) + + config_lines = {'components':[], 'component-nodes':[]} + + config_files={} + prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0], ivector_dim) + + # Add the init config lines for estimating the preconditioning matrices + init_config_lines = copy.deepcopy(config_lines) + init_config_lines['components'].insert(0, '# Config file for initializing neural network prior to') + init_config_lines['components'].insert(0, '# preconditioning matrix computation') + nodes.AddOutputLayer(init_config_lines, prev_layer_output) + config_files[config_dir + '/init.config'] = init_config_lines + + prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, config_dir + '/lda.mat') + + left_context = 0 + right_context = 0 + # we moved the first splice layer to before the LDA.. + # so the input to the first affine layer is going to [0] index + splice_indexes[0] = [0] + + for i in range(0, num_hidden_layers): + # make the intermediate config file for layerwise discriminative training + # if specified, pool the input from the previous layer + + # prepare the spliced input + if not (len(splice_indexes[i]) == 1 and splice_indexes[i][0] == 0): + if pool_type != "none" and pool_window is None: + raise Exception("Pooling type was specified as {0}, this requires specification of the pool-window".format(pool_type)) + if pool_type in set(["low-pass", "weighted-average"]): + if pool_type == "weighted-average": + lpfilter_is_updatable = True + else: + lpfilter_is_updatable = False + # low-pass filter the input to smooth it before the sub-sampling + [prev_layer_output, cur_left_context, cur_right_context] = AddLpFilter(config_lines, + 'Tdnn_input_smoother_{0}'.format(i), + prev_layer_output, + pool_lpfilter_width, + pool_window, + config_dir + '/Tdnn_input_smoother_{0}.txt'.format(i), + is_updatable = lpfilter_is_updatable) + left_context += cur_left_context + right_context += cur_right_context + + elif pool_type == "per-dim-weighted-average": + # add permute component to shuffle the feature columns of the Append descriptor output so + # that columns corresponding to the same feature index are contiguous + # add a block-affine component to collapse all the feature indexes across time steps into a single value + [prev_layer_output, cur_left_context, cur_right_context] = AddPerDimAffineLayer(config_lines, + 'Tdnn_input_PDA_{0}'.format(i), + prev_layer_output, + pool_window) + + left_context += cur_left_context + right_context += cur_right_context + elif pool_type == "multi-dim-weighted-average": + [prev_layer_output, cur_left_context, cur_right_context] = AddMultiDimAffineLayer(config_lines, + 'Tdnn_input_PDA_{0}'.format(i), + prev_layer_output, + pool_window, + 10 * pool_window, 10) + left_context += cur_left_context + right_context += cur_right_context + + + try: + zero_index = splice_indexes[i].index(0) + except ValueError: + zero_index = None + # I just assume the prev_layer_output_descriptor is a simple forwarding descriptor + prev_layer_output_descriptor = prev_layer_output['descriptor'] + subset_output = prev_layer_output + if subset_dim > 0: + # if subset_dim is specified the script expects a zero in the splice indexes + assert(zero_index is not None) + subset_node_config = "dim-range-node name=Tdnn_input_{0} input-node={1} dim-offset={2} dim={3}".format(i, prev_layer_output_descriptor, 0, subset_dim) + subset_output = {'descriptor' : 'Tdnn_input_{0}'.format(i), + 'dimension' : subset_dim} + config_lines['component-nodes'].append(subset_node_config) + appended_descriptors = [] + appended_dimension = 0 + for j in range(len(splice_indexes[i])): + if j == zero_index: + appended_descriptors.append(prev_layer_output['descriptor']) + appended_dimension += prev_layer_output['dimension'] + continue + appended_descriptors.append('Offset({0}, {1})'.format(subset_output['descriptor'], splice_indexes[i][j])) + appended_dimension += subset_output['dimension'] + prev_layer_output = {'descriptor' : "Append({0})".format(" , ".join(appended_descriptors)), + 'dimension' : appended_dimension} + else: + # this is a normal affine node + pass + + if xent_separate_forward_affine and i == num_hidden_layers - 1: + if xent_regularize == 0.0: + raise Exception("xent-separate-forward-affine=True is valid only if xent-regularize is non-zero") + + prev_layer_output_chain = nodes.AddAffRelNormLayer(config_lines, "Tdnn_pre_final_chain", + prev_layer_output, nonlin_output_dim, + self_repair_scale = self_repair_scale, + norm_target_rms = final_layer_normalize_target) + + + nodes.AddFinalLayer(config_lines, prev_layer_output_chain, num_targets, + use_presoftmax_prior_scale = use_presoftmax_prior_scale, + prior_scale_file = prior_scale_file, + include_log_softmax = include_log_softmax) + + + prev_layer_output_xent = nodes.AddAffRelNormLayer(config_lines, "Tdnn_pre_final_xent", + prev_layer_output, nonlin_output_dim, + self_repair_scale = self_repair_scale, + norm_target_rms = final_layer_normalize_target) + + nodes.AddFinalLayer(config_lines, prev_layer_output_xent, num_targets, + ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format( + 0.5 / xent_regularize), + use_presoftmax_prior_scale = use_presoftmax_prior_scale, + prior_scale_file = prior_scale_file, + include_log_softmax = True, + name_affix = 'xent') + else: + prev_layer_output = nodes.AddAffRelNormLayer(config_lines, "Tdnn_{0}".format(i), + prev_layer_output, nonlin_output_dim, + self_repair_scale = self_repair_scale, + norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target) + + # a final layer is added after each new layer as we are generating + # configs for layer-wise discriminative training + nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, + use_presoftmax_prior_scale = use_presoftmax_prior_scale, + prior_scale_file = prior_scale_file, + include_log_softmax = include_log_softmax) + + if xent_regularize != 0.0: + nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, + ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format( + 0.5 / xent_regularize), + use_presoftmax_prior_scale = use_presoftmax_prior_scale, + prior_scale_file = prior_scale_file, + include_log_softmax = True, + name_affix = 'xent') + + config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines + config_lines = {'components':[], 'component-nodes':[]} + + left_context += int(parsed_splice_output['left_context']) + right_context += int(parsed_splice_output['right_context']) + + # write the files used by other scripts like steps/nnet3/get_egs.sh + f = open(config_dir + "/vars", "w") + print('model_left_context=' + str(left_context), file=f) + print('model_right_context=' + str(right_context), file=f) + print('num_hidden_layers=' + str(num_hidden_layers), file=f) + f.close() + + # printing out the configs + # init.config used to train lda-mllt train + for key in config_files.keys(): + PrintConfig(key, config_files[key]) + +def Main(): + args = GetArgs() + + MakeConfigs(config_dir = args.config_dir, + splice_indexes_string = args.splice_indexes, + feat_dim = args.feat_dim, ivector_dim = args.ivector_dim, + num_targets = args.num_targets, + nonlin_input_dim = args.nonlin_input_dim, + nonlin_output_dim = args.nonlin_output_dim, + subset_dim = args.subset_dim, + pool_type = args.pool_type, pool_window = args.pool_window, + pool_lpfilter_width = args.pool_lpfilter_width, + use_presoftmax_prior_scale = args.use_presoftmax_prior_scale, + final_layer_normalize_target = args.final_layer_normalize_target, + include_log_softmax = args.include_log_softmax, + xent_regularize = args.xent_regularize, + xent_separate_forward_affine = args.xent_separate_forward_affine, + self_repair_scale = args.self_repair_scale) + +if __name__ == "__main__": + Main() + diff --git a/egs/wsj/s5/steps/nnet3/tdnn/train.sh b/egs/wsj/s5/steps/nnet3/tdnn/train.sh new file mode 100755 index 00000000000..773e10ccab6 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/tdnn/train.sh @@ -0,0 +1,660 @@ +#!/bin/bash + +# note, TDNN is the same as what we used to call multisplice. + +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). +# 2013 Xiaohui Zhang +# 2013 Guoguo Chen +# 2014 Vimal Manohar +# 2014 Vijayaditya Peddinti +# Apache 2.0. + + +# Begin configuration section. +cmd=run.pl +num_epochs=15 # Number of epochs of training; + # the number of iterations is worked out from this. +initial_effective_lrate=0.01 +final_effective_lrate=0.001 +pnorm_input_dim=3000 +pnorm_output_dim=300 +relu_dim= # you can use this to make it use ReLU's instead of p-norms. +rand_prune=4.0 # Relates to a speedup we do for LDA. +minibatch_size=512 # This default is suitable for GPU-based training. + # Set it to 128 for multi-threaded CPU-based training. +max_param_change=2.0 # max param change per minibatch +samples_per_iter=400000 # each iteration of training, see this many samples + # per job. This option is passed to get_egs.sh +num_jobs_initial=1 # Number of neural net jobs to run in parallel at the start of training +num_jobs_final=8 # Number of neural net jobs to run in parallel at the end of training +prior_subset_size=20000 # 20k samples per job, for computing priors. +num_jobs_compute_prior=10 # these are single-threaded, run on CPU. +get_egs_stage=0 # can be used for rerunning after partial +online_ivector_dir= +presoftmax_prior_scale_power=-0.25 +use_presoftmax_prior_scale=true +remove_egs=true # set to false to disable removing egs after training is done. + +max_models_combine=20 # The "max_models_combine" is the maximum number of models we give + # to the final 'combine' stage, but these models will themselves be averages of + # iteration-number ranges. + +shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples + # on each iter. You could set it to 0 or to a large value for complete + # randomization, but this would both consume memory and cause spikes in + # disk I/O. Smaller is easier on disk and memory but less random. It's + # not a huge deal though, as samples are anyway randomized right at the start. + # (the point of this is to get data in different minibatches on different iterations, + # since in the preconditioning method, 2 samples in the same minibatch can + # affect each others' gradients. + +add_layers_period=2 # by default, add new layers every 2 iterations. +stage=-6 +exit_stage=-100 # you can set this to terminate the training early. Exits before running this stage + +# count space-separated fields in splice_indexes to get num-hidden-layers. +splice_indexes="-4,-3,-2,-1,0,1,2,3,4 0 -2,2 0 -4,4 0" +# Format : layer/....layer/ " +# note: hidden layers which are composed of one or more components, +# so hidden layer indexing is different from component count +chunk_training=false # if true training is done with chunk randomization, rather than frame randomization + +randprune=4.0 # speeds up LDA. +use_gpu=true # if true, we run on GPU. +cleanup=true +egs_dir= +max_lda_jobs=10 # use no more than 10 jobs for the LDA accumulation. +lda_opts= +egs_opts= +transform_dir= # If supplied, this dir used instead of alidir to find transforms. +cmvn_opts= # will be passed to get_lda.sh and get_egs.sh, if supplied. + # only relevant for "raw" features, not lda. +feat_type=raw # or set to 'lda' to use LDA features. +align_cmd= # The cmd that is passed to steps/nnet2/align.sh +align_use_gpu= # Passed to use_gpu in steps/nnet2/align.sh [yes/no] +realign_times= # List of times on which we realign. Each time is + # floating point number strictly between 0 and 1, which + # will be multiplied by the num-iters to get an iteration + # number. +num_jobs_align=30 # Number of jobs for realignment +# End configuration section. +frames_per_eg=8 # to be passed on to get_egs.sh +subset_dim=0 + +trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet" + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --num-epochs <#epochs|15> # Number of epochs of training" + echo " --initial-effective-lrate # effective learning rate at start of training." + echo " --final-effective-lrate # effective learning rate at end of training." + echo " # data, 0.00025 for large data" + echo " --num-hidden-layers <#hidden-layers|2> # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs" + echo " --add-layers-period <#iters|2> # Number of iterations between adding hidden layers" + echo " --presoftmax-prior-scale-power # use the specified power value on the priors (inverse priors) to scale" + echo " # the pre-softmax outputs (set to 0.0 to disable the presoftmax element scale)" + echo " --num-jobs-initial # Number of parallel jobs to use for neural net training, at the start." + echo " --num-jobs-final # Number of parallel jobs to use for neural net training, at the end" + echo " --num-threads # Number of parallel threads per job, for CPU-based training (will affect" + echo " # results as well as speed; may interact with batch size; if you increase" + echo " # this, you may want to decrease the batch size." + echo " --parallel-opts # extra options to pass to e.g. queue.pl for processes that" + echo " # use multiple threads... note, you might have to reduce mem_free,ram_free" + echo " # versus your defaults, because it gets multiplied by the -pe smp argument." + echo " --minibatch-size # Size of minibatch to process (note: product with --num-threads" + echo " # should not get too large, e.g. >2k)." + echo " --samples-per-iter <#samples|400000> # Number of samples of data to process per iteration, per" + echo " # process." + echo " --splice-indexes " + echo " # Frame indices used for each splice layer." + echo " # Format : layer/....layer/ " + echo " # (note: we splice processed, typically 40-dimensional frames" + echo " --lda-dim # Dimension to reduce spliced features to with LDA" + echo " --realign-times # A list of space-separated floating point numbers between 0.0 and" + echo " # 1.0 to specify how far through training realignment is to be done" + echo " --align-cmd (utils/run.pl|utils/queue.pl ) # passed to align.sh" + echo " --align-use-gpu (yes/no) # specify is gpu is to be used for realignment" + echo " --num-jobs-align <#njobs|30> # Number of jobs to perform realignment" + echo " --stage # Used to run a partially-completed training process from somewhere in" + echo " # the middle." + + + exit 1; +fi + +data=$1 +lang=$2 +alidir=$3 +dir=$4 + +if [ ! -z "$realign_times" ]; then + [ -z "$align_cmd" ] && echo "$0: realign_times specified but align_cmd not specified" && exit 1 + [ -z "$align_use_gpu" ] && echo "$0: realign_times specified but align_use_gpu not specified" && exit 1 +fi + +# Check some files. +for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + + +# Set some variables. +num_leaves=`tree-info $alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1 +[ -z $num_leaves ] && echo "\$num_leaves is unset" && exit 1 +[ "$num_leaves" -eq "0" ] && echo "\$num_leaves is 0" && exit 1 + +nj=`cat $alidir/num_jobs` || exit 1; # number of jobs in alignment dir... +# in this dir we'll have just one job. +sdata=$data/split$nj +utils/split_data.sh $data $nj + +mkdir -p $dir/log +echo $nj > $dir/num_jobs +cp $alidir/tree $dir + + +# First work out the feature and iVector dimension, needed for tdnn config creation. +case $feat_type in + raw) feat_dim=$(feat-to-dim --print-args=false scp:$data/feats.scp -) || \ + { echo "$0: Error getting feature dim"; exit 1; } + ;; + lda) [ ! -f $alidir/final.mat ] && echo "$0: With --feat-type lda option, expect $alidir/final.mat to exist." + # get num-rows in lda matrix, which is the lda feature dim. + feat_dim=$(matrix-dim --print-args=false $alidir/final.mat | cut -f 1) + ;; + *) + echo "$0: Bad --feat-type '$feat_type';"; exit 1; +esac +if [ -z "$online_ivector_dir" ]; then + ivector_dim=0 +else + ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1; +fi + + +if [ $stage -le -5 ]; then + echo "$0: creating neural net configs"; + + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + python steps/nnet3/tdnn/make_configs.py \ + --splice-indexes "$splice_indexes" \ + --subset-dim "$subset_dim" \ + --feat-dim $feat_dim \ + --ivector-dim $ivector_dim \ + $dim_opts \ + --use-presoftmax-prior-scale $use_presoftmax_prior_scale \ + --num-targets $num_leaves \ + $dir/configs || exit 1; + + # Initialize as "raw" nnet, prior to training the LDA-like preconditioning + # matrix. This first config just does any initial splicing that we do; + # we do this as it's a convenient way to get the stats for the 'lda-like' + # transform. + $cmd $dir/log/nnet_init.log \ + nnet3-init --srand=-2 $dir/configs/init.config $dir/init.raw || exit 1; +fi + +# sourcing the "vars" below sets +# left_context=(something) +# right_context=(something) +# num_hidden_layers=(something) +. $dir/configs/vars || exit 1; + +context_opts="--left-context=$left_context --right-context=$right_context" + +! [ "$num_hidden_layers" -gt 0 ] && echo \ + "$0: Expected num_hidden_layers to be defined" && exit 1; + +[ -z "$transform_dir" ] && transform_dir=$alidir + + +if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then + extra_opts=() + [ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts") + [ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type) + [ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir) + extra_opts+=(--transform-dir $transform_dir) + extra_opts+=(--left-context $left_context) + extra_opts+=(--right-context $right_context) + echo "$0: calling get_egs.sh" + steps/nnet3/get_egs.sh $egs_opts "${extra_opts[@]}" \ + --samples-per-iter $samples_per_iter --stage $get_egs_stage \ + --cmd "$cmd" $egs_opts \ + --frames-per-eg $frames_per_eg \ + $data $alidir $dir/egs || exit 1; +fi + +[ -z $egs_dir ] && egs_dir=$dir/egs + +if [ "$feat_dim" != "$(cat $egs_dir/info/feat_dim)" ]; then + echo "$0: feature dimension mismatch with egs, $feat_dim vs $(cat $egs_dir/info/feat_dim)"; + exit 1; +fi +if [ "$ivector_dim" != "$(cat $egs_dir/info/ivector_dim)" ]; then + echo "$0: ivector dimension mismatch with egs, $ivector_dim vs $(cat $egs_dir/info/ivector_dim)"; + exit 1; +fi + +# copy any of the following that exist, to $dir. +cp $egs_dir/{cmvn_opts,splice_opts,final.mat} $dir 2>/dev/null + +# confirm that the egs_dir has the necessary context (especially important if +# the --egs-dir option was used on the command line). +egs_left_context=$(cat $egs_dir/info/left_context) || exit -1 +egs_right_context=$(cat $egs_dir/info/right_context) || exit -1 + ( [ $egs_left_context -lt $left_context ] || \ + [ $egs_right_context -lt $right_context ] ) && \ + echo "$0: egs in $egs_dir have too little context" && exit -1; + +frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; } +num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; } + +# num_archives_expanded considers each separate label-position from +# 0..frames_per_eg-1 to be a separate archive. +if [ "$chunk_training" == "true" ]; then + num_archives_expanded=$num_archives +else + num_archives_expanded=$[$num_archives*$frames_per_eg] +fi + +[ $num_jobs_initial -gt $num_jobs_final ] && \ + echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1; + +[ $num_jobs_final -gt $num_archives_expanded ] && \ + echo "$0: --final-num-jobs cannot exceed #archives $num_archives_expanded." && exit 1; + + +if [ $stage -le -3 ]; then + echo "$0: getting preconditioning matrix for input features." + num_lda_jobs=$num_archives + [ $num_lda_jobs -gt $max_lda_jobs ] && num_lda_jobs=$max_lda_jobs + + # Write stats with the same format as stats for LDA. + $cmd JOB=1:$num_lda_jobs $dir/log/get_lda_stats.JOB.log \ + nnet3-acc-lda-stats --rand-prune=$rand_prune \ + $dir/init.raw "ark:$egs_dir/egs.JOB.ark" $dir/JOB.lda_stats || exit 1; + + all_lda_accs=$(for n in $(seq $num_lda_jobs); do echo $dir/$n.lda_stats; done) + $cmd $dir/log/sum_transform_stats.log \ + sum-lda-accs $dir/lda_stats $all_lda_accs || exit 1; + + rm $all_lda_accs || exit 1; + + # this computes a fixed affine transform computed in the way we described in + # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant + # of an LDA transform but without dimensionality reduction. + $cmd $dir/log/get_transform.log \ + nnet-get-feature-transform $lda_opts $dir/lda.mat $dir/lda_stats || exit 1; + + ln -sf ../lda.mat $dir/configs/lda.mat +fi + + +if [ $stage -le -2 ]; then + echo "$0: preparing initial vector for FixedScaleComponent before softmax" + echo " ... using priors^$presoftmax_prior_scale_power and rescaling to average 1" + + # obtains raw pdf count + $cmd JOB=1:$nj $dir/log/acc_pdf.JOB.log \ + ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \ + post-to-tacc --per-pdf=true $alidir/final.mdl ark:- $dir/pdf_counts.JOB || exit 1; + $cmd $dir/log/sum_pdf_counts.log \ + vector-sum --binary=false $dir/pdf_counts.* $dir/pdf_counts || exit 1; + rm $dir/pdf_counts.* + + awk -v power=$presoftmax_prior_scale_power -v smooth=0.01 \ + '{ for(i=2; i<=NF-1; i++) { count[i-2] = $i; total += $i; } + num_pdfs=NF-2; average_count = total/num_pdfs; + for (i=0; i $dir/presoftmax_prior_scale.vec + ln -sf ../presoftmax_prior_scale.vec $dir/configs/presoftmax_prior_scale.vec +fi + +if [ $stage -le -1 ]; then + # Add the first layer; this will add in the lda.mat and + # presoftmax_prior_scale.vec. + $cmd $dir/log/add_first_layer.log \ + nnet3-init --srand=-3 $dir/init.raw $dir/configs/layer1.config $dir/0.raw || exit 1; + + # Convert to .mdl, train the transitions, set the priors. + $cmd $dir/log/init_mdl.log \ + nnet3-am-init $alidir/final.mdl $dir/0.raw - \| \ + nnet3-am-train-transitions - "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl || exit 1; +fi + + +# set num_iters so that as close as possible, we process the data $num_epochs +# times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives_expanded, +# where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2. + +num_archives_to_process=$[$num_epochs*$num_archives_expanded] +num_archives_processed=0 +num_iters=$[($num_archives_to_process*2)/($num_jobs_initial+$num_jobs_final)] + +! [ $num_iters -gt $[$finish_add_layers_iter+2] ] \ + && echo "$0: Insufficient epochs" && exit 1 + +finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period] + +echo "$0: Will train for $num_epochs epochs = $num_iters iterations" + +if $use_gpu; then + parallel_suffix="" + train_queue_opt="--gpu 1" + combine_queue_opt="--gpu 1" + prior_gpu_opt="--use-gpu=yes" + prior_queue_opt="--gpu 1" + parallel_train_opts= + if ! cuda-compiled; then + echo "$0: WARNING: you are running with one thread but you have not compiled" + echo " for CUDA. You may be running a setup optimized for GPUs. If you have" + echo " GPUs and have nvcc installed, go to src/ and do ./configure; make" + exit 1 + fi +else + echo "$0: without using a GPU this will be very slow. nnet3 does not yet support multiple threads." + parallel_train_opts="--use-gpu=no" + combine_queue_opt="" # the combine stage will be quite slow if not using + # GPU, as we didn't enable that program to use + # multiple threads. + prior_gpu_opt="--use-gpu=no" + prior_queue_opt="" +fi + + +approx_iters_per_epoch_final=$[$num_archives_expanded/$num_jobs_final] +# First work out how many iterations we want to combine over in the final +# nnet3-combine-fast invocation. (We may end up subsampling from these if the +# number exceeds max_model_combine). The number we use is: +# min(max(max_models_combine, approx_iters_per_epoch_final), +# 1/2 * iters_after_last_layer_added) +num_iters_combine=$max_models_combine +if [ $num_iters_combine -lt $approx_iters_per_epoch_final ]; then + num_iters_combine=$approx_iters_per_epoch_final +fi +half_iters_after_add_layers=$[($num_iters-$finish_add_layers_iter)/2] +if [ $num_iters_combine -gt $half_iters_after_add_layers ]; then + num_iters_combine=$half_iters_after_add_layers +fi +first_model_combine=$[$num_iters-$num_iters_combine+1] + +x=0 + +for realign_time in $realign_times; do + # Work out the iterations on which we will re-align, if the --realign-times + # option was used. This is slightly approximate. + ! perl -e "exit($realign_time > 0.0 && $realign_time < 1.0 ? 0:1);" && \ + echo "Invalid --realign-times option $realign_times: elements must be strictly between 0 and 1."; + # the next formula is based on the one for mix_up_iter above. + realign_iter=$(perl -e '($j,$k,$n,$p)=@ARGV; print int(0.5 + ($j==$k ? $n*$p : $n*(sqrt((1-$p)*$j*$j+$p*$k*$k)-$j)/($k-$j))); ' $num_jobs_initial $num_jobs_final $num_iters $realign_time) || exit 1; + realign_this_iter[$realign_iter]=$realign_time +done + +cur_egs_dir=$egs_dir + +while [ $x -lt $num_iters ]; do + [ $x -eq $exit_stage ] && echo "$0: Exiting early due to --exit-stage $exit_stage" && exit 0; + + this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);") + + ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process; + this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);"); + + echo "On iteration $x, learning rate is $this_learning_rate." + + if [ ! -z "${realign_this_iter[$x]}" ]; then + prev_egs_dir=$cur_egs_dir + cur_egs_dir=$dir/egs_${realign_this_iter[$x]} + fi + + if [ $x -ge 0 ] && [ $stage -le $x ]; then + if [ ! -z "${realign_this_iter[$x]}" ]; then + time=${realign_this_iter[$x]} + + echo "Getting average posterior for purposes of adjusting the priors." + # Note: this just uses CPUs, using a smallish subset of data. + # always use the first egs archive, which makes the script simpler; + # we're using different random subsets of it. + rm $dir/post.$x.*.vec 2>/dev/null + $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \ + nnet3-copy-egs --srand=JOB --frame=random $context_opts ark:$prev_egs_dir/egs.1.ark ark:- \| \ + nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \ + nnet3-merge-egs ark:- ark:- \| \ + nnet3-compute-from-egs --apply-exp=true "nnet3-am-copy --raw=true $dir/$x.mdl -|" ark:- ark:- \| \ + matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1; + + sleep 3; # make sure there is time for $dir/post.$x.*.vec to appear. + + $cmd $dir/log/vector_sum.$x.log \ + vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1; + rm $dir/post.$x.*.vec; + + echo "Re-adjusting priors based on computed posteriors" + $cmd $dir/log/adjust_priors.$x.log \ + nnet3-am-adjust-priors $dir/$x.mdl $dir/post.$x.vec $dir/$x.mdl || exit 1; + + sleep 2 + + steps/nnet3/align.sh --nj $num_jobs_align --cmd "$align_cmd" --use-gpu $align_use_gpu \ + --transform-dir "$transform_dir" --online-ivector-dir "$online_ivector_dir" \ + --iter $x $data $lang $dir $dir/ali_$time || exit 1 + + steps/nnet3/relabel_egs.sh --cmd "$cmd" --iter $x $dir/ali_$time \ + $prev_egs_dir $cur_egs_dir || exit 1 + + if $cleanup && [[ $prev_egs_dir =~ $dir/egs* ]]; then + steps/nnet3/remove_egs.sh $prev_egs_dir + fi + fi + + # Set off jobs doing some diagnostics, in the background. + # Use the egs dir from the previous iteration for the diagnostics + $cmd $dir/log/compute_prob_valid.$x.log \ + nnet3-compute-prob "nnet3-am-copy --raw=true $dir/$x.mdl - |" \ + "ark:nnet3-merge-egs ark:$cur_egs_dir/valid_diagnostic.egs ark:- |" & + $cmd $dir/log/compute_prob_train.$x.log \ + nnet3-compute-prob "nnet3-am-copy --raw=true $dir/$x.mdl - |" \ + "ark:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:- |" & + + if [ $x -gt 0 ]; then + $cmd $dir/log/progress.$x.log \ + nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true $dir/$[$x-1].mdl - |" "nnet3-am-copy --raw=true $dir/$x.mdl - |" \ + "ark:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:-|" '&&' \ + nnet3-info "nnet3-am-copy --raw=true $dir/$x.mdl - |" & + fi + + echo "Training neural net (pass $x)" + + if [ $x -gt 0 ] && \ + [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \ + [ $[$x%$add_layers_period] -eq 0 ]; then + do_average=false # if we've just mixed up, don't do averaging but take the + # best. + cur_num_hidden_layers=$[1+$x/$add_layers_period] + config=$dir/configs/layer$cur_num_hidden_layers.config + raw="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl - | nnet3-init --srand=$x - $config - |" + else + do_average=true + if [ $x -eq 0 ]; then do_average=false; fi # on iteration 0, pick the best, don't average. + raw="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl -|" + fi + if $do_average; then + this_minibatch_size=$minibatch_size + else + # on iteration zero or when we just added a layer, use a smaller minibatch + # size (and we will later choose the output of just one of the jobs): the + # model-averaging isn't always helpful when the model is changing too fast + # (i.e. it can worsen the objective function), and the smaller minibatch + # size will help to keep the update stable. + this_minibatch_size=$[$minibatch_size/2]; + fi + + rm $dir/.error 2>/dev/null + + + ( # this sub-shell is so that when we "wait" below, + # we only wait for the training jobs that we just spawned, + # not the diagnostic jobs that we spawned above. + + # We can't easily use a single parallel SGE job to do the main training, + # because the computation of which archive and which --frame option + # to use for each job is a little complex, so we spawn each one separately. + for n in $(seq $this_num_jobs); do + k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we'll derive + # the other indexes from. + archive=$[($k%$num_archives)+1]; # work out the 1-based archive index. + frame=$[(($k/$num_archives)%$frames_per_eg)]; # work out the 0-based frame + # index; this increases more slowly than the archive index because the + # same archive with different frame indexes will give similar gradients, + # so we want to separate them in time. + + $cmd $train_queue_opt $dir/log/train.$x.$n.log \ + nnet3-train $parallel_train_opts \ + --max-param-change=$max_param_change "$raw" \ + "ark:nnet3-copy-egs --frame=$frame $context_opts ark:$cur_egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --minibatch-size=$this_minibatch_size --discard-partial-minibatches=true ark:- ark:- |" \ + $dir/$[$x+1].$n.raw || touch $dir/.error & + done + wait + ) + # the error message below is not that informative, but $cmd will + # have printed a more specific one. + [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1; + + nnets_list= + for n in `seq 1 $this_num_jobs`; do + nnets_list="$nnets_list $dir/$[$x+1].$n.raw" + done + + if $do_average; then + # average the output of the different jobs. + $cmd $dir/log/average.$x.log \ + nnet3-average $nnets_list - \| \ + nnet3-am-copy --set-raw-nnet=- $dir/$x.mdl $dir/$[$x+1].mdl || exit 1; + else + # choose the best from the different jobs. + n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) { + $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn"; + undef $logprob; while () { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } } + close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; + $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1; + [ -z "$n" ] && echo "Error getting best model" && exit 1; + $cmd $dir/log/select.$x.log \ + nnet3-am-copy --set-raw-nnet=$dir/$[$x+1].$n.raw $dir/$x.mdl $dir/$[$x+1].mdl || exit 1; + fi + + rm $nnets_list + [ ! -f $dir/$[$x+1].mdl ] && exit 1; + if [ -f $dir/$[$x-1].mdl ] && $cleanup && \ + [ $[($x-1)%100] -ne 0 ] && [ $[$x-1] -lt $first_model_combine ]; then + rm $dir/$[$x-1].mdl + fi + fi + x=$[$x+1] + num_archives_processed=$[$num_archives_processed+$this_num_jobs] +done + + +if [ $stage -le $num_iters ]; then + echo "Doing final combination to produce final.mdl" + + # Now do combination. In the nnet3 setup, the logic + # for doing averaging of subsets of the models in the case where + # there are too many models to reliably esetimate interpolation + # factors (max_models_combine) is moved into the nnet3-combine + nnets_list=() + for n in $(seq 0 $[num_iters_combine-1]); do + iter=$[$first_model_combine+$n] + mdl=$dir/$iter.mdl + [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1; + nnets_list[$n]="nnet3-am-copy --raw=true $mdl -|"; + done + + # Below, we use --use-gpu=no to disable nnet3-combine-fast from using a GPU, + # as if there are many models it can give out-of-memory error; and we set + # num-threads to 8 to speed it up (this isn't ideal...) + + $cmd $combine_queue_opt $dir/log/combine.log \ + nnet3-combine --num-iters=40 \ + --enforce-sum-to-one=true --enforce-positive-weights=true \ + --verbose=3 "${nnets_list[@]}" "ark:nnet3-merge-egs --minibatch-size=1024 ark:$cur_egs_dir/combine.egs ark:-|" \ + "|nnet3-am-copy --set-raw-nnet=- $dir/$num_iters.mdl $dir/combined.mdl" || exit 1; + + # Compute the probability of the final, combined model with + # the same subset we used for the previous compute_probs, as the + # different subsets will lead to different probs. + $cmd $dir/log/compute_prob_valid.final.log \ + nnet3-compute-prob "nnet3-am-copy --raw=true $dir/combined.mdl -|" \ + "ark:nnet3-merge-egs ark:$cur_egs_dir/valid_diagnostic.egs ark:- |" & + $cmd $dir/log/compute_prob_train.final.log \ + nnet3-compute-prob "nnet3-am-copy --raw=true $dir/combined.mdl -|" \ + "ark:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:- |" & +fi + +if [ $stage -le $[$num_iters+1] ]; then + echo "Getting average posterior for purposes of adjusting the priors." + # Note: this just uses CPUs, using a smallish subset of data. + if [ $num_jobs_compute_prior -gt $num_archives ]; then egs_part=1; + else egs_part=JOB; fi + rm $dir/post.$x.*.vec 2>/dev/null + $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$x.JOB.log \ + nnet3-copy-egs --frame=random $context_opts --srand=JOB ark:$cur_egs_dir/egs.$egs_part.ark ark:- \| \ + nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \ + nnet3-merge-egs ark:- ark:- \| \ + nnet3-compute-from-egs $prior_gpu_opt --apply-exp=true \ + "nnet3-am-copy --raw=true $dir/combined.mdl -|" ark:- ark:- \| \ + matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1; + + sleep 3; # make sure there is time for $dir/post.$x.*.vec to appear. + + $cmd $dir/log/vector_sum.$x.log \ + vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1; + + rm $dir/post.$x.*.vec; + + echo "Re-adjusting priors based on computed posteriors" + $cmd $dir/log/adjust_priors.final.log \ + nnet3-am-adjust-priors $dir/combined.mdl $dir/post.$x.vec $dir/final.mdl || exit 1; +fi + + +if [ ! -f $dir/final.mdl ]; then + echo "$0: $dir/final.mdl does not exist." + # we don't want to clean up if the training didn't succeed. + exit 1; +fi + +sleep 2 + +echo Done + +if $cleanup; then + echo Cleaning up data + if $remove_egs && [[ $cur_egs_dir =~ $dir/egs* ]]; then + steps/nnet2/remove_egs.sh $cur_egs_dir + fi + + echo Removing most of the models + for x in `seq 0 $num_iters`; do + if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.mdl ]; then + # delete all but every 100th model; don't delete the ones which combine to form the final model. + rm $dir/$x.mdl + fi + done +fi diff --git a/egs/wsj/s5/steps/nnet3/tdnn/train_raw_nnet.sh b/egs/wsj/s5/steps/nnet3/tdnn/train_raw_nnet.sh new file mode 100644 index 00000000000..8f33272b97f --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/tdnn/train_raw_nnet.sh @@ -0,0 +1,551 @@ +#!/bin/bash + +# note, TDNN is the same as what we used to call multisplice. + +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). +# 2013 Xiaohui Zhang +# 2013 Guoguo Chen +# 2014-2016 Vimal Manohar +# 2014 Vijayaditya Peddinti +# Apache 2.0. + + +# Begin configuration section. +cmd=run.pl +num_epochs=15 # Number of epochs of training; + # the number of iterations is worked out from this. +initial_effective_lrate=0.01 +final_effective_lrate=0.001 +rand_prune=4.0 # Relates to a speedup we do for LDA. +minibatch_size=512 # This default is suitable for GPU-based training. + # Set it to 128 for multi-threaded CPU-based training. +max_param_change=2.0 # max param change per minibatch +samples_per_iter=400000 # each iteration of training, see this many samples + # per job. This option is passed to get_egs.sh +num_jobs_initial=1 # Number of neural net jobs to run in parallel at the start of training +num_jobs_final=8 # Number of neural net jobs to run in parallel at the end of training +prior_subset_size=20000 # 20k samples per job, for computing priors. +num_jobs_compute_prior=10 # these are single-threaded, run on CPU. +get_egs_stage=0 # can be used for rerunning after partial +online_ivector_dir= +remove_egs=true # set to false to disable removing egs after training is done. + +max_models_combine=20 # The "max_models_combine" is the maximum number of models we give + # to the final 'combine' stage, but these models will themselves be averages of + # iteration-number ranges. + +shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples + # on each iter. You could set it to 0 or to a large value for complete + # randomization, but this would both consume memory and cause spikes in + # disk I/O. Smaller is easier on disk and memory but less random. It's + # not a huge deal though, as samples are anyway randomized right at the start. + # (the point of this is to get data in different minibatches on different iterations, + # since in the preconditioning method, 2 samples in the same minibatch can + # affect each others' gradients. + +add_layers_period=2 # by default, add new layers every 2 iterations. +stage=-6 +exit_stage=-100 # you can set this to terminate the training early. Exits before running this stage + +chunk_training=false # if true training is done with chunk randomization, rather than frame randomization + +randprune=4.0 # speeds up LDA. +use_gpu=true # if true, we run on GPU. +cleanup=true +egs_dir= +configs_dir= +max_lda_jobs=10 # use no more than 10 jobs for the LDA accumulation. +lda_opts= +egs_opts= +transform_dir= # If supplied, this dir used instead of alidir to find transforms. +cmvn_opts= # will be passed to get_lda.sh and get_egs.sh, if supplied. +frames_per_eg=8 # to be passed on to get_egs.sh + +# Raw nnet training options i.e. without transition model +nj=4 +dense_targets=true # Use dense targets instead of sparse targets + +# End configuration section. + +trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 data/train scp:snr_targets/targets.scp exp/nnet3_snr_predictor" + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --num-epochs <#epochs|15> # Number of epochs of training" + echo " --initial-effective-lrate # effective learning rate at start of training." + echo " --final-effective-lrate # effective learning rate at end of training." + echo " # data, 0.00025 for large data" + echo " --num-hidden-layers <#hidden-layers|2> # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs" + echo " --add-layers-period <#iters|2> # Number of iterations between adding hidden layers" + echo " --num-jobs-initial # Number of parallel jobs to use for neural net training, at the start." + echo " --num-jobs-final # Number of parallel jobs to use for neural net training, at the end" + echo " --num-threads # Number of parallel threads per job, for CPU-based training (will affect" + echo " # results as well as speed; may interact with batch size; if you increase" + echo " # this, you may want to decrease the batch size." + echo " --parallel-opts # extra options to pass to e.g. queue.pl for processes that" + echo " # use multiple threads... note, you might have to reduce mem_free,ram_free" + echo " # versus your defaults, because it gets multiplied by the -pe smp argument." + echo " --minibatch-size # Size of minibatch to process (note: product with --num-threads" + echo " # should not get too large, e.g. >2k)." + echo " --samples-per-iter <#samples|400000> # Number of samples of data to process per iteration, per" + echo " # process." + echo " --splice-indexes " + echo " # Frame indices used for each splice layer." + echo " # Format : layer/....layer/ " + echo " # (note: we splice processed, typically 40-dimensional frames" + echo " --lda-dim # Dimension to reduce spliced features to with LDA" + echo " --stage # Used to run a partially-completed training process from somewhere in" + echo " # the middle." + + + exit 1; +fi + +data=$1 +targets_scp=$2 +dir=$3 + +# Check some files. +for f in $data/feats.scp $targets_scp; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +if $add_final_sigmoid && $include_log_softmax; then + echo "add-final-sigmoid and include-log-softmax cannot both be true" +fi + +# in this dir we'll have just one job. +sdata=$data/split$nj +utils/split_data.sh $data $nj + +mkdir -p $dir/log +echo $nj > $dir/num_jobs + + +# First work out the feature and iVector dimension, needed for tdnn config creation. +feat_dim=$(feat-to-dim --print-args=false scp:$data/feats.scp -) || \ + { echo "$0: Error getting feature dim"; exit 1; } + +if [ -z "$online_ivector_dir" ]; then + ivector_dim=0 +else + ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1; +fi + +if [ ! -z "$configs_dir" ]; then + cp -rT $configs_dir $dir/configs || exit 1 +fi + +if [ $stage -le -5 ]; then + # Initialize as "raw" nnet, prior to training the LDA-like preconditioning + # matrix. This first config just does any initial splicing that we do; + # we do this as it's a convenient way to get the stats for the 'lda-like' + # transform. + $cmd $dir/log/nnet_init.log \ + nnet3-init --srand=-2 $dir/configs/init.config $dir/init.raw || exit 1; +fi + +# sourcing the "vars" below sets +# model_left_context=(something) +# model_right_context=(something) +# num_hidden_layers=(something) +# num_targets=(something) +# add_lda=(true|false) +# include_log_softmax=(true|false) +# objective_type=(something) +. $dir/configs/vars || exit 1; +left_context=$model_left_context +right_context=$model_right_context + +[ -z "$num_targets" ] && echo "\$num_targets is not defined. Needs to be defined in $dir/configs/vars." && exit 1 +[ -z "$add_lda" ] && echo "\$add_lda is not defined. Needs to be defined in $dir/configs/vars." && exit 1 +[ -z "$include_log_softmax" ] && echo "\$include_log_softmax is not defined. Needs to be defined in $dir/configs/vars." && exit 1 +[ -z "$objective_type" ] && echo "\$objective_type is not defined. Needs to be defined in $dir/configs/vars." && exit 1 + +context_opts="--left-context=$left_context --right-context=$right_context" + +! [ "$num_hidden_layers" -gt 0 ] && echo \ + "$0: Expected num_hidden_layers to be defined" && exit 1; + +if $dense_targets; then + tmp_num_targets=`feat-to-dim scp:$targets_scp - 2>/dev/null` || exit 1 + + if [ $tmp_num_targets -ne $num_targets ]; then + echo "Mismatch between num-targets provided to script vs configs" + exit 1 + fi +fi + +if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then + extra_opts=() + [ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts") + [ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type) + [ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir) + extra_opts+=(--transform-dir "$transform_dir") + extra_opts+=(--left-context $left_context) + extra_opts+=(--right-context $right_context) + echo "$0: calling get_egs.sh" + + if $dense_targets; then + target_type=dense + else + target_type=sparse + fi + + steps/nnet3/get_egs_targets.sh $egs_opts "${extra_opts[@]}" \ + --samples-per-iter $samples_per_iter --stage $get_egs_stage \ + --cmd "$cmd" --nj $nj \ + --frames-per-eg $frames_per_eg \ + --target-type $target_type --num-targets $num_targets \ + $data $targets_scp $dir/egs || exit 1; +fi + +[ -z $egs_dir ] && egs_dir=$dir/egs + +if [ "$feat_dim" != "$(cat $egs_dir/info/feat_dim)" ]; then + echo "$0: feature dimension mismatch with egs, $feat_dim vs $(cat $egs_dir/info/feat_dim)"; + exit 1; +fi +if [ "$ivector_dim" != "$(cat $egs_dir/info/ivector_dim)" ]; then + echo "$0: ivector dimension mismatch with egs, $ivector_dim vs $(cat $egs_dir/info/ivector_dim)"; + exit 1; +fi + +# copy any of the following that exist, to $dir. +cp $egs_dir/{cmvn_opts,splice_opts,final.mat} $dir 2>/dev/null + +# confirm that the egs_dir has the necessary context (especially important if +# the --egs-dir option was used on the command line). +egs_left_context=$(cat $egs_dir/info/left_context) || exit -1 +egs_right_context=$(cat $egs_dir/info/right_context) || exit -1 + ( [ $egs_left_context -lt $left_context ] || \ + [ $egs_right_context -lt $right_context ] ) && \ + echo "$0: egs in $egs_dir have too little context" && exit -1; + +frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; } +num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; } + +# num_archives_expanded considers each separate label-position from +# 0..frames_per_eg-1 to be a separate archive. +if [ "$chunk_training" == "true" ]; then + num_archives_expanded=$num_archives +else + num_archives_expanded=$[$num_archives*$frames_per_eg] +fi + +[ $num_jobs_initial -gt $num_jobs_final ] && \ + echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1; + +[ $num_jobs_final -gt $num_archives_expanded ] && \ + echo "$0: --final-num-jobs cannot exceed #archives $num_archives_expanded." && exit 1; + + +if $add_lda && [ $stage -le -3 ]; then + echo "$0: getting preconditioning matrix for input features." + num_lda_jobs=$num_archives + [ $num_lda_jobs -gt $max_lda_jobs ] && num_lda_jobs=$max_lda_jobs + + # Write stats with the same format as stats for LDA. + $cmd JOB=1:$num_lda_jobs $dir/log/get_lda_stats.JOB.log \ + nnet3-acc-lda-stats --rand-prune=$rand_prune \ + $dir/init.raw "ark:$egs_dir/egs.JOB.ark" $dir/JOB.lda_stats || exit 1; + + all_lda_accs=$(for n in $(seq $num_lda_jobs); do echo $dir/$n.lda_stats; done) + $cmd $dir/log/sum_transform_stats.log \ + sum-lda-accs $dir/lda_stats $all_lda_accs || exit 1; + + rm $all_lda_accs || exit 1; + + # this computes a fixed affine transform computed in the way we described in + # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant + # of an LDA transform but without dimensionality reduction. + $cmd $dir/log/get_transform.log \ + nnet-get-feature-transform $lda_opts $dir/lda.mat $dir/lda_stats || exit 1; + + ln -sf ../lda.mat $dir/configs/lda.mat +fi + + +if [ $stage -le -1 ]; then + # Add the first layer; this will add in the lda.mat + $cmd $dir/log/add_first_layer.log \ + nnet3-init --srand=-3 $dir/init.raw $dir/configs/layer1.config $dir/0.raw || exit 1; + +fi + + +# set num_iters so that as close as possible, we process the data $num_epochs +# times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives_expanded, +# where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2. + +num_archives_to_process=$[$num_epochs*$num_archives_expanded] +num_archives_processed=0 +num_iters=$[($num_archives_to_process*2)/($num_jobs_initial+$num_jobs_final)] + +finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period] + +! [ $num_iters -gt $[$finish_add_layers_iter+2] ] \ + && echo "$0: Insufficient epochs" && exit 1 + +echo "$0: Will train for $num_epochs epochs = $num_iters iterations" + +if $use_gpu; then + parallel_suffix="" + train_queue_opt="--gpu 1" + combine_queue_opt="--gpu 1" + prior_gpu_opt="--use-gpu=yes" + prior_queue_opt="--gpu 1" + parallel_train_opts= + if ! cuda-compiled; then + echo "$0: WARNING: you are running with one thread but you have not compiled" + echo " for CUDA. You may be running a setup optimized for GPUs. If you have" + echo " GPUs and have nvcc installed, go to src/ and do ./configure; make" + exit 1 + fi +else + echo "$0: without using a GPU this will be very slow. nnet3 does not yet support multiple threads." + parallel_train_opts="--use-gpu=no" + combine_queue_opt="" # the combine stage will be quite slow if not using + # GPU, as we didn't enable that program to use + # multiple threads. + prior_gpu_opt="--use-gpu=no" + prior_queue_opt="" +fi + + +approx_iters_per_epoch_final=$[$num_archives_expanded/$num_jobs_final] +# First work out how many iterations we want to combine over in the final +# nnet3-combine-fast invocation. (We may end up subsampling from these if the +# number exceeds max_model_combine). The number we use is: +# min(max(max_models_combine, approx_iters_per_epoch_final), +# 1/2 * iters_after_last_layer_added) +num_iters_combine=$max_models_combine +if [ $num_iters_combine -lt $approx_iters_per_epoch_final ]; then + num_iters_combine=$approx_iters_per_epoch_final +fi +half_iters_after_add_layers=$[($num_iters-$finish_add_layers_iter)/2] +if [ $num_iters_combine -gt $half_iters_after_add_layers ]; then + num_iters_combine=$half_iters_after_add_layers +fi +first_model_combine=$[$num_iters-$num_iters_combine+1] + +x=0 + + +compute_accuracy=false +if [ "$objective_type" == "linear" ]; then + compute_accuracy=true +fi + +while [ $x -lt $num_iters ]; do + [ $x -eq $exit_stage ] && echo "$0: Exiting early due to --exit-stage $exit_stage" && exit 0; + + this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);") + + ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process; + this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);"); + + echo "On iteration $x, learning rate is $this_learning_rate." + + if [ $x -ge 0 ] && [ $stage -le $x ]; then + + # Set off jobs doing some diagnostics, in the background. + # Use the egs dir from the previous iteration for the diagnostics + $cmd $dir/log/compute_prob_valid.$x.log \ + nnet3-compute-prob --compute-accuracy=$compute_accuracy $dir/$x.raw \ + "ark:nnet3-merge-egs ark:$egs_dir/valid_diagnostic.egs ark:- |" & + $cmd $dir/log/compute_prob_train.$x.log \ + nnet3-compute-prob --compute-accuracy=$compute_accuracy $dir/$x.raw \ + "ark:nnet3-merge-egs ark:$egs_dir/train_diagnostic.egs ark:- |" & + + if [ $x -gt 0 ]; then + $cmd $dir/log/progress.$x.log \ + nnet3-show-progress --use-gpu=no $dir/$[x-1].raw $dir/$x.raw \ + "ark:nnet3-merge-egs ark:$egs_dir/train_diagnostic.egs ark:-|" '&&' \ + nnet3-info $dir/$x.raw & + fi + + echo "Training neural net (pass $x)" + + if [ $x -gt 0 ] && \ + [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \ + [ $[$x%$add_layers_period] -eq 0 ]; then + do_average=false # if we've just mixed up, don't do averaging but take the + # best. + cur_num_hidden_layers=$[1+$x/$add_layers_period] + config=$dir/configs/layer$cur_num_hidden_layers.config + raw="nnet3-copy --learning-rate=$this_learning_rate $dir/$x.raw - | nnet3-init --srand=$x - $config - |" + else + do_average=true + if [ $x -eq 0 ]; then do_average=false; fi # on iteration 0, pick the best, don't average. + raw="nnet3-copy --learning-rate=$this_learning_rate $dir/$x.raw -|" + fi + if $do_average; then + this_minibatch_size=$minibatch_size + else + # on iteration zero or when we just added a layer, use a smaller minibatch + # size (and we will later choose the output of just one of the jobs): the + # model-averaging isn't always helpful when the model is changing too fast + # (i.e. it can worsen the objective function), and the smaller minibatch + # size will help to keep the update stable. + this_minibatch_size=$[$minibatch_size/2]; + fi + + rm $dir/.error 2>/dev/null + + + ( # this sub-shell is so that when we "wait" below, + # we only wait for the training jobs that we just spawned, + # not the diagnostic jobs that we spawned above. + + # We can't easily use a single parallel SGE job to do the main training, + # because the computation of which archive and which --frame option + # to use for each job is a little complex, so we spawn each one separately. + for n in $(seq $this_num_jobs); do + k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we'll derive + # the other indexes from. + archive=$[($k%$num_archives)+1]; # work out the 1-based archive index. + frame=$[(($k/$num_archives)%$frames_per_eg)]; # work out the 0-based frame + # index; this increases more slowly than the archive index because the + # same archive with different frame indexes will give similar gradients, + # so we want to separate them in time. + + $cmd $train_queue_opt $dir/log/train.$x.$n.log \ + nnet3-train $parallel_train_opts \ + --max-param-change=$max_param_change "$raw" \ + "ark:nnet3-copy-egs --frame=$frame $context_opts ark:$egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --minibatch-size=$this_minibatch_size --discard-partial-minibatches=true ark:- ark:- |" \ + $dir/$[$x+1].$n.raw || touch $dir/.error & + done + wait + ) + # the error message below is not that informative, but $cmd will + # have printed a more specific one. + [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1; + + nnets_list= + for n in `seq 1 $this_num_jobs`; do + nnets_list="$nnets_list $dir/$[$x+1].$n.raw" + done + + if $do_average; then + # average the output of the different jobs. + $cmd $dir/log/average.$x.log \ + nnet3-average $nnets_list $dir/$[x+1].raw || exit 1; + else + # choose the best from the different jobs. + n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) { + $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn"; + undef $logprob; while () { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } } + close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; + $best_n=$n; } } print "$best_n\n"; ' $this_num_jobs $dir/log/train.$x.%d.log) || exit 1; + [ -z "$n" ] && echo "Error getting best model" && exit 1; + $cmd $dir/log/select.$x.log \ + nnet3-copy $dir/$[$x+1].$n.raw $dir/$[$x+1].raw || exit 1; + fi + + rm $nnets_list + [ ! -f $dir/$[$x+1].raw ] && exit 1; + if [ -f $dir/$[$x-1].raw ] && $cleanup && \ + [ $[($x-1)%100] -ne 0 ] && [ $[$x-1] -lt $first_model_combine ]; then + rm $dir/$[$x-1].raw + fi + fi + x=$[$x+1] + num_archives_processed=$[$num_archives_processed+$this_num_jobs] +done + +if [ $stage -le $num_iters ]; then + echo "Doing final combination to produce final.raw" + + # Now do combination. In the nnet3 setup, the logic + # for doing averaging of subsets of the models in the case where + # there are too many models to reliably esetimate interpolation + # factors (max_models_combine) is moved into the nnet3-combine + nnets_list=() + for n in $(seq 0 $[num_iters_combine-1]); do + iter=$[$first_model_combine+$n] + nnet=$dir/$iter.raw + [ ! -f $nnet ] && echo "Expected $nnet to exist" && exit 1; + nnets_list[$n]=$nnet + done + + # Below, we use --use-gpu=no to disable nnet3-combine-fast from using a GPU, + # as if there are many models it can give out-of-memory error; and we set + # num-threads to 8 to speed it up (this isn't ideal...) + + $cmd $combine_queue_opt $dir/log/combine.log \ + nnet3-combine --num-iters=40 \ + --enforce-sum-to-one=true --enforce-positive-weights=true \ + --verbose=3 "${nnets_list[@]}" "ark:nnet3-merge-egs --minibatch-size=1024 ark:$egs_dir/combine.egs ark:-|" \ + $dir/final.raw || exit 1; + + # Compute the probability of the final, combined model with + # the same subset we used for the previous compute_probs, as the + # different subsets will lead to different probs. + $cmd $dir/log/compute_prob_valid.final.log \ + nnet3-compute-prob --compute-accuracy=$compute_accuracy $dir/final.raw \ + "ark:nnet3-merge-egs ark:$egs_dir/valid_diagnostic.egs ark:- |" & + $cmd $dir/log/compute_prob_train.final.log \ + nnet3-compute-prob --compute-accuracy=$compute_accuracy $dir/final.raw \ + "ark:nnet3-merge-egs ark:$egs_dir/train_diagnostic.egs ark:- |" & +fi + +if $include_log_softmax && [ $stage -le $[$num_iters+1] ]; then + echo "Getting average posterior for purpose of using as prior to convert posteriors to likelihoods." + # Note: this just uses CPUs, using a smallish subset of data. + if [ $num_jobs_compute_prior -gt $num_archives ]; then egs_part=1; + else egs_part=JOB; fi + rm $dir/post.$x.*.vec 2>/dev/null + $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$x.JOB.log \ + nnet3-copy-egs --frame=random $context_opts --srand=JOB ark:$egs_dir/egs.$egs_part.ark ark:- \| \ + nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \ + nnet3-merge-egs ark:- ark:- \| \ + nnet3-compute-from-egs $prior_gpu_opt --apply-exp=true \ + $dir/final.raw ark:- ark:- \| \ + matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1; + + sleep 3; # make sure there is time for $dir/post.$x.*.vec to appear. + + $cmd $dir/log/vector_sum.$x.log \ + vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1; + + rm -f $dir/post.$x.*.vec; + +fi + + +if [ ! -f $dir/final.raw ]; then + echo "$0: $dir/final.raw does not exist." + # we don't want to clean up if the training didn't succeed. + exit 1; +fi + +sleep 2 + +echo Done + +if $cleanup; then + echo Cleaning up data + if $remove_egs && [[ $egs_dir =~ $dir/egs* ]]; then + steps/nnet2/remove_egs.sh $egs_dir + fi + + echo Removing most of the models + for x in `seq 0 $num_iters`; do + if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.mdl ]; then + # delete all but every 100th model; don't delete the ones which combine to form the final model. + rm $dir/$x.raw + fi + done +fi + diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py new file mode 100755 index 00000000000..cde3ef14933 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/train_dnn.py @@ -0,0 +1,632 @@ +#!/usr/bin/env python + + +# Copyright 2016 Vijayaditya Peddinti. +# Apache 2.0. + + +# this script is based on steps/nnet3/lstm/train.sh + + +import subprocess +import argparse +import sys +import pprint +import logging +import imp +import traceback +from nnet3_train_lib import * + +nnet3_log_parse = imp.load_source('', 'steps/nnet3/report/nnet3_log_parse_lib.py') + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s') +handler.setFormatter(formatter) +logger.addHandler(handler) +logger.info('Starting RNN trainer (train_rnn.py)') + + +def GetArgs(): + # we add compulsary arguments as named arguments for readability + parser = argparse.ArgumentParser(description=""" + Trains a feed forward DNN acoustic model using the cross-entropy objective. + DNNs include simple DNNs, TDNNs and CNNs. + """, + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + # feat options + parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir', + default = None, action = NullstrToNoneAction, + help="""directory with the ivectors extracted in + an online fashion.""") + parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts', + default = None, action = NullstrToNoneAction, + help="A string specifying '--norm-means' and '--norm-vars' values") + + # egs extraction options + parser.add_argument("--egs.frames-per-eg", type=int, dest='frames_per_eg', + default = 8, + help="Number of output labels per example") + parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir', + default = None, action = NullstrToNoneAction, + help="""String to provide options directly to steps/nnet3/get_egs.sh script""") + parser.add_argument("--egs.dir", type=str, dest='egs_dir', + default = None, action = NullstrToNoneAction, + help="""Directory with egs. If specified this directory + will be used rather than extracting egs""") + parser.add_argument("--egs.stage", type=int, dest='egs_stage', + default = 0, help="Stage at which get_egs.sh should be restarted") + parser.add_argument("--egs.opts", type=str, dest='egs_opts', + default = None, action = NullstrToNoneAction, + help="""String to provide options directly to steps/nnet3/get_egs.sh script""") + + # trainer options + parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs', + default = 8, + help="Number of epochs to train the model") + parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size', + default = 20000, + help="Number of samples for computing priors") + parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior', + default = 10, + help="The prior computation jobs are single threaded and run on the CPU") + parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine', + default = 20, + help="The maximum number of models used in the final model combination stage. These models will themselves be averages of iteration-number ranges") + parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size', + default = 5000, + help="Controls randomization of the samples on each" + "iteration. If 0 or a large value the randomization is" + "complete, but this will consume memory and cause spikes" + "in disk I/O. Smaller is easier on disk and memory but" + "less random. It's not a huge deal though, as samples" + "are anyway randomized right at the start." + "(the point of this is to get data in different" + "minibatches on different iterations, since in the" + "preconditioning method, 2 samples in the same minibatch" + "can affect each others' gradients.") + parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period', + default=2, + help="The number of iterations between adding layers" + "during layer-wise discriminative training.") + parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change', + default=2.0, + help="The maximum change in parameters allowed per minibatch," + "measured in Frobenius norm over the entire model") + parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter', + default=400000, + help="This is really the number of egs in each archive.") + parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune', + default=4.0, + help="""Value used in preconditioning matrix estimation""") + parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs', + default=10, + help="""Max number of jobs used for LDA stats accumulation""") + parser.add_argument("--trainer.presoftmax-prior-scale-power", type=float, dest='presoftmax_prior_scale_power', + default=-0.25, + help="") + + # Realignment parameters + parser.add_argument("--trainer.realign.command", type=str, dest='realign_command', + default=None, action=NullstrToNoneAction, + help="""Command to be used with steps/nnet3/align.sh during realignment""") + parser.add_argument("--trainer.realign.num-jobs", type=int, dest='realign_num_jobs', + default=30, + help="Number of jobs to use for realignment") + parser.add_argument("--trainer.realign.times", type=str, dest='realign_times', + default=None, action=NullstrToNoneAction, + help="""A space seperated string of realignment + times. Values must be between 0 and 1 + e.g. '0.1 0.2 0.3' """) + + parser.add_argument("--trainer.realign.use_gpu", type=str, dest='realign_use_gpu', + default=True, action=StrToBoolAction, + choices = ["true", "false"], + help="If true, gpu is used with steps/nnet3/align.sh") + + # Parameters for the optimization + parser.add_argument("--trainer.optimization.minibatch-size", type=float, dest='minibatch_size', + default = 512, + help="Size of the minibatch used to compute the gradient") + parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate', + default = 0.0003, + help="Learning rate used during the initial iteration") + parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate', + default = 0.00003, + help="Learning rate used during the final iteration") + parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial', + default = 1, + help="Number of neural net jobs to run in parallel at the start of training") + parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final', + default = 8, + help="Number of neural net jobs to run in parallel at the end of training") + parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine', + default = 20, + help = """ The is the maximum number of models we give to the + final 'combine' stage, but these models will themselves + be averages of iteration-number ranges. """) + parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum', + default = 0.0, + help="""Momentum used in update computation. + Note: we implemented it in such a way that + it doesn't increase the effective learning rate.""") + # General options + parser.add_argument("--stage", type=int, default=-4, + help="Specifies the stage of the experiment to execution from") + parser.add_argument("--exit-stage", type=int, default=None, + help="If specified, training exits before running this stage") + parser.add_argument("--cmd", type=str, action = NullstrToNoneAction, + dest = "command", + help="""Specifies the script to launch jobs. + e.g. queue.pl for launching on SGE cluster + run.pl for launching on local machine + """, default = "queue.pl") + parser.add_argument("--use-gpu", type=str, action = StrToBoolAction, + choices = ["true", "false"], + help="Use GPU for training", default=True) + parser.add_argument("--cleanup", type=str, action = StrToBoolAction, + choices = ["true", "false"], + help="Clean up models after training", default=True) + parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs', + default = True, action = StrToBoolAction, + choices = ["true", "false"], + help="""If true, remove egs after experiment""") + parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval", + type=int, default=100, + help="Determines iterations for which models will be preserved during cleanup. If iter % preserve_model_interval == 0 model will be preserved.") + + parser.add_argument("--reporting.email", dest = "email", + type=str, default=None, action = NullstrToNoneAction, + help=""" Email-id to report about the progress of the experiment. + NOTE: It assumes the machine on which the script is being run can send + emails from command line via. mail program. The + Kaldi mailing list will not support this feature. + It might require local expertise to setup. """) + parser.add_argument("--reporting.interval", dest = "reporting_interval", + type=int, default=0.1, + help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent") + + parser.add_argument("--feat-dir", type=str, required = True, + help="Directory with features used for training the neural network.") + parser.add_argument("--lang", type=str, required = True, + help="Languade directory") + parser.add_argument("--ali-dir", type=str, required = True, + help="Directory with alignments used for training the neural network.") + parser.add_argument("--dir", type=str, required = True, + help="Directory to store the models and all other files.") + + print(' '.join(sys.argv)) + + args = parser.parse_args() + + [args, run_opts] = ProcessArgs(args) + + return [args, run_opts] + +def ProcessArgs(args): + # process the options + if args.frames_per_eg < 1: + raise Exception("--egs.frames-per-eg should have a minimum value of 1") + + if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")): + raise Exception("This scripts expects {0} to exist and have a configs" + " directory which is the output of make_configs.py script") + + if args.transform_dir is None: + args.transform_dir = args.ali_dir + # set the options corresponding to args.use_gpu + run_opts = RunOpts() + if args.use_gpu: + if not CheckIfCudaCompiled(): + logger.warning(""" + You are running with one thread but you have not compiled + for CUDA. You may be running a setup optimized for GPUs. If you have + GPUs and have nvcc installed, go to src/ and do ./configure; make""") + + run_opts.train_queue_opt = "--gpu 1" + run_opts.parallel_train_opts = "" + run_opts.combine_queue_opt = "--gpu 1" + run_opts.prior_gpu_opt = "--use-gpu=yes" + run_opts.prior_queue_opt = "--gpu 1" + + else: + logger.warning(""" + Without using a GPU this will be very slow. nnet3 does not yet support multiple threads.""") + + run_opts.train_queue_opt = "" + run_opts.parallel_train_opts = "--use-gpu=no" + run_opts.combine_queue_opt = "" + run_opts.prior_gpu_opt = "--use-gpu=no" + run_opts.prior_queue_opt = "" + + if args.realign_use_gpu is True: + run_opts.realign_use_gpu = True + run_opts.realign_queue_opt = "--gpu 1" + else: + run_opts.realign_use_gpu = False + run_opts.realign_queue_opt = "" + + if args.realign_command is None: + run_opts.realign_command = args.command + else: + run_opts.realign_command = args.realign_command + run_opts.realign_num_jobs = args.realign_num_jobs + + run_opts.command = args.command + run_opts.num_jobs_compute_prior = args.num_jobs_compute_prior + + return [args, run_opts] + +# a class to store run options +class RunOpts: + def __init__(self): + self.command = None + self.train_queue_opt = None + self.combine_queue_opt = None + self.prior_gpu_opt = None + self.prior_queue_opt = None + self.parallel_train_opts = None + self.realign_use_gpu = None + +# this is the main method which differs between RNN and DNN training +def TrainNewModels(dir, iter, num_jobs, num_archives_processed, num_archives, + raw_model_string, egs_dir, frames_per_eg, + left_context, right_context, + momentum, max_param_change, + shuffle_buffer_size, minibatch_size, + run_opts): + # We cannot easily use a single parallel SGE job to do the main training, + # because the computation of which archive and which --frame option + # to use for each job is a little complex, so we spawn each one separately. + # this is no longer true for RNNs as we use do not use the --frame option + # but we use the same script for consistency with FF-DNN code + + context_opts="--left-context={0} --right-context={1}".format( + left_context, right_context) + processes = [] + for job in range(1,num_jobs+1): + k = num_archives_processed + job - 1 # k is a zero-based index that we will derive + # the other indexes from. + archive_index = (k % num_archives) + 1 # work out the 1-based archive index. + frame = (k / num_archives) % frames_per_eg + process_handle = RunKaldiCommand(""" +{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \ + nnet3-train {parallel_train_opts} \ + --print-interval=10 --momentum={momentum} \ + --max-param-change={max_param_change} \ + "{raw_model}" \ + "ark:nnet3-copy-egs --frame={frame} {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={iter} ark:- ark:-| nnet3-merge-egs --minibatch-size={minibatch_size} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \ + {dir}/{next_iter}.{job}.raw + """.format(command = run_opts.command, + train_queue_opt = run_opts.train_queue_opt, + dir = dir, iter = iter, next_iter = iter + 1, job = job, + parallel_train_opts = run_opts.parallel_train_opts, + frame = frame, + momentum = momentum, max_param_change = max_param_change, + raw_model = raw_model_string, context_opts = context_opts, + egs_dir = egs_dir, archive_index = archive_index, + shuffle_buffer_size = shuffle_buffer_size, + minibatch_size = minibatch_size), + wait = False) + + processes.append(process_handle) + + all_success = True + for process in processes: + process.wait() + [stdout_value, stderr_value] = process.communicate() + print(stderr_value) + if process.returncode != 0: + all_success = False + + if not all_success: + open('{0}/.error'.format(dir), 'w').close() + raise Exception("There was error during training iteration {0}".format(iter)) + +def TrainOneIteration(dir, iter, egs_dir, + num_jobs, num_archives_processed, num_archives, + learning_rate, minibatch_size, + frames_per_eg, num_hidden_layers, add_layers_period, + left_context, right_context, + momentum, max_param_change, shuffle_buffer_size, + run_opts): + + + + # Set off jobs doing some diagnostics, in the background. + # Use the egs dir from the previous iteration for the diagnostics + logger.info("Training neural net (pass {0})".format(iter)) + + ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts) + + if iter > 0: + ComputeProgress(dir, iter, egs_dir, run_opts) + + if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0): + + do_average = False # if we've just mixed up, don't do averaging but take the + # best. + cur_num_hidden_layers = 1 + iter / add_layers_period + config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers) + raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={iter} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, config=config_file ) + else: + do_average = True + if iter == 0: + do_average = False # on iteration 0, pick the best, don't average. + raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter) + + if do_average: + cur_minibatch_size = minibatch_size + cur_max_param_change = max_param_change + else: + # on iteration zero or when we just added a layer, use a smaller minibatch + # size (and we will later choose the output of just one of the jobs): the + # model-averaging isn't always helpful when the model is changing too fast + # (i.e. it can worsen the objective function), and the smaller minibatch + # size will help to keep the update stable. + cur_minibatch_size = minibatch_size / 2 + cur_max_param_change = float(max_param_change) / math.sqrt(2) + + try: + os.remove("{0}/.error".format(dir)) + except OSError: + pass + + TrainNewModels(dir, iter, num_jobs, num_archives_processed, num_archives, + raw_model_string, egs_dir, frames_per_eg, + left_context, right_context, + momentum, max_param_change, + shuffle_buffer_size, cur_minibatch_size, + run_opts) + [models_to_average, best_model] = GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter)) + nnets_list = [] + for n in models_to_average: + nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n)) + + if do_average: + # average the output of the different jobs. + RunKaldiCommand(""" +{command} {dir}/log/average.{iter}.log \ +nnet3-average {nnet_list} - \| \ +nnet3-am-copy --set-raw-nnet=- {dir}/{iter}.mdl {dir}/{new_iter}.mdl + """.format(command = run_opts.command, + dir = dir, + iter = iter, + nnet_list = " ".join(nnets_list), + new_iter = iter + 1)) + + else: + # choose the best model from different jobs + RunKaldiCommand(""" +{command} {dir}/log/select.{iter}.log \ + nnet3-am-copy --set-raw-nnet={dir}/{next_iter}.{best_model_index}.raw {dir}/{iter}.mdl {dir}/{next_iter}.mdl + """.format(command = run_opts.command, + dir = dir, iter = iter, next_iter = iter + 1, + best_model_index = best_model)) + + try: + for i in range(1, num_jobs + 1): + os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i)) + except OSError: + raise Exception("Error while trying to delete the raw models") + + new_model = "{0}/{1}.mdl".format(dir, iter + 1) + + if not os.path.isfile(new_model): + raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter)) + elif os.stat(new_model).st_size == 0: + raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter)) + +# args is a Namespace with the required parameters +def Train(args, run_opts): + arg_string = pprint.pformat(vars(args)) + logger.info("Arguments for the experiment\n{0}".format(arg_string)) + + # Set some variables. + num_leaves = GetNumberOfLeaves(args.ali_dir) + num_jobs = GetNumberOfJobs(args.ali_dir) + feat_dim = GetFeatDim(args.feat_dir) + ivector_dim = GetIvectorDim(args.online_ivector_dir) + + # split the training data into parts for individual jobs + # we will use the same number of jobs as that used for alignment + SplitData(args.feat_dir, num_jobs) + shutil.copy('{0}/tree'.format(args.ali_dir), args.dir) + f = open('{0}/num_jobs'.format(args.dir), 'w') + f.write(str(num_jobs)) + f.close() + + config_dir = '{0}/configs'.format(args.dir) + var_file = '{0}/vars'.format(config_dir) + + [left_context, right_context, num_hidden_layers] = ParseModelConfigVarsFile(var_file) + # Initialize as "raw" nnet, prior to training the LDA-like preconditioning + # matrix. This first config just does any initial splicing that we do; + # we do this as it's a convenient way to get the stats for the 'lda-like' + # transform. + + if (args.stage <= -5): + logger.info("Initializing a basic network for estimating preconditioning matrix") + RunKaldiCommand(""" +{command} {dir}/log/nnet_init.log \ + nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw + """.format(command = run_opts.command, + dir = args.dir)) + + default_egs_dir = '{0}/egs'.format(args.dir) + if (args.stage <= -4) and args.egs_dir is None: + logger.info("Generating egs") + + GenerateEgs(args.feat_dir, args.ali_dir, default_egs_dir, + left_context, right_context, + left_context, right_context, run_opts, + frames_per_eg = args.frames_per_eg, + egs_opts = args.egs_opts, + cmvn_opts = args.cmvn_opts, + online_ivector_dir = args.online_ivector_dir, + samples_per_iter = args.samples_per_iter, + transform_dir = args.transform_dir, + stage = args.egs_stage) + + if args.egs_dir is None: + egs_dir = default_egs_dir + else: + egs_dir = args.egs_dir + + [egs_left_context, egs_right_context, frames_per_eg, num_archives] = VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context) + assert(args.frames_per_eg == frames_per_eg) + + if (args.num_jobs_final > num_archives): + raise Exception('num_jobs_final cannot exceed the number of archives in the egs directory') + + # copy the properties of the egs to dir for + # use during decoding + CopyEgsPropertiesToExpDir(egs_dir, args.dir) + + if (args.stage <= -3): + logger.info('Computing the preconditioning matrix for input features') + + ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts, + max_lda_jobs = args.max_lda_jobs, + rand_prune = args.rand_prune) + + if (args.stage <= -2): + logger.info("Computing initial vector for FixedScaleComponent before" + " softmax, using priors^{prior_scale} and rescaling to" + " average 1".format(prior_scale = args.presoftmax_prior_scale_power)) + + ComputePresoftmaxPriorScale(args.dir, args.ali_dir, num_jobs, run_opts, + presoftmax_prior_scale_power = args.presoftmax_prior_scale_power) + + + if (args.stage <= -1): + logger.info("Preparing the initial acoustic model.") + PrepareInitialAcousticModel(args.dir, args.ali_dir, run_opts) + + + # set num_iters so that as close as possible, we process the data $num_epochs + # times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives, + # where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2. + num_archives_expanded = num_archives * args.frames_per_eg + num_archives_to_process = args.num_epochs * num_archives_expanded + num_archives_processed = 0 + num_iters=(num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final) + + num_iters_combine = VerifyIterations(num_iters, args.num_epochs, + num_hidden_layers, num_archives_expanded, + args.max_models_combine, args.add_layers_period, + args.num_jobs_final) + + learning_rate = lambda iter, current_num_jobs, num_archives_processed: GetLearningRate(iter, current_num_jobs, num_iters, + num_archives_processed, + num_archives_to_process, + args.initial_effective_lrate, + args.final_effective_lrate) + realign_iters = [] + if args.realign_times is not None: + realign_iters = GetRealignIters(args.realign_times, + num_iters, + args.num_jobs_initial, + args.num_jobs_final) + print(realign_iters) + # egs_dir will be updated if there is realignment + cur_egs_dir=egs_dir + + logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters)) + for iter in range(num_iters): + if (args.exit_stage is not None) and (iter == args.exit_stage): + logger.info("Exiting early due to --exit-stage {0}".format(iter)) + return + current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters) + + if args.stage <= iter: + if iter in realign_iters: + logger.info("Re-aligning the data at iteration {0}".format(iter)) + prev_egs_dir=cur_egs_dir + cur_egs_dir="{0}/egs_{1}".format(args.dir, "iter"+str(iter)) + new_ali_dir="{0}/ali_{1}".format(args.dir, "iter"+str(iter)) + Realign(args.dir, iter, args.feat_dir, args.lang, + prev_egs_dir, cur_egs_dir, + args.prior_subset_size, num_archives, run_opts, + transform_dir = args.transform_dir, online_ivector_dir = args.online_ivector_dir) + if args.cleanup and args.egs_dir is None: + RemoveEgs(prev_egs_dir) + model_file = "{dir}/{iter}.mdl".format(dir = args.dir, iter = iter) + + logger.info("On iteration {0}, learning rate is {1}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed))) + + TrainOneIteration(args.dir, iter, egs_dir, current_num_jobs, + num_archives_processed, num_archives, + learning_rate(iter, current_num_jobs, num_archives_processed), + args.minibatch_size, args.frames_per_eg, + num_hidden_layers, args.add_layers_period, + left_context, right_context, + args.momentum, args.max_param_change, + args.shuffle_buffer_size, run_opts) + if args.cleanup: + # do a clean up everythin but the last 2 models, under certain conditions + RemoveModel(args.dir, iter-2, num_iters, num_iters_combine, + args.preserve_model_interval) + + if args.email is not None: + reporting_iter_interval = num_iters * args.reporting_interval + if iter % reporting_iter_interval == 0: + # lets do some reporting + [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir) + message = report + subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter) + sendMail(message, subject, args.email) + + num_archives_processed = num_archives_processed + current_num_jobs + + if args.stage <= num_iters: + logger.info("Doing final combination to produce final.mdl") + CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts) + + if args.stage <= num_iters + 1: + logger.info("Getting average posterior for purposes of adjusting the priors.") + avg_post_vec_file = ComputeAveragePosterior(args.dir, 'combined', egs_dir, + num_archives, args.prior_subset_size, run_opts) + + logger.info("Re-adjusting priors based on computed posteriors") + combined_model = "{dir}/combined.mdl".format(dir = args.dir) + final_model = "{dir}/final.mdl".format(dir = args.dir) + AdjustAmPriors(args.dir, combined_model, avg_post_vec_file, final_model, run_opts) + + if args.cleanup: + logger.info("Cleaning up the experiment directory {0}".format(args.dir)) + remove_egs = args.remove_egs + if args.egs_dir is not None: + # this egs_dir was not created by this experiment so we will not + # delete it + remove_egs = False + + CleanNnetDir(args.dir, num_iters, cur_egs_dir, + preserve_model_interval = args.preserve_model_interval, + remove_egs = remove_egs) + + # do some reporting + [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir) + if args.email is not None: + SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email) + + report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w") + report_handle.write(report) + report_handle.close() + +def Main(): + [args, run_opts] = GetArgs() + try: + Train(args, run_opts) + except Exception as e: + if args.email is not None: + message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir) + sendMail(message, message, args.email) + traceback.print_exc() + raise e + +if __name__ == "__main__": + Main() diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py new file mode 100755 index 00000000000..463b0a0d3ff --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/train_rnn.py @@ -0,0 +1,704 @@ +#!/usr/bin/env python + + +# Copyright 2016 Vijayaditya Peddinti. +# Apache 2.0. + + +# this script is based on steps/nnet3/lstm/train.sh + + +import subprocess +import argparse +import sys +import pprint +import logging +import imp +import traceback +from nnet3_train_lib import * + +nnet3_log_parse = imp.load_source('', 'steps/nnet3/report/nnet3_log_parse_lib.py') + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s') +handler.setFormatter(formatter) +logger.addHandler(handler) +logger.info('Starting RNN trainer (train_rnn.py)') + + +def GetArgs(): + # we add compulsary arguments as named arguments for readability + parser = argparse.ArgumentParser(description=""" + Trains an RNN acoustic model using the cross-entropy objective. + RNNs include LSTMs, BLSTMs and GRUs. + RNN acoustic model training differs from feed-forward DNN training + in the following ways + 1. RNN acoustic models train on output chunks rather than individual + outputs + 2. The training includes additional stage of shrinkage, where + the parameters of the model are scaled when the derivative averages + at the non-linearities are below a threshold. + 3. RNNs can also be trained with state preservation training + """, + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + # feat options + parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir', + default = None, action = NullstrToNoneAction, + help="""directory with the ivectors extracted in + an online fashion.""") + parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts', + default = None, action = NullstrToNoneAction, + help="A string specifying '--norm-means' and '--norm-vars' values") + + # egs extraction options + parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width', + default = 20, + help="""Number of output labels in the sequence + used to train an LSTM. + Caution: if you double this you should halve + --trainer.samples-per-iter.""") + parser.add_argument("--egs.chunk-left-context", type=int, dest='chunk_left_context', + default = 40, + help="""Number of left steps used in the estimation of LSTM + state before prediction of the first label""") + parser.add_argument("--egs.chunk-right-context", type=int, dest='chunk_right_context', + default = 0, + help="""Number of right steps used in the estimation of BLSTM + state before prediction of the first label""") + parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir', + default = None, action = NullstrToNoneAction, + help="""String to provide options directly to steps/nnet3/get_egs.sh script""") + parser.add_argument("--egs.dir", type=str, dest='egs_dir', + default = None, action = NullstrToNoneAction, + help="""Directory with egs. If specified this directory + will be used rather than extracting egs""") + parser.add_argument("--egs.stage", type=int, dest='egs_stage', + default = 0, help="Stage at which get_egs.sh should be restarted") + parser.add_argument("--egs.opts", type=str, dest='egs_opts', + default = None, action = NullstrToNoneAction, + help="""String to provide options directly to steps/nnet3/get_egs.sh script""") + + # trainer options + parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs', + default = 8, + help="Number of epochs to train the model") + parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size', + default = 20000, + help="Number of samples for computing priors") + parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior', + default = 10, + help="The prior computation jobs are single threaded and run on the CPU") + parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine', + default = 20, + help="The maximum number of models used in the final model combination stage. These models will themselves be averages of iteration-number ranges") + parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size', + default = 5000, + help=""" Controls randomization of the samples on each + iteration. If 0 or a large value the randomization is + complete, but this will consume memory and cause spikes + in disk I/O. Smaller is easier on disk and memory but + less random. It's not a huge deal though, as samples + are anyway randomized right at the start. + (the point of this is to get data in different + minibatches on different iterations, since in the + preconditioning method, 2 samples in the same minibatch + can affect each others' gradients.""") + parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period', + default=2, + help="The number of iterations between adding layers during layer-wise discriminative training.") + parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change', + default=2.0, + help="""The maximum change in parameters allowed + per minibatch, measured in Frobenius norm over + the entire model""") + parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter', + default=20000, + help="""This is really the number of egs in each + archive. Each eg has 'chunk_width' frames in it-- + for chunk_width=20, this value (20k) is equivalent + to the 400k number that we use as a default in + regular DNN training.""") + parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune', + default=4.0, + help="""Value used in preconditioning matrix estimation""") + parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs', + default=10, + help="""Max number of jobs used for LDA stats accumulation""") + + # Realignment parameters + parser.add_argument("--trainer.realign.command", type=str, dest='realign_command', + default=None, action=NullstrToNoneAction, + help="""Command to be used with steps/nnet3/align.sh during realignment""") + parser.add_argument("--trainer.realign.num-jobs", type=int, dest='realign_num_jobs', + default=30, + help="Number of jobs to use for realignment") + parser.add_argument("--trainer.realign.times", type=str, dest='realign_times', + default=None, action=NullstrToNoneAction, + help="""A space seperated string of realignment + times. Values must be between 0 and 1 + e.g. '0.1 0.2 0.3' """) + + parser.add_argument("--trainer.realign.use_gpu", type=str, dest='realign_use_gpu', + default=True, action=StrToBoolAction, + choices = ["true", "false"], + help="If true, gpu is used with steps/nnet3/align.sh") + + # Parameters for the optimization + parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate', + default = 0.0003, + help="Learning rate used during the initial iteration") + parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate', + default = 0.00003, + help="Learning rate used during the final iteration") + parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial', + default = 1, + help="Number of neural net jobs to run in parallel at the start of training") + parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final', + default = 8, + help="Number of neural net jobs to run in parallel at the end of training") + parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine', + default = 20, + help = """ The is the maximum number of models we give to the + final 'combine' stage, but these models will themselves + be averages of iteration-number ranges. """) + parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum', + default = 0.5, + help="""Momentum used in update computation. + Note: we implemented it in such a way that + it doesn't increase the effective learning rate.""") + parser.add_argument("--trainer.optimization.shrink-value", type=float, dest='shrink_value', + default = 0.99, + help="Scaling factor used for scaling the parameter matrices when the derivative averages are below the shrink-threshold at the non-linearities") + parser.add_argument("--trainer.optimization.shrink-threshold", type=float, dest='shrink_threshold', + default = 0.15, + help="If the derivative averages are below this threshold we scale the parameter matrices with the shrink-value. It is less than 0.25 for sigmoid non-linearities.") + + # RNN specific trainer options + parser.add_argument("--trainer.rnn.num-chunk-per-minibatch", type=int, dest='num_chunk_per_minibatch', + default=100, + help="Number of sequences to be processed in parallel every minibatch" ) + parser.add_argument("--trainer.rnn.num-bptt-steps", type=int, dest='num_bptt_steps', + default=None, + help="The number of time steps to back-propagate from the last label in the chunk. By default it is same as the chunk-width." ) + + # General options + parser.add_argument("--stage", type=int, default=-4, + help="Specifies the stage of the experiment to execution from") + parser.add_argument("--exit-stage", type=int, default=None, + help="If specified, training exits before running this stage") + parser.add_argument("--cmd", type=str, action = NullstrToNoneAction, + dest = "command", + help="""Specifies the script to launch jobs. + e.g. queue.pl for launching on SGE cluster + run.pl for launching on local machine + """, default = "queue.pl") + parser.add_argument("--use-gpu", type=str, action = StrToBoolAction, + choices = ["true", "false"], + help="Use GPU for training", default=True) + parser.add_argument("--cleanup", type=str, action = StrToBoolAction, + choices = ["true", "false"], + help="Clean up models after training", default=True) + parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs', + default = True, action = StrToBoolAction, + choices = ["true", "false"], + help="""If true, remove egs after experiment""") + parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval", + type=int, default=100, + help="Determines iterations for which models will be preserved during cleanup. If iter % preserve_model_interval == 0 model will be preserved.") + + parser.add_argument("--reporting.email", dest = "email", + type=str, default=None, action = NullstrToNoneAction, + help=""" Email-id to report about the progress of the experiment. + NOTE: It assumes the machine on which the script is being run can send + emails from command line via. mail program. The + Kaldi mailing list will not support this feature. + It might require local expertise to setup. """) + parser.add_argument("--reporting.interval", dest = "reporting_interval", + type=int, default=0.1, + help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent") + + parser.add_argument("--feat-dir", type=str, required = True, + help="Directory with features used for training the neural network.") + parser.add_argument("--lang", type=str, required = True, + help="Languade directory") + parser.add_argument("--ali-dir", type=str, required = True, + help="Directory with alignments used for training the neural network.") + parser.add_argument("--dir", type=str, required = True, + help="Directory to store the models and all other files.") + + print(' '.join(sys.argv)) + + args = parser.parse_args() + + [args, run_opts] = ProcessArgs(args) + + return [args, run_opts] + +def ProcessArgs(args): + # process the options + if args.chunk_width < 1: + raise Exception("--egs.chunk-width should have a minimum value of 1") + + if args.chunk_left_context < 0: + raise Exception("--egs.chunk-left-context should be positive") + + if args.chunk_right_context < 0: + raise Exception("--egs.chunk-right-context should be positive") + + if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")): + raise Exception("""This scripts expects {0} to exist and have a configs + directory which is the output of make_configs.py script""") + + if args.transform_dir is None: + args.transform_dir = args.ali_dir + # set the options corresponding to args.use_gpu + run_opts = RunOpts() + if args.use_gpu: + if not CheckIfCudaCompiled(): + logger.warning(""" + You are running with one thread but you have not compiled + for CUDA. You may be running a setup optimized for GPUs. If you have + GPUs and have nvcc installed, go to src/ and do ./configure; make""") + + run_opts.train_queue_opt = "--gpu 1" + run_opts.parallel_train_opts = "" + run_opts.combine_queue_opt = "--gpu 1" + run_opts.prior_gpu_opt = "--use-gpu=yes" + run_opts.prior_queue_opt = "--gpu 1" + + else: + logger.warning(""" + Without using a GPU this will be very slow. nnet3 does not yet support multiple threads.""") + + run_opts.train_queue_opt = "" + run_opts.parallel_train_opts = "--use-gpu=no" + run_opts.combine_queue_opt = "" + run_opts.prior_gpu_opt = "--use-gpu=no" + run_opts.prior_queue_opt = "" + + if args.realign_use_gpu is True: + run_opts.realign_use_gpu = True + run_opts.realign_queue_opt = "--gpu 1" + else: + run_opts.realign_use_gpu = False + run_opts.realign_queue_opt = "" + + if args.realign_command is None: + run_opts.realign_command = args.command + else: + run_opts.realign_command = args.realign_command + run_opts.realign_num_jobs = args.realign_num_jobs + + run_opts.command = args.command + run_opts.num_jobs_compute_prior = args.num_jobs_compute_prior + + return [args, run_opts] + +class StrToBoolAction(argparse.Action): + """ A custom action to convert bools from shell format i.e., true/false + to python format i.e., True/False """ + def __call__(self, parser, namespace, values, option_string=None): + if values == "true": + setattr(namespace, self.dest, True) + elif values == "false": + setattr(namespace, self.dest, False) + else: + raise Exception("Unknown value {0} for --{1}".format(values, self.dest)) + +class NullstrToNoneAction(argparse.Action): + """ A custom action to convert empty strings passed by shell + to None in python. This is necessary as shell scripts print null strings + when a variable is not specified. We could use the more apt None + in python. """ + def __call__(self, parser, namespace, values, option_string=None): + if values.strip() == "": + setattr(namespace, self.dest, None) + else: + setattr(namespace, self.dest, values) + + +# a class to store run options +class RunOpts: + def __init__(self): + self.command = None + self.train_queue_opt = None + self.combine_queue_opt = None + self.prior_gpu_opt = None + self.prior_queue_opt = None + self.parallel_train_opts = None + self.realign_use_gpu = None + + +def TrainNewModels(dir, iter, num_jobs, num_archives_processed, num_archives, + raw_model_string, egs_dir, + left_context, right_context, min_deriv_time, + momentum, max_param_change, + shuffle_buffer_size, num_chunk_per_minibatch, + run_opts): + # We cannot easily use a single parallel SGE job to do the main training, + # because the computation of which archive and which --frame option + # to use for each job is a little complex, so we spawn each one separately. + # this is no longer true for RNNs as we use do not use the --frame option + # but we use the same script for consistency with FF-DNN code + + context_opts="--left-context={0} --right-context={1}".format( + left_context, right_context) + processes = [] + for job in range(1,num_jobs+1): + k = num_archives_processed + job - 1 # k is a zero-based index that we will derive + # the other indexes from. + archive_index = (k % num_archives) + 1 # work out the 1-based archive index. + + process_handle = RunKaldiCommand(""" +{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \ + nnet3-train {parallel_train_opts} \ + --print-interval=10 --momentum={momentum} \ + --max-param-change={max_param_change} \ + --optimization.min-deriv-time={min_deriv_time} "{raw_model}" \ + "ark:nnet3-copy-egs {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={iter} ark:- ark:-| nnet3-merge-egs --minibatch-size={num_chunk_per_minibatch} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \ + {dir}/{next_iter}.{job}.raw + """.format(command = run_opts.command, + train_queue_opt = run_opts.train_queue_opt, + dir = dir, iter = iter, next_iter = iter + 1, job = job, + parallel_train_opts = run_opts.parallel_train_opts, + momentum = momentum, max_param_change = max_param_change, + min_deriv_time = min_deriv_time, + raw_model = raw_model_string, context_opts = context_opts, + egs_dir = egs_dir, archive_index = archive_index, + shuffle_buffer_size = shuffle_buffer_size, + num_chunk_per_minibatch = num_chunk_per_minibatch), + wait = False) + + processes.append(process_handle) + + all_success = True + for process in processes: + process.wait() + [stdout_value, stderr_value] = process.communicate() + print(stderr_value) + if process.returncode != 0: + all_success = False + + if not all_success: + open('{0}/.error'.format(dir), 'w').close() + raise Exception("There was error during training iteration {0}".format(iter)) + + +def TrainOneIteration(dir, iter, egs_dir, + num_jobs, num_archives_processed, num_archives, + learning_rate, shrinkage_value, num_chunk_per_minibatch, + num_hidden_layers, add_layers_period, + left_context, right_context, min_deriv_time, + momentum, max_param_change, shuffle_buffer_size, + run_opts): + # Set off jobs doing some diagnostics, in the background. + # Use the egs dir from the previous iteration for the diagnostics + logger.info("Training neural net (pass {0})".format(iter)) + + ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts) + + if iter > 0: + ComputeProgress(dir, iter, egs_dir, run_opts) + + if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0): + + do_average = False # if we've just mixed up, don't do averaging but take the + # best. + cur_num_hidden_layers = 1 + iter / add_layers_period + config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers) + raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={iter} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, config=config_file ) + else: + do_average = True + if iter == 0: + do_average = False # on iteration 0, pick the best, don't average. + raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter) + + if do_average: + cur_num_chunk_per_minibatch = num_chunk_per_minibatch + else: + # on iteration zero or when we just added a layer, use a smaller minibatch + # size (and we will later choose the output of just one of the jobs): the + # model-averaging isn't always helpful when the model is changing too fast + # (i.e. it can worsen the objective function), and the smaller minibatch + # size will help to keep the update stable. + cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2 + + try: + os.remove("{0}/.error".format(dir)) + except OSError: + pass + + TrainNewModels(dir, iter, num_jobs, num_archives_processed, num_archives, + raw_model_string, egs_dir, + left_context, right_context, min_deriv_time, + momentum, max_param_change, + shuffle_buffer_size, cur_num_chunk_per_minibatch, + run_opts) + [models_to_average, best_model] = GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter)) + nnets_list = [] + for n in models_to_average: + nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n)) + + if do_average: + # average the output of the different jobs. + RunKaldiCommand(""" +{command} {dir}/log/average.{iter}.log \ +nnet3-average {nnet_list} - \| \ +nnet3-am-copy --scale={shrink} --set-raw-nnet=- {dir}/{iter}.mdl {dir}/{new_iter}.mdl + """.format(command = run_opts.command, + dir = dir, + iter = iter, + nnet_list = " ".join(nnets_list), + shrink = shrinkage_value, + new_iter = iter + 1)) + + else: + # choose the best model from different jobs + RunKaldiCommand(""" +{command} {dir}/log/select.{iter}.log \ + nnet3-am-copy --scale={shrink} --set-raw-nnet={dir}/{next_iter}.{best_model_index}.raw {dir}/{iter}.mdl {dir}/{next_iter}.mdl + """.format(command = run_opts.command, + dir = dir, iter = iter, next_iter = iter + 1, + shrink = shrinkage_value, best_model_index = best_model)) + + try: + for i in range(1, num_jobs + 1): + os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i)) + except OSError: + raise Exception("Error while trying to delete the raw models") + + new_model = "{0}/{1}.mdl".format(dir, iter + 1) + + if not os.path.isfile(new_model): + raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter)) + elif os.stat(new_model).st_size == 0: + raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter)) + +# args is a Namespace with the required parameters +def Train(args, run_opts): + arg_string = pprint.pformat(vars(args)) + logger.info("Arguments for the experiment\n{0}".format(arg_string)) + + # Set some variables. + num_leaves = GetNumberOfLeaves(args.ali_dir) + num_jobs = GetNumberOfJobs(args.ali_dir) + feat_dim = GetFeatDim(args.feat_dir) + ivector_dim = GetIvectorDim(args.online_ivector_dir) + + # split the training data into parts for individual jobs + # we will use the same number of jobs as that used for alignment + SplitData(args.feat_dir, num_jobs) + shutil.copy('{0}/tree'.format(args.ali_dir), args.dir) + f = open('{0}/num_jobs'.format(args.dir), 'w') + f.write(str(num_jobs)) + f.close() + + config_dir = '{0}/configs'.format(args.dir) + var_file = '{0}/vars'.format(config_dir) + + [model_left_context, model_right_context, num_hidden_layers] = ParseModelConfigVarsFile(var_file) + # Initialize as "raw" nnet, prior to training the LDA-like preconditioning + # matrix. This first config just does any initial splicing that we do; + # we do this as it's a convenient way to get the stats for the 'lda-like' + # transform. + + if (args.stage <= -4): + logger.info("Initializing a basic network for estimating preconditioning matrix") + RunKaldiCommand(""" +{command} {dir}/log/nnet_init.log \ + nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw + """.format(command = run_opts.command, + dir = args.dir)) + + left_context = args.chunk_left_context + model_left_context + right_context = args.chunk_right_context + model_right_context + + default_egs_dir = '{0}/egs'.format(args.dir) + if (args.stage <= -3) and args.egs_dir is None: + logger.info("Generating egs") + + GenerateEgs(args.feat_dir, args.ali_dir, default_egs_dir, + left_context, right_context, + args.chunk_width + left_context, + args.chunk_width + right_context, run_opts, + frames_per_eg = args.chunk_width, + egs_opts = args.egs_opts, + cmvn_opts = args.cmvn_opts, + online_ivector_dir = args.online_ivector_dir, + samples_per_iter = args.samples_per_iter, + transform_dir = args.transform_dir, + stage = args.egs_stage) + + if args.egs_dir is None: + egs_dir = default_egs_dir + else: + egs_dir = args.egs_dir + + [egs_left_context, egs_right_context, frames_per_eg, num_archives] = VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context) + assert(args.chunk_width == frames_per_eg) + + if (args.num_jobs_final > num_archives): + raise Exception('num_jobs_final cannot exceed the number of archives in the egs directory') + + # copy the properties of the egs to dir for + # use during decoding + CopyEgsPropertiesToExpDir(egs_dir, args.dir) + + if (args.stage <= -2): + logger.info('Computing the preconditioning matrix for input features') + + ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts, + max_lda_jobs = args.max_lda_jobs, + rand_prune = args.rand_prune) + + if (args.stage <= -1): + logger.info("Preparing the initial acoustic model.") + PrepareInitialAcousticModel(args.dir, args.ali_dir, run_opts) + + + # set num_iters so that as close as possible, we process the data $num_epochs + # times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives, + # where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2. + num_archives_to_process = args.num_epochs * num_archives + num_archives_processed = 0 + num_iters=(num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final) + + num_iters_combine = VerifyIterations(num_iters, args.num_epochs, + num_hidden_layers, num_archives, + args.max_models_combine, args.add_layers_period, + args.num_jobs_final) + + learning_rate = lambda iter, current_num_jobs, num_archives_processed: GetLearningRate(iter, current_num_jobs, num_iters, + num_archives_processed, + num_archives_to_process, + args.initial_effective_lrate, + args.final_effective_lrate) + realign_iters = [] + if args.realign_times is not None: + realign_iters = GetRealignIters(args.realign_times, + num_iters, + args.num_jobs_initial, + args.num_jobs_final) + print(realign_iters) + # egs_dir will be updated if there is realignment + cur_egs_dir=egs_dir + + if args.num_bptt_steps is None: + num_bptt_steps = args.chunk_width + else: + num_bptt_steps = args.num_bptt_steps + + min_deriv_time = args.chunk_width - num_bptt_steps + + + logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters)) + for iter in range(num_iters): + if (args.exit_stage is not None) and (iter == args.exit_stage): + logger.info("Exiting early due to --exit-stage {0}".format(iter)) + return + current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters) + + if args.stage <= iter: + if iter in realign_iters: + logger.info("Re-aligning the data at iteration {0}".format(iter)) + prev_egs_dir=cur_egs_dir + cur_egs_dir="{0}/egs_{1}".format(args.dir, "iter"+str(iter)) + new_ali_dir="{0}/ali_{1}".format(args.dir, "iter"+str(iter)) + Realign(args.dir, iter, args.feat_dir, args.lang, + prev_egs_dir, cur_egs_dir, + args.prior_subset_size, num_archives, run_opts, + transform_dir = args.transform_dir, online_ivector_dir = args.online_ivector_dir) + if args.cleanup and args.egs_dir is None: + RemoveEgs(prev_egs_dir) + model_file = "{dir}/{iter}.mdl".format(dir = args.dir, iter = iter) + shrinkage_value = args.shrink_value if DoShrinkage(iter, model_file, "SigmoidComponent", args.shrink_threshold) else 1 + logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value)) + + TrainOneIteration(args.dir, iter, egs_dir, current_num_jobs, + num_archives_processed, num_archives, + learning_rate(iter, current_num_jobs, num_archives_processed), + shrinkage_value, + args.num_chunk_per_minibatch, + num_hidden_layers, args.add_layers_period, + left_context, right_context, min_deriv_time, + args.momentum, args.max_param_change, + args.shuffle_buffer_size, run_opts) + if args.cleanup: + # do a clean up everythin but the last 2 models, under certain conditions + RemoveModel(args.dir, iter-2, num_iters, num_iters_combine, + args.preserve_model_interval) + + if args.email is not None: + reporting_iter_interval = num_iters * args.reporting_interval + if iter % reporting_iter_interval == 0: + # lets do some reporting + [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir) + message = report + subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter) + sendMail(message, subject, args.email) + + num_archives_processed = num_archives_processed + current_num_jobs + + if args.stage <= num_iters: + logger.info("Doing final combination to produce final.mdl") + CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts, + chunk_width = args.chunk_width) + + if args.stage <= num_iters + 1: + logger.info("Getting average posterior for purposes of adjusting the priors.") + avg_post_vec_file = ComputeAveragePosterior(args.dir, 'combined', egs_dir, + num_archives, args.prior_subset_size, run_opts) + + logger.info("Re-adjusting priors based on computed posteriors") + combined_model = "{dir}/combined.mdl".format(dir = args.dir) + final_model = "{dir}/final.mdl".format(dir = args.dir) + AdjustAmPriors(args.dir, combined_model, avg_post_vec_file, final_model, run_opts) + + if args.cleanup: + logger.info("Cleaning up the experiment directory {0}".format(args.dir)) + remove_egs = args.remove_egs + if args.egs_dir is not None: + # this egs_dir was not created by this experiment so we will not + # delete it + remove_egs = False + + CleanNnetDir(args.dir, num_iters, cur_egs_dir, + preserve_model_interval = args.preserve_model_interval, + remove_egs = remove_egs) + + # do some reporting + [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir) + if args.email is not None: + sendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email) + + report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w") + report_handle.write(report) + report_handle.close() + +def Main(): + [args, run_opts] = GetArgs() + try: + Train(args, run_opts) + except Exception as e: + if args.email is not None: + message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir) + sendMail(message, message, args.email) + traceback.print_exc() + raise e + +def SendMail(message, subject, email_id): + try: + subprocess.Popen('echo "{message}" | mail -s "{subject}" {email} '.format( + message = message, + subject = subject, + email = email_id), shell=True) + except Exception as e: + logger.info(" Unable to send mail due to error:\n {error}".format(error = str(e))) + pass + +if __name__ == "__main__": + Main() diff --git a/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh b/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh index e17026e496f..d8ac11da720 100755 --- a/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh +++ b/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh @@ -93,6 +93,7 @@ echo -n >$ieconf cp $srcdir/online_cmvn.conf $dir/conf/ || exit 1; echo "--cmvn-config=$dir/conf/online_cmvn.conf" >>$ieconf for x in $(echo $splice_opts); do echo "$x"; done > $dir/conf/splice.conf +echo "--ivector-period=$ivector_period" >>$ieconf echo "--splice-config=$dir/conf/splice.conf" >>$ieconf echo "--lda-matrix=$srcdir/final.mat" >>$ieconf echo "--global-cmvn-stats=$srcdir/global_cmvn.stats" >>$ieconf diff --git a/egs/wsj/s5/steps/paste_feats.sh b/egs/wsj/s5/steps/paste_feats.sh index da82179f616..abeee5aba23 100755 --- a/egs/wsj/s5/steps/paste_feats.sh +++ b/egs/wsj/s5/steps/paste_feats.sh @@ -44,10 +44,10 @@ done mkdir -p $ark_dir $logdir -mkdir -p $data +mkdir -p $data cp $data_src_first/* $data/ 2>/dev/null # so we get the other files, such as utt2spk. -rm $data/cmvn.scp 2>/dev/null -rm $data/feats.scp 2>/dev/null +rm $data/cmvn.scp 2>/dev/null +rm $data/feats.scp 2>/dev/null # use "name" as part of name of the archive. name=`basename $data` @@ -58,19 +58,25 @@ for data_src in ${data_src_arr[@]}; do data_src_args="$data_src_args scp:$data_src/split$nj/JOB/feats.scp" done +for n in $(seq $nj); do + # the next command does nothing unless $arkdir/storage/ exists, see + # utils/create_data_link.pl for more info. + utils/create_data_link.pl $arkdir/pasted_$name.$n.ark +done + $cmd JOB=1:$nj $logdir/append.JOB.log \ paste-feats --length-tolerance=$length_tolerance $data_src_args ark:- \| \ copy-feats --compress=$compress ark:- \ ark,scp:$ark_dir/pasted_$name.JOB.ark,$ark_dir/pasted_$name.JOB.scp || exit 1; - + # concatenate the .scp files together. for ((n=1; n<=nj; n++)); do cat $ark_dir/pasted_$name.$n.scp >> $data/feats.scp || exit 1; done > $data/feats.scp || exit 1; -nf=`cat $data/feats.scp | wc -l` -nu=`cat $data/utt2spk | wc -l` +nf=`cat $data/feats.scp | wc -l` +nu=`cat $data/utt2spk | wc -l` if [ $nf -ne $nu ]; then echo "It seems not all of the feature files were successfully processed ($nf != $nu);" echo "consider using utils/fix_data_dir.sh $data" diff --git a/egs/wsj/s5/steps/score_kaldi.sh b/egs/wsj/s5/steps/score_kaldi.sh index 8a2aee9d48d..f054ebdb41d 100755 --- a/egs/wsj/s5/steps/score_kaldi.sh +++ b/egs/wsj/s5/steps/score_kaldi.sh @@ -137,6 +137,12 @@ if [ $stage -le 1 ]; then cat $dir/scoring_kaldi/wer_details/per_utt \| \ utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \ sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1; + + $cmd $dir/scoring_kaldi/log/wer_bootci.log \ + compute-wer-bootci \ + ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \ + '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1; + fi fi diff --git a/egs/wsj/s5/steps/score_kaldi_compare.sh b/egs/wsj/s5/steps/score_kaldi_compare.sh new file mode 100755 index 00000000000..91fc057b906 --- /dev/null +++ b/egs/wsj/s5/steps/score_kaldi_compare.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# Copyright 2016 Nicolas Serrano +# Apache 2.0 + +[ -f ./path.sh ] && . ./path.sh + +# begin configuration section. +cmd=run.pl +replications=10000 +#end configuration section. + +echo "$0 $@" # Print the command line for logging +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: local/score_compare.sh [--cmd (run.pl|queue.pl...)] " + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --replications # number of bootstrap evaluation to compute confidence." + exit 1; +fi + +dir1=$1 +dir2=$2 +dir_compare=$3 + +mkdir -p $dir_compare/log + +for d in $dir1 $dir2; do + for f in test_filt.txt best_wer; do + [ ! -f $d/$f ] && echo "score_compare.sh: no such file $d/$f" && exit 1; + done +done + + +best_wer_file1=$(awk '{print $NF}' $dir1/best_wer) +best_transcript_file1=$(echo $best_wer_file1 | sed -e 's=.*/wer_==' | \ + awk -v FS='_' -v dir=$dir1 '{print dir"/penalty_"$2"/"$1".txt"}') + +best_wer_file2=$(awk '{print $NF}' $dir2/best_wer) +best_transcript_file2=$(echo $best_wer_file2 | sed -e 's=.*/wer_==' | \ + awk -v FS='_' -v dir=$dir2 '{print dir"/penalty_"$2"/"$1".txt"}') + +$cmd $dir_compare/log/score_compare.log \ + compute-wer-bootci --replications=$replications \ + ark:$dir1/test_filt.txt ark:$best_transcript_file1 ark:$best_transcript_file2 \ + '>' $dir_compare/wer_bootci_comparison || exit 1; + +exit 0; diff --git a/egs/wsj/s5/steps/select_feats.sh b/egs/wsj/s5/steps/select_feats.sh index 970823fdf25..072dd3194cf 100755 --- a/egs/wsj/s5/steps/select_feats.sh +++ b/egs/wsj/s5/steps/select_feats.sh @@ -43,31 +43,31 @@ mkdir -p $ark_dir $logdir mkdir -p $data cp $data_in/* $data/ 2>/dev/null # so we get the other files, such as utt2spk. -rm $data/cmvn.scp 2>/dev/null -rm $data/feats.scp 2>/dev/null +rm $data/cmvn.scp 2>/dev/null +rm $data/feats.scp 2>/dev/null # use "name" as part of name of the archive. name=`basename $data` -for j in $(seq $nj); do +for j in $(seq $nj); do # the next command does nothing unless $mfccdir/storage/ exists, see # utils/create_data_link.pl for more info. - utils/create_data_link.pl $ark_dir/pasted_$name.$j.ark + utils/create_data_link.pl $ark_dir/selected_$name.$j.ark done $cmd JOB=1:$nj $logdir/append.JOB.log \ select-feats "$selector" scp:$data_in/split$nj/JOB/feats.scp ark:- \| \ copy-feats --compress=$compress ark:- \ - ark,scp:$ark_dir/pasted_$name.JOB.ark,$ark_dir/pasted_$name.JOB.scp || exit 1; - + ark,scp:$ark_dir/selected_$name.JOB.ark,$ark_dir/selected_$name.JOB.scp || exit 1; + # concatenate the .scp files together. for ((n=1; n<=nj; n++)); do - cat $ark_dir/pasted_$name.$n.scp >> $data/feats.scp || exit 1; + cat $ark_dir/selected_$name.$n.scp >> $data/feats.scp || exit 1; done > $data/feats.scp || exit 1; -nf=`cat $data/feats.scp | wc -l` -nu=`cat $data/utt2spk | wc -l` +nf=`cat $data/feats.scp | wc -l` +nu=`cat $data/utt2spk | wc -l` if [ $nf -ne $nu ]; then echo "It seems not all of the feature files were successfully processed ($nf != $nu);" exit 1; diff --git a/egs/wsj/s5/utils/copy_data_dir.sh b/egs/wsj/s5/utils/copy_data_dir.sh index bb4d4e77e7c..5e1a9cba470 100755 --- a/egs/wsj/s5/utils/copy_data_dir.sh +++ b/egs/wsj/s5/utils/copy_data_dir.sh @@ -46,7 +46,7 @@ srcdir=$1 destdir=$2 if [ ! -f $srcdir/utt2spk ]; then - echo "copy_data_dir.sh: no such file $srcdir/utt2spk" + echo "copy_data_dir.sh: no such file $srcdir/utt2spk" exit 1; fi @@ -82,7 +82,7 @@ if [ -f $srcdir/segments ]; then cp $srcdir/reco2file_and_channel $destdir/ fi else # no segments->wav indexed by utt. - if [ -f $srcdir/wav.scp ]; then + if [ -f $srcdir/wav.scp ]; then utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp fi fi @@ -90,6 +90,9 @@ fi if [ -f $srcdir/text ]; then utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text fi +if [ -f $srcdir/utt2dur ]; then + utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur +fi if [ -f $srcdir/spk2gender ]; then utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender fi diff --git a/egs/wsj/s5/utils/create_split_dir.pl b/egs/wsj/s5/utils/create_split_dir.pl index 0c4f023f7f3..dc94f3bad43 100755 --- a/egs/wsj/s5/utils/create_split_dir.pl +++ b/egs/wsj/s5/utils/create_split_dir.pl @@ -53,6 +53,7 @@ # If the symbolic link already exists, delete it. if (-l $pseudo_storage) { print STDERR "$0: link $pseudo_storage already exists, not overwriting.\n"; + $index++; next; } diff --git a/egs/wsj/s5/utils/data/combine_data.sh b/egs/wsj/s5/utils/data/combine_data.sh new file mode 120000 index 00000000000..0aed7e823b7 --- /dev/null +++ b/egs/wsj/s5/utils/data/combine_data.sh @@ -0,0 +1 @@ +../combine_data.sh \ No newline at end of file diff --git a/egs/wsj/s5/utils/data/copy_data_dir.sh b/egs/wsj/s5/utils/data/copy_data_dir.sh new file mode 120000 index 00000000000..b9854db4655 --- /dev/null +++ b/egs/wsj/s5/utils/data/copy_data_dir.sh @@ -0,0 +1 @@ +../copy_data_dir.sh \ No newline at end of file diff --git a/egs/wsj/s5/utils/data/get_frame_shift.sh b/egs/wsj/s5/utils/data/get_frame_shift.sh new file mode 100755 index 00000000000..77f5f8eb7dc --- /dev/null +++ b/egs/wsj/s5/utils/data/get_frame_shift.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# Copyright 2016 Johns Hopkins University (author: Daniel Povey) +# Apache 2.0 + +# This script takes as input a data directory, such as data/train/, preferably +# with utt2dur file already existing (or the utt2dur file will be created if +# not), and it attempts to work out the approximate frame shift by comparing the +# utt2dur with the output of feat-to-len on the feats.scp. It prints it out. +# if the shift is very close to, but above, 0.01 (the normal frame shift) it +# rounds it down. + +. utils/parse_options.sh +. ./path.sh + +if [ $# != 1 ]; then + echo "Usage: $0 " + echo "e.g.:" + echo " $0 data/train" + echo "This script prints the frame-shift (e.g. 0.01) to the standard out." + echo "If does not contain utt2dur, this script will call utils/data/get_utt2dur.sh," + echo "which will require write permission to " + exit 1 +fi + +export LC_ALL=C + +dir=$1 + +if [ ! -f $dir/utt2dur ]; then + echo "$0: $dir/utt2dur does not exist: creating it" 1>&2 + utils/data/get_utt2dur.sh $dir 1>&2 +fi + +if [ ! -f $dir/feats.scp ]; then + echo "$0: $dir/feats.scp does not exist" 1>&2 + exit 1 +fi + +temp=$(mktemp /tmp/tmp.XXXX) + +feat-to-len scp:$dir/feats.scp ark,t:- | head -n 10 > $temp + +if [ -z $temp ]; then + echo "$0: error running feat-to-len" 1>&2 + exit 1 +fi + +head -n 10 $dir/utt2dur | paste - $temp | \ + awk '{ dur += $2; frames += $4; } END { shift = dur / frames; if (shift > 0.01 && shift < 0.0102) shift = 0.01; print shift; }' || exit 1; + +rm $temp + +exit 0 diff --git a/egs/wsj/s5/utils/data/get_num_frames.sh b/egs/wsj/s5/utils/data/get_num_frames.sh new file mode 100755 index 00000000000..9c4aae5e693 --- /dev/null +++ b/egs/wsj/s5/utils/data/get_num_frames.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# This script works out the approximate number of frames in a training directory. +# This is sometimes needed by higher-level scripts + + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# -ne 1 ]; then + ( + echo "Usage: $0 " + echo "Prints the number of frames of data in the data-dir" + ) 1>&2 +fi + +data=$1 + +if [ ! -f $data/utt2dur ]; then + utils/data/get_utt2dur.sh $data 1>&2 || exit 1 +fi + +frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1 + +awk -v s=$frame_shift '{n += $2} END{print int(n / s)}' <$data/utt2dur diff --git a/egs/wsj/s5/utils/data/get_utt2dur.sh b/egs/wsj/s5/utils/data/get_utt2dur.sh new file mode 100755 index 00000000000..344eb773581 --- /dev/null +++ b/egs/wsj/s5/utils/data/get_utt2dur.sh @@ -0,0 +1,83 @@ +#!/bin/bash + +# Copyright 2016 Johns Hopkins University (author: Daniel Povey) +# Apache 2.0 + +# This script operates on a data directory, such as in data/train/, and adds the +# utt2dur file if it does not already exist. The file 'utt2dur' maps from +# utterance to the duration of the utterance in seconds. This script works it +# out from the 'segments' file, or, if not present, from the wav.scp file (it +# first tries interrogating the headers, and if this fails, it reads the wave +# files in entirely.) + +. utils/parse_options.sh +. ./path.sh + +if [ $# != 1 ]; then + echo "Usage: $0 " + echo "e.g.:" + echo " $0 data/train" + exit 1 +fi + +export LC_ALL=C + +data=$1 + + +if [ -f $data/segments ]; then + echo "$0: working out $data/utt2dur from $data/segments" + cat $data/segments | awk '{len=$4-$3; print $1, len;}' > $data/utt2dur +else + echo "$0: segments file does not exist so getting durations from wave files" + if [ ! -f $data/wav.scp ]; then + echo "$0: Expected $data/wav.scp or $data/segments to exist" + exit 1 + fi + + # if the wav.scp contains only lines of the form + # utt1 /foo/bar/sph2pipe -f wav /baz/foo.sph | + if cat $data/wav.scp | perl -e ' + while (<>) { s/\|\s*$/ |/; # make sure final | is preceded by space. + @A = split; if (!($#A == 5 && $A[1] =~ m/sph2pipe$/ && + $A[2] eq "-f" && $A[3] eq "wav" && $A[5] eq "|")) { exit(1); } + $utt = $A[0]; $sphere_file = $A[4]; + if (!open(F, "<$sphere_file")) { die "Error opening sphere file $sphere_file"; } + $sample_rate = -1; $sample_count = -1; + for ($n = 0; $n <= 30; $n++) { + $line = ; + if ($line =~ m/sample_rate -i (\d+)/) { $sample_rate = $1; } + if ($line =~ m/sample_count -i (\d+)/) { $sample_count = $1; } + if ($line =~ m/end_head/) { break; } + } + close(F); + if ($sample_rate == -1 || $sample_count == -1) { + die "could not parse sphere header from $sphere_file"; + } + $duration = $sample_count * 1.0 / $sample_rate; + print "$utt $duration\n"; + } ' > $data/utt2dur; then + echo "$0: successfully obtained utterance lengths from sphere-file headers" + else + echo "$0: could not get utterance lengths from sphere-file headers, using wav-to-duration" + if ! command -v wav-to-duration >/dev/null; then + echo "$0: wav-to-duration is not on your path" + exit 1; + fi + if ! wav-to-duration scp:$data/wav.scp ark,t:$data/utt2dur 2>&1 | grep -v 'nonzero return status'; then + echo "$0: there was a problem getting the durations; moving $data/utt2dur to $data/.backup/" + mkdir -p $data/.backup/ + mv $data/utt2dur $data/.backup/ + fi + fi +fi + +len1=$(cat $data/utt2spk | wc -l) +len2=$(cat $data/utt2dur | wc -l) +if [ "$len1" != "$len2" ]; then + echo "$0: warning: length of utt2dur does not equal that of utt2spk, $len2 != $len1" +fi + +echo "$0: computed $data/utt2dur" + +exit 0 diff --git a/egs/wsj/s5/utils/data/perturb_data_dir_speed.sh b/egs/wsj/s5/utils/data/perturb_data_dir_speed.sh new file mode 120000 index 00000000000..1cd5db30d92 --- /dev/null +++ b/egs/wsj/s5/utils/data/perturb_data_dir_speed.sh @@ -0,0 +1 @@ +../perturb_data_dir_speed.sh \ No newline at end of file diff --git a/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh b/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh new file mode 100755 index 00000000000..a5a030ffdd8 --- /dev/null +++ b/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +# Copyright 2016 Johns Hopkins University (author: Daniel Povey) + +# Apache 2.0 + +# This script does the standard 3-way speed perturbing of +# a data directory (it operates on the wav.scp). + +. utils/parse_options.sh + +if [ $# != 2 ]; then + echo "Usage: perturb_data_dir_speed_3way.sh " + echo "Applies standard 3-way speed perturbation using factors of 0.9, 1.0 and 1.1." + echo "e.g.:" + echo " $0 data/train data/train_sp" + echo "Note: if /feats.scp already exists, this will refuse to run." + exit 1 +fi + +srcdir=$1 +destdir=$2 + +if [ ! -f $srcdir/wav.scp ]; then + echo "$0: expected $srcdir/wav.scp to exist" + exit 1 +fi + +if [ -f $destdir/feats.scp ]; then + echo "$0: $destdir/feats.scp already exists: refusing to run this (please delete $destdir/feats.scp if you want this to run)" + exit 1 +fi + +utils/data/perturb_data_dir_speed.sh 0.9 ${srcdir} ${destdir}_speed0.9 || exit 1 +utils/data/perturb_data_dir_speed.sh 1.1 ${srcdir} ${destdir}_speed1.1 || exit 1 +utils/data/combine_data.sh $destdir ${srcdir} ${destdir}_speed0.9 ${destdir}_speed1.1 || exit 1 + +rm -r ${destdir}_speed0.9 ${destdir}_speed1.1 + +echo "$0: generated 3-way speed-perturbed version of data in $srcdir, in $destdir" +utils/validate_data_dir.sh --no-feats $destdir + diff --git a/egs/wsj/s5/utils/data/perturb_data_dir_volume.sh b/egs/wsj/s5/utils/data/perturb_data_dir_volume.sh new file mode 100755 index 00000000000..b7fb0cfce26 --- /dev/null +++ b/egs/wsj/s5/utils/data/perturb_data_dir_volume.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +# Copyright 2016 Johns Hopkins University (author: Daniel Povey) +# Apache 2.0 + +# This script operates on a data directory, such as in data/train/, and modifies +# the wav.scp to perturb the volume (typically useful for training data when +# using systems that don't have cepstral mean normalization). + +. utils/parse_options.sh + +if [ $# != 1 ]; then + echo "Usage: $0 " + echo "e.g.:" + echo " $0 data/train" + exit 1 +fi + +export LC_ALL=C + +data=$1 + +if [ ! -f $data/wav.scp ]; then + echo "$0: Expected $data/wav.scp to exist" + exit 1 +fi + +if [ grep "sox --vol" $data/wav.scp ]; then + echo "$0: It looks like the data was already volume perturbed. Not doing anything." + exit 0 +fi + +cat $data/wav.scp | python -c " +import sys, os, subprocess, re, random +scale_low = 1.0/8 +scale_high = 2.0 +for line in sys.stdin.readlines(): + if len(line.strip()) == 0: + continue + print '{0} sox --vol {1} -t wav - -t wav - |'.format(line.strip(), random.uniform(scale_low, scale_high)) +" > $data/wav.scp_scaled || exit 1; + +len1=$(cat $data/wav.scp | wc -l) +len2=$(cat $data/wav.scp_scaled | wc -l) +if [ "$len1" != "$len2" ]; then + echo "$0: error detected: number of lines changed $len1 vs $len2"; + exit 1 +fi + +mv $data/wav.scp_scaled $data/wav.scp + +if [ -f $data/feats.scp ]; then + echo "$0: $data/feats.scp exists; moving it to $data/.backup/ as it wouldn't be valid any more." + mkdir -p $data/.backup/ + mv $data/feats.scp $data/.backup/ +fi + +echo "$0: added volume perturbation to the data in $data" +exit 0 + diff --git a/egs/wsj/s5/utils/data/validate_data_dir.sh b/egs/wsj/s5/utils/data/validate_data_dir.sh new file mode 120000 index 00000000000..1e19b4d921f --- /dev/null +++ b/egs/wsj/s5/utils/data/validate_data_dir.sh @@ -0,0 +1 @@ +../validate_data_dir.sh \ No newline at end of file diff --git a/egs/wsj/s5/utils/fix_data_dir.sh b/egs/wsj/s5/utils/fix_data_dir.sh index 4716925df7d..b6ce1511814 100755 --- a/egs/wsj/s5/utils/fix_data_dir.sh +++ b/egs/wsj/s5/utils/fix_data_dir.sh @@ -1,9 +1,9 @@ #!/bin/bash -# This script makes sure that only the segments present in +# This script makes sure that only the segments present in # all of "feats.scp", "wav.scp" [if present], segments [if present] # text, and utt2spk are present in any of them. -# It puts the original contents of data-dir into +# It puts the original contents of data-dir into # data-dir/.backup if [ $# != 1 ]; then @@ -35,7 +35,8 @@ function check_sorted { fi } -for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp reco2file_and_channel spk2gender utt2lang utt2uniq; do +for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \ + reco2file_and_channel spk2gender utt2lang utt2uniq utt2dur; do if [ -f $data/$x ]; then cp $data/$x $data/.backup/$x check_sorted $data/$x @@ -61,7 +62,7 @@ function filter_file { function filter_recordings { # We call this once before the stage when we filter on utterance-id, and once # after. - + if [ -f $data/segments ]; then # We have a segments file -> we need to filter this and the file wav.scp, and # reco2file_and_utt, if it exists, to make sure they have the same list of @@ -78,7 +79,7 @@ function filter_recordings { utils/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp mv $tmpdir/recordings.tmp $tmpdir/recordings - + cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments filter_file $tmpdir/recordings $data/segments cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments @@ -86,7 +87,7 @@ function filter_recordings { filter_file $tmpdir/recordings $data/wav.scp [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel - + fi } @@ -116,8 +117,6 @@ function filter_speakers { function filter_utts { cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts -# Do a check. - ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \ echo "utt2spk is not in sorted order (fix this yourself)" && exit 1; @@ -128,7 +127,7 @@ function filter_utts { ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \ echo "spk2utt is not in sorted order (fix this yourself)" && exit 1; - if [ -f $data/utt2uniq ]; then + if [ -f $data/utt2uniq ]; then ! cat $data/utt2uniq | sort | cmp - $data/utt2uniq && \ echo "utt2uniq is not in sorted order (fix this yourself)" && exit 1; fi @@ -155,7 +154,7 @@ function filter_utts { fi fi - for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang $maybe_wav; do + for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2dur $maybe_wav; do if [ -f $data/$x ]; then cp $data/$x $data/.backup/$x if ! cmp -s $data/$x <( utils/filter_scp.pl $tmpdir/utts $data/$x ) ; then diff --git a/egs/wsj/s5/utils/format_lm_sri.sh b/egs/wsj/s5/utils/format_lm_sri.sh index 7753c186045..7b5477e958a 100755 --- a/egs/wsj/s5/utils/format_lm_sri.sh +++ b/egs/wsj/s5/utils/format_lm_sri.sh @@ -71,8 +71,8 @@ if [ -z $loc ]; then export PATH=$PATH:$sdir:$sdir/.. else echo You appear to not have SRILM tools installed, either on your path, - echo or installed in $sdir. See tools/install_srilm.sh for installation - echo instructions. + echo or installed in $sdir. cd to ../../../tools and run + echo extras/install_srilm.sh. exit 1 fi fi @@ -88,8 +88,8 @@ lm_base=$(basename $lm '.gz') gunzip -c $lm | utils/find_arpa_oovs.pl $out_dir/words.txt \ > $out_dir/oovs_${lm_base}.txt || exit 1; -# Removing all "illegal" combinations of and , which are supposed to -# occur only at being/end of utt. These can cause determinization failures +# Removing all "illegal" combinations of and , which are supposed to +# occur only at being/end of utt. These can cause determinization failures # of CLG [ends up being epsilon cycles]. gunzip -c $lm \ | egrep -v ' | | ' \ @@ -98,8 +98,8 @@ gunzip -c $lm \ awk '{print $1}' $out_dir/words.txt > $tmpdir/voc || exit 1; # Change the LM vocabulary to be the intersection of the current LM vocabulary -# and the set of words in the pronunciation lexicon. This also renormalizes the -# LM by recomputing the backoff weights, and remove those ngrams whose +# and the set of words in the pronunciation lexicon. This also renormalizes the +# LM by recomputing the backoff weights, and remove those ngrams whose # probabilities are lower than the backed-off estimates. change-lm-vocab -vocab $tmpdir/voc -lm $tmpdir/lm.gz -write-lm $tmpdir/out_lm \ $srilm_opts || exit 1; diff --git a/egs/wsj/s5/utils/lang/add_lex_disambig.pl b/egs/wsj/s5/utils/lang/add_lex_disambig.pl new file mode 120000 index 00000000000..2d1d4425b49 --- /dev/null +++ b/egs/wsj/s5/utils/lang/add_lex_disambig.pl @@ -0,0 +1 @@ +../add_lex_disambig.pl \ No newline at end of file diff --git a/egs/wsj/s5/utils/lang/check_g_properties.pl b/egs/wsj/s5/utils/lang/check_g_properties.pl new file mode 100755 index 00000000000..aa0e6eb1c78 --- /dev/null +++ b/egs/wsj/s5/utils/lang/check_g_properties.pl @@ -0,0 +1,89 @@ +#!/usr/bin/env perl + +use IPC::Open2; + +if (@ARGV != 1) { + print "Usage: $0 [options] \n"; + print "e.g.: $0 data/lang\n"; + exit(1); +} + +$lang = shift @ARGV; + +# This script checks that G.fst in the lang.fst directory is OK with respect +# to certain expected properties, and returns nonzero exit status if a problem was +# detected. It is called from validate_lang.pl. +# This only checks the properties of G that relate to disambiguation symbols, +# epsilons and forbidden symbols and . + +if (! -e "$lang/G.fst") { + print "$0: error: $lang/G.fst does not exist\n"; + exit(1); +} + +open(W, "<$lang/words.txt") || die "opening $lang/words.txt"; +$hash_zero = -1; +while () { + @A = split(" ", $_); + ($sym, $int) = @A; + if ($sym eq "" || $sym eq "") { $is_forbidden{$int} = 1; } + if ($sym eq "#0") { $hash_zero = $int; } +} + +if (-e "$lang/phones/wdisambig_words.int") { + open(F, "<$lang/phones/wdisambig_words.int") || die "opening $lang/phones/wdisambig_words.int"; + while () { + chop; + $is_disambig{$_} = 1; + } +} else { + $is_disambig{$hash_zero} = 1; +} + +$input_cmd = ". ./path.sh; fstprint $lang/G.fst|"; +open(G, $input_cmd) || die "running command $input_cmd"; + +$info_cmd = ". ./path.sh; fstcompile | fstinfo "; +open2(O, I, "$info_cmd") || die "running command $info_cmd"; + +$has_epsilons = 0; + +while () { + @A = split(" ", $_); + if (@A >= 4) { + if ($is_forbidden{$A[2]} || $is_forbidden{$A[3]}) { + chop; + print "$0: validating $lang: error: line $_ in G.fst contains forbidden symbol or \n"; + exit(1); + } elsif ($is_disambig{$A[2]}) { + print O $_; + if ($A[3] != 0) { + chop; + print "$0: validating $lang: error: line $_ in G.fst has disambig on input but no epsilon on output\n"; + exit(1); + } + } elsif ($A[2] == 0) { + print O $_; + $has_epsilons = 1; + } elsif ($A[2] != $A[3]) { + chop; + print "$0: validating $lang: error: line $_ in G.fst has inputs and outputs different but input is not disambig symbol.\n"; + exit(1); + } + } +} + +close(O); # tell 'fstcompile | fstinfo' pipeline that its input is done. +while () { + if (m/cyclic\s+/) { + print "$0: validating $lang: error: G.fst has cycles containing only disambig symbols and epsilons. Would cause determinization failure\n"; + exit(1); + } +} + +if ($has_epsilons) { + print "$0: warning: validating $lang: G.fst has epsilon-input arcs. We don't expect these in most setups.\n"; +} + +print "--> $0 successfully validated $lang/G.fst\n"; +exit(0); diff --git a/egs/wsj/s5/utils/lang/prepare_lang.sh b/egs/wsj/s5/utils/lang/prepare_lang.sh new file mode 120000 index 00000000000..96b9f592e82 --- /dev/null +++ b/egs/wsj/s5/utils/lang/prepare_lang.sh @@ -0,0 +1 @@ +../prepare_lang.sh \ No newline at end of file diff --git a/egs/wsj/s5/utils/lang/validate_lang.pl b/egs/wsj/s5/utils/lang/validate_lang.pl new file mode 120000 index 00000000000..edb66bf3149 --- /dev/null +++ b/egs/wsj/s5/utils/lang/validate_lang.pl @@ -0,0 +1 @@ +../validate_lang.pl \ No newline at end of file diff --git a/egs/wsj/s5/utils/make_phone_bigram_lang.sh b/egs/wsj/s5/utils/make_phone_bigram_lang.sh index 87d1db8f3e8..a8a67870fb3 100755 --- a/egs/wsj/s5/utils/make_phone_bigram_lang.sh +++ b/egs/wsj/s5/utils/make_phone_bigram_lang.sh @@ -11,7 +11,7 @@ # language-id. -# We might later have options here; if not, I'llr emove this. +# We might later have options here; if not, I'll emove this. echo "$0 $@" # Print the command line for logging @@ -42,6 +42,8 @@ rm -r $lang_out/phones 2>/dev/null cp -r $lang/phones/ $lang_out/ rm $lang_out/phones/word_boundary.* 2>/dev/null # these would # no longer be valid. +rm $lang_out/phones/wdisambig* 2>/dev/null # ditto this. + # List of disambig symbols will be empty: not needed, since G.fst and L.fst * G.fst # are determinizable without any. echo -n > $lang_out/phones/disambig.txt @@ -81,7 +83,7 @@ ali-to-phones $alidir/final.mdl "ark:gunzip -c $alidir/ali.*.gz|" ark,t:- | \ foreach $p (@phones) { $src = $phn2state{$p}; $hist = $histcount{$p}; - $hist > 0 || die; + $hist > 0 || die; foreach $q (@phones) { $c = $count{$p,$q}; if (defined $c) { @@ -92,7 +94,7 @@ ali-to-phones $alidir/final.mdl "ark:gunzip -c $alidir/ali.*.gz|" ark,t:- | \ } $c = $count{$p,""}; if (defined $c) { - $cost = -log($c / $hist); # cost on FST arc. + $cost = -log($c / $hist); # cost on FST arc. print "$src $cost\n"; # final-prob. } } ' | fstcompile --acceptor=true | \ @@ -101,7 +103,7 @@ ali-to-phones $alidir/final.mdl "ark:gunzip -c $alidir/ali.*.gz|" ark,t:- | \ # symbols for phones and words are the same. # Neither has disambig symbols. cp $lang_out/phones.txt $lang_out/words.txt - + grep -v '' $lang_out/phones.txt | awk '{printf("0 0 %s %s\n", $2, $2);} END{print("0 0.0");}' | \ fstcompile > $lang_out/L.fst diff --git a/egs/wsj/s5/utils/perturb_data_dir_speed.sh b/egs/wsj/s5/utils/perturb_data_dir_speed.sh index 61c0962cf15..091ea0c069e 100755 --- a/egs/wsj/s5/utils/perturb_data_dir_speed.sh +++ b/egs/wsj/s5/utils/perturb_data_dir_speed.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash # Copyright 2013 Johns Hopkins University (author: Daniel Povey) # 2014 Tom Ko @@ -36,7 +36,7 @@ which sox &>/dev/null ! [ $? -eq 0 ] && echo "sox: command not found" && exit 1; if [ ! -f $srcdir/utt2spk ]; then - echo "$0: no such file $srcdir/utt2spk" + echo "$0: no such file $srcdir/utt2spk" exit 1; fi @@ -65,18 +65,18 @@ if [ -f $srcdir/segments ]; then utils/apply_map.pl -f 1 $destdir/reco_map <$srcdir/wav.scp | sed 's/| *$/ |/' | \ awk -v factor=$factor \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} + '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} else {print wid " sox -t wav" $_ " -t wav - speed " factor " |"}}' > $destdir/wav.scp if [ -f $srcdir/reco2file_and_channel ]; then utils/apply_map.pl -f 1 $destdir/reco_map <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel fi - + rm $destdir/reco_map 2>/dev/null else # no segments->wav indexed by utterance. if [ -f $srcdir/wav.scp ]; then utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp | sed 's/| *$/ |/' | \ awk -v factor=$factor \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} + '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} else {print wid " sox -t wav" $_ " -t wav - speed " factor " |"}}' > $destdir/wav.scp fi fi @@ -88,6 +88,10 @@ if [ -f $srcdir/spk2gender ]; then utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender fi +if [ -f $srcdir/utt2dur ]; then + cat $srcdir/utt2dur | utils/apply_map.pl -f 1 $destdir/utt_map | \ + awk -v factor=$factor '{print $1, $2/factor;}' >$destdir/utt2dur +fi rm $destdir/spk_map $destdir/utt_map 2>/dev/null echo "$0: generated speed-perturbed version of data in $srcdir, in $destdir" diff --git a/egs/wsj/s5/utils/perturb_data_signal.sh b/egs/wsj/s5/utils/perturb_data_signal.sh new file mode 100755 index 00000000000..7034dd22d5b --- /dev/null +++ b/egs/wsj/s5/utils/perturb_data_signal.sh @@ -0,0 +1,149 @@ +#!/bin/bash + +# Copyright 2013 Johns Hopkins University (author: Daniel Povey) +# 2014 Tom Ko +# Apache 2.0 + +# This script operates on a directory, such as in data/train/, +# that contains some subset of the following files: +# wav.scp +# spk2utt +# utt2spk +# text +# spk_filter.scp +# It generates the files which are used for perturbing the data at signal-level. + +. utils/parse_options.sh + +if [ $# != 3 ]; then + echo "Usage: perturb_data_signal.sh " + echo "e.g.:" + echo " $0 'fp01' data/train_si284 data/train_si284p" + exit 1 +fi + +export LC_ALL=C + +prefix=$1 +srcdir=$2 +destdir=$3 +spk_prefix=$prefix"-" +utt_prefix=$prefix"-" + +for f in spk2utt text utt2spk wav.scp spk_filter.scp; do + [ ! -f $srcdir/$f ] && echo "$0: no such file $srcdir/$f" && exit 1; +done + +set -e; +set -o pipefail + +mkdir -p $destdir + +cat $srcdir/utt2spk | awk -v p=$utt_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/utt_map +cat $srcdir/spk2utt | awk -v p=$spk_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/spk_map +cat $srcdir/utt2spk | awk -v p=$utt_prefix '{printf("%s%s %s\n", p, $1, $1);}' > $destdir/utt2uniq + +cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \ + utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk + +utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt + + +# The following perl script is the core part. + +echo $spk_prefix | perl -e ' + $prefix = ; + chomp($prefix); + ($u2s_in, $seg_in, $wav_in, $filt_in, $wav_out) = @ARGV; + if (open(SEG, "<$seg_in")) { + $have_segments="true"; + } else { + $have_segments="false"; + } + open(UI, "<$u2s_in") || die "Error: fail to open $u2s_in\n"; + open(WI, "<$wav_in") || die "Error: fail to open $wav_in\n"; + open(FI, "<$filt_in") || die "Error: fail to open $filt_in\n"; + open(WO, ">$wav_out") || die "Error: fail to open $wav_out\n"; + while () { + chomp; + @col = split; + @col == 2 || die "Error: bad line $_\n"; + ($utt_id, $spk) = @col; + $utt2spk{$utt_id} = $spk; + } + if ($have_segments eq "true") { + while () { + chomp; + @col = split; + $reco2utt{$col[1]} = $col[0]; + } + } + while () { + chomp; + @col = split; + $pipe = join(" ", @col[1..@col-1]); + $reco2pipe{$col[0]} = $pipe; + $recolist{$col[0]} = $col[0]; + if ($have_segments eq "false") { + $reco2utt{$col[0]} = $col[0]; + } + } + while () { + chomp; + @col = split; + @col == 2 || die "Error: bad line $_\n"; + $spk2filt{$col[0]} = $col[1]; + } + + foreach $reco (sort keys %recolist) { + #$reco2spk{$reco} = $utt2spk{$reco2utt{$reco}}; + #$reco2filt{$reco} = $spk2filt{$utt2spk{$reco2utt{$reco}}}; + $reco2spk{$reco} = $reco; + $reco2filt{$reco} = $spk2filt{$reco}; + if ($reco2filt{$reco} eq "") { + $spk = (keys %spk2filt)[rand keys %spk2filt]; + $reco2spk{$reco} = $spk; + $reco2filt{$reco} = $spk2filt{$spk}; + } + while (1) { + # randomly pick a filter from another speaker + $spk = (keys %spk2filt)[rand keys %spk2filt]; + $reco2perturbspk{$reco} = $spk; + $reco2perturbfilt{$reco} = $spk2filt{$spk}; + if ($reco2perturbfilt{$reco} ne $reco2filt{$reco}) { + last; + } + } + } + + foreach $reco (sort keys %recolist) { + print WO "$prefix$reco $reco2pipe{$reco} apply-filter --inverse=false \"scp:echo $reco2spk{$reco} $reco2filt{$reco} |\" - - | apply-filter --inverse=true \"scp:echo $reco2perturbspk{$reco} $reco2perturbfilt{$reco} |\" - - |\n"; + } + +' $srcdir/utt2spk $srcdir/segments $srcdir/wav.scp \ +$srcdir/spk_filter.scp $destdir/wav.scp + +if [ -f $srcdir/segments ]; then + # also apply the spk_prefix to the recording-ids. + cat $srcdir/wav.scp | awk -v p=$spk_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/reco_map + + utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments | utils/apply_map.pl -f 2 $destdir/reco_map >$destdir/segments + + if [ -f $srcdir/reco2file_and_channel ]; then + utils/apply_map.pl -f 1 $destdir/reco_map <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel + fi + + rm $destdir/reco_map 2>/dev/null +fi + +if [ -f $srcdir/text ]; then + utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text +fi +if [ -f $srcdir/spk2gender ]; then + utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender +fi + + +rm $destdir/spk_map $destdir/utt_map 2>/dev/null +echo "$0: generated signal-perturbed version of data in $srcdir, in $destdir" +utils/validate_data_dir.sh --no-feats $destdir diff --git a/egs/wsj/s5/utils/perturb_data_signal_v2.sh b/egs/wsj/s5/utils/perturb_data_signal_v2.sh new file mode 100755 index 00000000000..c205b67e5e0 --- /dev/null +++ b/egs/wsj/s5/utils/perturb_data_signal_v2.sh @@ -0,0 +1,187 @@ +#!/bin/bash + +# Copyright 2013 Johns Hopkins University (author: Daniel Povey) +# 2014 Tom Ko +# Apache 2.0 + +# This script operates on a directory, such as in data/train/, +# that contains some subset of the following files: +# wav.scp +# spk2utt +# utt2spk +# text +# spk_filter.scp +# It generates the files which are used for perturbing the data at signal-level. + +. utils/parse_options.sh + +if [ $# != 4 ]; then + echo "Usage: perturb_data_signal.sh " + echo "e.g.:" + echo " $0 3 'fp01' data/train_si284 data/train_si284p" + exit 1 +fi + +export LC_ALL=C + +num_parts=$1 +prefix=$2 +srcdir=$3 +destdir=$4 +spk_prefix=$prefix"-" +utt_prefix=$prefix"-" + +for f in spk2utt text utt2spk wav.scp spk_filter.scp; do + [ ! -f $srcdir/$f ] && echo "$0: no such file $srcdir/$f" && exit 1; +done + +set -e; +set -o pipefail + +mkdir -p $destdir + +cat $srcdir/utt2spk | awk -v p=$utt_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/utt_map +cat $srcdir/spk2utt | awk -v p=$spk_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/spk_map +cat $srcdir/utt2spk | awk -v p=$utt_prefix '{printf("%s%s %s\n", p, $1, $1);}' > $destdir/utt2uniq + +cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \ + utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk + +utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt + + +# The following perl script is the core part. + +echo $spk_prefix | perl -e ' + $prefix = ; + chomp($prefix); + ($num_parts, $u2s_in, $s2u_in, $seg_in, $wav_in, $filt_in, $wav_out, $seg_out) = @ARGV; + if (open(SEG, "<$seg_in")) { + $have_segments="true"; + } else { + $have_segments="false"; + } + open(UI, "<$u2s_in") || die "Error: fail to open $u2s_in\n"; + open(SI, "<$s2u_in") || die "Error: fail to open $s2u_in\n"; + open(WI, "<$wav_in") || die "Error: fail to open $wav_in\n"; + open(FI, "<$filt_in") || die "Error: fail to open $filt_in\n"; + open(WO, ">$wav_out") || die "Error: fail to open $wav_out\n"; + open(SO, ">$seg_out") || die "Error: fail to open $seg_out\n"; + while () { + chomp; + @col = split; + @col == 2 || die "Error: bad line $_\n"; + ($utt_id, $spk) = @col; + $utt2spk{$utt_id} = $spk; + } + while () { + chomp; + @col = split; + $spks = join(" ", @col[1..@col-1]); + $spk2utt{$col[0]} = $spks; + } + if ($have_segments eq "true") { + while () { + chomp; + @col = split; + $seg = join(" ", @col[2..@col-1]); + $reco2utt{$col[1]} = $col[0]; + $utt2reco{$col[0]} = $col[1]; + $utt2seg{$col[0]} = $seg; + } + } + while () { + chomp; + @col = split; + $pipe = join(" ", @col[1..@col-1]); + $reco2pipe{$col[0]} = $pipe; + $recolist{$col[0]} = $col[0]; + if ($have_segments eq "false") { + $reco2utt{$col[0]} = $col[0]; + } + } + while () { + chomp; + @col = split; + @col == 2 || die "Error: bad line $_\n"; + $spk2filt{$col[0]} = $col[1]; + } + + foreach $reco (sort keys %recolist) { + #$reco2spk{$reco} = $utt2spk{$reco2utt{$reco}}; + #$reco2filt{$reco} = $spk2filt{$utt2spk{$reco2utt{$reco}}}; + $reco2spk{$reco} = $reco; + $reco2filt{$reco} = $spk2filt{$reco}; + for (my $i=0; $i < $num_parts; $i++) { + $newreco2spk{$reco.$i} = $reco; + } + @spk2filt_rand{keys %spk2filt} = @spk2filt{keys %spk2filt}; + delete $spk2filt_rand{$reco}; + if ($reco2filt{$reco} eq "") { + $spk = (keys %spk2filt)[rand keys %spk2filt]; + $reco2spk{$reco} = $spk; + $reco2filt{$reco} = $spk2filt{$spk}; + delete $spk2filt_rand{$spk}; + } + for (my $i=0; $i < $num_parts; $i++) { + # randomly pick a filter from another speaker + $spk = (keys %spk2filt_rand)[rand keys %spk2filt_rand]; + $newreco2perturbspk{$reco.$i} = $spk; + $newreco2perturbfilt{$reco.$i} = $spk2filt{$spk}; + delete $spk2filt_rand{$spk}; + } + } + + foreach $spk (sort keys %spk2utt) { + @utts = split(" ", $spk2utt{$spk}); + $numutts = @utts; + if ($numutts < $num_parts) { + $partsize = $numutts; + } else { + $partsize = $numutts / $num_parts; + } + for (my $i=0; $i < $numutts; $i++) { + $partid = int($i / $partsize); + $utt = $utts[$i]; + $filled = sprintf "%02d", $partid; + print SO "$prefix$utt $prefix$utt2reco{$utt}-$filled $utt2seg{$utt}\n"; + $newrecolist{"$prefix$utt2reco{$utt}-$filled"} = "$prefix$utt2reco{$utt}-$filled"; + } + } + + foreach $reco (sort keys %recolist) { + for (my $i=0; $i < $num_parts; $i++) { + $filled = sprintf "%02d", $i; + if ($newrecolist{"$prefix$reco-$filled"} ne "") { + print WO "$prefix$reco-$filled $reco2pipe{$reco} apply-filter \"scp:echo $reco2spk{$reco} $reco2filt{$reco} |\" - - | apply-filter --inverse=true \"scp:echo $newreco2perturbspk{$reco.$i} $newreco2perturbfilt{$reco.$i} |\" - - |\n"; + } + } + } + +' $num_parts $srcdir/utt2spk $srcdir/spk2utt $srcdir/segments $srcdir/wav.scp \ +$srcdir/spk_filter.scp $destdir/wav.scp $destdir/segments + +if [ -f $srcdir/segments ]; then + # also apply the spk_prefix to the recording-ids. + cat $srcdir/wav.scp | awk -v p=$spk_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/reco_map + +# utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments | utils/apply_map.pl -f 2 $destdir/reco_map >$destdir/segments + +# if [ -f $srcdir/reco2file_and_channel ]; then +# utils/apply_map.pl -f 1 $destdir/reco_map <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel +# fi + + rm $destdir/reco_map 2>/dev/null +fi + +if [ -f $srcdir/text ]; then + utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text +fi +if [ -f $srcdir/spk2gender ]; then + utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender +fi + + +rm $destdir/spk_map $destdir/utt_map 2>/dev/null +echo "$0: generated signal-perturbed version of data in $srcdir, in $destdir" +utils/validate_data_dir.sh --no-feats $destdir diff --git a/egs/wsj/s5/utils/prepare_lang.sh b/egs/wsj/s5/utils/prepare_lang.sh index 43b8bce1f4c..0014f22a04e 100755 --- a/egs/wsj/s5/utils/prepare_lang.sh +++ b/egs/wsj/s5/utils/prepare_lang.sh @@ -28,20 +28,21 @@ # and also files silence_phones.txt, nonsilence_phones.txt, optional_silence.txt # and extra_questions.txt # Here, silence_phones.txt and nonsilence_phones.txt are lists of silence and -# non-silence phones respectively (where silence includes various kinds of -# noise, laugh, cough, filled pauses etc., and nonsilence phones includes the +# non-silence phones respectively (where silence includes various kinds of +# noise, laugh, cough, filled pauses etc., and nonsilence phones includes the # "real" phones.) -# In each line of those files is a list of phones, and the phones on each line -# are assumed to correspond to the same "base phone", i.e. they will be +# In each line of those files is a list of phones, and the phones on each line +# are assumed to correspond to the same "base phone", i.e. they will be # different stress or tone variations of the same basic phone. -# The file "optional_silence.txt" contains just a single phone (typically SIL) +# The file "optional_silence.txt" contains just a single phone (typically SIL) # which is used for optional silence in the lexicon. # extra_questions.txt might be empty; typically will consist of lists of phones, -# all members of each list with the same stress or tone; and also possibly a -# list for the silence phones. This will augment the automtically generated -# questions (note: the automatically generated ones will treat all the -# stress/tone versions of a phone the same, so will not "get to ask" about +# all members of each list with the same stress or tone; and also possibly a +# list for the silence phones. This will augment the automatically generated +# questions (note: the automatically generated ones will treat all the +# stress/tone versions of a phone the same, so will not "get to ask" about # stress or tone). +# # This script adds word-position-dependent phones and constructs a host of other # derived files, that go in data/lang/. @@ -49,19 +50,20 @@ # Begin configuration section. num_sil_states=5 num_nonsil_states=3 +num_word_disambig_syms=1 position_dependent_phones=true -# position_dependent_phones is false also when position dependent phones and word_boundary.txt +# position_dependent_phones is false also when position dependent phones and word_boundary.txt # have been generated by another source reverse=false -share_silence_phones=false # if true, then share pdfs of different silence +share_silence_phones=false # if true, then share pdfs of different silence # phones together. sil_prob=0.5 phone_symbol_table= # if set, use a specified phones.txt file. # end configuration sections -. utils/parse_options.sh +. utils/parse_options.sh -if [ $# -ne 4 ]; then +if [ $# -ne 4 ]; then echo "usage: utils/prepare_lang.sh " echo "e.g.: utils/prepare_lang.sh data/local/dict data/local/lang data/lang" echo " should contain the following files:" @@ -133,10 +135,10 @@ if $position_dependent_phones; then # adding the markers _B, _E, _S, _I depending on word position. # In this recipe, these markers apply to silence also. # Do this starting from lexiconp.txt only. - if "$silprob"; then + if "$silprob"; then perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; $silword_p = shift @A; $wordsil_f = shift @A; $wordnonsil_f = shift @A; @A>0||die; - if(@A==1) { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_S\n"; } + if(@A==1) { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_S\n"; } else { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_B "; for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \ < $srcdir/lexiconp_silprob.txt > $tmpdir/lexiconp_silprob.txt @@ -158,11 +160,11 @@ if $position_dependent_phones; then mv $tmpdir/lexiconp.pre_reverse $tmpdir/lexiconp.txt fi fi - + # create $tmpdir/phone_map.txt # this has the format (on each line) # ... - # where the versions depend on the position of the phone within a word. + # where the versions depend on the position of the phone within a word. # For instance, we'd have: # AA AA_B AA_E AA_I AA_S # for (B)egin, (E)nd, (I)nternal and (S)ingleton @@ -174,11 +176,11 @@ if $position_dependent_phones; then # This phone map expands the phone lists into all the word-position-dependent # versions of the phone lists. - cat <(for x in `cat $srcdir/silence_phones.txt`; do for y in "" "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \ - <(for x in `cat $srcdir/nonsilence_phones.txt`; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \ + cat <(set -f; for x in `cat $srcdir/silence_phones.txt`; do for y in "" "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \ + <(set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \ > $tmpdir/phone_map.txt else - if "$silprob"; then + if "$silprob"; then cp $srcdir/lexiconp_silprob.txt $tmpdir/lexiconp_silprob.txt if $reverse; then echo "We do not support reverse option and silprob at the same time" @@ -245,10 +247,10 @@ cat $srcdir/extra_questions.txt 2>/dev/null | utils/apply_map.pl $tmpdir/phone_m # be inside a word. if $position_dependent_phones; then for suffix in _B _E _I _S; do - (for x in `cat $srcdir/nonsilence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt + (set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt done for suffix in "" _B _E _I _S; do - (for x in `cat $srcdir/silence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt + (set -f; for x in `cat $srcdir/silence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt done fi @@ -277,7 +279,7 @@ if [[ ! -z $phone_symbol_table ]]; then start_symbol=`grep \#0 $phone_symbol_table | awk '{print $2}'` echo "" | cat - $dir/phones/{silence,nonsilence}.txt | awk -v f=$phone_symbol_table ' BEGIN { while ((getline < f) > 0) { phones[$1] = $2; }} { print $1" "phones[$1]; }' | sort -k2 -g |\ - cat - <(cat $dir/phones/disambig.txt | awk -v x=$start_symbol '{n=x+NR-1; print $1, n;}') > $dir/phones.txt + cat - <(cat $dir/phones/disambig.txt | awk -v x=$start_symbol '{n=x+NR-1; print $1, n;}') > $dir/phones.txt else echo "" | cat - $dir/phones/{silence,nonsilence,disambig}.txt | \ awk '{n=NR-1; print $1, n;}' > $dir/phones.txt @@ -313,7 +315,7 @@ fi cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq | awk ' BEGIN { print " 0"; - } + } { if ($1 == "") { print " is in the vocabulary!" | "cat 1>&2" @@ -362,7 +364,7 @@ cat $dir/phones/align_lexicon.txt | utils/sym2int.pl -f 3- $dir/phones.txt | \ utils/sym2int.pl -f 1-2 $dir/words.txt > $dir/phones/align_lexicon.int # Create the basic L.fst without disambiguation symbols, for use -# in training. +# in training. if $silprob; then # Usually it's the same as having a fixed-prob L.fst @@ -386,7 +388,18 @@ cat $dir/oov.txt | utils/sym2int.pl $dir/words.txt >$dir/oov.int || exit 1; # integer version of oov symbol, used in some scripts. -# Create these lists of phones in colon-separated integer list form too, +# the file wdisambig.txt contains a (line-by-line) list of the text-form of the +# disambiguation symbols that are used in the grammar and passed through by the +# lexicon. At this stage it's hardcoded as '#0', but we're laying the groundwork +# for more generality (which probably would be added by another script). +# wdisambig_words.int contains the corresponding list interpreted by the +# symbol table words.txt, and wdisambig_phones.int contains the corresponding +# list interpreted by the symbol table phones.txt. +echo '#0' >$dir/phones/wdisambig.txt +utils/sym2int.pl $dir/phones.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_phones.int +utils/sym2int.pl $dir/words.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_words.int + +# Create these lists of phones in colon-separated integer list form too, # for purposes of being given to programs as command-line options. for f in silence nonsilence optional_silence disambig context_indep; do utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt >$dir/phones/$f.int @@ -415,20 +428,18 @@ utils/gen_topo.pl $num_nonsil_states $num_sil_states $nonsilphonelist $silphonel # Create the lexicon FST with disambiguation symbols, and put it in lang_test. # There is an extra step where we create a loop to "pass through" the # disambiguation symbols from G.fst. -phone_disambig_symbol=`grep \#0 $dir/phones.txt | awk '{print $2}'` -word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'` if $silprob; then utils/make_lexicon_fst_silprob.pl $tmpdir/lexiconp_silprob_disambig.txt $srcdir/silprob.txt $silphone '#'$ndisambig | \ fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops "echo $phone_disambig_symbol |" "echo $word_disambig_symbol |" | \ + fstaddselfloops $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \ fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1; else utils/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt $sil_prob $silphone '#'$ndisambig | \ fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops "echo $phone_disambig_symbol |" "echo $word_disambig_symbol |" | \ + fstaddselfloops $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \ fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1; fi diff --git a/egs/wsj/s5/utils/slurm.pl b/egs/wsj/s5/utils/slurm.pl index 68c269080ac..8095272732e 100755 --- a/egs/wsj/s5/utils/slurm.pl +++ b/egs/wsj/s5/utils/slurm.pl @@ -11,7 +11,7 @@ use Cwd; use Getopt::Long; -# slurm.pl was created from the queue.pl +# slurm.pl was created from the queue.pl # queue.pl has the same functionality as run.pl, except that # it runs the job in question on the queue (Sun GridEngine). # This version of queue.pl uses the task array functionality @@ -20,7 +20,7 @@ # The script now supports configuring the queue system using a config file # (default in conf/queue.conf; but can be passed specified with --config option) -# and a set of command line options. +# and a set of command line options. # The current script handles: # 1) Normal configuration arguments # For e.g. a command line option of "--gpu 1" could be converted into the option @@ -30,7 +30,7 @@ # $0 here in the line is replaced with the argument read from the CLI and the # resulting string is passed to qsub. # 2) Special arguments to options such as -# gpu=0 +# gpu=0 # If --gpu 0 is given in the command line, then no special "-q" is given. # 3) Default argument # default gpu=0 @@ -60,7 +60,7 @@ my $qsub_opts = ""; my $sync = 0; my $num_threads = 1; -my $max_jobs_run; +my $max_jobs_run; my $gpu = 0; my $config = "conf/slurm.conf"; @@ -99,12 +99,12 @@ () print_usage(); } -for (my $x = 1; $x <= 3; $x++) { # This for-loop is to +for (my $x = 1; $x <= 3; $x++) { # This for-loop is to # allow the JOB=1:n option to be interleaved with the # options to qsub. while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) { my $switch = shift @ARGV; - + if ($switch eq "-V") { $qsub_opts .= "-V "; } else { @@ -121,10 +121,10 @@ () $num_threads = $argument2; } elsif ($switch =~ m/^--/) { # Config options # Convert CLI option to variable name - # by removing '--' from the switch and replacing any + # by removing '--' from the switch and replacing any # '-' with a '_' $switch =~ s/^--//; - $switch =~ s/-/_/g; + $switch =~ s/-/_/g; $cli_options{$switch} = $argument; } else { # Other qsub options - passed as is $qsub_opts .= "$switch $argument "; @@ -160,7 +160,7 @@ () if (exists $cli_options{"config"}) { $config = $cli_options{"config"}; -} +} my $default_config_file = <<'EOF'; # Default configuration @@ -168,17 +168,18 @@ () option time=* --time $0 option mem=* --mem-per-cpu $0 option mem=0 # Do not add anything to qsub_opts -option num_threads=* --cpus-per-task $0 --ntasks-per-node=1 +option num_threads=* --cpus-per-task $0 --ntasks-per-node=1 option num_threads=1 --cpus-per-task 1 --ntasks-per-node=1 # Do not add anything to qsub_opts -option max_jobs_run=* # Do nothing default gpu=0 option gpu=0 -p shared option gpu=* -p gpu --gres=gpu:$0 --time 4:0:0 # this has to be figured out +# note: the --max-jobs-run option is supported as a special case +# by slurm.pl and you don't have to handle it in the config file. EOF # Here the configuration options specified by the user on the command line # (e.g. --mem 2G) are converted to options to the qsub system as defined in -# the config file. (e.g. if the config file has the line +# the config file. (e.g. if the config file has the line # "option mem=* -l ram_free=$0,mem_free=$0" # and the user has specified '--mem 2G' on the command line, the options # passed to queue system would be "-l ram_free=2G,mem_free=2G @@ -192,7 +193,7 @@ () my %cli_config_options = (); my %cli_default_options = (); -if ($opened_config_file == 0 && exists($cli_options{"config"})) { +if ($opened_config_file == 0 && exists($cli_options{"config"})) { print STDERR "Could not open config file $config\n"; exit(1); } elsif ($opened_config_file == 0 && !exists($cli_options{"config"})) { @@ -212,12 +213,12 @@ () if ($_ =~ /^command (.+)/) { $read_command = 1; $qsub_cmd = $1 . " "; - } elsif ($_ =~ m/^option ([^=]+)=\* (.+)$/) { + } elsif ($_ =~ m/^option ([^=]+)=\* (.+)$/) { # Config option that needs replacement with parameter value read from CLI # e.g.: option mem=* -l mem_free=$0,ram_free=$0 my $option = $1; # mem my $arg= $2; # -l mem_free=$0,ram_free=$0 - if ($arg !~ m:\$0:) { + if ($arg !~ m:\$0:) { print STDERR "Warning: the line '$line' in config file ($config) does not substitution variable \$0\n"; } if (exists $cli_options{$option}) { @@ -237,7 +238,7 @@ () } } elsif ($_ =~ m/^default (\S+)=(\S+)/) { # Default options. Used for setting default values to options i.e. when - # the user does not specify the option on the command line + # the user does not specify the option on the command line # e.g. default gpu=0 my $option = $1; # gpu my $value = $2; # 0 @@ -261,19 +262,25 @@ () for my $option (keys %cli_options) { if ($option eq "config") { next; } - if ($option eq "max_jobs_run" && $array_job != 1) { print STDERR "Ignoring $option\n"; next; } + my $value = $cli_options{$option}; - - if ($option eq "max_jobs_run") { $max_jobs_run = $value; } - if (exists $cli_default_options{($option,$value)}) { + if ($option eq "max_jobs_run") { + if ($array_job != 1) { + print STDERR "Ignoring $option since this is not an array task."; + } else { + $max_jobs_run = $value; + } + } elsif (exists $cli_default_options{($option,$value)}) { $qsub_opts .= "$cli_default_options{($option,$value)} "; } elsif (exists $cli_config_options{$option}) { $qsub_opts .= "$cli_config_options{$option} "; } elsif (exists $cli_default_options{($option,"*")}) { $qsub_opts .= $cli_default_options{($option,"*")} . " "; } else { - if ($opened_config_file == 0) { $config = "default config file"; } + if ($opened_config_file == 0) { + $config = "default config file"; + } die "$0: Command line option $option not described in $config (or value '$value' not allowed)\n"; } } @@ -301,7 +308,7 @@ () # my $cmd = ""; -foreach my $x (@ARGV) { +foreach my $x (@ARGV) { if ($x =~ m/^\S+$/) { $cmd .= $x . " "; } # If string contains no spaces, take # as-is. elsif ($x =~ m:\":) { $cmd .= "'$x' "; } # else if no dbl-quotes, use single @@ -322,23 +329,23 @@ () # make a directory called "q", # where we will put the log created by qsub... normally this doesn't contain # anything interesting, evertyhing goes to $logfile. -if (! -d "$qdir") { +if (! -d "$qdir") { system "mkdir $qdir 2>/dev/null"; sleep(5); ## This is to fix an issue we encountered in denominator lattice creation, ## where if e.g. the exp/tri2b_denlats/log/15/q directory had just been ## created and the job immediately ran, it would die with an error because nfs ## had not yet synced. I'm also decreasing the acdirmin and acdirmax in our ## NFS settings to something like 5 seconds. -} +} my $queue_array_opt = ""; if ($array_job == 1) { # It's an array job. if ($max_jobs_run) { - $queue_array_opt = "--array ${jobstart}-${jobend}%${max_jobs_run}"; + $queue_array_opt = "--array ${jobstart}-${jobend}%${max_jobs_run}"; } else { - $queue_array_opt = "--array ${jobstart}-${jobend}"; + $queue_array_opt = "--array ${jobstart}-${jobend}"; } - $logfile =~ s/$jobname/\$SLURM_ARRAY_TASK_ID/g; # This variable will get + $logfile =~ s/$jobname/\$SLURM_ARRAY_TASK_ID/g; # This variable will get # replaced by qsub, in each job, with the job-id. $cmd =~ s/$jobname/\$\{SLURM_ARRAY_TASK_ID\}/g; # same for the command... $queue_logfile =~ s/\.?$jobname//; # the log file in the q/ subdirectory @@ -475,14 +482,14 @@ () } } - # Check that the job exists in SLURM. Job can be killed if duration - # exceeds some hard limit, or in case of a machine shutdown. + # Check that the job exists in SLURM. Job can be killed if duration + # exceeds some hard limit, or in case of a machine shutdown. if (($check_sge_job_ctr++ % 10) == 0) { # Don't run qstat too often, avoid stress on SGE. if ( -f $f ) { next; }; #syncfile appeared: OK. $ret = system("squeue -j $sge_job_id >/dev/null 2>/dev/null"); # system(...) : To get the actual exit value, shift $ret right by eight bits. if ($ret>>8 == 1) { # Job does not seem to exist - # Don't consider immediately missing job as error, first wait some + # Don't consider immediately missing job as error, first wait some # time to make sure it is not just delayed creation of the syncfile. sleep(3); @@ -546,7 +553,7 @@ () push @logfiles, $logfile; } else { for (my $jobid = $jobstart; $jobid <= $jobend; $jobid++) { - my $l = $logfile; + my $l = $logfile; $l =~ s/\$SLURM_ARRAY_TASK_ID/$jobid/g; push @logfiles, $l; } diff --git a/egs/wsj/s5/utils/validate_data_dir.sh b/egs/wsj/s5/utils/validate_data_dir.sh index 63f8bdbf3b9..19452c3c235 100755 --- a/egs/wsj/s5/utils/validate_data_dir.sh +++ b/egs/wsj/s5/utils/validate_data_dir.sh @@ -133,7 +133,7 @@ if [ -f $data/wav.scp ]; then ! cat $data/segments | \ awk '{if (NF != 4 || ($4 <= $3 && $4 != -1)) { print "Bad line in segments file", $0; exit(1); }}' && \ echo "$0: badly formatted segments file" && exit 1; - + segments_len=`cat $data/segments | wc -l` if [ -f $data/text ]; then ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/text) && \ @@ -153,14 +153,14 @@ if [ -f $data/wav.scp ]; then # this file is needed only for ctm scoring; it's indexed by recording-id. check_sorted_and_uniq $data/reco2file_and_channel ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { + awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { if ( NF == 3 && $3 == "1" ) { warning_issued = 1; } else { - print "Bad line ", $0; exit 1; + print "Bad line ", $0; exit 1; } } - } + } END { if (warning_issued == 1) { print "The channel should be marked as A or B, not 1! You should change it ASAP! " @@ -188,14 +188,14 @@ if [ -f $data/wav.scp ]; then # this file is needed only for ctm scoring; it's indexed by recording-id. check_sorted_and_uniq $data/reco2file_and_channel ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { + awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { if ( NF == 3 && $3 == "1" ) { warning_issued = 1; } else { - print "Bad line ", $0; exit 1; + print "Bad line ", $0; exit 1; } } - } + } END { if (warning_issued == 1) { print "The channel should be marked as A or B, not 1! You should change it ASAP! " @@ -228,6 +228,7 @@ if [ -f $data/feats.scp ]; then fi fi + if [ -f $data/cmvn.scp ]; then check_sorted_and_uniq $data/cmvn.scp cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn @@ -294,4 +295,19 @@ for f in vad.scp utt2lang utt2uniq; do fi done + +if [ -f $data/utt2dur ]; then + check_sorted_and_uniq $data/utt2dur + cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur + if ! cmp -s $tmpdir/utts{,.utt2dur}; then + echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file" + echo "$0: differ, partial diff is:" + partial_diff $tmpdir/utts{,.feats} + exit 1; + fi + cat $data/utt2dur | \ + awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1 +fi + + echo "$0: Successfully validated data-directory $data" diff --git a/egs/wsj/s5/utils/validate_lang.pl b/egs/wsj/s5/utils/validate_lang.pl index ae087bd9578..f9a27584b07 100755 --- a/egs/wsj/s5/utils/validate_lang.pl +++ b/egs/wsj/s5/utils/validate_lang.pl @@ -22,6 +22,7 @@ exit(1); } +print "$0 " . join(" ", @ARGV) . "\n"; $lang = shift @ARGV; $exit = 0; @@ -89,15 +90,7 @@ $wint2sym{$wsymtab{$_}} = $_; } } -if (exists $wsymtab{"#0"}) { - print "--> $lang/words.txt has \"#0\"\n"; - print "--> $lang/words.txt is OK\n"; -} else { - $warning = 1; - print "--> WARNING: $lang/words.txt doesn't have \"#0\"\n"; - print "--> (if you are using ARPA-type language models, you will normally\n"; - print "--> need the disambiguation symbol \"#0\" to ensure determinizability)\n"; -} +print "--> $lang/words.txt is OK\n"; print "\n"; # Checking phones/* ------------------------------- @@ -113,7 +106,6 @@ sub check_txt_int_csl { if (!open(CSL, "<$cat.csl")) { $exit = 1; return print "--> ERROR: fail to open $cat.csl\n"; } - if (-z "$cat.txt") { $warning = 1; print "--> WARNING: $cat.txt is empty\n"; } @@ -743,6 +735,76 @@ sub check_summation { } } +sub check_wdisambig { + print "Checking word-level disambiguation symbols...\n"; + # This block checks that one of the two following conditions hold: + # (1) for lang diretories prepared by older versions of prepare_lang.sh: + # The symbol '#0' should appear in words.txt and phones.txt, and should + # or (2): the files wdisambig.txt, wdisambig_phones.int and wdisambig_words.int + # exist, and have the expected properties (see below for details). + my %wdisambig_words_hash; + my %wdisambig_words_string = ""; + + if (! -e "$lang/phones/wdisambig.txt") { + print "--> no $lang/phones/wdisambig.txt (older prepare_lang.sh)\n"; + if (exists $wsymtab{"#0"}) { + print "--> $lang/words.txt has \"#0\"\n"; + $wdisambig_words_hash{$wsymtab{"#0"}} = 1; + $wdisambig_words_string = $wsymtab{"#0"}; + } else { + print "--> WARNING: $lang/words.txt doesn't have \"#0\"\n"; + print "--> (if you are using ARPA-type language models, you will normally\n"; + print "--> need the disambiguation symbol \"#0\" to ensure determinizability)\n"; + } + } else { + print "--> $lang/phones/wdisambig.txt exists (newer prepare_lang.sh)\n"; + if (!open(T, "<$lang/phones/wdisambig.txt")) { + print "--> ERROR: fail to open $lang/phones/wdisambig.txt\n"; $exit = 1; return; + } + chomp(my @wdisambig = ); + close(T); + if (!open(W, "<$lang/phones/wdisambig_words.int")) { + print "--> ERROR: fail to open $lang/phones/wdisambig_words.int\n"; $exit = 1; return; + } + chomp(my @wdisambig_words = ); + close(W); + if (!open(P, "<$lang/phones/wdisambig_phones.int")) { + print "--> ERROR: fail to open $lang/phones/wdisambig_phones.int\n"; $exit = 1; return; + } + chomp(my @wdisambig_phones =

); + close(P); + my $len = @wdisambig, $len2; + if (($len2 = @wdisambig_words) != $len) { + print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_words.int have different lengths"; + $exit = 1; return; + } + if (($len2 = @wdisambig_phones) != $len) { + print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_phones.int have different lengths"; + $exit = 1; return; + } + for (my $i = 0; $i < $len; $i++) { + if ($wsymtab{$wdisambig[$i]} ne $wdisambig_words[$i]) { + my $ii = $i + 1; + print "--> ERROR: line $ii of files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_words.int mismatch\n"; + $exit = 1; return; + } + } + for (my $i = 0; $i < $len; $i++) { + if ($psymtab{$wdisambig[$i]} ne $wdisambig_phones[$i]) { + my $ii = $i + 1; + print "--> ERROR: line $ii of files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_phones.int mismatch\n"; + $exit = 1; return; + } + } + foreach my $i ( @wdisambig_words ) { + $wdisambig_words_hash{$i} = 1; + $wdisambig_words_string .= " " . $i; + } + } +} + +check_wdisambig(); + if (-e "$lang/G.fst") { # Check that G.fst is ilabel sorted and nonempty. $text = `. ./path.sh; fstinfo $lang/G.fst`; @@ -781,21 +843,17 @@ sub check_summation { } # Check that G.fst does not have cycles with only disambiguation symbols or - # epsilons on the input, or the forbidden symbols and . - $cmd = ". ./path.sh; fstprint $lang/G.fst | awk -v disambig=$lang/phones/disambig.int -v words=$lang/words.txt 'BEGIN{while((getline0) is_disambig[\$1]=1; is_disambig[0] = 1; while((getline0){ if(\$1==\"\"||\$1==\"\") is_forbidden[\$2]=1;}} {if(NF<3 || is_disambig[\$3]) print; else if(is_forbidden[\$3] || is_forbidden[\$4]) { print \"Error: line \" \$0 \" in G.fst contains forbidden symbol or \" | \"cat 1>&2\"; exit(1); }}' | fstcompile | fstinfo "; - $output = `$cmd`; - if ($output !~ m/# of states\s+[1-9]/) { # fstinfo did not read a nonempty FST (there should be final probs at least)... - print "--> ERROR: failure running command to check for disambig-sym loops [possibly G.fst " . - "contained the forbidden symbols or , or possibly some other error.. Output was: \n"; - print $output; - $exit = 1; - } - if ($output !~ m/cyclic\s+n/) { # FST was cyclic after selecting only for disambig symbols. This is now allowed. - print "--> ERROR: G.fst contained cycles with only disambiguation symbols or epsilons on the input. Would cause determinization failure in graph creation.\n"; - $exit = 1; - } else { - print "--> G.fst did not contain cycles with only disambig symbols or epsilon on the input, and did not contain\n" . - "the forbidden symbols or (if present in vocab) on the input or output.\n"; + # epsilons on the input, or the forbidden symbols and (and a few + # related checks + + if (-e "$lang/G.fst") { + system("utils/lang/check_g_properties.pl $lang"); + if ($? != 0) { + print "--> ERROR: failure running check_g_properties.pl\n"; + $exit = 1; + } else { + print("--> utils/lang/check_g_properties.pl succeeded.\n"); + } } } diff --git a/notes b/notes deleted file mode 100644 index b0777bd71f4..00000000000 --- a/notes +++ /dev/null @@ -1,109 +0,0 @@ --- -TODO: --- - - Transition-model equivalent. - - chain::Topology - - This stores the topology for each phone in the 'chain-model' modeling code. - It has a list of phones and allows you to get the topology FST for each phone. - - A topology is an unweighted, epsilon-free acceptor FST [acceptor means the - ilabels and olabels are the same]. Its initial state must not be final. - The labels on the arcs must start from 1 and have no gaps-- i.e. they must - form a set like (1, 2) or (1, 2, 3). - - An example FST would be (in OpenFst acceptor format), - -0 1 1 # transition from state 0 to state 1 with label 1. -1 1 2 # transition from state 1 to state 1 (self-loop) with label 2. -1 0 # this says that state 1 is final. - - - A Topology object is 'alignable' if all of the phones' topology FSTs - have the property that the set of labels on the arcs from the start state are - disjoint from the set of labels on other arcs, and there are no transitions to - the start state. This means that we can identify the beginning of the phone. - - - chain::PhoneContextModel - - list of phones - - LeftContext() ... the number of phones of left context (there is no right context). - - A mechanism to find a particular context-depenendent phone: you have to call the following - LeftContext() + 1 times. - - // returns new state. if phone_in_context != NULL, outputs - // the cd_phone_index to there (FST-wise, view this as the input symbol on the - // transition, where 'phone' is the output symbool. - int32 AdvanceState(int32 cur_state, int32 phone, int32 *cd_phone); - - Fst *GetFst(); - - - Phone indexes may not include zero (and may have gaps) - - - cd_phoness are 1-based, without gaps. - - - We will initialize the PhoneContextModel using a tree for now, but in - future we may enable different ways of doing this. We'll require that - the tree be trained using only one pdf-class. - - TODO: enable tree-building with separate stats per state, but to give a single - index per phone. [so store array of Gaussian stats]. ----- - - To get labels for the individual transitions on the context-dependent phones, - we need to store an offset for each - - chain::ContextDependentTopology [note, you can use this even if you don't have - context]. - stores Topology and PhoneContextModel, and also stores offsets for each context-dependent - phone that allow us to assign a unique context-dependent label for each label in the - cd-label - - cd-labels will be 1-based so they can appear on FSTs. We may subtract one so they - can appear at the output layer of a nnet. ---- - - FST-based representation of phone language model?? We can get it from class - LanguageModel as an FST, and then prune away disallowed phone sequences and - rebalance. The output can be in the standard FST representation. - - What do we do about initial alphas and final betas? We want to limit it - to the same states that are active in the phone lattice at that time, as a - better approximation of the end effects. - - For the betas, it's just a question of what [context-independent] phones are - active at the end-time. - - We can limit it with reasonable acuracy by just considering the set of - symbols that are active at times 0 and T, and then limiting the alphas and - betas to the states from which those symbols would be emitted. - By time T we mean one past the end of the file. We can store information - saying that either it's a final-prob, or just storing the active symbols - at that point. - - - - - ---- - - - phones. It will store the probs more compactly than OpenFst. - ---- - Suppose we have 200 phones, and 500 history-states. - If there are 5000 CD-phones, then each phone has on average 25 versions... - - Suppose for each history-state, the output-prob for a phone is just a - combination of some subspace of output-probs for that phone. - - num-params = 500 * 200 * 25 = 2.5 million. - --- - - - - diff --git a/src/Makefile b/src/Makefile index 57a4b98e0aa..4fe95251b1e 100644 --- a/src/Makefile +++ b/src/Makefile @@ -147,7 +147,7 @@ $(EXT_SUBDIRS) : mklibdir bin fstbin gmmbin fgmmbin sgmmbin sgmm2bin featbin nnetbin nnet2bin nnet3bin chainbin latbin ivectorbin lmbin kwsbin online2bin: \ base matrix util feat tree optimization thread gmm transform sgmm sgmm2 fstext hmm \ - lm decoder lat cudamatrix nnet nnet2 nnet3 ivector + lm decoder lat cudamatrix nnet nnet2 nnet3 ivector chain #2)The libraries have inter-dependencies base: diff --git a/src/base/io-funcs-inl.h b/src/base/io-funcs-inl.h index 9629c5466ad..9311645cc0c 100644 --- a/src/base/io-funcs-inl.h +++ b/src/base/io-funcs-inl.h @@ -3,6 +3,7 @@ // Copyright 2009-2011 Microsoft Corporation; Saarland University; // Jan Silovsky; Yanmin Qian; // Johns Hopkins University (Author: Daniel Povey) +// 2016 Xiaohui Zhang // See ../../COPYING for clarification regarding multiple authors // @@ -87,6 +88,112 @@ template inline void ReadBasicType(std::istream &is, } } +// Template that covers integers. +template +inline void WriteIntegerPairVector(std::ostream &os, bool binary, + const std::vector > &v) { + // Compile time assertion that this is not called with a wrong type. + KALDI_ASSERT_IS_INTEGER_TYPE(T); + if (binary) { + char sz = sizeof(T); // this is currently just a check. + os.write(&sz, 1); + int32 vecsz = static_cast(v.size()); + KALDI_ASSERT((size_t)vecsz == v.size()); + os.write(reinterpret_cast(&vecsz), sizeof(vecsz)); + if (vecsz != 0) { + os.write(reinterpret_cast(&(v[0])), sizeof(T) * vecsz * 2); + } + } else { + // focus here is on prettiness of text form rather than + // efficiency of reading-in. + // reading-in is dominated by low-level operations anyway: + // for efficiency use binary. + os << "[ "; + typename std::vector >::const_iterator iter = v.begin(), + end = v.end(); + for (; iter != end; ++iter) { + if (sizeof(T) == 1) + os << static_cast(iter->first) << ',' + << static_cast(iter->second) << ' '; + else + os << iter->first << ',' + << iter->second << ' '; + } + os << "]\n"; + } + if (os.fail()) { + throw std::runtime_error("Write failure in WriteIntegerPairVector."); + } +} + +// Template that covers integers. +template +inline void ReadIntegerPairVector(std::istream &is, bool binary, + std::vector > *v) { + KALDI_ASSERT_IS_INTEGER_TYPE(T); + KALDI_ASSERT(v != NULL); + if (binary) { + int sz = is.peek(); + if (sz == sizeof(T)) { + is.get(); + } else { // this is currently just a check. + KALDI_ERR << "ReadIntegerPairVector: expected to see type of size " + << sizeof(T) << ", saw instead " << sz << ", at file position " + << is.tellg(); + } + int32 vecsz; + is.read(reinterpret_cast(&vecsz), sizeof(vecsz)); + if (is.fail() || vecsz < 0) goto bad; + v->resize(vecsz); + if (vecsz > 0) { + is.read(reinterpret_cast(&((*v)[0])), sizeof(T)*vecsz*2); + } + } else { + std::vector > tmp_v; // use temporary so v doesn't use extra memory + // due to resizing. + is >> std::ws; + if (is.peek() != static_cast('[')) { + KALDI_ERR << "ReadIntegerPairVector: expected to see [, saw " + << is.peek() << ", at file position " << is.tellg(); + } + is.get(); // consume the '['. + is >> std::ws; // consume whitespace. + while (is.peek() != static_cast(']')) { + if (sizeof(T) == 1) { // read/write chars as numbers. + int16 next_t1, next_t2; + is >> next_t1; + if (is.fail()) goto bad; + if (is.peek() != static_cast(',')) + KALDI_ERR << "ReadIntegerPairVector: expected to see ',', saw " + << is.peek() << ", at file position " << is.tellg(); + is.get(); // consume the ','. + is >> next_t2 >> std::ws; + if (is.fail()) goto bad; + else + tmp_v.push_back(std::make_pair((T)next_t1, (T)next_t2)); + } else { + T next_t1, next_t2; + is >> next_t1; + if (is.fail()) goto bad; + if (is.peek() != static_cast(',')) + KALDI_ERR << "ReadIntegerPairVector: expected to see ',', saw " + << is.peek() << ", at file position " << is.tellg(); + is.get(); // consume the ','. + is >> next_t2 >> std::ws; + if (is.fail()) goto bad; + else + tmp_v.push_back(std::make_pair((T)next_t1, (T)next_t2)); + } + } + is.get(); // get the final ']'. + *v = tmp_v; // could use std::swap to use less temporary memory, but this + // uses less permanent memory. + } + if (!is.fail()) return; + bad: + KALDI_ERR << "ReadIntegerPairVector: read failure at file position " + << is.tellg(); +} template inline void WriteIntegerVector(std::ostream &os, bool binary, const std::vector &v) { diff --git a/src/base/io-funcs-test.cc b/src/base/io-funcs-test.cc index 63506073ff8..dd05326d5ed 100644 --- a/src/base/io-funcs-test.cc +++ b/src/base/io-funcs-test.cc @@ -43,8 +43,20 @@ void UnitTestIo(bool binary) { WriteIntegerVector(outfile, binary, vec2); if (!binary) outfile << " \n"; std::vector vec3; - for (size_t i = 0; i < 10; i++) vec3.push_back(Rand()%100); + + int32 size = RandInt(0, 10); + for (size_t i = 0; i < size; i++) vec3.push_back(Rand()%100); WriteIntegerVector(outfile, binary, vec3); + std::vector > vec4; + WriteIntegerPairVector(outfile, binary, vec4); + if (!binary && Rand()%2 == 0) outfile << " \n"; + std::vector > vec5; + for (size_t i = 0; i < size; i++) vec5.push_back(std::make_pair(Rand()%100 - 10, Rand()%100 - 10)); + WriteIntegerPairVector(outfile, binary, vec5); + if (!binary) outfile << " \n"; + std::vector > vec6; + for (size_t i = 0; i < size; i++) vec6.push_back(std::make_pair(Rand()%100, Rand()%100)); + WriteIntegerPairVector(outfile, binary, vec6); if (!binary && Rand()%2 == 0) outfile << " \n"; const char *token1 = "Hi"; WriteToken(outfile, binary, token1); @@ -90,6 +102,15 @@ void UnitTestIo(bool binary) { std::vector vec3_in; ReadIntegerVector(infile, binary_in, &vec3_in); KALDI_ASSERT(vec3_in == vec3); + std::vector > vec4_in; + ReadIntegerPairVector(infile, binary_in, &vec4_in); + KALDI_ASSERT(vec4_in == vec4); + std::vector > vec5_in; + ReadIntegerPairVector(infile, binary_in, &vec5_in); + KALDI_ASSERT(vec5_in == vec5); + std::vector > vec6_in; + ReadIntegerPairVector(infile, binary_in, &vec6_in); + KALDI_ASSERT(vec6_in == vec6); std::string token1_in, token2_in; KALDI_ASSERT(Peek(infile, binary_in) == static_cast(*token1)); KALDI_ASSERT(PeekToken(infile, binary_in) == static_cast(*token1)); diff --git a/src/base/io-funcs.h b/src/base/io-funcs.h index ba0cf1c1c7c..4caddc6b5b3 100644 --- a/src/base/io-funcs.h +++ b/src/base/io-funcs.h @@ -2,6 +2,7 @@ // Copyright 2009-2011 Microsoft Corporation; Saarland University; // Jan Silovsky; Yanmin Qian +// 2016 Xiaohui Zhang // See ../../COPYING for clarification regarding multiple authors // @@ -181,6 +182,16 @@ template inline void WriteIntegerVector(std::ostream &os, bool binary, template inline void ReadIntegerVector(std::istream &is, bool binary, std::vector *v); +/// Function for writing STL vectors of pairs of integer types. +template +inline void WriteIntegerPairVector(std::ostream &os, bool binary, + const std::vector > &v); + +/// Function for reading STL vector of pairs of integer types. +template +inline void ReadIntegerPairVector(std::istream &is, bool binary, + std::vector > *v); + /// The WriteToken functions are for writing nonempty sequences of non-space /// characters. They are not for general strings. void WriteToken(std::ostream &os, bool binary, const char *token); diff --git a/src/base/kaldi-math.h b/src/base/kaldi-math.h index e28ddcc1a09..ac590a06a25 100644 --- a/src/base/kaldi-math.h +++ b/src/base/kaldi-math.h @@ -41,20 +41,19 @@ #endif #ifndef M_PI -# define M_PI 3.1415926535897932384626433832795 +#define M_PI 3.1415926535897932384626433832795 #endif #ifndef M_SQRT2 -# define M_SQRT2 1.4142135623730950488016887 +#define M_SQRT2 1.4142135623730950488016887 #endif - #ifndef M_2PI -# define M_2PI 6.283185307179586476925286766559005 +#define M_2PI 6.283185307179586476925286766559005 #endif #ifndef M_SQRT1_2 -# define M_SQRT1_2 0.7071067811865475244008443621048490 +#define M_SQRT1_2 0.7071067811865475244008443621048490 #endif #ifndef M_LOG_2PI @@ -65,6 +64,11 @@ #define M_LN2 0.693147180559945309417232121458 #endif +#ifndef M_LN10 +#define M_LN10 2.302585092994045684017991454684 +#endif + + #define KALDI_ISNAN std::isnan #define KALDI_ISINF std::isinf #define KALDI_ISFINITE(x) std::isfinite(x) diff --git a/src/base/kaldi-utils.cc b/src/base/kaldi-utils.cc index 13a3412a9bb..1ae1dc0b758 100644 --- a/src/base/kaldi-utils.cc +++ b/src/base/kaldi-utils.cc @@ -20,7 +20,9 @@ #include #elif defined(_WIN32) || defined(_MSC_VER) || defined(MINGW) #include +#if defined(_MSC_VER) && _MSC_VER < 1900 #define snprintf _snprintf +#endif /* _MSC_VER < 1900 */ #else #include #endif diff --git a/src/bin/Makefile b/src/bin/Makefile index ac175e42e0e..74b1b5de62b 100644 --- a/src/bin/Makefile +++ b/src/bin/Makefile @@ -5,7 +5,8 @@ include ../kaldi.mk BINFILES = align-equal align-equal-compiled acc-tree-stats \ show-alignments compile-questions cluster-phones \ - compute-wer make-h-transducer add-self-loops convert-ali \ + compute-wer compute-wer-bootci make-h-transducer \ + add-self-loops convert-ali \ compile-train-graphs compile-train-graphs-fsts arpa2fst \ make-pdf-to-tid-transducer make-ilabel-transducer show-transitions \ ali-to-phones ali-to-post weight-silence-post acc-lda est-lda \ diff --git a/src/bin/analyze-counts.cc b/src/bin/analyze-counts.cc index 60be710c79d..6c5d0328936 100644 --- a/src/bin/analyze-counts.cc +++ b/src/bin/analyze-counts.cc @@ -1,6 +1,6 @@ // bin/analyze-counts.cc -// Copyright 2012-2014 Brno University of Technology (Author: Karel Vesely) +// Copyright 2012-2016 Brno University of Technology (Author: Karel Vesely) // See ../../COPYING for clarification regarding multiple authors // @@ -51,6 +51,15 @@ int main(int argc, char *argv[]) { po.Register("binary", &binary, "write in binary mode"); po.Register("symbol-table", &symbol_table_filename, "Read symbol table for display of counts"); + int32 counts_dim = 0; + po.Register("counts-dim", &counts_dim, + "Output dimension of the counts, a hint for dimension auto-detection."); + + std::string frame_weights; + po.Register("frame-weights", &frame_weights, "Per-frame weights (counting weighted frames)."); + std::string utt_weights; + po.Register("utt-weights", &utt_weights, "Per-utterance weights (counting weighted frames)."); + po.Read(argc, argv); if (po.NumArgs() != 2) { @@ -61,39 +70,78 @@ int main(int argc, char *argv[]) { std::string alignments_rspecifier = po.GetArg(1), wxfilename = po.GetArg(2); - SequentialInt32VectorReader reader(alignments_rspecifier); + SequentialInt32VectorReader alignment_reader(alignments_rspecifier); - // Get the counts - std::vector counts; - int32 num_done = 0; - for (; !reader.Done(); reader.Next()) { - std::string key = reader.Key(); - std::vector alignment = reader.Value(); + RandomAccessBaseFloatVectorReader weights_reader; + if (frame_weights != "") { + weights_reader.Open(frame_weights); + } + RandomAccessBaseFloatReader utt_weights_reader; + if (utt_weights != "") { + utt_weights_reader.Open(utt_weights); + } + // Buffer for the counts, + Vector counts(counts_dim, kSetZero); + + // Get the counts, + int32 num_done = 0, num_other_error = 0; + for (; !alignment_reader.Done(); alignment_reader.Next()) { + std::string utt = alignment_reader.Key(); + // check we have per-frame weights, + if (frame_weights != "" && !weights_reader.HasKey(utt)) { + KALDI_WARN << utt << ", missing per-frame weights"; + num_other_error++; + continue; + } + // check we have per-utterance weights, + if (utt_weights != "" && !utt_weights_reader.HasKey(utt)) { + KALDI_WARN << utt << ", missing per-utterance weight"; + num_other_error++; + continue; + } + + // Get the alignment, + const std::vector &alignment = alignment_reader.Value(); + + // Get the weights, + BaseFloat utt_w = (utt_weights == "" ? 1.0 : utt_weights_reader.Value(utt)); + Vector frame_w; + if (frame_weights != "") { + frame_w = weights_reader.Value(utt); + KALDI_ASSERT(frame_w.Dim() == alignment.size()); + } + + // Accumulate the counts, for (size_t i = 0; i < alignment.size(); i++) { - int32 value = alignment[i]; - if(value >= counts.size()) { - counts.resize(value+1); + // Extend the vector if needed, + if (alignment[i] >= counts.Dim()) { + Vector tmp(counts); + counts.Resize(alignment[i]+1, kSetZero); + counts.Range(0, tmp.Dim()).CopyFromVec(tmp); } - counts[value]++; // Accumulate + // Accumulate, + counts(alignment[i]) += 1.0 * utt_w * (frame_weights == "" ? 1.0 : frame_w(i)); } num_done++; } - // We need at least one occurence for each tgt, so there is no nan during decoding - std::vector counts_nozero(counts); - for(size_t i = 0; i < counts.size(); i++) { - if(counts_nozero[i] == 0) { - KALDI_WARN << "Zero count for element " << i << ", force setting to one." - << " This avoids divide-by-zero when we use the counts in decoding."; - counts_nozero[i]++; + // Report elements with zero counts (this is suspicious), + for (size_t i = 0; i < counts.Dim(); i++) { + if (0.0 == counts(i)) { + KALDI_WARN << "Zero count for label " << i << ", this is suspicious."; } } - // Write + // Add a ``half-frame'' to all the elements, + // (avoids zero-counts, which would cause problems in decoding), + Vector counts_nozero(counts); + counts_nozero.Add(0.5); + + // Write, Output ko(wxfilename, binary); - WriteIntegerVector(ko.Stream(), binary, counts_nozero); + counts_nozero.Write(ko.Stream(), binary); //// //// THE REST IS FOR ANALYSIS, IT GETS PRINTED TO LOG @@ -108,16 +156,16 @@ int main(int argc, char *argv[]) { KALDI_ERR << "Could not read symbol table from file " << symbol_table_filename; } - // sort the counts - std::vector > sorted_counts; - for (int32 i = 0; i < counts.size(); i++) { - sorted_counts.push_back(std::make_pair(static_cast(counts[i]), i)); + // sort the counts, + std::vector > sorted_counts; + for (int32 i = 0; i < counts.Dim(); i++) { + sorted_counts.push_back(std::make_pair(static_cast(counts(i)), i)); } std::sort(sorted_counts.begin(), sorted_counts.end()); - // print + // print, std::ostringstream os; - int32 sum = std::accumulate(counts.begin(),counts.end(), 0); + double sum = counts.Sum(); os << "Printing...\n### The sorted count table," << std::endl; os << "count\t(norm),\tid\t(symbol):" << std::endl; for (int32 i=0; i > & edit_word_per_hyp) { + + // Both text and integers are loaded as vector of strings, + SequentialTokenVectorReader ref_reader(ref_rspecifier); + RandomAccessTokenVectorReader hyp_reader(hyp_rspecifier); + int32 num_words = 0, word_errs = 0, num_ins = 0, num_del = 0, num_sub = 0; + + // Main loop, store WER stats per hyp, + for (; !ref_reader.Done(); ref_reader.Next()) { + std::string key = ref_reader.Key(); + const std::vector &ref_sent = ref_reader.Value(); + std::vector hyp_sent; + if (!hyp_reader.HasKey(key)) { + if (mode == "strict") + KALDI_ERR << "No hypothesis for key " << key << " and strict " + "mode specifier."; + if (mode == "present") // do not score this one. + continue; + } else { + hyp_sent = hyp_reader.Value(key); + } + num_words = ref_sent.size(); + word_errs = LevenshteinEditDistance(ref_sent, hyp_sent, + &num_ins, &num_del, &num_sub); + edit_word_per_hyp.push_back(std::pair(word_errs, num_words)); + } +} + +void GetEditsDualHyp(const std::string &hyp_rspecifier, + const std::string &hyp_rspecifier2, + const std::string &ref_rspecifier, + const std::string &mode, + std::vector > & edit_word_per_hyp, + std::vector > & edit_word_per_hyp2) { + + // Both text and integers are loaded as vector of strings, + SequentialTokenVectorReader ref_reader(ref_rspecifier); + RandomAccessTokenVectorReader hyp_reader(hyp_rspecifier); + RandomAccessTokenVectorReader hyp_reader2(hyp_rspecifier2); + int32 num_words = 0, word_errs = 0, + num_ins = 0, num_del = 0, num_sub = 0; + + // Main loop, store WER stats per hyp, + for (; !ref_reader.Done(); ref_reader.Next()) { + std::string key = ref_reader.Key(); + const std::vector &ref_sent = ref_reader.Value(); + std::vector hyp_sent, hyp_sent2; + if (mode == "strict" && + (!hyp_reader.HasKey(key) || !hyp_reader2.HasKey(key))) { + KALDI_ERR << "No hypothesis for key " << key << " in both transcripts " + "comparison is not possible."; + } else if (mode == "present" && + (!hyp_reader.HasKey(key) || !hyp_reader2.HasKey(key))) + continue; + + num_words = ref_sent.size(); + + //all mode, if a hypothesis is not present, consider as an error + if(hyp_reader.HasKey(key)){ + hyp_sent = hyp_reader.Value(key); + word_errs = LevenshteinEditDistance(ref_sent, hyp_sent, + &num_ins, &num_del, &num_sub); + } + else + word_errs = num_words; + edit_word_per_hyp.push_back(std::pair(word_errs, num_words)); + + if(hyp_reader2.HasKey(key)){ + hyp_sent2 = hyp_reader2.Value(key); + word_errs = LevenshteinEditDistance(ref_sent, hyp_sent2, + &num_ins, &num_del, &num_sub); + } + else + word_errs = num_words; + edit_word_per_hyp2.push_back(std::pair(word_errs, num_words)); + } +} + +void GetBootstrapWERInterval( + const std::vector > & edit_word_per_hyp, + int32 replications, + BaseFloat *mean, BaseFloat *interval) { + BaseFloat wer_accum = 0.0, wer_mult_accum = 0.0; + + for (int32 i = 0; i <= replications; ++i) { + int32 num_words = 0, word_errs = 0; + for (int32 j = 0; j <= edit_word_per_hyp.size(); ++j) { + int32 random_pos = kaldi::RandInt(0, edit_word_per_hyp.size()); + word_errs += edit_word_per_hyp[random_pos].first; + num_words += edit_word_per_hyp[random_pos].second; + } + + BaseFloat wer_rep = static_cast(word_errs) / num_words; + wer_accum += wer_rep; + wer_mult_accum += wer_rep*wer_rep; + } + + // Compute mean WER and std WER + *mean = wer_accum / replications; + *interval = 1.96*sqrt(wer_mult_accum/replications-(*mean)*(*mean)); +} + +void GetBootstrapWERTwoSystemComparison( + const std::vector > & edit_word_per_hyp, + const std::vector > & edit_word_per_hyp2, + int32 replications, BaseFloat *p_improv) { + int32 improv_accum = 0.0; + + for (int32 i = 0; i <= replications; ++i) { + int32 word_errs = 0; + for (int32 j = 0; j <= edit_word_per_hyp.size(); ++j) { + int32 random_pos = kaldi::RandInt(0, edit_word_per_hyp.size()); + word_errs += edit_word_per_hyp[random_pos].first - + edit_word_per_hyp2[random_pos].first; + } + if(word_errs > 0) + ++improv_accum; + } + // Compute mean WER and std WER + *p_improv = static_cast(improv_accum) / replications; +} + +} //namespace kaldi + +int main(int argc, char *argv[]) { + using namespace kaldi; + typedef kaldi::int32 int32; + + try { + const char *usage = + "Compute a bootstrapping of WER to extract the 95\% confidence interval.\n" + "Take a reference and a transcription file, in integer or text format,\n" + "and outputs overall WER statistics to standard output along with its\n" + "confidence interval using the bootstrap methos of Bisani and Ney.\n" + "If a second transcription file corresponding to the same reference is\n" + "provided, a bootstrap comparison of the two transcription is performed\n" + "to estimate the probability of improvement.\n" + "\n" + "Usage: compute-wer-bootci [options] []\n" + "E.g.: compute-wer-bootci --mode=present ark:data/train/text ark:hyp_text\n" + "or compute-wer-bootci ark:data/train/text ark:hyp_text ark:hyp_text2\n" + "See also: compute-wer\n"; + + ParseOptions po(usage); + + std::string mode = "strict"; + po.Register("mode", &mode, + "Scoring mode: \"present\"|\"all\"|\"strict\":\n" + " \"present\" means score those we have transcriptions for\n" + " \"all\" means treat absent transcriptions as empty\n" + " \"strict\" means die if all in ref not also in hyp"); + + int32 replications = 10000; + po.Register("replications", &replications, + "Number of replications to compute the intervals"); + + po.Read(argc, argv); + + if (po.NumArgs() < 2 || po.NumArgs() > 3) { + po.PrintUsage(); + exit(1); + } + + std::string ref_rspecifier = po.GetArg(1); + std::string hyp_rspecifier = po.GetArg(2); + std::string hyp2_rspecifier = (po.NumArgs() == 3?po.GetArg(3):""); + + if (mode != "strict" && mode != "present" && mode != "all") { + KALDI_ERR << + "--mode option invalid: expected \"present\"|\"all\"|\"strict\", got " + << mode; + } + + //Get editions per each utterance + std::vector > edit_word_per_hyp, edit_word_per_hyp2; + if(hyp2_rspecifier.empty()) + GetEditsSingleHyp(hyp_rspecifier, ref_rspecifier, mode, edit_word_per_hyp); + else + GetEditsDualHyp(hyp_rspecifier, hyp2_rspecifier, ref_rspecifier, mode, + edit_word_per_hyp, edit_word_per_hyp2); + + //Extract WER for a number of replications of the same size + //as the hypothesis extracted + BaseFloat mean_wer = 0.0, interval = 0.0, + mean_wer2 = 0.0, interval2 = 0.0, + p_improv = 0.0; + + GetBootstrapWERInterval(edit_word_per_hyp, replications, + &mean_wer, &interval); + + if(!hyp2_rspecifier.empty()) { + GetBootstrapWERInterval(edit_word_per_hyp2, replications, + &mean_wer2, &interval2); + + GetBootstrapWERTwoSystemComparison(edit_word_per_hyp, edit_word_per_hyp2, + replications, &p_improv); + } + + // Print the output, + std::cout.precision(2); + std::cerr.precision(2); + std::cout << "Set1: %WER " << std::fixed << 100*mean_wer << + " 95\% Conf Interval [ " << 100*mean_wer-100*interval << + ", " << 100*mean_wer+100*interval << " ]" << '\n'; + + if(!hyp2_rspecifier.empty()) { + std::cout << "Set2: %WER " << std::fixed << 100*mean_wer2 << + " 95\% Conf Interval [ " << 100*mean_wer2-100*interval2 << + ", " << 100*mean_wer2+100*interval2 << " ]" << '\n'; + + std::cout << "Probability of Set2 improving Set1: " << std::fixed << + 100*p_improv << '\n'; + } + + return 0; + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/src/bin/vector-sum.cc b/src/bin/vector-sum.cc index 20f58d52b7d..42404e38384 100644 --- a/src/bin/vector-sum.cc +++ b/src/bin/vector-sum.cc @@ -101,7 +101,8 @@ int32 TypeOneUsage(const ParseOptions &po) { } int32 TypeTwoUsage(const ParseOptions &po, - bool binary) { + bool binary, + bool average = false) { KALDI_ASSERT(po.NumArgs() == 2); KALDI_ASSERT(ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier && "vector-sum: first argument must be an rspecifier"); @@ -133,6 +134,8 @@ int32 TypeTwoUsage(const ParseOptions &po, } } } + + if (num_done > 0 && average) sum.Scale(1.0 / num_done); Vector sum_float(sum); WriteKaldiObject(sum_float, po.GetArg(2), binary); @@ -199,12 +202,13 @@ int main(int argc, char *argv[]) { " e.g.: vector-sum --binary=false 1.vec 2.vec 3.vec sum.vec\n" "See also: copy-vector, dot-weights\n"; - bool binary; + bool binary, average = false; ParseOptions po(usage); po.Register("binary", &binary, "If true, write output as binary (only " "relevant for usage types two or three"); + po.Register("average", &average, "Do average instead of sum"); po.Read(argc, argv); @@ -219,7 +223,7 @@ int main(int argc, char *argv[]) { ClassifyWspecifier(po.GetArg(N), NULL, NULL, NULL) == kNoWspecifier) { // input from a single table, output not to table. - exit_status = TypeTwoUsage(po, binary); + exit_status = TypeTwoUsage(po, binary, average); } else if (po.NumArgs() >= 2 && ClassifyRspecifier(po.GetArg(1), NULL, NULL) == kNoRspecifier && ClassifyWspecifier(po.GetArg(N), NULL, NULL, NULL) == diff --git a/src/chain/Makefile b/src/chain/Makefile index e24913c06f2..c02844767f8 100644 --- a/src/chain/Makefile +++ b/src/chain/Makefile @@ -12,7 +12,7 @@ OBJFILES = chain-supervision.o chain-numerator.o chain-den-graph.o \ language-model.o chain-denominator.o chain-training.o ifeq ($(CUDA), true) - OBJFILES += chain-kernels.o + OBJFILES += chain-kernels.o endif LIBNAME = kaldi-chain @@ -53,7 +53,7 @@ endif ADDLIBS = ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \ ../fstext/kaldi-fstext.a \ - ../matrix/kaldi-matrix.a ../cudamatrix/kaldi-cudamatrix.a \ + ../cudamatrix/kaldi-cudamatrix.a ../matrix/kaldi-matrix.a \ ../util/kaldi-util.a ../base/kaldi-base.a diff --git a/src/chain/chain-datastruct.h b/src/chain/chain-datastruct.h index 7ea58038918..52e388a3f2e 100644 --- a/src/chain/chain-datastruct.h +++ b/src/chain/chain-datastruct.h @@ -45,7 +45,8 @@ extern "C" { }; - + // Search for this in chain-kernels.cu for an explanation. + enum { kThresholdingPowerOfTwo = 14 }; } diff --git a/src/chain/chain-den-graph.cc b/src/chain/chain-den-graph.cc index a654ad7d05f..ceb61a550f0 100644 --- a/src/chain/chain-den-graph.cc +++ b/src/chain/chain-den-graph.cc @@ -139,77 +139,6 @@ void DenominatorGraph::SetInitialProbs(const fst::StdVectorFst &fst) { Vector avg_prob_float(avg_prob); initial_probs_ = avg_prob_float; - special_hmm_state_ = ComputeSpecialState(fst, avg_prob_float); -} - -int32 NumStatesThatCanReach(const fst::StdVectorFst &fst, - int32 dest_state) { - int32 num_states = fst.NumStates(), - num_states_can_reach = 0; - KALDI_ASSERT(dest_state >= 0 && dest_state < num_states); - std::vector can_reach(num_states, false); - std::vector > reverse_transitions(num_states); - for (int32 s = 0; s < num_states; s++) - for (fst::ArcIterator aiter(fst, s); !aiter.Done(); - aiter.Next()) - reverse_transitions[aiter.Value().nextstate].push_back(s); - std::vector queue; - can_reach[dest_state] = true; - queue.push_back(dest_state); - num_states_can_reach++; - while (!queue.empty()) { - int32 state = queue.back(); - queue.pop_back(); - std::vector::const_iterator iter = reverse_transitions[state].begin(), - end = reverse_transitions[state].end(); - for (; iter != end; ++iter) { - int32 prev_state = *iter; - if (!can_reach[prev_state]) { - can_reach[prev_state] = true; - queue.push_back(prev_state); - num_states_can_reach++; - } - } - } - KALDI_ASSERT(num_states_can_reach >= 1 && - num_states_can_reach <= num_states); - return num_states_can_reach; -} - - -int32 DenominatorGraph::ComputeSpecialState( - const fst::StdVectorFst &fst, - const Vector &initial_probs) { - int32 num_states = initial_probs.Dim(); - std::vector > pairs(num_states); - for (int32 i = 0; i < num_states; i++) - pairs.push_back(std::pair(-initial_probs(i), i)); - // the first element of each pair is the negative of the initial-prob, - // so when we sort, the highest initial-prob will be first. - std::sort(pairs.begin(), pairs.end()); - // this threshold of 0.75 is pretty arbitrary. We reject any - // state if it can't be reached by 75% of all other states. - // In practice we think that states will either be reachable by - // almost-all states, or almost-none (e.g. states that are active - // only at utterance-beginning), so this threshold shouldn't - // be too critical. - int32 min_states_can_reach = 0.75 * num_states; - for (int32 i = 0; i < num_states; i++) { - int32 state = pairs[i].second; - int32 n = NumStatesThatCanReach(fst, state); - if (n < min_states_can_reach) { - KALDI_WARN << "Rejecting state " << state << " as a 'special' HMM state " - << "(for renormalization in fwd-bkwd), because it's only " - << "reachable by " << n << " out of " << num_states - << " states."; - } else { - return state; - } - } - KALDI_ERR << "Found no states that are reachable by at least " - << min_states_can_reach << " out of " << num_states - << " states. This is unexpected. Change the threshold"; - return -1; } void DenominatorGraph::GetNormalizationFst(const fst::StdVectorFst &ifst, @@ -261,6 +190,34 @@ void MinimizeAcceptorNoPush(fst::StdVectorFst *fst) { fst::Decode(fst, encoder); } +// This static function, used in CreateDenominatorFst, sorts an +// fst's states in decreasing order of number of transitions (into + out of) +// the state. The aim is to have states that have a lot of transitions +// either into them or out of them, be numbered earlier, so hopefully +// they will be scheduled first and won't delay the computation +static void SortOnTransitionCount(fst::StdVectorFst *fst) { + // negative_num_transitions[i] will contain (before sorting), the pair + // ( -(num-transitions-into(i) + num-transition-out-of(i)), i) + int32 num_states = fst->NumStates(); + std::vector > negative_num_transitions(num_states); + for (int32 i = 0; i < num_states; i++) { + negative_num_transitions[i].first = 0; + negative_num_transitions[i].second = i; + } + for (int32 i = 0; i < num_states; i++) { + for (fst::ArcIterator aiter(*fst, i); !aiter.Done(); + aiter.Next()) { + negative_num_transitions[i].first--; + negative_num_transitions[aiter.Value().nextstate].first--; + } + } + std::sort(negative_num_transitions.begin(), negative_num_transitions.end()); + std::vector order(num_states); + for (int32 i = 0; i < num_states; i++) + order[negative_num_transitions[i].second] = i; + fst::StateSort(fst, order); +} + void DenGraphMinimizeWrapper(fst::StdVectorFst *fst) { for (int32 i = 1; i <= 3; i++) { fst::PushSpecial(fst, fst::kDelta * 0.01); @@ -414,6 +371,8 @@ void CreateDenominatorFst(const ContextDependency &ctx_dep, DenGraphMinimizeWrapper(&transition_id_fst); + SortOnTransitionCount(&transition_id_fst); + *den_fst = transition_id_fst; CheckDenominatorFst(trans_model.NumPdfs(), *den_fst); PrintDenGraphStats(*den_fst); diff --git a/src/chain/chain-den-graph.h b/src/chain/chain-den-graph.h index 8e5ee39e4bd..b2510651f39 100644 --- a/src/chain/chain-den-graph.h +++ b/src/chain/chain-den-graph.h @@ -88,13 +88,6 @@ class DenominatorGraph { // Note: we renormalize each HMM-state to sum to one before doing this. const CuVector &InitialProbs() const; - // returns the index of the HMM-state that has the highest value in - // InitialProbs (and which we believe will always be reachable from most other - // states... later on we may check this more carefully [TODO]). - // It's used in getting the 'arbitrary_scale' value to keep the alphas - // in a good dynamic range. - int32 SpecialHmmState() const { return special_hmm_state_; } - // This function outputs a modifified version of the FST that was used to // build this object, that has an initial-state with epsilon transitions to // each state, with weight determined by initial_probs_; and has each original @@ -116,23 +109,15 @@ class DenominatorGraph { // functions called from the constructor void SetTransitions(const fst::StdVectorFst &fst, int32 num_pfds); - // work out the initial-probs and the 'special state' - // Note, there are no final-probs; we treat all states as final - // with probability one [we have a justification for this.. - // assuming it's roughly a well-normalized HMM, this makes sense; - // note that we train on chunks, so the beginning and end of a chunk - // appear at arbitrary points in the sequence. - // At both beginning and end of the chunk, we limit ourselves to - // only those pdf-ids that were allowed in the numerator sequence. + // work out the initial-probs. Note, there are no final-probs; we treat all + // states as final with probability one [we have a justification for this.. + // assuming it's roughly a well-normalized HMM, this makes sense; note that we + // train on chunks, so the beginning and end of a chunk appear at arbitrary + // points in the sequence. At both beginning and end of the chunk, we limit + // ourselves to only those pdf-ids that were allowed in the numerator + // sequence. void SetInitialProbs(const fst::StdVectorFst &fst); - // return a suitable 'special' HMM-state used for normalizing probabilities in - // the forward-backward. It has to have a reasonably high probability and be - // reachable from most of the graph. returns a suitable state-index - // that we can set special_hmm_state_ to. - int32 ComputeSpecialState(const fst::StdVectorFst &fst, - const Vector &initial_probs); - // forward_transitions_ is an array, indexed by hmm-state index, // of start and end indexes into the transition_ array, which // give us the set of transitions out of this state. @@ -152,23 +137,9 @@ class DenominatorGraph { // distribution of the HMM. This isn't too critical. CuVector initial_probs_; - // The index of a somewhat arbitrarily chosen HMM-state that we - // use for adjusting the alpha probabilities. It needs to be - // one that is reachable from all states (i.e. not a special - // state that's only reachable at sentence-start). We choose - // whichever one has the greatest initial-prob. It's set - // in SetInitialProbs(). - int32 special_hmm_state_; - int32 num_pdfs_; }; -// returns the number of states from which there is a path to -// 'dest_state'. Utility function used in selecting 'special' state -// for normalization of probabilities. -int32 NumStatesThatCanReach(const fst::StdVectorFst &fst, - int32 dest_state); - // Function that does acceptor minimization without weight pushing... // this is useful when constructing the denominator graph. diff --git a/src/chain/chain-denominator.cc b/src/chain/chain-denominator.cc index eaee850a999..258c33cd465 100644 --- a/src/chain/chain-denominator.cc +++ b/src/chain/chain-denominator.cc @@ -39,12 +39,23 @@ DenominatorComputation::DenominatorComputation( std::min(exp_nnet_output_transposed_.NumCols(), static_cast(kMaxDerivTimeSteps) * num_sequences_)), - alpha_(frames_per_sequence_ + 1, den_graph_.NumStates() * num_sequences_, + alpha_(frames_per_sequence_ + 1, + den_graph_.NumStates() * num_sequences_ + num_sequences_, kUndefined), - beta_(2, den_graph_.NumStates() * num_sequences_, kUndefined), + beta_(2, den_graph_.NumStates() * num_sequences_ + num_sequences_, + kUndefined), tot_prob_(num_sequences_, kUndefined), tot_log_prob_(num_sequences_, kUndefined), - log_correction_term_(num_sequences_, kUndefined) { + log_correction_term_(num_sequences_, kUndefined), + ok_(true) { + KALDI_ASSERT(opts_.leaky_hmm_coefficient > 0.0 && + opts_.leaky_hmm_coefficient < 1.0); + // make sure the alpha sums and beta sums are zeroed. + alpha_.ColRange(den_graph_.NumStates() * num_sequences_, + num_sequences_).SetZero(); + beta_.ColRange(den_graph_.NumStates() * num_sequences_, + num_sequences_).SetZero(); + KALDI_ASSERT(nnet_output.NumRows() % num_sequences == 0); exp_nnet_output_transposed_.ApplyExp(); } @@ -70,13 +81,12 @@ void DenominatorComputation::AlphaFirstFrame() { void DenominatorComputation::AlphaGeneralFrame(int32 t) { KALDI_ASSERT(t > 0 && t <= frames_per_sequence_); BaseFloat *this_alpha = alpha_.RowData(t); - const BaseFloat *prev_alpha = alpha_.RowData(t - 1); + const BaseFloat *prev_alpha_dash = alpha_.RowData(t - 1); const Int32Pair *backward_transitions = den_graph_.BackwardTransitions(); const DenominatorGraphTransition *transitions = den_graph_.Transitions(); int32 num_pdfs = exp_nnet_output_transposed_.NumRows(), num_hmm_states = den_graph_.NumStates(), - num_sequences = num_sequences_, - special_hmm_state = den_graph_.SpecialHmmState(); + num_sequences = num_sequences_; // 'probs' is the matrix of pseudo-likelihoods for frame t - 1. CuSubMatrix probs(exp_nnet_output_transposed_, 0, num_pdfs, @@ -90,8 +100,8 @@ void DenominatorComputation::AlphaGeneralFrame(int32 t) { dim3 dimGrid(n_blocks(num_sequences, dimBlock.x), num_hmm_states, 1); cuda_chain_hmm_forward(dimGrid, dimBlock, backward_transitions, transitions, - num_sequences, special_hmm_state, prob_data, - probs.Stride(), prev_alpha, this_alpha); + num_sequences, prob_data, probs.Stride(), + prev_alpha_dash, this_alpha); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); @@ -110,18 +120,19 @@ void DenominatorComputation::AlphaGeneralFrame(int32 t) { int32 pdf_id = trans_iter->pdf_id, prev_hmm_state = trans_iter->hmm_state; BaseFloat prob = prob_data[pdf_id * prob_stride + s], - this_prev_alpha = prev_alpha[prev_hmm_state * num_sequences + s]; + this_prev_alpha = prev_alpha_dash[prev_hmm_state * num_sequences + s]; this_tot_alpha += this_prev_alpha * transition_prob * prob; } - // Let arbitrary_scale be the inverse of the alpha value for the - // hmm-state indexed special_hmm_state_ on the previous frame (for this - // sequence); we multiply this into all the transition-probabilities - // from the previous frame to this frame, in both the forward and - // backward passes, in order to keep the alphas in a good numeric range. - // This won't affect the posteriors, but when computing the total - // likelihood we'll need to compensate for it later on. + // Let arbitrary_scale be the inverse of the alpha-sum value that we + // store in the same place we'd store the alpha for the state numbered + // 'num_hmm_states'. We multiply this into all the + // transition-probabilities from the previous frame to this frame, in + // both the forward and backward passes, in order to keep the alphas in + // a good numeric range. This won't affect the posteriors, but when + // computing the total likelihood we'll need to compensate for it later + // on. BaseFloat arbitrary_scale = - 1.0 / prev_alpha[special_hmm_state * num_sequences + s]; + 1.0 / prev_alpha_dash[num_hmm_states * num_sequences + s]; KALDI_ASSERT(this_tot_alpha - this_tot_alpha == 0); this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale; } @@ -129,37 +140,89 @@ void DenominatorComputation::AlphaGeneralFrame(int32 t) { } } +void DenominatorComputation::AlphaDash(int32 t) { + BaseFloat *this_alpha = alpha_.RowData(t); + + // create a 'fake matrix' for the regular alphas- view this row as a matrix. + // initializer takes [pointer, num-rows, num-cols, stride]. + CuSubMatrix alpha_mat(this_alpha, + den_graph_.NumStates(), + num_sequences_, + num_sequences_); + + // the alpha-dash is the sum of alpha over all states. + CuSubVector alpha_sum_vec(this_alpha + + den_graph_.NumStates() * num_sequences_, + num_sequences_); + alpha_sum_vec.AddRowSumMat(1.0, alpha_mat, 0.0); + + alpha_mat.AddVecVec(opts_.leaky_hmm_coefficient, + den_graph_.InitialProbs(), + alpha_sum_vec); + // it's now alpha-dash. +} + +// compute beta from beta-dash. +void DenominatorComputation::Beta(int32 t) { + BaseFloat *this_beta_dash = beta_.RowData(t % 2); + // create a 'fake matrix' for the regular beta-dash (which is + // the counterpart of alpha-dash)- view this row as a matrix. + // initializer takes [pointer, num-rows, num-cols, stride]. + CuSubMatrix beta_dash_mat(this_beta_dash, + den_graph_.NumStates(), + num_sequences_, + num_sequences_); + // making the t index implicit, the beta-dash-sum for each sequence is the sum + // over all states i of beta_i * opts_.leaky_hmm_coefficient * initial_prob_i. + CuSubVector beta_dash_sum_vec( + this_beta_dash + den_graph_.NumStates() * num_sequences_, + num_sequences_); + beta_dash_sum_vec.AddMatVec(opts_.leaky_hmm_coefficient, beta_dash_mat, + kTrans, den_graph_.InitialProbs(), 0.0); + // we are computing beta in place. After the following, beta-dash-mat + // will contain the actual beta (i.e. the counterpart of alpha), + // not the beta-dash. + beta_dash_mat.AddVecToRows(1.0, beta_dash_sum_vec); +} + BaseFloat DenominatorComputation::Forward() { AlphaFirstFrame(); - for (int32 t = 1; t <= frames_per_sequence_; t++) + AlphaDash(0); + for (int32 t = 1; t <= frames_per_sequence_; t++) { AlphaGeneralFrame(t); + AlphaDash(t); + } return ComputeTotLogLike(); } BaseFloat DenominatorComputation::ComputeTotLogLike() { tot_prob_.Resize(num_sequences_); - // View the last alpha as a matrix of size num-hmm-states by num-sequences. - CuSubMatrix last_alpha(alpha_.RowData(frames_per_sequence_), - den_graph_.NumStates(), - num_sequences_, - num_sequences_); + // View the last alpha-dash as a matrix of size num-hmm-states by num-sequences. + CuSubMatrix last_alpha_dash( + alpha_.RowData(frames_per_sequence_), + den_graph_.NumStates(), + num_sequences_, + num_sequences_); - tot_prob_.AddRowSumMat(1.0, last_alpha, 0.0); + tot_prob_.AddRowSumMat(1.0, last_alpha_dash, 0.0); // we should probably add an ApplyLog() function that takes a vector argument. tot_log_prob_ = tot_prob_; tot_log_prob_.ApplyLog(); BaseFloat tot_log_prob = tot_log_prob_.Sum(); - // We now have to add something for the arbitrary scaling factor. the - // inverses of all the alphas for hmm-states numbered zero, for t = 0 - // ... frames_per_sequence_ - 1, were included as the 'arbitrary factors' in the - // transition-probs, so we need to multiply them all together (not inversed) - // and add them as a correction term to the total log-likes. Note: the + // We now have to add something for the arbitrary scaling factor. [note: the // purpose of the arbitrary scaling factors was to keep things in a good - // floating-point range. + // floating-point range] + // The inverses of all the tot-alpha quantities, for t = 0 + // ... frames_per_sequence_ - 1, were included as the 'arbitrary factors' in + // the transition-probs, so we need to multiply them all together (not + // inversed) and add them as a correction term to the total log-likes. + // These tot-alpha quantities were stored in the same place that we would + // have stored the HMM-state numbered 'num_hmm_states'. + int32 num_hmm_states = den_graph_.NumStates(); CuSubMatrix inv_arbitrary_scales( alpha_, 0, frames_per_sequence_, - num_sequences_ * den_graph_.SpecialHmmState(), num_sequences_); + num_sequences_ * num_hmm_states, num_sequences_); CuMatrix log_inv_arbitrary_scales( inv_arbitrary_scales); log_inv_arbitrary_scales.ApplyLog(); @@ -170,12 +233,16 @@ BaseFloat DenominatorComputation::ComputeTotLogLike() { -void DenominatorComputation::Backward( +bool DenominatorComputation::Backward( BaseFloat deriv_weight, CuMatrixBase *nnet_output_deriv) { - BetaLastFrame(); + BetaDashLastFrame(); + Beta(frames_per_sequence_); for (int32 t = frames_per_sequence_ - 1; t >= 0; t--) { - BetaGeneralFrame(t); + BetaDashGeneralFrame(t); + if (GetVerboseLevel() >= 1 || t == 0) + BetaGeneralFrameDebug(t); + Beta(t); if (t % kMaxDerivTimeSteps == 0) { // commit the derivative stored in exp_nnet_output_transposed_ by adding // its transpose to the appropriate sub-matrix of 'nnet_output_deriv'. @@ -190,35 +257,35 @@ void DenominatorComputation::Backward( *nnet_output_deriv, t * num_sequences_, chunk_frames * num_sequences_, 0, num_pdfs); - output_deriv_part.AddMat(deriv_weight, transposed_deriv_part, - kTrans); + output_deriv_part.AddMat(deriv_weight, transposed_deriv_part, kTrans); if (t != 0) transposed_deriv_part.SetZero(); } } + return ok_; } -void DenominatorComputation::BetaLastFrame() { - // sets up the beta on the last frame (frame == frames_per_sequence_). Note that - // the betas we use here contain a 1/(tot-prob) factor in order to simplify - // the backprop. +void DenominatorComputation::BetaDashLastFrame() { + // sets up the beta-dash quantity on the last frame (frame == + // frames_per_sequence_). Note that the betas we use here contain a + // 1/(tot-prob) factor in order to simplify the backprop. int32 t = frames_per_sequence_; - BaseFloat *last_frame_beta = beta_.RowData(t % 2); + BaseFloat *last_frame_beta_dash = beta_.RowData(t % 2); // create a 'fake matrix' - view this row as a matrix. - CuSubMatrix beta_mat(last_frame_beta, - den_graph_.NumStates(), - num_sequences_, - num_sequences_); + CuSubMatrix beta_dash_mat(last_frame_beta_dash, + den_graph_.NumStates(), + num_sequences_, + num_sequences_); CuVector inv_tot_prob(tot_prob_); inv_tot_prob.InvertElements(); // the beta values at the end of the file only vary with the sequence-index, // not with the HMM-index. We treat all states as having a final-prob of one. - beta_mat.CopyRowsFromVec(inv_tot_prob); + beta_dash_mat.CopyRowsFromVec(inv_tot_prob); } -void DenominatorComputation::BetaGeneralFrame(int32 t) { +void DenominatorComputation::BetaDashGeneralFrame(int32 t) { KALDI_ASSERT(t >= 0 && t < frames_per_sequence_); int32 num_pdfs = exp_nnet_output_transposed_.NumRows(); // t_wrapped gives us the time-index we use when indexing @@ -226,9 +293,9 @@ void DenominatorComputation::BetaGeneralFrame(int32 t) { // matrix, storing only chunks of frames at a time, and we add it to the // non-transposed output whenever we finish a chunk. int32 t_wrapped = t % static_cast(kMaxDerivTimeSteps); - const BaseFloat *this_alpha = alpha_.RowData(t), + const BaseFloat *this_alpha_dash = alpha_.RowData(t), *next_beta = beta_.RowData((t + 1) % 2); - BaseFloat *this_beta = beta_.RowData(t % 2); + BaseFloat *this_beta_dash = beta_.RowData(t % 2); const Int32Pair *forward_transitions = den_graph_.ForwardTransitions(); const DenominatorGraphTransition *transitions = den_graph_.Transitions(); // 'probs' is the matrix of pseudo-likelihoods for frame t. @@ -238,8 +305,7 @@ void DenominatorComputation::BetaGeneralFrame(int32 t) { t_wrapped * num_sequences_, num_sequences_); int32 num_hmm_states = den_graph_.NumStates(), - num_sequences = num_sequences_, - special_hmm_state = den_graph_.SpecialHmmState(); + num_sequences = num_sequences_; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { @@ -247,10 +313,9 @@ void DenominatorComputation::BetaGeneralFrame(int32 t) { dim3 dimBlock(std::min(CU1DBLOCK, num_sequences), 1, 1); dim3 dimGrid(n_blocks(num_sequences, dimBlock.x), num_hmm_states, 1); cuda_chain_hmm_backward(dimGrid, dimBlock, forward_transitions, transitions, - num_sequences, special_hmm_state, - probs.Data(), probs.Stride(), this_alpha, next_beta, - this_beta, log_prob_deriv.Data(), - log_prob_deriv.Stride()); + num_sequences, probs.Data(), probs.Stride(), + this_alpha_dash, next_beta, this_beta_dash, + log_prob_deriv.Data(), log_prob_deriv.Stride()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else @@ -262,12 +327,12 @@ void DenominatorComputation::BetaGeneralFrame(int32 t) { BaseFloat *log_prob_deriv_data = log_prob_deriv.Data(); for (int32 h = 0; h < num_hmm_states; h++) { for (int32 s = 0; s < num_sequences; s++) { - BaseFloat this_alpha_prob = this_alpha[h * num_sequences + s], + BaseFloat this_alpha_dash_prob = this_alpha_dash[h * num_sequences + s], inv_arbitrary_scale = - this_alpha[special_hmm_state * num_sequences + s]; + this_alpha_dash[num_hmm_states * num_sequences + s]; double tot_variable_factor = 0.0; - BaseFloat - occupation_factor = this_alpha_prob / inv_arbitrary_scale; + BaseFloat occupation_factor = this_alpha_dash_prob / + inv_arbitrary_scale; const DenominatorGraphTransition *trans_iter = transitions + forward_transitions[h].first, *trans_end = transitions + forward_transitions[h].second; @@ -282,13 +347,49 @@ void DenominatorComputation::BetaGeneralFrame(int32 t) { BaseFloat occupation_prob = variable_factor * occupation_factor; log_prob_deriv_data[pdf_id * deriv_stride + s] += occupation_prob; } - this_beta[h * num_sequences + s] = + this_beta_dash[h * num_sequences + s] = tot_variable_factor / inv_arbitrary_scale; } } } } +void DenominatorComputation::BetaGeneralFrameDebug(int32 t) { + BaseFloat num_hmm_states = den_graph_.NumStates(), + alpha_beta_size = num_hmm_states * num_sequences_; + CuSubVector this_alpha_dash(alpha_.RowData(t), alpha_beta_size), + this_beta_dash(beta_.RowData(t % 2), alpha_beta_size); + int32 t_wrapped = t % static_cast(kMaxDerivTimeSteps), + num_pdfs = exp_nnet_output_transposed_.NumRows(); + CuSubMatrix this_log_prob_deriv( + nnet_output_deriv_transposed_, 0, num_pdfs, + t_wrapped * num_sequences_, num_sequences_); + BaseFloat alpha_beta_product = VecVec(this_alpha_dash, + this_beta_dash), + this_log_prob_deriv_sum = this_log_prob_deriv.Sum(); + if (!ApproxEqual(alpha_beta_product, num_sequences_)) { + KALDI_WARN << "On time " << t << ", alpha-beta product " + << alpha_beta_product << " != " << num_sequences_ + << " alpha-dash-sum = " << this_alpha_dash.Sum() + << ", beta-dash-sum = " << this_beta_dash.Sum(); + if (fabs(alpha_beta_product - num_sequences_) > 2.0) { + KALDI_WARN << "Excessive error detected, will abandon this minibatch"; + ok_ = false; + } + } + // use higher tolerance, since we are using randomized pruning for the + // log-prob derivatives. + if (!ApproxEqual(this_log_prob_deriv_sum, + num_sequences_, 0.01)) { + KALDI_WARN << "On time " << t << ", log-prob-deriv sum " + << this_log_prob_deriv_sum << " != " << num_sequences_; + if (fabs(this_log_prob_deriv_sum - num_sequences_) > 2.0) { + KALDI_WARN << "Excessive error detected, will abandon this minibatch"; + ok_ = false; + } + } +} + } // namespace chain } // namespace kaldi diff --git a/src/chain/chain-denominator.h b/src/chain/chain-denominator.h index f3b0afa6721..b0f616673d6 100644 --- a/src/chain/chain-denominator.h +++ b/src/chain/chain-denominator.h @@ -41,6 +41,153 @@ namespace kaldi { namespace chain { +/* + This extended comment describes how we implement forward-backward without log + and without overflow, and also the leaky-HMM idea. + + We'll start by establishing the notation for conventional forward-backward, + then add the 'arbitrary-scale' concept that prevents overflow, and then + add the 'leaky-hmm' concept. + + All this is done in parallel over multiple sequences, but the computations + are independent over the separate sequences, so we won't introduce any notation + or index for the sequence; we'll just explain it for one sequences. + + Suppose we have I hmm-states, numbered i = 0 ... I-1 (we'll use i and j for + hmm-state indexes). Let foll(i) give a list of arcs leaving state i, and + pred(i) give a list of arcs entering state i, and we'll use notation like: + for (j, p, n) in foll(i): + for iterating over those arcs, where in this case j is the destination-state, + p is the transition-probability of the arc and n is the pdf-id index. + We can then look up the emission probability as x(t, n) for some frame + 0 <= t < T. + + ** Version 1 of the computation (naive version) ** + + * Forward computation (version 1) + + In the forward computation we're computing alpha(i, t) for 0 <= t <= T): + - For the first frame, set alpha(0, i) = init(i), where init(i) is the + initial-probabilitiy from state i. # in our framework these are obtained + # by running the HMM for a while and getting an averaged occupation + # probability, and using this as an initial-prob, since the boundaries of + # chunks don't really correspond to utterance boundaries in general.] + - For t = 1 ... T: + for i = 0 ... I-1: + alpha(t, i) = 0 + for (j, p, n) in pred(i): # note: j is preceding-state. + alpha(t, i) += x(t-1, n) * alpha(t-1, j) * p. + + - total-prob = \sum_i alpha(T, i). # note, we take the final-probs of all states + # to be 1.0. + + * Backward computation (version 1) + + And now for the backward computation. Contrary to tradition, we include the + inverse of the total-prob as a factor in the betas. This is both more + convenient (it simplifies the way we obtain posteriors), and makes the + algorithm more generalizable as all the beta quantities can be interpreted as + the partial derivative of the logprob with respect to their corresponding + alpha. + + In forward backward notation, gamma is normally used for state-level + occupation probabilities, but what we care about here is pdf-id-level + occupation probabilities (i.e. the partial derivative of the log-likelihood + w.r.t. the logs of the x(t, n) quantities), so we use gamma for that. + + - for the final frame: + for each i, beta(T, i) = 1 / total-prob. + - for t = T-1 ... 0: + for i = 0 ... I-1: + beta(t, i) = 0 + for (j, p, n) in foll(i): # note: j is following-state. + beta(t, i) += x(t, n) * beta(t+1, j) * p. + gamma(t, n) += alpha(t, i) * x(t, n) * beta(t+1, j) * p. + + ** Version 2 of the computation (renormalized version) ** + + Version 1 of the algorithm is susceptible to numeric underflow and overflow, + due to the limited range of IEEE floating-point exponents. + Define tot-alpha(t) = \sum_i alpha(t, i). Then the renormalized version of + the computation is as above, except whenever the quantity x(t, n) appears, + we replace it with x(t, n) / alpha(t). In the algorithm we refer to + 1.0 / tot-alpha(t) as 'arbitrary_scale', because mathematically we can use any + value here as long as we are consistent and the value only varies with t + and not with n; we'll always get the same posteriors (gamma). + + When the algorithm outputs log(total-prob) as the total log-probability + of the HMM, we have to instead return the expression: + log(total-prob) + \sum_{t=0}^{T-1} tot-alpha(t). + to correct for the scaling of the x values. + + The algorithm is still vulnerable to overflow in the beta computation because + it's possible that the dominant path could have a very tiny alpha. However, + once we introduce the leaky-HMM idea (below), this problem will disappear. + + ** Version 3 of the computation (leaky-HMM version) ** + + The leaky-HMM idea is intended to improve generalization by allowing paths + other than those explicitly allowed by the FST we compiled. Another way to + look at it is as a way of hedging our bets about where we split the utterance, + so it's as we're marginalizing over different splits of the utterance. You + could also think of it as a modification of the FST so that there is an + epsilon transition from each state to a newly added state, with probability + one, and then an epsilon transition from the newly added state to each state + with probability leaky-hmm-prob * init(i) [except we need a mechanism so that + no more than two epsilon transitions can be taken per frame- this would involve + creating two copies of the states] + + Recall that we mentioned that init(i) is the initial-probability of + HMM-state i, but these are obtained in such a way that they can be treated + as priors, or average occupation-probabilities. + + Anyway, the way we formulate leaky-hmm is as follows: + + * Forward computation (version 3) + + Let leaky-hmm-prob be a constant defined by the user, with 0.1 being a typical + value. It defines how much probability we give to the 'leaky' transitions. + + - For frame 0, set alpha(0, i) = init(i). + - For 0 <= t <= T, define tot-alpha(t) = \sum_i alpha(t, i). + - For 0 <= t <= T, define alpha'(t, i) = alpha(t, i) + tot-alpha(t) * leaky-hmm-prob * init(i). + + - For 1 <= t <= T, the computation of alpha(t, i) is as before except we use + the previous frame's alpha' instead of alpha. That is: + alpha(t, i) = 0 + for (j, p, n) in pred(i): # note: j is preceding-state. + alpha(t, i) += alpha'(t-1, j) * p * x(t-1, n) / tot-alpha(t-1) + + - total-prob = \sum_i alpha'(T, i) + + The corrected log-prob that we return from the algorithm will be + (total-prob + \sum_{t=0}^{T-1} tot-alpha(t)). + + * Backward computation (version 3) + + The backward computation is as follows. It is fairly straightforward to + derive if you think of it as an instance of backprop where beta, tot-beta and + beta' are the partial derivatives of the output log-prob w.r.t. the + corresponding alpha, tot-alpha and alpha' quantities. Note, tot-beta is not + really the sum of the betas as its name might suggest, it's just the + derivative w.r.t. tot-alpha. + + - beta'(T, i) = 1 / total-prob. + - for 0 <= t <= T, define tot-beta(t) = leaky-hmm-prob * \sum_i init(i) * beta'(t, i) + - for 0 <= t <= T, define beta(t, i) = beta'(t, i) + tot-beta(t). + - for 0 <= t < T, we compute beta'(t, i) and update gamma(t, n) as follows: + for 0 <= i < I: + beta'(t, i) = 0 + for (j, p, n) in foll(i): # note: j is following-state. + beta'(t, i) += beta(t+1, j) * p * x(t, n) / tot-alpha(t) + gamma(t, n) += alpha'(t, i) * beta(t+1, j) * p * x(t, n) / tot-alpha(t) + + Note: in the code, the tot-alpha and tot-beta quantities go in the same + memory location that the corresponding alpha and beta for state I would go. + + */ + + // This does forward-backward in parallel on a number of sequences, using a // single HMM. class DenominatorComputation { @@ -70,7 +217,8 @@ class DenominatorComputation { // this adds deriv_weight times (the derivative of the log-prob w.r.t. the // nnet output), to 'nnet_output_deriv'. - void Backward(BaseFloat deriv_weight, + // returns true if everything seemed OK, false if a failure was detected. + bool Backward(BaseFloat deriv_weight, CuMatrixBase *nnet_output_deriv); private: @@ -84,6 +232,9 @@ class DenominatorComputation { void AlphaFirstFrame(); // the alpha computation for some 0 < t <= num_time_steps_. void AlphaGeneralFrame(int32 t); + // does the 'alpha-dash' computation for time t. this relates to + // 'leaky hmm'. + void AlphaDash(int32 t); // done after all the alphas, this function computes and returns the total // log-likelihood summed over all the sequences, and sets tot_prob_ (if we're @@ -92,9 +243,15 @@ class DenominatorComputation { // from the Forward() computation). BaseFloat ComputeTotLogLike(); - void BetaLastFrame(); + void BetaDashLastFrame(); // beta computation for 0 <= beta < num_time_steps_. - void BetaGeneralFrame(int32 t); + void BetaDashGeneralFrame(int32 t); + // compute the beta quantity from the beta-dash quantity (relates to leaky hmm). + void Beta(int32 t); + + // some checking that we can do if debug mode is activated, or on frame zero. + // Sets ok_ to false if a bad problem is detected. + void BetaGeneralFrameDebug(int32 t); const ChainTrainingOptions &opts_; const DenominatorGraph &den_graph_; @@ -116,13 +273,18 @@ class DenominatorComputation { // the derivs w.r.t. the nnet outputs (transposed) CuMatrix nnet_output_deriv_transposed_; - // the alpha probabilities; dimension is (frames_per_sequence + 1) by (num-hmm-states - // * num-sequences). Note, they are not logs. + // the (temporarily) alpha and (more permanently) alpha-dash probabilities; + // dimension is (frames_per_sequence + 1) by (num-hmm-states * num-sequences + + // num_sequences). Note, they are not logs. The last 'num_sequences' + // columns, where the alpha for the state indexed 'num_hmm_states' would live, + // are for the alpha-sums, which relates to leaky HMM. CuMatrix alpha_; - // the beta probabilities (rolling buffer); dimension is 2 * (num-hmm-states * - // num-sequences). Note: for efficiency and to simplify the equations, these - // are actually the beta / tot_prob_. + // the beta (also beta-dash) probabilities (rolling buffer); dimension is 2 * + // (num-hmm-states * num-sequences + num_sequences). [the last + // 'num_sequences' columns are for the beta-sums, which relates to leaky HMM.] + // Note: for efficiency and to simplify the equations, these are actually the + // beta / tot_prob_. CuMatrix beta_; // the total probability for each sequence, excluding the product of @@ -136,11 +298,13 @@ class DenominatorComputation { CuVector tot_log_prob_; // the log of the total correction term for each sequence, which is the - // product of the alpha_[special hmm state] over all the frames. The - // 'correction terms' are terms that we divide the alphas and betas by in - // order to keep them in a good dynamic range. The product of them - // must be included in the total likelihood. + // product of the alpha-sums [used in the leaky-hmm computation] over all the + // frames. The 'correction terms' are terms that we divide the alphas and + // betas by in order to keep them in a good dynamic range. The product of + // them must be included in the total likelihood. CuVector log_correction_term_; + + bool ok_; }; diff --git a/src/chain/chain-kernels-ansi.h b/src/chain/chain-kernels-ansi.h index af7a1a6b176..8ec1dcf322c 100644 --- a/src/chain/chain-kernels-ansi.h +++ b/src/chain/chain-kernels-ansi.h @@ -29,7 +29,6 @@ extern "C" { const Int32Pair *forward_transitions, const DenominatorGraphTransition *transitions, int32_cuda num_sequences, - int32_cuda special_hmm_state, const BaseFloat *probs, int32_cuda prob_stride, const BaseFloat *this_alpha, @@ -42,7 +41,6 @@ extern "C" { const Int32Pair *backward_transitions, const DenominatorGraphTransition *transitions, int32_cuda num_sequences, - int32_cuda special_hmm_state, const BaseFloat *probs, int32_cuda prob_stride, const BaseFloat *prev_alpha, diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu index 8fcf8037d36..ea10b6680f0 100644 --- a/src/chain/chain-kernels.cu +++ b/src/chain/chain-kernels.cu @@ -40,9 +40,9 @@ __device__ inline void atomic_add_thresholded(Real* address, Real value) { // threshold itself with probability (value / threshold). This preserves // expectations. Note: we assume that value >= 0. - // you can choose any value for the threshold, but powers of 2 are nice - // because they will exactly preserve the precision of the value. - const Real threshold = 1.0 / (1 << 14); + // kThresholdingPowerOfTwo is defined in chain-datastruct.h; it defines + // the threshold for randomized posterior pruning. + const Real threshold = 1.0 / (1 << kThresholdingPowerOfTwo); if (value >= threshold) { atomic_add(address, value); } else { @@ -67,7 +67,6 @@ __device__ inline void atomic_add_thresholded(Real* address, Real value) { if ((x >> 12) > (x & 4095)) atomic_add(address, threshold); } - } // one iteration of the forward computation in the 'tombstone' CTC HMM computation. @@ -82,7 +81,6 @@ __global__ static void _cuda_chain_hmm_forward(const Int32Pair *backward_transitions, const DenominatorGraphTransition *transitions, int32_cuda num_sequences, - int32_cuda special_hmm_state, const BaseFloat *probs, int32_cuda prob_stride, const BaseFloat *prev_alpha, @@ -137,15 +135,18 @@ static void _cuda_chain_hmm_forward(const Int32Pair *backward_transitions, this_tot_alpha += this_prev_alpha0 * transition_prob0 * pseudo_loglike0; } - // Let arbitrary_scale be the inverse of the alpha value for the - // hmm-state indexed special_hmm_state_ on the previous frame (for this - // sequence); we multiply this into all the transition-probabilities - // from the previous frame to this frame, in both the forward and - // backward passes, in order to keep the alphas in a good numeric range. - // This won't affect the posteriors, but when computing the total - // likelihood we'll need to compensate for it later on. + int32_cuda num_hmm_states = gridDim.y; + // Let arbitrary_scale be the inverse of the sum of all alpha values on-- the + // previous frame this sum of all the alpha values is stored in the place that + // we'd store the previous alpha for state-index equal to num_hmm_states + // (i.e. one past the end). We multiply this into all the + // transition-probabilities from the previous frame to this frame, in both the + // forward and backward passes, in order to keep the alphas in a good numeric + // range. This won't affect the posteriors, as it's just a constant factor + // for each frame, but when computing the total likelihood we'll need to + // compensate for it later on. BaseFloat arbitrary_scale = - 1.0 / prev_alpha[special_hmm_state * num_sequences + s]; + 1.0 / prev_alpha[num_hmm_states * num_sequences + s]; this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale; } @@ -154,7 +155,6 @@ __global__ static void _cuda_chain_hmm_backward(const Int32Pair *forward_transitions, const DenominatorGraphTransition *transitions, int32_cuda num_sequences, - int32_cuda special_hmm_state, const BaseFloat *probs, int32_cuda prob_stride, const BaseFloat *this_alpha, const BaseFloat *next_beta, BaseFloat *this_beta, BaseFloat *log_prob_deriv, @@ -179,10 +179,14 @@ static void _cuda_chain_hmm_backward(const Int32Pair *forward_transitions, if (s >= num_sequences) return; + // below, you can read 'gridDim.y' as 'num_hmm_states'. See where + // arbitrary_scale is defined in the forward computation above, for more + // explanation. BaseFloat this_alpha_prob = this_alpha[h * num_sequences + s], inv_arbitrary_scale = - this_alpha[special_hmm_state * num_sequences + s]; + this_alpha[gridDim.y * num_sequences + s]; double tot_variable_factor = 0.0; + BaseFloat occupation_factor = this_alpha_prob / inv_arbitrary_scale; const DenominatorGraphTransition *trans_iter = transitions + forward_transitions[h].first, @@ -223,7 +227,8 @@ static void _cuda_chain_hmm_backward(const Int32Pair *forward_transitions, atomic_add_thresholded(log_prob_deriv + (pdf_id0 * log_prob_deriv_stride + s), occupation_prob0); } - this_beta[h * num_sequences + s] = tot_variable_factor / inv_arbitrary_scale; + BaseFloat beta = tot_variable_factor / inv_arbitrary_scale; + this_beta[h * num_sequences + s] = beta; } @@ -231,28 +236,26 @@ void cuda_chain_hmm_forward(dim3 Gr, dim3 Bl, const Int32Pair *backward_transitions, const DenominatorGraphTransition *transitions, int32_cuda num_sequences, - int32_cuda special_hmm_state, const BaseFloat *probs, int32_cuda prob_stride, const BaseFloat *prev_alpha, BaseFloat *this_alpha) { _cuda_chain_hmm_forward<<>>(backward_transitions, transitions, - num_sequences, special_hmm_state, - probs, prob_stride, prev_alpha, this_alpha); + num_sequences, probs, prob_stride, + prev_alpha, this_alpha); } void cuda_chain_hmm_backward(dim3 Gr, dim3 Bl, const Int32Pair *forward_transitions, const DenominatorGraphTransition *transitions, int32_cuda num_sequences, - int32_cuda special_hmm_state, const BaseFloat *probs, int32_cuda prob_stride, const BaseFloat *this_alpha, const BaseFloat *next_beta, BaseFloat *this_beta, BaseFloat *log_prob_deriv, int32_cuda log_prob_deriv_stride) { _cuda_chain_hmm_backward<<>>(forward_transitions, transitions, - num_sequences, special_hmm_state, - probs, prob_stride, this_alpha, next_beta, + num_sequences, probs, prob_stride, + this_alpha, next_beta, this_beta, log_prob_deriv, log_prob_deriv_stride); } diff --git a/src/chain/chain-numerator.h b/src/chain/chain-numerator.h index 1dc9d9d489d..15cb31e0571 100644 --- a/src/chain/chain-numerator.h +++ b/src/chain/chain-numerator.h @@ -76,8 +76,8 @@ class NumeratorComputation { BaseFloat Forward(); // Does the backward computation and (efficiently) adds the derivative of the - // nnet output w.r.t. the (log-prob times supervision_.weight) to - // 'nnet_output_deriv'. + // nnet output w.r.t. the (log-prob times supervision_.weight times + // deriv_weight) to 'nnet_output_deriv'. void Backward(CuMatrixBase *nnet_output_deriv); private: diff --git a/src/chain/chain-supervision-test.cc b/src/chain/chain-supervision-test.cc index e6a333317e8..ea673df3291 100644 --- a/src/chain/chain-supervision-test.cc +++ b/src/chain/chain-supervision-test.cc @@ -251,15 +251,17 @@ void ChainTrainingTest(const DenominatorGraph &den_graph, nnet_output.SetRandn(); ChainTrainingOptions opts; + if (RandInt(0, 1) == 1) + opts.leaky_hmm_coefficient = 0.2; CuMatrix nnet_output_deriv(nnet_output.NumRows(), nnet_output.NumCols(), kUndefined); - BaseFloat objf, weight; + BaseFloat objf, l2_term, weight; ComputeChainObjfAndDeriv(opts, den_graph, supervision, - nnet_output, &objf, &weight, + nnet_output, &objf, &l2_term, &weight, &nnet_output_deriv); { @@ -296,11 +298,12 @@ void ChainTrainingTest(const DenominatorGraph &den_graph, CuMatrix nnet_output_perturbed(nnet_delta_output); nnet_output_perturbed.AddMat(1.0, nnet_output); - BaseFloat objf_modified, weight_modified; + BaseFloat objf_modified, l2_term_modified, weight_modified; ComputeChainObjfAndDeriv(opts, den_graph, supervision, nnet_output_perturbed, - &objf_modified, &weight_modified, + &objf_modified, &l2_term_modified, + &weight_modified, NULL); observed_objf_changes(p) = objf_modified - objf; @@ -419,21 +422,6 @@ void ChainDenominatorTest(const DenominatorGraph &den_graph) { 10.0); } - { // another check: that scaling the initial probs has the expected effect. - BaseFloat scale = 0.1 + 0.7 * RandUniform(); - DenominatorGraph den_graph_scaled(den_graph); - den_graph_scaled.ScaleInitialProbs(scale); - DenominatorComputation denominator_computation_scaled_initial( - opts, den_graph_scaled, - num_sequences, nnet_output); - BaseFloat forward_prob_scaled_initial = - denominator_computation_scaled_initial.Forward(); - BaseFloat observed_difference = - forward_prob_scaled_initial - forward_prob, - predicted_difference = num_sequences * log(scale); - AssertEqual(observed_difference, predicted_difference); - } - int32 num_tries = 5; BaseFloat epsilon = 1.0e-04; Vector predicted_objf_changes(num_tries), diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index 03fdb3cbe64..7d699600bee 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -800,5 +800,27 @@ void GetWeightsForRanges(int32 range_length, } } + +void GetWeightsForRangesNew(int32 range_length, + int32 num_frames_zeroed, + const std::vector &range_starts, + std::vector > *weights) { + KALDI_ASSERT(range_length > 0 && num_frames_zeroed * 2 < range_length); + int32 num_ranges = range_starts.size(); + weights->resize(num_ranges); + for (int32 i = 0; i < num_ranges; i++) { + (*weights)[i].Resize(range_length); + (*weights)[i].Set(1.0); + } + if (num_frames_zeroed == 0) + return; + for (int32 i = 1; i < num_ranges; i++) + (*weights)[i].Range(0, num_frames_zeroed).Set(0.0); + for (int32 i = 0; i + 1 < num_ranges; i++) + (*weights)[i].Range(range_length - num_frames_zeroed, + num_frames_zeroed).Set(0.0); +} + + } // namespace chain } // namespace kaldi diff --git a/src/chain/chain-supervision.h b/src/chain/chain-supervision.h index b17f62d00ad..2dda8baf1e4 100644 --- a/src/chain/chain-supervision.h +++ b/src/chain/chain-supervision.h @@ -355,7 +355,7 @@ int32 ComputeFstStateTimes(const fst::StdVectorFst &fst, /// all the same it will only append Supervision objects where successive ones /// have the same weight and num-frames, and if 'compactify' is true. The /// normal use-case for this is when you are combining neural-net examples for -/// training; appending them like this helps to simplify the decoding process. +/// training; appending them like this helps to simplify the training process. /// This function will crash if the values of label_dim in the inputs are not /// all the same. @@ -402,6 +402,28 @@ void GetWeightsForRanges(int32 range_length, std::vector > *weights); +/// This is a newer version of GetWeightsForRanges with a simpler behavior +/// than GetWeightsForRanges and a different purpose. Instead of aiming to +/// create weights that sum to one over the whole file, the purpose is to +/// zero out the derivative weights for a certain number of frames to each +/// side of every 'cut point' in the numerator lattice [by numerator lattice, +/// what I mean is the FST that we automatically generate from the numerator +/// alignment or lattice]. So we don't zero out the weights for the very +/// beginning or very end of each original utterance, just those where +/// we split the utterance into pieces. We believe there is an incentive +/// for the network to produce deletions near the edges, and this aims to fix +/// this problem. +/// range_length is the length of each range of times (so range_starts[0] +/// represents the start of a range of t values of length 'range_length' +/// and so range_starts[1] etc.), and num_frames_zeroed is the number of frames +/// on each side of the cut point on which we are supposed to zero out the +/// derivative. +void GetWeightsForRangesNew(int32 range_length, + int32 num_frames_zeroed, + const std::vector &range_starts, + std::vector > *weights); + + typedef TableWriter > SupervisionWriter; typedef SequentialTableReader > SequentialSupervisionReader; typedef RandomAccessTableReader > RandomAccessSupervisionReader; diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index 42cdfed2713..1bf0201fbfa 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -29,9 +29,11 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, const DenominatorGraph &den_graph, const Supervision &supervision, const CuMatrixBase &nnet_output, - BaseFloat *tot_objf, - BaseFloat *tot_weight, - CuMatrixBase *nnet_output_deriv) { + BaseFloat *objf, + BaseFloat *l2_term, + BaseFloat *weight, + CuMatrixBase *nnet_output_deriv, + CuMatrixBase *xent_output_deriv) { BaseFloat num_logprob_weighted; if (nnet_output_deriv) nnet_output_deriv->SetZero(); @@ -40,29 +42,44 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, // note: supervision.weight is included as a factor in the derivative from // the numerator object, and the logprob too. num_logprob_weighted = numerator.Forward(); - if (nnet_output_deriv) + if (nnet_output_deriv) { numerator.Backward(nnet_output_deriv); + if (xent_output_deriv) + xent_output_deriv->CopyFromMat(*nnet_output_deriv); + } else if (xent_output_deriv) { + // this branch will be taken if xent_output_deriv but not + // nnet_output_deriv is set- which could happen if you want to compute the + // cross-entropy objective but not the derivatives. + xent_output_deriv->SetZero(); + numerator.Backward(xent_output_deriv); + } } DenominatorComputation denominator(opts, den_graph, supervision.num_sequences, nnet_output); BaseFloat den_logprob = denominator.Forward(); + bool ok = true; if (nnet_output_deriv) - denominator.Backward(-supervision.weight, - nnet_output_deriv); + ok = denominator.Backward(-supervision.weight, + nnet_output_deriv); - *tot_objf = num_logprob_weighted - supervision.weight * den_logprob; - *tot_weight = supervision.weight * supervision.num_sequences * + *objf = num_logprob_weighted - supervision.weight * den_logprob; + *weight = supervision.weight * supervision.num_sequences * supervision.frames_per_sequence; - if (!(*tot_objf == *tot_objf)) { - // inf or NaN detected + if (!((*objf) - (*objf) == 0) || !ok) { + // inf or NaN detected, or denominator computation returned false. if (nnet_output_deriv) nnet_output_deriv->SetZero(); + if (xent_output_deriv) + xent_output_deriv->SetZero(); BaseFloat default_objf = -10; - KALDI_WARN << "Objective function is " << (*tot_objf) - << ", setting to " << default_objf << " per frame."; - *tot_objf = default_objf * *tot_weight; + KALDI_WARN << "Objective function is " << (*objf) + << " and denominator computation (if done) returned " + << std::boolalpha << ok + << ", setting objective function to " << default_objf + << " per frame."; + *objf = default_objf * *weight; } // This code helps us see how big the derivatives are, on average, @@ -81,6 +98,16 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, row_products_per_frame(i / num_sequences) += row_products_cpu(i); KALDI_LOG << "Derivs per frame are " << row_products_per_frame; } + + if (opts.l2_regularize == 0.0) { + *l2_term = 0.0; + } else { + // compute the l2 penalty term and its derivative + BaseFloat scale = supervision.weight * opts.l2_regularize; + *l2_term = -0.5 * scale * TraceMatMat(nnet_output, nnet_output, kTrans); + if (nnet_output_deriv) + nnet_output_deriv->AddMat(-1.0 * scale, nnet_output); + } } diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h index 8eb7e8343f4..e6143d10846 100644 --- a/src/chain/chain-training.h +++ b/src/chain/chain-training.h @@ -40,11 +40,44 @@ namespace chain { struct ChainTrainingOptions { - // Currently empty. - - ChainTrainingOptions() { } - + // l2 regularization constant on the 'chain' output; the actual term added to + // the objf will be -0.5 times this constant times the squared l2 norm. + // (squared so it's additive across the dimensions). e.g. try 0.0005. + BaseFloat l2_regularize; + + // Coefficient for 'leaky hmm'. This means we have an epsilon-transition from + // each state to a special state with probability one, and then another + // epsilon-transition from that special state to each state, with probability + // leaky_hmm_coefficient times [initial-prob of destination state]. Imagine + // we make two copies of each state prior to doing this, version A and version + // B, with transition from A to B, so we don't have to consider epsilon loops- + // or just imagine the coefficient is small enough that we can ignore the + // epsilon loops. + BaseFloat leaky_hmm_coefficient; + + + // Cross-entropy regularization constant. (e.g. try 0.1). If nonzero, + // the network is expected to have an output named 'output-xent', which + // should have a softmax as its final nonlinearity. + BaseFloat xent_regularize; + + ChainTrainingOptions(): l2_regularize(0.0), leaky_hmm_coefficient(1.0e-05), + xent_regularize(0.0) { } + void Register(OptionsItf *opts) { + opts->Register("l2-regularize", &l2_regularize, "l2 regularization " + "constant for 'chain' training, applied to the output " + "of the neural net."); + opts->Register("leaky-hmm-coefficient", &leaky_hmm_coefficient, "Coefficient " + "that allows transitions from each HMM state to each other " + "HMM state, to ensure gradual forgetting of context (can " + "improve generalization). For numerical reasons, may not be " + "exactly zero."); + opts->Register("xent-regularize", &xent_regularize, "Cross-entropy " + "regularization constant for 'chain' training. If " + "nonzero, the network is expected to have an output " + "named 'output-xent', which should have a softmax as " + "its final nonlinearity."); } }; @@ -59,10 +92,13 @@ struct ChainTrainingOptions { paths and constraints on the alignment as an FST @param [in] nnet_output The output of the neural net; dimension must equal ((supervision.num_sequences * supervision.frames_per_sequence) by - den_graph.NumPdfs()). + den_graph.NumPdfs()). The rows are ordered as: all sequences + for frame 0; all sequences for frame 1; etc. @param [out] objf The [num - den] objective function computed for this example; you'll want to divide it by 'tot_weight' before displaying it. + @param [out] l2_term The l2 regularization term in the objective function, if + the --l2-regularize option is used. To be added to 'o @param [out] weight The weight to normalize the objective function by; equals supervision.weight * supervision.num_sequences * supervision.frames_per_sequence. @@ -70,14 +106,22 @@ struct ChainTrainingOptions { the neural-net output. Only written to if non-NULL. You don't have to zero this before passing to this function, we zero it internally. + @param [out] xent_output_deriv If non-NULL, then the numerator part of the derivative + (which equals a posterior from the numerator forward-backward, + scaled by the supervision weight) is written to here. This will + be used in the cross-entropy regularization code. This value + is also used in computing the cross-entropy objective value. */ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, const DenominatorGraph &den_graph, const Supervision &supervision, const CuMatrixBase &nnet_output, - BaseFloat *tot_objf, - BaseFloat *tot_weight, - CuMatrixBase *nnet_output_deriv); + BaseFloat *objf, + BaseFloat *l2_term, + BaseFloat *weight, + CuMatrixBase *nnet_output_deriv, + CuMatrixBase *xent_output_deriv = NULL); + } // namespace chain diff --git a/src/chainbin/nnet3-chain-acc-lda-stats.cc b/src/chainbin/nnet3-chain-acc-lda-stats.cc index 3bdf710c489..3f092879b6e 100644 --- a/src/chainbin/nnet3-chain-acc-lda-stats.cc +++ b/src/chainbin/nnet3-chain-acc-lda-stats.cc @@ -40,9 +40,11 @@ class NnetChainLdaStatsAccumulator { void AccStats(const NnetChainExample &eg) { ComputationRequest request; - bool need_backprop = false, store_stats = false; + bool need_backprop = false, store_stats = false, + need_xent = false, need_xent_deriv = false; - GetChainComputationRequest(nnet_, eg, need_backprop, store_stats, &request); + GetChainComputationRequest(nnet_, eg, need_backprop, store_stats, + need_xent, need_xent_deriv, &request); const NnetComputation &computation = *(compiler_.Compile(request)); diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc index 4e32d280638..ed162d1d18b 100644 --- a/src/chainbin/nnet3-chain-get-egs.cc +++ b/src/chainbin/nnet3-chain-get-egs.cc @@ -25,6 +25,7 @@ #include "hmm/posterior.h" #include "nnet3/nnet-example.h" #include "nnet3/nnet-chain-example.h" +#include "nnet3/nnet-example-utils.h" namespace kaldi { namespace nnet3 { @@ -48,6 +49,7 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst, int32 frames_per_eg, int32 frames_overlap_per_eg, int32 frame_subsampling_factor, + int32 cut_zero_frames, int64 *num_frames_written, int64 *num_egs_written, NnetChainExampleWriter *example_writer) { @@ -57,13 +59,36 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst, num_feature_frames_subsampled = (num_feature_frames + frame_subsampling_factor - 1)/ frame_subsampling_factor; - if (num_output_frames != num_feature_frames_subsampled) - KALDI_ERR << "Mismatch in num-frames: chain supervision has " - << num_output_frames - << " versus features/frame_subsampling_factor = " - << num_feature_frames << " / " << frame_subsampling_factor - << ": check that --frame-subsampling-factor option is set " - << "the same as to chain-get-supervision."; + if (num_output_frames != num_feature_frames_subsampled) { + // we tolerate deviations in the num-frames if they are very small (1 output + // frame). + + if (abs(num_output_frames - num_feature_frames_subsampled) > 1) { + KALDI_ERR << "Mismatch in num-frames: chain supervision has " + << num_output_frames + << " versus features/frame_subsampling_factor = " + << num_feature_frames << " / " << frame_subsampling_factor + << " = " << num_feature_frames_subsampled + << ": check that --frame-subsampling-factor option is set " + << "the same as to chain-get-supervision."; + } + int32 new_num_feature_frames = + num_output_frames * frame_subsampling_factor; + // add a few frames at the end to make it match up. + Matrix feats_new(new_num_feature_frames, feats.NumCols(), + kUndefined); + int32 min_feature_frames = std::min(num_feature_frames, + new_num_feature_frames); + feats_new.RowRange(0, min_feature_frames).CopyFromMat( + feats.RowRange(0, min_feature_frames)); + for (int32 i = num_feature_frames; i < new_num_feature_frames; i++) + feats_new.Row(i).CopyFromVec(feats.Row(num_feature_frames - 1)); + return ProcessFile(normalization_fst, feats_new, ivector_feats, + supervision, utt_id, compress, left_context, right_context, + frames_per_eg, frames_overlap_per_eg, frame_subsampling_factor, + cut_zero_frames, num_frames_written, num_egs_written, + example_writer); + } KALDI_ASSERT(frames_per_eg % frame_subsampling_factor == 0); @@ -88,9 +113,15 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst, // to the edge are not as accurate as they could be, because when we split we // don't know the correct alphas and betas). std::vector > deriv_weights; - chain::GetWeightsForRanges(frames_per_eg_subsampled, - range_starts_subsampled, - &deriv_weights); + if (cut_zero_frames >= 0) + chain::GetWeightsForRangesNew(frames_per_eg_subsampled, + cut_zero_frames / frame_subsampling_factor, + range_starts_subsampled, + &deriv_weights); + else + chain::GetWeightsForRanges(frames_per_eg_subsampled, + range_starts_subsampled, + &deriv_weights); if (range_starts_subsampled.empty()) { KALDI_WARN << "No output for utterance " << utt_id @@ -177,35 +208,6 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst, return true; } -void RoundUpNumFrames(int32 frame_subsampling_factor, - int32 *num_frames, - int32 *num_frames_overlap) { - if (*num_frames % frame_subsampling_factor != 0) { - int32 new_num_frames = frame_subsampling_factor * - (*num_frames / frame_subsampling_factor + 1); - KALDI_LOG << "Rounding up --num-frames=" << (*num_frames) - << " to a multiple of --frame-subsampling-factor=" - << frame_subsampling_factor - << ", now --num-frames=" << new_num_frames; - *num_frames = new_num_frames; - } - if (*num_frames_overlap % frame_subsampling_factor != 0) { - int32 new_num_frames_overlap = frame_subsampling_factor * - (*num_frames_overlap / frame_subsampling_factor + 1); - KALDI_LOG << "Rounding up --num-frames-overlap=" << (*num_frames_overlap) - << " to a multiple of --frame-subsampling-factor=" - << frame_subsampling_factor - << ", now --num-frames-overlap=" << new_num_frames_overlap; - *num_frames_overlap = new_num_frames_overlap; - } - if (*num_frames_overlap < 0 || *num_frames_overlap >= *num_frames) { - KALDI_ERR << "--num-frames-overlap=" << (*num_frames_overlap) << " < " - << "--num-frames=" << (*num_frames); - } - -} - - } // namespace nnet2 } // namespace kaldi @@ -237,6 +239,7 @@ int main(int argc, char *argv[]) { bool compress = true; int32 left_context = 0, right_context = 0, num_frames = 1, num_frames_overlap = 0, length_tolerance = 100, + cut_zero_frames = -1, frame_subsampling_factor = 1; std::string ivector_rspecifier; @@ -244,6 +247,10 @@ int main(int argc, char *argv[]) { ParseOptions po(usage); po.Register("compress", &compress, "If true, write egs in " "compressed format (recommended)"); + po.Register("cut-zero-frames", &cut_zero_frames, "Number of frames " + "(measured before subsampling) to zero the derivative on each " + "side of a cut point (if set, activates new-style derivative " + "weights)"); po.Register("left-context", &left_context, "Number of frames of left " "context the neural net requires."); po.Register("right-context", &right_context, "Number of frames of right " @@ -333,14 +340,15 @@ int main(int argc, char *argv[]) { || ivector_feats->NumRows() == 0)) { KALDI_WARN << "Length difference between feats " << feats.NumRows() << " and iVectors " << ivector_feats->NumRows() - << "exceeds tolerance " << length_tolerance; + << " exceeds tolerance " << length_tolerance; num_err++; continue; } if (ProcessFile(normalization_fst, feats, ivector_feats, supervision, - key, compress, left_context, right_context, num_frames, + key, compress, + left_context, right_context, num_frames, num_frames_overlap, frame_subsampling_factor, - &num_frames_written, &num_egs_written, + cut_zero_frames, &num_frames_written, &num_egs_written, &example_writer)) num_done++; else diff --git a/src/chainbin/nnet3-chain-train.cc b/src/chainbin/nnet3-chain-train.cc index 71092f1bc27..5486a5f7fe9 100644 --- a/src/chainbin/nnet3-chain-train.cc +++ b/src/chainbin/nnet3-chain-train.cc @@ -70,17 +70,21 @@ int main(int argc, char *argv[]) { Nnet nnet; ReadKaldiObject(nnet_rxfilename, &nnet); - fst::StdVectorFst den_fst; - ReadFstKaldi(den_fst_rxfilename, &den_fst); + bool ok; - NnetChainTrainer trainer(opts, den_fst, &nnet); + { + fst::StdVectorFst den_fst; + ReadFstKaldi(den_fst_rxfilename, &den_fst); - SequentialNnetChainExampleReader example_reader(examples_rspecifier); + NnetChainTrainer trainer(opts, den_fst, &nnet); - for (; !example_reader.Done(); example_reader.Next()) - trainer.Train(example_reader.Value()); + SequentialNnetChainExampleReader example_reader(examples_rspecifier); - bool ok = trainer.PrintTotalStats(); + for (; !example_reader.Done(); example_reader.Next()) + trainer.Train(example_reader.Value()); + + ok = trainer.PrintTotalStats(); + } #if HAVE_CUDA==1 CuDevice::Instantiate().PrintProfile(); diff --git a/src/configure b/src/configure index c90e9ba4ee0..0f6577dde17 100755 --- a/src/configure +++ b/src/configure @@ -52,7 +52,7 @@ function is_set { ## First do some checks. These verify that all the things are ## here that should be here. -if [ "`basename $PWD`" != "src" ]; then +if ! [ -x "$PWD/configure" ]; then echo 'You must run "configure" from the src/ directory.' exit 1 fi @@ -177,7 +177,10 @@ do esac done - +# the idea here is that if you change the configuration options from using +# CUDA to not using it, or vice versa, we want to recompile all parts of the +# code that may use a GPU. Touching this file is a way to force this. +touch cudamatrix/cu-common.h 2>/dev/null function failure { echo "***configure failed: $* ***" >&2 @@ -400,11 +403,11 @@ function linux_configure_mkl_threading { } ## -##CUDA is used in src/cudamatrix and src/nnet{,bin} only. -##It is used to accelerate the neural network training, -##the rest of kaldi is running on CPUs. +## CUDA is used only in selected directories including src/cudamatrix, src/nnet* +## and src/chain*. It is used to accelerate the neural network training, the +## rest of kaldi runs on CPUs. ## -function linux_configure_cuda { +function configure_cuda { #check for CUDA toolkit in the system if [ ! $CUDATKDIR ]; then for base in /Developer/NVIDIA/CUDA-6.0 /usr/local/share/cuda /usr/local/cuda /pkgs_local/cuda-3.2/ /opt/nvidia_cuda/cuda-6.0/ /usr/; do @@ -425,9 +428,13 @@ function linux_configure_cuda { echo CUDATKDIR = $CUDATKDIR >> kaldi.mk if [ "`uname -m`" == "x86_64" ]; then - cat makefiles/linux_x86_64_cuda.mk >> kaldi.mk + if [ "`uname`" == "Darwin" ]; then + sed 's/lib64/lib/g' < makefiles/cuda_64bit.mk >> kaldi.mk + else + cat makefiles/cuda_64bit.mk >> kaldi.mk + fi else - cat makefiles/linux_cuda.mk >> kaldi.mk + cat makefiles/cuda_32bit.mk >> kaldi.mk fi else echo "CUDA will not be used! If you have already installed cuda drivers " @@ -538,7 +545,7 @@ function linux_configure_debian_ubuntu { cat makefiles/linux_atlas.mk >> kaldi.mk fix_cxx_flag echo "Successfully configured for Debian/Ubuntu Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS" - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda linux_configure_speex exit_success; } @@ -557,7 +564,7 @@ function linux_configure_debian_ubuntu3 { cat makefiles/linux_atlas.mk >> kaldi.mk fix_cxx_flag echo "Successfully configured for Debian/Ubuntu Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS" - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda linux_configure_speex exit_success; } @@ -579,7 +586,7 @@ function linux_configure_debian7 { cat makefiles/linux_atlas.mk >> kaldi.mk fix_cxx_flag echo "Successfully configured for Debian 7 [dynamic libraries] with ATLASLIBS =$ATLASLIBS" - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda linux_configure_speex exit_success; } @@ -598,7 +605,7 @@ function linux_configure_redhat { cat makefiles/linux_atlas.mk >> kaldi.mk fix_cxx_flag echo "Successfully configured for red hat [dynamic libraries] with ATLASLIBS =$ATLASLIBS" - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda exit_success; } @@ -619,7 +626,7 @@ function linux_configure_redhat_fat { cat makefiles/linux_atlas.mk >> kaldi.mk fix_cxx_flag echo "Successfully configured for red hat [dynamic libraries, fat] with ATLASLIBS =$ATLASLIBS" - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda exit_success; } @@ -671,7 +678,7 @@ function linux_configure_static { echo ATLASLIBS = $ATLASLIBS >> kaldi.mk cat makefiles/linux_atlas.mk >> kaldi.mk fix_cxx_flag - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda linux_configure_speex echo "Successfully configured for Linux [static libraries] with ATLASLIBS =$ATLASLIBS" exit_success; @@ -750,7 +757,7 @@ function linux_configure_dynamic { echo ATLASLIBS = $ATLASLIBS >> kaldi.mk cat makefiles/linux_atlas.mk >> kaldi.mk fix_cxx_flag - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda linux_configure_speex echo "Successfully configured for Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS" exit_success; @@ -793,7 +800,7 @@ echo "CONFIGURE_VERSION := $CONFIGURE_VERSION" >> kaldi.mk echo "FSTROOT = $FSTROOT" >> kaldi.mk # Check installed OpenFst version and add C++11 flags if OpenFst >= 1.4 -OPENFST_VER=`grep 'PACKAGE_VERSION' $FSTROOT/Makefile | sed -e 's:.*= ::'` +OPENFST_VER="${OPENFST_VER:-`grep 'PACKAGE_VERSION' $FSTROOT/Makefile | sed -e 's:.*= ::'`}" echo "OPENFST_VER = $OPENFST_VER" >> kaldi.mk OPENFST_VER_NUM=`echo $OPENFST_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d"` if [ $OPENFST_VER_NUM -ge 10400 ]; then @@ -810,7 +817,7 @@ echo "Doing OS specific configurations ..." # which crashes on Darwin. Also the linear algebra libraries on Macs are # used differently (through the Accelerate framework) than on Linux. if [ "`uname`" == "Darwin" ]; then - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda echo "On Darwin: checking for Accelerate framework ..." if [ ! -e /System/Library/Frameworks/Accelerate.framework ]; then failure "Need the Accelerate.framework to compile on Darwin." @@ -970,7 +977,7 @@ if [ "`uname`" == "Linux" ]; then fix_cxx_flag echo "MKLFLAGS = ${MKL_LINK_LINE} ${THREADING_LINE} $EXTRA_LIBS " >> kaldi.mk - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda linux_configure_speex echo "Successfully configured for Linux with MKL libs from $MKLROOT" exit_success; @@ -993,7 +1000,7 @@ if [ "`uname`" == "Linux" ]; then cat makefiles/linux_clapack.mk >> kaldi.mk fix_cxx_flag echo "Warning (CLAPACK): this part of the configure process is not properly tested and will not work." - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda linux_configure_speex echo "Successfully configured for Linux with CLAPACK libs from $CLAPACKROOT" exit_success; @@ -1017,7 +1024,7 @@ if [ "`uname`" == "Linux" ]; then echo "OPENBLASROOT = $OPENBLASROOT" >> kaldi.mk cat makefiles/linux_openblas.mk >> kaldi.mk fix_cxx_flag - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda linux_configure_speex echo "Successfully configured OpenBLAS from $OPENBLASROOT." exit_success; diff --git a/src/cudamatrix/cu-common.cc b/src/cudamatrix/cu-common.cc index 8718c49eea5..2b23bf0b621 100644 --- a/src/cudamatrix/cu-common.cc +++ b/src/cudamatrix/cu-common.cc @@ -51,19 +51,20 @@ void GetBlockSizesForSimpleMatrixOperation(int32 num_rows, dim3 *dimBlock) { KALDI_ASSERT(num_rows > 0 && num_cols > 0); int32 col_blocksize = 64, row_blocksize = 4; - while (num_cols + (num_cols / 2) <= col_blocksize && - num_rows > 65536 * row_blocksize) { + while (col_blocksize > 1 && + (num_cols + (num_cols / 2) <= col_blocksize || + num_rows > 65536 * row_blocksize)) { col_blocksize /= 2; row_blocksize *= 2; } - KALDI_ASSERT(col_blocksize > 0 && "Matrix too large to process"); - dimBlock->x = col_blocksize; dimBlock->y = row_blocksize; dimBlock->z = 1; dimGrid->x = n_blocks(num_cols, col_blocksize); dimGrid->y = n_blocks(num_rows, row_blocksize); + KALDI_ASSERT(dimGrid->y <= 65536 && + "Matrix has too many rows to process"); dimGrid->z = 1; } #endif diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc index ec7e69edad0..c34994ed6ce 100644 --- a/src/cudamatrix/cu-device.cc +++ b/src/cudamatrix/cu-device.cc @@ -435,7 +435,7 @@ std::string CuDevice::GetFreeMemory(int64* free, int64* total) const { // WARNING! the CUDA API is inconsistent accross versions! #ifdef _MSC_VER size_t mem_free, mem_total; - cuMemGetInfo_v2(handle_, &mem_free, &mem_total); + cuMemGetInfo_v2(&mem_free, &mem_total); #else #if (CUDA_VERSION >= 3020) // define the function signature type @@ -447,9 +447,6 @@ std::string CuDevice::GetFreeMemory(int64* free, int64* total) const { // we will load cuMemGetInfo_v2 dynamically from libcuda.so // pre-fill ``safe'' values that will not cause problems mem_free = 1; mem_total = 1; -#ifdef _MSC_VER - cuMemGetInfo_v2(handle_, &mem_free, &mem_total); -#else // open libcuda.so void* libcuda = dlopen("libcuda.so",RTLD_LAZY); if (NULL == libcuda) { @@ -473,7 +470,6 @@ std::string CuDevice::GetFreeMemory(int64* free, int64* total) const { // close the library dlclose(libcuda); } -#endif } #endif // copy the output values outside @@ -574,6 +570,7 @@ CuDevice::CuDevice(): active_gpu_id_(-1), verbose_(true), CuDevice::~CuDevice() { if (Enabled()) { cublasDestroy(handle_); + cudaDeviceReset(); } } diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h index 804bea1a217..a52c42cf347 100644 --- a/src/cudamatrix/cu-kernels-ansi.h +++ b/src/cudamatrix/cu-kernels-ansi.h @@ -2,7 +2,7 @@ // Copyright 2009-2012 Karel Vesely // 2013 Johns Hopkins University (author: Daniel Povey) -// 2013 Hainan Xu +// 2013 Hainan Xu // 2013 Xiaohui Zhang // 2013-2015 Guoguo Chen @@ -44,7 +44,7 @@ void cudaI32_set_const(dim3 Gr, dim3 Bl, int32_cuda *mat, int32_cuda value, Matr */ /* - * CuMatrix + * CuMatrix */ void cudaF_copy_upp_low(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA); void cudaF_copy_low_upp(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA); @@ -58,7 +58,7 @@ void cudaFD_copy_from_tp(dim3 Gr, dim3 Bl, float* A, const double* B, MatrixDim void cudaF_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d); void cudaF_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim d); void cudaF_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power, bool include_sign, MatrixDim d); -void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d); +void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d); void cudaF_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val, MatrixDim d); void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride); void cudaF_add_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride); @@ -108,9 +108,9 @@ void cudaF_vec_min(const float* v, float* value, int dim); void cudaF_vec_max(const float* v, float* value, int dim); void cudaF_trace_mat_mat_trans(const float* A, const float* B, MatrixDim dA, int B_stride, float* value); void cudaF_trace_mat_mat(const float* A, const float* B, MatrixDim dA, int B_stride, float* value); -void cudaF_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M, - int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride, - int N_col_stride, int threads_per_element, float beta); +void cudaF_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M, + int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride, + int N_col_stride, int threads_per_element, float beta); void cudaF_add_vec_vec(int Gr, int Bl, float alpha, float* v, const float* x, const float* y, float beta, int dim); void cudaF_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const float* mat, MatrixDim dmat, int dim); void cudaF_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col, const float* mat, MatrixDim dmat, int dim); @@ -141,6 +141,7 @@ void cudaF_soft_hinge(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, i void cudaF_group_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size, float power); void cudaF_group_max(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size); void cudaF_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride); +void cudaF_heaviside(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride); void cudaF_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d, int e_stride, int y_stride); void cudaF_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride); void cudaF_diff_tanh(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d, int e_stride, int y_stride); @@ -161,15 +162,15 @@ void cudaF_take_mean(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in) void cudaF_matrix_add_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim dim, float alpha, MatrixElement* x, int num_elements); void cudaF_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim, float alpha, const Int32Pair* indices, const float* x, int s, float* data); void cudaF_comp_obj_deriv(dim3 Gr,dim3 Bl, MatrixElement* x, int s, const float* z, MatrixDim d, float* z2, MatrixDim d2, float* t); -void cudaF_transpose_matrix(dim3 Gr, dim3 Bl, float* mat, MatrixDim d); +void cudaF_transpose_matrix(dim3 Gr, dim3 Bl, float* mat, MatrixDim d); void cudaF_sy_add_tr2(dim3 Gr, dim3 Bl, float alpha, float beta, const float* T, MatrixDim tdim, float *S, MatrixDim sdim); void cudaF_sum_column_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim, const float *src_data, MatrixDim src_dim, - const Int32Pair *indices); + const Int32Pair *indices); void cudaF_add_row_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim, const float *src_data, MatrixDim src_dim, - const Int32Pair *indexes); + const Int32Pair *indexes); void cudaF_matrix_lookup(dim3 Gr, dim3 Bl, const float *data, MatrixDim dim, const Int32Pair *indices, int indices_size, float *output); @@ -177,19 +178,19 @@ void cudaF_matrix_lookup(dim3 Gr, dim3 Bl, const float *data, MatrixDim dim, void cudaF_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1, const float *mat2, float *mask, MatrixDim mat1_dim, int mat2_stride, int mask_stride); - + /********************************************************* * double CUDA kernel calls */ /* - * CuMatrix + * CuMatrix */ void cudaD_copy_upp_low(dim3 Gr, dim3 Bl, double* A, MatrixDim dimB); void cudaD_copy_low_upp(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA); void cudaD_add_diag_vec_mat(dim3 Gr, dim3 Bl, double alpha, double *mat, MatrixDim mat_dim, const double *vec, const double *mat2, int mat2_row_stride, - int mat2_col_stride, double beta); + int mat2_col_stride, double beta); void cudaD_copy_from_tp_trans(dim3 Gr, dim3 Bl, double* A, const double* B, MatrixDim dmat); void cudaDF_copy_from_tp_trans(dim3 Gr, dim3 Bl, double* A, const float* B, MatrixDim dmat); void cudaD_copy_from_tp(dim3 Gr, dim3 Bl, double* A, const double* B, MatrixDim dmat); @@ -197,7 +198,7 @@ void cudaDF_copy_from_tp(dim3 Gr, dim3 Bl, double* A, const float* B, MatrixDim void cudaD_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d); void cudaD_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim d); void cudaD_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power, bool include_sign, MatrixDim d); -void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d); +void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d); void cudaD_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val, MatrixDim d); void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride); void cudaD_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride); @@ -248,9 +249,9 @@ void cudaD_vec_min(const double* v, double* value, int dim); void cudaD_vec_max(const double* v, double* value, int dim); void cudaD_trace_mat_mat_trans(const double* A, const double* B, MatrixDim dA, int B_stride, double* value); void cudaD_trace_mat_mat(const double* A, const double* B, MatrixDim dA, int B_stride, double* value); -void cudaD_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M, - int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride, - int N_col_stride, int threads_per_element, double beta); +void cudaD_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M, + int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride, + int N_col_stride, int threads_per_element, double beta); void cudaD_add_vec_vec(int Gr, int Bl, double alpha, double* v, const double* x, const double* y, double beta, int dim); void cudaD_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim); void cudaD_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col, const double* mat, MatrixDim dmat, int dim); @@ -271,7 +272,7 @@ void cudaD_add_mat_blockmat(dim3 Gr, dim3 Bl, double *data, MatrixDim d, const d void cudaD_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data, int num_blocks, const double *C_data, int C_num_cols, int C_row_stride, int C_col_stride, const double *D_data, int D_row_stride, int D_col_stride, - double alpha, double beta); + double alpha, double beta); /* @@ -283,6 +284,7 @@ void cudaD_soft_hinge(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, void cudaD_group_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size, double power); void cudaD_group_max(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size); void cudaD_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride); +void cudaD_heaviside(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride); void cudaD_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d, int e_stride, int y_stride); void cudaD_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride); void cudaD_diff_tanh(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d, int e_stride, int y_stride); @@ -342,14 +344,14 @@ void cudaD_add_row_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim, void cudaD_matrix_lookup(dim3 Gr, dim3 Bl, const double *data, MatrixDim dim, const Int32Pair *indices, int indices_size, double *output); - + void cudaD_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1, - const double *mat2, double *mask, MatrixDim mat1_dim, + const double *mat2, double *mask, MatrixDim mat1_dim, int mat2_stride, int mask_stride); - - -} // extern "C" + + +} // extern "C" #endif // HAVE_CUDA diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 00af3eb234a..d494be4169a 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -931,15 +931,15 @@ static void _add_diag_mat_mat( int v_idx = i / threads_per_element, // v_idx is the index into v that we are supposed to sub_idx = i % threads_per_element; // add to; 0 <= sub_idx < threads_per_element tells // us which block of elements we sum up. - if (v_idx >= v_dim) return; - - Real sum = 0.0; - for (int j = sub_idx; j < M_cols; j += threads_per_element) { - int M_index = v_idx * M_row_stride + j * M_col_stride, - N_index = j * N_row_stride + v_idx * N_col_stride; - sum += M[M_index] * N[N_index]; + if (v_idx < v_dim) { + Real sum = 0.0; + for (int j = sub_idx; j < M_cols; j += threads_per_element) { + int M_index = v_idx * M_row_stride + j * M_col_stride, + N_index = j * N_row_stride + v_idx * N_col_stride; + sum += M[M_index] * N[N_index]; + } + temp_data[threadIdx.x] = sum; } - temp_data[threadIdx.x] = sum; // start_idx = threadIdx.x - sub_idx; // start of the position in temp_data // that we want to sum up. @@ -959,7 +959,7 @@ static void _add_diag_mat_mat( __syncthreads(); num_total_threads = half_point; } - if (sub_idx == 0) { + if (sub_idx == 0 && v_idx < v_dim) { v[v_idx] = beta * v[v_idx] + alpha * temp_data[threadIdx.x]; } } @@ -1152,7 +1152,6 @@ __global__ static void _pvec_sum(Real* v, Real* g, int dim, int size) { int i = blockIdx.x * blockDim.x + threadIdx.x; int start = size * i; - if (start >= dim) return; int end = start + size; if (end > dim) end = dim; __shared__ Real row_data[CU1DBLOCK]; @@ -1752,6 +1751,19 @@ static void _diff_tanh(Real*eout, const Real*e, const Real*y, MatrixDim d, int e eout[dst_index] = (1.0 - y[y_index]*y[y_index]) * e[e_index]; } +template +__global__ +static void _heaviside(Real*y, const Real*x, MatrixDim d, int src_stride) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int dst_index = i + j*d.stride, src_index = i + j*src_stride; + if(i < d.cols && j < d.rows) { + Real res = (x[src_index] > 0.0 ? 1.0 : 0.0); + y[dst_index] = res; + } +} + + template __global__ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) { @@ -2145,7 +2157,6 @@ void cudaF_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power, bool include void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { _apply_heaviside<<>>(mat, d); - } void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) { @@ -2471,6 +2482,10 @@ void cudaF_diff_tanh (dim3 Gr, dim3 Bl, float* eout, const float* e, const float _diff_tanh<<>>(eout, e, y, d, e_stride, y_stride); } +void cudaF_heaviside (dim3 Gr, dim3 Bl, float* y, const float* x, MatrixDim d, int src_stride) { + _heaviside<<>>(y, x, d, src_stride); +} + void cudaF_softmax_reduce (size_t Gr, size_t Bl, float* y, const float* x, MatrixDim d, int src_stride) { _softmax_reduce<<>>(y, x, d, src_stride); } @@ -2930,6 +2945,10 @@ void cudaD_diff_tanh (dim3 Gr, dim3 Bl, double* eout, const double* e, const dou _diff_tanh<<>>(eout, e, y, d, e_stride, y_stride); } +void cudaD_heaviside (dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d, int src_stride) { + _heaviside<<>>(y, x, d, src_stride); +} + void cudaD_softmax_reduce (size_t Gr, size_t Bl, double* y, const double* x, MatrixDim d, int src_stride) { _softmax_reduce<<>>(y, x, d, src_stride); } diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index fc1fbae54da..0ded2f794d3 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -4,7 +4,7 @@ // 2013 Ehsan Variani // 2014 Johns Hopkins University (author: Daniel Povey) // 2013 Hainan Xu -// 2013 Xiaohui Zhang +// 2013 Xiaohui Zhang // 2013-2015 Guoguo Chen // See ../../COPYING for clarification regarding multiple authors @@ -33,14 +33,14 @@ #include "cudamatrix/cu-kernels-ansi.h" /* - * In this file are C++ templated wrappers + * In this file are C++ templated wrappers * of the ANSI-C CUDA kernels */ namespace kaldi { /* - * CuMatrix + * CuMatrix */ inline void cuda_copy_upp_low(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA) { cudaF_copy_upp_low(Gr, Bl, A, dimA); } @@ -176,10 +176,10 @@ inline void cuda_add_vec_to_rows(dim3 Gr, dim3 Bl, float alpha, const float *row inline void cuda_transpose_matrix(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { cudaF_transpose_matrix(Gr, Bl, mat, d); } inline void cuda_sy_add_tr2(dim3 Gr, dim3 Bl, float alpha, float beta, const float* T, MatrixDim tdim, float *S, MatrixDim sdim) { cudaF_sy_add_tr2(Gr, Bl, alpha, beta, T, tdim, S, sdim); } inline void cuda_add_mat_diag_vec(dim3 Gr, dim3 Bl, float alpha, float *mat, MatrixDim mat_dim, const float *mat2, int mat2_row_stride, int mat2_col_stride, const float *vec, float beta) { cudaF_add_mat_diag_vec(Gr, Bl, alpha, mat, mat_dim, mat2, mat2_row_stride, mat2_col_stride, vec, beta); } -inline void cuda_add_mat_mat_elements(dim3 Gr, dim3 Bl, float *data, const float *srcA_data, const float *srcB_data, MatrixDim dim, int srcA_stride, int srcB_stride, float alpha, float beta) { cudaF_add_mat_mat_elements(Gr, Bl, data, srcA_data, srcB_data, dim, srcA_stride, srcB_stride, alpha, beta); } +inline void cuda_add_mat_mat_elements(dim3 Gr, dim3 Bl, float *data, const float *srcA_data, const float *srcB_data, MatrixDim dim, int srcA_stride, int srcB_stride, float alpha, float beta) { cudaF_add_mat_mat_elements(Gr, Bl, data, srcA_data, srcB_data, dim, srcA_stride, srcB_stride, alpha, beta); } + - /* * CuVector */ @@ -194,8 +194,8 @@ inline void cuda_vec_min(const float* v, float* value, int dim) { cudaF_vec_min( inline void cuda_vec_max(const float* v, float* value, int dim) { cudaF_vec_max(v,value,dim); } inline void cuda_trace_mat_mat_trans(const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { cudaF_trace_mat_mat_trans(A,B,dA,B_stride,value); } inline void cuda_trace_mat_mat(const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { cudaF_trace_mat_mat(A,B,dA,B_stride,value); } -inline void cuda_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M, - int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride, +inline void cuda_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M, + int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride, int N_col_stride, int threads_per_element, float beta) { cudaF_add_diag_mat_mat(Gr, Bl, alpha, v, v_dim, M, M_cols, M_row_stride, M_col_stride, N, N_row_stride, N_col_stride, threads_per_element, beta); @@ -240,6 +240,7 @@ inline void cuda_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d, int e_stride, int y_stride) { cudaF_diff_sigmoid(Gr,Bl,eout,e,y,d,e_stride,y_stride); } inline void cuda_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride) { cudaF_tanh(Gr,Bl,y,x,d,src_stride); } inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d, int e_stride, int y_stride) { cudaF_diff_tanh(Gr,Bl,eout,e,y,d,e_stride,y_stride); } +inline void cuda_heaviside(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride) { cudaF_heaviside(Gr,Bl,y,x,d,src_stride); } /* Bl: dimBlock value is fixed min(d.col, CU1DBLOCK), represent CU1DBLOCK threads reduce a row at the same time. Gr: the number of rows @@ -283,7 +284,7 @@ inline void cuda_matrix_lookup(dim3 Gr, dim3 Bl, const float *data, cudaF_matrix_lookup(Gr, Bl, data, dim, indices, indices_size, output); } -inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1, const float *mat2, float *mask, +inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1, const float *mat2, float *mask, MatrixDim mat1_dim, int mat2_stride, int mask_stride) { cudaF_equal_element_mask(Gr, Bl, mat1, mat2, mask, mat1_dim, mat2_stride, mask_stride); } @@ -293,7 +294,7 @@ inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1, const f // double versions /* - * CuMatrix + * CuMatrix */ inline void cuda_copy_upp_low(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA) { cudaD_copy_upp_low(Gr, Bl, A, dimA); } inline void cuda_copy_low_upp(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA) { cudaD_copy_low_upp(Gr, Bl, A, dimA); } @@ -378,8 +379,8 @@ inline void cuda_vec_min(const double* v, double* value, int dim) { cudaD_vec_mi inline void cuda_vec_max(const double* v, double* value, int dim) { cudaD_vec_max(v,value,dim); } inline void cuda_trace_mat_mat_trans(const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { cudaD_trace_mat_mat_trans(A,B,dA,B_stride,value); } inline void cuda_trace_mat_mat(const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { cudaD_trace_mat_mat(A,B,dA,B_stride,value); } -inline void cuda_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M, - int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride, +inline void cuda_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M, + int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride, int N_col_stride, int threads_per_element, double beta) { cudaD_add_diag_mat_mat(Gr, Bl, alpha, v, v_dim, M, M_cols, M_row_stride, M_col_stride, N, N_row_stride, N_col_stride, threads_per_element, beta); @@ -422,6 +423,7 @@ inline void cuda_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d, int e_stride, int y_stride) { cudaD_diff_sigmoid(Gr,Bl,eout,e,y,d,e_stride,y_stride); } inline void cuda_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_tanh(Gr,Bl,y,x,d,src_stride); } inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d, int e_stride, int y_stride) { cudaD_diff_tanh(Gr,Bl,eout,e,y,d,e_stride,y_stride); } +inline void cuda_heaviside(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_heaviside(Gr,Bl,y,x,d,src_stride); } inline void cuda_softmax_reduce(size_t Gr, size_t Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_softmax_reduce(Gr,Bl,y,x,d,src_stride); } inline void cuda_log_softmax_reduce(size_t Gr, size_t Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_log_softmax_reduce(Gr,Bl,y,x,d,src_stride); } @@ -460,7 +462,7 @@ inline void cuda_matrix_lookup(dim3 Gr, dim3 Bl, const double *data, cudaD_matrix_lookup(Gr, Bl, data, dim, indices, indices_size, output); } -inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1, const double *mat2, double *mask, +inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1, const double *mat2, double *mask, MatrixDim mat1_dim, int mat2_stride, int mask_stride) { cudaD_equal_element_mask(Gr, Bl, mat1, mat2, mask, mat1_dim, mat2_stride, mask_stride); } diff --git a/src/cudamatrix/cu-math.h b/src/cudamatrix/cu-math.h index 453cf4439fb..65a4c0c4af3 100644 --- a/src/cudamatrix/cu-math.h +++ b/src/cudamatrix/cu-math.h @@ -1,7 +1,7 @@ // cudamatrix/cu-math.h // Copyright 2009-2012 Karel Vesely -// 2013 Johns Hopkins University (Author: David Snyder) +// 2013 Johns Hopkins University (Author: David Snyder) // See ../../COPYING for clarification regarding multiple authors // @@ -28,9 +28,9 @@ #include "base/timer.h" namespace kaldi { - + namespace cu { - + /// RegularizeL1 is a gradient step with l1 regularization added to the /// gradient. We don't let the value cross over zero from positive to negative /// or vice versa, in a single step. If an element tries to cross zero and is @@ -40,9 +40,9 @@ void RegularizeL1(CuMatrixBase *weight, CuMatrixBase *gradient, Real l1_penalty, Real learning_rate); /// Copies a permutation of src into tgt. The row permutation is specified in -/// copy_from_idx such that src.Row(copy_from_idx[r]) == tgt.Row(r). The +/// copy_from_idx such that src.Row(copy_from_idx[r]) == tgt.Row(r). The /// dimensions of copy_from_idx must be equivalent to the number of rows in -/// tgt and src and all elements in the vector must be in [0, src.numRows()-1]. +/// tgt and src and all elements in the vector must be in [0, src.numRows()-1]. template void Randomize(const CuMatrixBase &src, const CuArray ©_from_idx, @@ -52,10 +52,10 @@ void Randomize(const CuMatrixBase &src, /// The dimensions of tgt must be equivalent to the number of rows in src /// and it must be that tgt.NumColumns == src.NumColumns * frame_offsets.Dim(). /// As a result, tgt(i, k*n_cols + j) == src(i + frame_offsets[k], j) for the -/// general case where i in [0..src.NumRows()-1], -/// k in [0..frame_offsets.Dim()-1], j in [0..src.NumRows()-1] +/// general case where i in [0..src.NumRows()-1], +/// k in [0..frame_offsets.Dim()-1], j in [0..src.NumRows()-1] /// and n_cols = src.NumColumns(). If i + frame_offsets[k] is greater than the -/// number of rows in src or less than 0 than the right side of the equation +/// number of rows in src or less than 0 than the right side of the equation /// is replaced by src(src.NumRows()-1, j) or src(0, j) respectively, to avoid /// an index out of bounds. template @@ -73,6 +73,13 @@ void Copy(const CuMatrixBase &src, const CuArray ©_from_indices, CuMatrixBase *tgt); +template +void Group2norm(const CuMatrixBase &src, + CuMatrixBase *dest, + int32 group_stride); + + + } // namespace cu } // namespace kaldi diff --git a/src/cudamatrix/cu-matrix-inl.h b/src/cudamatrix/cu-matrix-inl.h index b4b51cbc53b..aa6fcf6f44d 100644 --- a/src/cudamatrix/cu-matrix-inl.h +++ b/src/cudamatrix/cu-matrix-inl.h @@ -54,8 +54,13 @@ inline CuSubMatrix::CuSubMatrix(const Real *data, // in general if you use SubMatrix or CuSubMatrix, const-correctness is not // preserved (preserving it would require us duplicating the class and it // would have been a hassle). + + // Note: we used to check that stride >= num_cols. We no longer check for + // this as there are some situations where having stride < num_cols is useful, + // but beware because most if not all CUBLAS calls will crash when given + // such an input, even in a situation where it makes sense. KALDI_ASSERT((num_rows != 0) == (num_cols != 0) && stride >= 0 && - num_rows >= 0 && num_cols >= 0 && num_cols <= stride); + num_rows >= 0 && num_cols >= 0 && stride >= 0); } diff --git a/src/cudamatrix/cu-matrix-speed-test.cc b/src/cudamatrix/cu-matrix-speed-test.cc index f50ded8c209..1c32de34d5c 100644 --- a/src/cudamatrix/cu-matrix-speed-test.cc +++ b/src/cudamatrix/cu-matrix-speed-test.cc @@ -298,6 +298,23 @@ template void TestCuMatrixSigmoid(int32 dim) { << dim << ", speed was " << gflops << " gigaflops."; } +template void TestCuMatrixHeaviside(int32 dim) { + BaseFloat time_in_secs = 0.025; + CuMatrix M(dim, dim), N(dim, dim); + M.SetRandn(); + N.SetRandn(); + Timer tim; + int32 iter = 0; + for (;tim.Elapsed() < time_in_secs; iter++) { + N.ApplyHeaviside(); + } + + BaseFloat fdim = dim; + BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); + KALDI_LOG << "For CuMatrix::Heaviside" << NameOf() << ", for dim = " + << dim << ", speed was " << gflops << " gigaflops."; +} + template void TestCuMatrixMulRowsGroupMat(int32 dim) { BaseFloat time_in_secs = 0.025; @@ -806,6 +823,8 @@ template void CudaMatrixSpeedTest() { TestCuMatrixCholesky(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixSigmoid(sizes[s]); + for (int32 s = 0; s < ns; s++) + TestCuMatrixHeaviside(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuFindRowMaxId(sizes[s]); for (int32 s = 0; s < ns; s++) diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc index 2f675faad99..74419ea25ba 100644 --- a/src/cudamatrix/cu-matrix-test.cc +++ b/src/cudamatrix/cu-matrix-test.cc @@ -754,6 +754,25 @@ static void UnitTestCuMatrixApplyHeaviside() { } +template +static void UnitTestCuMatrixHeaviside() { + + for (int32 i = 0; i < 1; i++) { + Matrix H(10 + Rand() % 60, 10 + Rand() % 20); + H.SetRandn(); + H.Row(0).Set(0.0); + if (i == 2) { Matrix tmp(H, kTrans); H = tmp; } + + CuMatrix cH(H); + CuMatrix cH2(H.NumRows(), H.NumCols(), kUndefined); + cH2.Heaviside(cH); + H.ApplyHeaviside(); + Matrix H2(cH2); + AssertEqual(H, H2); + } +} + + template static void UnitTestCuMatrixMulElements() { for (int32 i = 0; i < 2; i++) { @@ -2445,6 +2464,7 @@ template void CudaMatrixUnitTest() { UnitTestCuMatrixApplyFloor(); UnitTestCuMatrixApplyCeiling(); UnitTestCuMatrixApplyHeaviside(); + UnitTestCuMatrixHeaviside(); UnitTestCuMatrixMulElements(); UnitTestCuMatrixDivElements(); UnitTestCuMatrixMax(); diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index eb5a268d543..7e8780902a6 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -46,7 +46,8 @@ namespace kaldi { template void CuMatrix::Resize(MatrixIndexT rows, MatrixIndexT cols, - MatrixResizeType resize_type) { + MatrixResizeType resize_type, + MatrixStrideType stride_type) { // This code does not currently support the other resize_type options. KALDI_ASSERT(resize_type == kSetZero || resize_type == kUndefined); if (rows * cols == 0) KALDI_ASSERT(rows == 0 && cols == 0); @@ -54,7 +55,6 @@ void CuMatrix::Resize(MatrixIndexT rows, MatrixIndexT cols, if (resize_type == kSetZero) this->SetZero(); return; } - if (this->num_rows_ != 0) this->Destroy(); if (rows == 0) return; @@ -63,11 +63,19 @@ void CuMatrix::Resize(MatrixIndexT rows, MatrixIndexT cols, Timer tim; MatrixIndexT row_bytes = cols * sizeof(Real); size_t pitch; - this->data_ = static_cast(CuDevice::Instantiate().MallocPitch( - row_bytes, rows, &pitch)); - this->num_rows_ = rows; - this->num_cols_ = cols; - this->stride_ = pitch / sizeof(Real); + if (stride_type == kDefaultStride) { + this->data_ = static_cast(CuDevice::Instantiate().MallocPitch( + row_bytes, rows, &pitch)); + this->num_rows_ = rows; + this->num_cols_ = cols; + this->stride_ = pitch / sizeof(Real); + } else { // kStrideEqualNumCols + size_t bytes = rows * cols * sizeof(Real); + this->data_ = static_cast(CuDevice::Instantiate().Malloc(bytes)); + this->num_rows_ = rows; + this->num_cols_ = cols; + this->stride_ = cols; + } if (resize_type == kSetZero) this->SetZero(); CuDevice::Instantiate().AccuProfile("CuMatrix::Resize", tim.Elapsed()); } else @@ -75,7 +83,7 @@ void CuMatrix::Resize(MatrixIndexT rows, MatrixIndexT cols, { // Let the initializer of Matrix handle the allocation, // and then just do Swap which will switch the pointers. // This wastes a few instructions but is simple to code. - Matrix mat(rows, cols, resize_type); + Matrix mat(rows, cols, resize_type, stride_type); this->Swap(&mat); } } @@ -1895,6 +1903,7 @@ void CuMatrixBase::CopyRowsFromVec(const CuVectorBase &v) { GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), &dimGrid, &dimBlock); cuda_copy_rows_from_vec(dimGrid, dimBlock, data_, this->Dim(), v.Data()); + CU_SAFE_CALL(cudaGetLastError()); } else { KALDI_ERR << "Wrong sized arguments"; } @@ -2016,6 +2025,26 @@ void CuMatrixBase::ApplyHeaviside() { } } +template +void CuMatrixBase::Heaviside(const CuMatrixBase &src) { + KALDI_ASSERT(SameDim(*this, src)); +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + Timer tim; + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); + cuda_heaviside(dimGrid, dimBlock, this->data_, src.data_, this->Dim(), + src.Stride()); + CU_SAFE_CALL(cudaGetLastError()); + + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + } else + #endif + { + Mat().Heaviside(src.Mat()); + } +} template void CuMatrixBase::ApplyExp() { diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h index fd4c642ab7f..fec26424ef8 100644 --- a/src/cudamatrix/cu-matrix.h +++ b/src/cudamatrix/cu-matrix.h @@ -254,6 +254,11 @@ class CuMatrixBase { /// element by element, x = 1 / (1 + exp(-x)) void Sigmoid(const CuMatrixBase &src); + /// Set each element to the Heaviside function of the corresponding element + /// of "src", which we define as the function (x > 0 ? 1.0 : 0.0) [note: + /// in general, there are different ways to deal with the situation when x==0.] + void Heaviside(const CuMatrixBase &src); + /// Apply the function y = log(1 + exp(x)), to each element. /// Note: the derivative of this function is the sigmoid function. /// This is like a soft ReLU. @@ -336,7 +341,9 @@ class CuMatrixBase { ///< The output will be set zero. If include_sign is true, it will ///< multiply the result by the sign of the input. void ApplyPowAbs(Real power, bool include_sign=false); - void ApplyHeaviside(); ///< For each element, sets x = (x > 0 ? 1.0 : 0.0) + /// For each element, sets x = (x > 0 ? 1.0 : 0.0). + /// See also Heaviside(). + void ApplyHeaviside(); void ApplyFloor(Real floor_val); void ApplyCeiling(Real ceiling_val); void ApplyExp(); @@ -425,9 +432,9 @@ class CuMatrixBase { /// *this = beta * *this + alpha * A .* B (.* element by element multiplication) void AddMatMatElements(const Real alpha, - const CuMatrixBase& A, - const CuMatrixBase& B, - const Real beta); + const CuMatrixBase& A, + const CuMatrixBase& B, + const Real beta); /// this <-- beta*this + alpha*A*B void AddMatSp(const Real alpha, @@ -619,8 +626,9 @@ class CuMatrix: public CuMatrixBase { /// Constructor with memory initialisation CuMatrix(MatrixIndexT rows, MatrixIndexT cols, - MatrixResizeType resize_type = kSetZero) { - Resize(rows, cols, resize_type); + MatrixResizeType resize_type = kSetZero, + MatrixStrideType stride_type = kDefaultStride) { + Resize(rows, cols, resize_type, stride_type); } // Note: we had to remove the "explicit" keyword due @@ -679,7 +687,8 @@ class CuMatrix: public CuMatrixBase { /// Allocate the memory void Resize(MatrixIndexT rows, MatrixIndexT cols, - MatrixResizeType resize_type = kSetZero); + MatrixResizeType resize_type = kSetZero, + MatrixStrideType stride_type = kDefaultStride); void Swap(Matrix *mat); void Swap(CuMatrix *mat); @@ -782,8 +791,8 @@ template template Matrix::Matrix(const CuMatrixBase &M, MatrixTransposeType trans) { - if (trans == kNoTrans) Init(M.NumRows(), M.NumCols()); - else Init(M.NumCols(), M.NumRows()); + if (trans == kNoTrans) Init(M.NumRows(), M.NumCols(), kDefaultStride); + else Init(M.NumCols(), M.NumRows(), kDefaultStride); M.CopyToMat(this, trans); } diff --git a/src/cudamatrix/cu-vector-test.cc b/src/cudamatrix/cu-vector-test.cc index a32e136f62e..9b7aa97776a 100644 --- a/src/cudamatrix/cu-vector-test.cc +++ b/src/cudamatrix/cu-vector-test.cc @@ -22,7 +22,7 @@ #include #include #include - +#include #include "base/kaldi-common.h" #include "util/common-utils.h" #include "cudamatrix/cu-matrix.h" @@ -62,7 +62,7 @@ static void UnitTestCuVectorIO() { } -template +template static void UnitTestCuVectorCopyFromVec() { for (int32 i = 1; i < 10; i++) { MatrixIndexT dim = 10 * i; @@ -80,7 +80,7 @@ static void UnitTestCuVectorCopyFromVec() { } } -template +template static void UnitTestCuSubVector() { for (int32 iter = 0 ; iter < 10; iter++) { int32 M1 = 1 + rand () % 10, M2 = 1 + Rand() % 1, M3 = 1 + Rand() % 10, M = M1 + M2 + M3, @@ -97,7 +97,7 @@ static void UnitTestCuSubVector() { -template +template static void UnitTestCuVectorMulTp() { for (int32 i = 1; i < 10; i++) { MatrixIndexT dim = 10 * i; @@ -105,7 +105,7 @@ static void UnitTestCuVectorMulTp() { A.SetRandn(); TpMatrix B(dim); B.SetRandn(); - + CuVector C(A); CuTpMatrix D(B); @@ -127,10 +127,10 @@ static void UnitTestCuVectorAddTp() { B.SetRandn(); Vector C(dim); C.SetRandn(); - + CuVector D(A); CuTpMatrix E(B); - CuVector F(C); + CuVector F(C); A.AddTpVec(1.0, B, kNoTrans, C, 1.0); D.AddTpVec(1.0, E, kNoTrans, F, 1.0); @@ -160,7 +160,7 @@ template void CuVectorUnitTestAddVec() { CuVector vec1_orig(vec1); BaseFloat alpha = 0.43243; vec1.AddVec(alpha, vec2); - + for (int32 i = 0; i < M; i++) AssertEqual(vec1_orig(i) + alpha * vec2(i), vec1(i)); } @@ -177,7 +177,7 @@ template void CuVectorUnitTestAddVecCross() { CuVector vec1_orig(vec1); Real alpha = 0.43243; vec1.AddVec(alpha, vec2); - + for (int32 i = 0; i < M; i++) AssertEqual(vec1_orig(i) + alpha * vec2(i), vec1(i)); } else { @@ -198,7 +198,7 @@ template void CuVectorUnitTestAddVecExtra() { CuVector vec1_orig(vec1); BaseFloat alpha = 0.43243, beta = 1.4321; vec1.AddVec(alpha, vec2, beta); - + for (int32 i = 0; i < M; i++) AssertEqual(beta * vec1_orig(i) + alpha * vec2(i), vec1(i)); } @@ -268,6 +268,20 @@ template static void UnitTestCuVectorReplaceValue() { } } +template static void UnitTestCuVectorSum() { + for (int32 i = 0; i < 200; i++) { + int32 start_dim = RandInt(1, 500), end_dim = RandInt(1, 500); + int32 dim = RandInt(10, 12000) + start_dim + end_dim; + Real quiet_nan = nan(""); // this is from . + Vector vec(start_dim + dim + end_dim); + vec.Range(0, start_dim).Set(quiet_nan); + vec.Range(start_dim, dim).Set(1.0); + vec.Range(start_dim + dim, end_dim).Set(quiet_nan); + BaseFloat sum = vec.Range(start_dim, dim).Sum(); + KALDI_ASSERT(ApproxEqual(sum, dim)); + } +} + template void CuVectorUnitTestInvertElements() { // Also tests MulElements(); int32 M = 256 + Rand() % 100; @@ -288,7 +302,7 @@ template void CuVectorUnitTestSum() { CuVector A(dim), ones(dim); A.SetRandn(); ones.Set(1.0); - + AssertEqual(VecVec(A, ones), A.Sum()); } } @@ -320,7 +334,7 @@ template void CuVectorUnitTestCopyFromMat() { } Matrix matrix(cu_matrix), matrix2(M, N); CuMatrix matrix3(M, N); - + CuVector vector(M * N), vector2(M * N); vector.CopyRowsFromMat(cu_matrix); vector2.CopyRowsFromMat(matrix); @@ -328,8 +342,8 @@ template void CuVectorUnitTestCopyFromMat() { matrix3.CopyRowsFromVec(Vector(vector2)); Vector vector3(M * N); vector3.CopyRowsFromMat(cu_matrix); - - + + for(int32 j = 0; j < M*N; j++) { if (Rand() % 500 == 0) { // random small subset (it was slow) KALDI_ASSERT(vector(j) == cu_matrix(j/N, j%N)); @@ -412,7 +426,7 @@ template void CuVectorUnitTestNorm() { KALDI_ASSERT(ApproxEqual(cu_vector.Norm(1.0), 3.0)); KALDI_ASSERT(ApproxEqual(cu_vector.Norm(2.0), sqrt(5.0))); } - + template void CuVectorUnitTestMin() { for (int32 p = 0; p < 5; p++) { @@ -496,7 +510,7 @@ template void CuVectorUnitTestApplyFloor() { BaseFloat floor = 0.33 * (-5 + Rand() % 10); int32 i = cu_vector.ApplyFloor(floor); int32 j = vector.ApplyFloor(floor); - + CuVector cu2(vector); AssertEqual(cu2, cu_vector); @@ -517,7 +531,7 @@ template void CuVectorUnitTestApplyCeiling() { BaseFloat floor = 0.33 * (-5 + Rand() % 10); int32 i = cu_vector.ApplyCeiling(floor); int32 j = vector.ApplyCeiling(floor); - + CuVector cu2(vector); AssertEqual(cu2, cu_vector); @@ -540,7 +554,7 @@ template void CuVectorUnitTestApplyPow() { BaseFloat pow = -2 + (Rand() % 5); cu_vector.ApplyPow(pow); vector.ApplyPow(pow); - + CuVector cu2(vector); AssertEqual(cu2, cu_vector); @@ -579,7 +593,7 @@ template void CuVectorUnitTestAddDiagMat2() { cu_mat_orig.SetRandn(); MatrixTransposeType trans = (p % 2 == 0 ? kNoTrans : kTrans); CuMatrix cu_mat(cu_mat_orig, trans); - + Vector vector(cu_vector); Matrix mat(cu_mat); @@ -604,12 +618,12 @@ static void CuVectorUnitTestAddDiagMatMat() { MatrixTransposeType transM = (iter % 2 == 0 ? kNoTrans : kTrans); MatrixTransposeType transN = ((iter/2) % 2 == 0 ? kNoTrans : kTrans); CuMatrix M(M_orig, transM), N(N_orig, transN); - + v.SetRandn(); CuVector w(v); w.AddDiagMatMat(alpha, M, transM, N, transN, beta); - + { CuVector w2(v); CuMatrix MN(dimM, dimM); @@ -669,7 +683,7 @@ template void CuVectorUnitTestAddSpVec() { CuSpMatrix mat_cu(M); mat_cu.SetRandn(); SpMatrix mat(mat_cu); - + BaseFloat alpha = 0.5 * (Rand() % 5), beta = 0.5 * (Rand() % 5); dst_cu.AddSpVec(alpha, mat_cu, src_cu, beta); dst.AddSpVec(alpha, mat, src, beta); @@ -695,6 +709,7 @@ template void CuVectorUnitTest() { CuVectorUnitTestScale(); CuVectorUnitTestSum(); CuVectorUnitTestInvertElements(); + UnitTestCuVectorSum(); CuVectorUnitTestAddRowSumMat(); CuVectorUnitTestAddColSumMat(); UnitTestCuVectorReplaceValue(); @@ -708,8 +723,8 @@ template void CuVectorUnitTest() { CuVectorUnitTestCopyDiagFromPacked(); CuVectorUnitTestCopyDiagFromMat(); CuVectorUnitTestCopyCross(); - CuVectorUnitTestCopyCross2(); - CuVectorUnitTestNorm(); + CuVectorUnitTestCopyCross2(); + CuVectorUnitTestNorm(); CuVectorUnitTestApplyExp(); CuVectorUnitTestApplyLog(); CuVectorUnitTestApplyFloor(); @@ -732,10 +747,10 @@ int main(int argc, char *argv[]) { const char *usage = "Usage: cu-vector-test [options]"; ParseOptions po(usage); - std::string use_gpu = "yes"; + std::string use_gpu = "yes"; po.Register("use-gpu", &use_gpu, "yes|no|optional"); po.Read(argc, argv); - + if (po.NumArgs() != 0) { po.PrintUsage(); exit(1); diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc index 64f41720869..6deb3809d85 100644 --- a/src/cudamatrix/cu-vector.cc +++ b/src/cudamatrix/cu-vector.cc @@ -279,7 +279,6 @@ Real CuVectorBase::Sum() const { CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); return tmp.Sum(); } else { - if (dim_ == 0) return 0.0; CuVector tmp(1, kUndefined); int dimBlock(CU1DBLOCK); int dimGrid = 1; // only 1 block here. we have loops in each thread. diff --git a/src/decoder/lattice-faster-decoder.h b/src/decoder/lattice-faster-decoder.h index 158248cc445..3aeef0bf24a 100644 --- a/src/decoder/lattice-faster-decoder.h +++ b/src/decoder/lattice-faster-decoder.h @@ -54,7 +54,7 @@ struct LatticeFasterDecoderConfig { // LatticeFasterDecoder class itself, but by the code that calls it, for // example in the function DecodeUtteranceLatticeFaster. fst::DeterminizeLatticePhonePrunedOptions det_opts; - + LatticeFasterDecoderConfig(): beam(16.0), max_active(std::numeric_limits::max()), min_active(200), @@ -99,7 +99,7 @@ class LatticeFasterDecoder { typedef Arc::Label Label; typedef Arc::StateId StateId; typedef Arc::Weight Weight; - + // instantiate this class once for each thing you have to decode. LatticeFasterDecoder(const fst::Fst &fst, const LatticeFasterDecoderConfig &config); @@ -117,7 +117,7 @@ class LatticeFasterDecoder { const LatticeFasterDecoderConfig &GetOptions() const { return config_; } - + ~LatticeFasterDecoder(); /// Decodes until there are no more frames left in the "decodable" object.. @@ -365,8 +365,9 @@ class LatticeFasterDecoder { const fst::Fst &fst_; bool delete_fst_; std::vector cost_offsets_; // This contains, for each - // frame, an offset that was added to the acoustic likelihoods on that - // frame in order to keep everything in a nice dynamic range. + // frame, an offset that was added to the acoustic log-likelihoods on that + // frame in order to keep everything in a nice dynamic range i.e. close to + // zero, to reduce roundoff errors. LatticeFasterDecoderConfig config_; int32 num_toks_; // current total #toks allocated... bool warned_; @@ -409,7 +410,7 @@ class LatticeFasterDecoder { void ClearActiveTokens(); - KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeFasterDecoder); + KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeFasterDecoder); }; diff --git a/src/decoder/lattice-faster-online-decoder.h b/src/decoder/lattice-faster-online-decoder.h index 30adb6df302..b69b5492fb7 100644 --- a/src/decoder/lattice-faster-online-decoder.h +++ b/src/decoder/lattice-faster-online-decoder.h @@ -62,7 +62,7 @@ class LatticeFasterOnlineDecoder { BestPathIterator(void *t, int32 f): tok(t), frame(f) { } bool Done() { return tok == NULL; } }; - + // instantiate this class once for each thing you have to decode. LatticeFasterOnlineDecoder(const fst::Fst &fst, const LatticeFasterDecoderConfig &config); @@ -80,7 +80,7 @@ class LatticeFasterOnlineDecoder { const LatticeFasterDecoderConfig &GetOptions() const { return config_; } - + ~LatticeFasterOnlineDecoder(); /// Decodes until there are no more frames left in the "decodable" object.. @@ -107,12 +107,12 @@ class LatticeFasterOnlineDecoder { bool GetBestPath(Lattice *ofst, bool use_final_probs = true) const; - + /// This function does a self-test of GetBestPath(). Returns true on /// success; returns false and prints a warning on failure. bool TestGetBestPath(bool use_final_probs = true) const; - - + + /// This function returns an iterator that can be used to trace back /// the best path. If use_final_probs == true and at least one final state /// survived till the end, it will use the final-probs in working out the best @@ -133,7 +133,7 @@ class LatticeFasterOnlineDecoder { /// while leaving its "nextstate" variable unchanged. BestPathIterator TraceBackBestPath( BestPathIterator iter, LatticeArc *arc) const; - + /// Outputs an FST corresponding to the raw, state-level /// tracebacks. Returns true if result is nonempty. /// If "use_final_probs" is true AND we reached the final-state @@ -152,7 +152,7 @@ class LatticeFasterOnlineDecoder { bool use_final_probs, BaseFloat beam) const; - + /// InitDecoding initializes the decoding, and should only be used if you /// intend to call AdvanceDecoding(). If you call Decode(), you don't need to /// call this. You can also call InitDecoding if you have already decoded an @@ -334,7 +334,7 @@ class LatticeFasterOnlineDecoder { /// Gets the weight cutoff. Also counts the active tokens. BaseFloat GetCutoff(Elem *list_head, size_t *tok_count, BaseFloat *adaptive_beam, Elem **best_elem); - + /// Processes emitting arcs for one frame. Propagates from prev_toks_ to cur_toks_. /// Returns the cost cutoff for subsequent ProcessNonemitting() to use. BaseFloat ProcessEmitting(DecodableInterface *decodable); @@ -343,7 +343,7 @@ class LatticeFasterOnlineDecoder { /// ProcessEmitting() on each frame. The cost cutoff is computed by the /// preceding ProcessEmitting(). void ProcessNonemitting(BaseFloat cost_cutoff); - + // HashList defined in ../util/hash-list.h. It actually allows us to maintain // more than one list (e.g. for current and previous frames), but only one of // them at a time can be indexed by StateId. It is indexed by frame-index @@ -361,9 +361,10 @@ class LatticeFasterOnlineDecoder { // make it class member to avoid internal new/delete. const fst::Fst &fst_; bool delete_fst_; - std::vector cost_offsets_; // This contains, for each - // frame, an offset that was added to the acoustic likelihoods on that - // frame in order to keep everything in a nice dynamic range. + std::vector cost_offsets_; // This contains, for each + // frame, an offset that was added to the acoustic log-likelihoods on that + // frame in order to keep everything in a nice dynamic range i.e. close to + // zero, to reduce roundoff errors. LatticeFasterDecoderConfig config_; int32 num_toks_; // current total #toks allocated... bool warned_; diff --git a/src/decoder/lattice-tracking-decoder.h b/src/decoder/lattice-tracking-decoder.h index 91484b56c60..0737ca3db36 100644 --- a/src/decoder/lattice-tracking-decoder.h +++ b/src/decoder/lattice-tracking-decoder.h @@ -74,7 +74,7 @@ struct LatticeTrackingDecoderConfig { } void Check() const { - KALDI_ASSERT(beam > 0.0 && max_active > 1 && lattice_beam > 0.0 + KALDI_ASSERT(beam > 0.0 && max_active > 1 && lattice_beam > 0.0 && prune_interval > 0 && beam_delta > 0.0 && hash_ratio >= 1.0 && extra_beam >= 0.0 && max_beam >= beam); } @@ -135,7 +135,7 @@ class LatticeTrackingDecoder { /// format. bool Decode(DecodableInterface *decodable, const fst::StdVectorFst &arc_graph); - + /// says whether a final-state was active on the last frame. If it was not, the /// lattice (or traceback) will end with states that are not final-states. bool ReachedFinal() const { return final_active_; } @@ -167,7 +167,7 @@ class LatticeTrackingDecoder { /// final-probs as one. bool GetLattice(fst::MutableFst *ofst, bool use_final_probs = true) const; - + private: struct Token; // ForwardLinks are the links from a token to a token on the next frame. @@ -181,13 +181,13 @@ class LatticeTrackingDecoder { ForwardLink *next; // next in singly-linked list of forward links from a // token. inline ForwardLink(Token *next_tok, Label ilabel, Label olabel, - BaseFloat graph_cost, BaseFloat acoustic_cost, + BaseFloat graph_cost, BaseFloat acoustic_cost, ForwardLink *next): next_tok(next_tok), ilabel(ilabel), olabel(olabel), - graph_cost(graph_cost), acoustic_cost(acoustic_cost), + graph_cost(graph_cost), acoustic_cost(acoustic_cost), next(next) { } - }; - + }; + // Token is what's resident in a particular state at a particular time. // In this decoder a Token actually contains *forward* links. // When first created, a Token just has the (total) cost. We add forward @@ -200,19 +200,19 @@ class LatticeTrackingDecoder { // that any of the currently active states at the decoding front may // eventually succeed (e.g. if you were to take the currently active states // one by one and compute this difference, and then take the minimum). - + ForwardLink *links; // Head of singly linked list of ForwardLinks - + Token *next; // Next in list of tokens for this frame. - + StateId lat_state; // current state in graph arc lattice from first pass decoding // lat_state == fst::kNoStateId means that this token is not tracked - + inline Token(BaseFloat tot_cost, BaseFloat extra_cost, ForwardLink *links, Token *next, StateId lat_state): tot_cost(tot_cost), extra_cost(extra_cost), links(links), next(next), lat_state(lat_state) { } inline void DeleteForwardLinks() { - ForwardLink *l = links, *m; + ForwardLink *l = links, *m; while (l != NULL) { m = l->next; delete l; @@ -221,7 +221,7 @@ class LatticeTrackingDecoder { links = NULL; } }; - + // head and tail of per-frame list of Tokens (list is in topological order), // and something saying whether we ever pruned it using PruneForwardLinks. struct TokenList { @@ -231,7 +231,7 @@ class LatticeTrackingDecoder { TokenList(): toks(NULL), must_prune_forward_links(true), must_prune_tokens(true) { } }; - + typedef HashList::Elem Elem; void PossiblyResizeHash(size_t num_toks); @@ -248,7 +248,7 @@ class LatticeTrackingDecoder { // lat_state is the next state in the arc graph lattice inline Token *FindOrAddToken(StateId state, StateId lat_state, int32 frame, BaseFloat tot_cost, bool *changed); - + // prunes outgoing links for all tokens in active_toks_[frame] // it's called by PruneActiveTokens // all links, that have link_extra_cost > lattice_beam are pruned @@ -267,13 +267,13 @@ class LatticeTrackingDecoder { // on the final frame. If there are final tokens active, it uses // the final-probs for pruning, otherwise it treats all tokens as final. void PruneForwardLinksFinal(int32 frame); - + // Prune away any tokens on this frame that have no forward links. // [we don't do this in PruneForwardLinks because it would give us // a problem with dangling pointers]. // It's called by PruneActiveTokens if any forward links have been pruned void PruneTokensForFrame(int32 frame); - + // Go backwards through still-alive tokens, pruning them. note: cur_frame is // where hash toks_ are (so we do not want to mess with it because these tokens // don't yet have forward pointers), but we do all previous frames, unless we @@ -286,7 +286,7 @@ class LatticeTrackingDecoder { /// Version of PruneActiveTokens that we call on the final frame. /// Takes into account the final-prob of tokens. void PruneActiveTokensFinal(int32 cur_frame); - + /// Gets the weight cutoff. Also counts the active tokens. BaseFloat GetCutoff(Elem *list_head, size_t *tok_count, BaseFloat *adaptive_beam, Elem **best_elem); @@ -311,9 +311,10 @@ class LatticeTrackingDecoder { std::vector tmp_array_; // used in GetCutoff. // make it class member to avoid internal new/delete. const fst::Fst &fst_; - std::vector cost_offsets_; // This contains, for each - // frame, an offset that was added to the acoustic likelihoods on that - // frame in order to keep everything in a nice dynamic range. + std::vector cost_offsets_; // This contains, for each + // frame, an offset that was added to the acoustic log-likelihoods on that + // frame in order to keep everything in a nice dynamic range i.e. close to + // zero, to reduce roundoff errors. LatticeTrackingDecoderConfig config_; int32 num_toks_; // current total #toks allocated... bool warned_; @@ -331,9 +332,9 @@ class LatticeTrackingDecoder { // to the caller, who then has to call toks_.Delete(e) for each one. It was designed // this way for convenience in propagating tokens from one frame to the next. void ClearToks(Elem *list); - + void ClearActiveTokens(); - + }; diff --git a/src/doc/glossary.dox b/src/doc/glossary.dox index ba42ea12370..31fa62d3389 100644 --- a/src/doc/glossary.dox +++ b/src/doc/glossary.dox @@ -26,7 +26,7 @@ search function of your browser. For convenience the definition of each term's section is preceded and followed by a colon, so for instance, typing ctrl-f ":lattice:" would take you to the section for "lattice". - +

@@ -37,7 +37,7 @@ synonymous with a sequence of transition-ids. Most of the time an alignment is derived from aligning the reference transcript of an utterance, in which case it is called a forced alignment. lattices also contain alignment information as sequences of transition-ids for each word -sequence in the lattice. The program \ref bin/show-alignments.cc "show-alignments" shows +sequence in the lattice. The program \ref bin/show-alignments.cc "show-alignments" shows alignments in a human-readable format. :forced alignment: see alignment. @@ -54,6 +54,18 @@ of the HMMs, and also various other important integer mappings; see \ref transit This object is generally written at the start of model files. The program \ref bin/show-transitions.cc "show-transitions" shows these. +:G.fst: The grammar FST G.fst which lives in the + data/lang/ directory in the scripts (see \ref data_prep_lang) represents + the language model in a Finite State Transducer format (see www.openfst.org). + For the most part it is an acceptor, meaning the input and output symbols on the + arcs are the same, but for statistical language models with backoff, the backoff + arcs have the "disambiguation symbol" #0 on the input side only. + For many purposes you'll want to get rid of the disambiguation symbols + using the command fstproject --project_output=true. The disambiguation symbols + are needed during graph compilation to make the FST determinizable, but for things + like language-model rescoring you don't want them. + +
*/ diff --git a/src/doc/hmm.dox b/src/doc/hmm.dox index 9935fa52711..938321fd7b2 100644 --- a/src/doc/hmm.dox +++ b/src/doc/hmm.dox @@ -447,9 +447,10 @@ We now explain what these three scales do: when we add the self-loop, let the probability mass given to the self-loop be p and the mass given to the rest be (1-p). We add a self-loop with log-probability self_loop_scale * log(p), and add (self_loop_scale * log(1-p)) to all the other - log transition probabilities - out of that state. In typical topologies, the self-loop scale is the only scale - that matters. + log transition probabilities out of that state. (Note: in the initial stage of + graph creation we create a graph without self-loops, and with the non-self-loop + transition probabilities renormalized to sum to one). In typical topologies, the + self-loop scale is the only scale that matters. The reason we feel it might make sense to apply a different probability scale to the self-loops versus the normal transition scale is we think they could be diff --git a/src/doc/install.dox b/src/doc/install.dox index 0ffb2b1220f..b40b139a8dc 100644 --- a/src/doc/install.dox +++ b/src/doc/install.dox @@ -29,8 +29,8 @@ possibly including unfinished and experimental features, can be downloaded by typing into a shell: \verbatim - git clone https://github.com/kaldi-asr/kaldi.git kaldi-trunk --origin golden - cd kaldi-trunk + git clone https://github.com/kaldi-asr/kaldi.git kaldi --origin upstream + cd kaldi \endverbatim If you want to get updates and bug fixes you can go to some checked-out directory, and type diff --git a/src/doc/tree_externals.dox b/src/doc/tree_externals.dox index ee2bc11d8b9..df9f96e8430 100644 --- a/src/doc/tree_externals.dox +++ b/src/doc/tree_externals.dox @@ -32,13 +32,13 @@ namespace kaldi { The basic algorithm that is being implemented is a top-down greedy splitting, where we have a number of ways we can split the data by asking about, say, the left phone, the right - phone, the central phone, the state we're in, and so on. + phone, the central phone, the state we're in, and so on. The algorithm we implement is similar to the standard algorithm, see for example the paper "Tree-based State Tying for High Accuracy Acoustic Modeling" by Young, Odell and Woodland. In this algorithm, we split the data up by asking the locally optimal question, i.e. the one that gives the most likelihood increase, supposing - we model the data on each side of the split by a single Gaussian. - Differences from standard implementations include added flexibility + we model the data on each side of the split by a single Gaussian. + Differences from standard implementations include added flexibility about how to configure the tree roots; the ability to ask questions about the HMM-state and the central phone; and the fact that by default in the Kaldi scripts, the questions are automatically generated by a top-down binary clustering of the data, which means @@ -50,7 +50,7 @@ namespace kaldi { be the tree roots. For how to configure it using the standard scripts, see \ref data_prep. In practice we generally let each tree-root correspond to a "real phone", meaning that we group together all word-position-dependent, tone-dependent or stress-dependent versions of - each phone into one group that becomes a tree root. + each phone into one group that becomes a tree root. The rest of this page mostly gives details at the code level of what is happening. @@ -74,7 +74,7 @@ below summarizes these values: N is the width of the context window and P is the identity of the designated -"central phone". Normally P is exactly in the middle of the window +"central phone". Normally P is exactly in the middle of the window (hence the name "central-position"); for example, with N=3, we would normally have P=1, but you are free to choose any value from 0 to N-1; for instance, P=2 and N=3 means two phones of left context and no right context at all. @@ -82,32 +82,32 @@ In the code, when we talk about the "central phone" we always mean the P'th phone which may or may not actually be the central phone of the context window. A vector of integers representing a typical triphone context window might be: -\code -// probably not valid C++ +\code +// probably not valid C++ vector ctx_window = { 12, 15, 21 }; \endcode -Assuming N=3 and P=1, this would represent phone 15 with +Assuming N=3 and P=1, this would represent phone 15 with a right context of 21 and a left context of 12. The way we handle end effects is using zero (which is not a valid phone because it's reserved in OpenFst for the epsilon meaning "no symbol"), so for instance: -\code +\code vector ctx_window = { 12, 15, 0 }; \endcode means phone 15 with a left-context of 12 and no right-context because it's the end of the utterance. At the end of utterance in particular, the use of zero this way may be a little unexpected because the last "phone" is actually the -subsequential symbol "$" (see \ref graph_c), but for the convenience +subsequential symbol "$" (see \ref graph_c), but for the convenience of the decision-tree code we don't put the subsequential symbol in these context windows, we put zero. Note that if we had N=3 and P=2, the above context window would be invalid because its P'th element would be zero which is not a real phone; also of course, -if we had a tree with N=1, neither of the windows above would be valid because they +if we had a tree with N=1, neither of the windows above would be valid because they are the wrong size. In the monophone case, we would have a window like: -\code +\code vector ctx_window = { 15 }; \endcode so monophone systems are just treated as a special case of context-dependent -systems, with a window size N of 1 and a tree that doesn't do anything very +systems, with a window size N of 1 and a tree that doesn't do anything very interesting. @@ -126,28 +126,28 @@ TransitionModel object and an AmDiagGmm object). If the program gmm-init-mono receives an option called --shared-phones, it will share the pdfs between specified sets of phones; otherwise it makes all the phones separate. -After training a monophone system starting from a flat start, we take +After training a monophone system starting from a flat start, we take the monophone alignments -and use the function AccumulateTreeStats() (called from \ref acc-tree-stats.cc +and use the function AccumulateTreeStats() (called from \ref acc-tree-stats.cc "acc-tree-stats") to accumulate statistics for training the tree. This program is not limited to reading in monophone alignments; it works from context-dependent alignments too so we can build trees based on e.g. triphone alignments. -The statistics for tree building are written to disk as the type \ref BuildTreeStatsType -(see \ref treei_stats). +The statistics for tree building are written to disk as the type \ref BuildTreeStatsType +(see \ref treei_stats). The function AccumulateTreeStats() takes the values N and P, as explained in the previous section; the command-line programs will set these by default to 3 and 1 respectively, but this can be overridden using the --context-width -and --central-position options. The program \ref acc-tree-stats.cc +and --central-position options. The program \ref acc-tree-stats.cc "acc-tree-stats" takes a list of context-independent phones (e.g. silence), but this is not required even if there are context-independent phones; it is just -a mechanism to reduce the size of the statistics. +a mechanism to reduce the size of the statistics. For context-independent hones, the program will accumulate the corresponding statistics without the keys corresponding to the left and right phones defined (c.f. \ref treei_event_map). When the statistics have been -accumulated we use the program \ref build-tree.cc "build-tree" to -build the tree. This outputs the tree. +accumulated we use the program \ref build-tree.cc "build-tree" to +build the tree. This outputs the tree. The program \ref build-tree.cc "build-tree" requires three things: - The statistics (of type BuildTreeStatsType) - The questions config (of type Questions) @@ -160,21 +160,32 @@ scripts, these are automatically obtained from tree-building statistics by the program cluster-phones. The roots file specifies sets of phones that are goint to have shared roots in the decision-tree clustering process, and says for each phone set the following two things: - - "shared" or "not-shared" says whether or not there should be separate - roots for each of the \ref pdf_class "pdf-classes" (i.e. HMM-states, - in the typical case), or if the roots - should be shared. If we are going to be splitting (the "split" option - below) we enforce that the roots should be shared. + + - "shared" or "not-shared" says whether or not there should be separate roots + for each of the \ref pdf_class "pdf-classes" (i.e. HMM-states, in the + typical case), or if the roots should be shared. If it says "shared" there + will be a single tree-root for all HMM states (e.g. all three states, in a + normal topology) ; if "not-shared" there would be (e.g.) three tree-roots, + one for each pdf-class. + - "split" or "not-split" says whether or not the decision tree splitting should actually be done for the roots in question (for silence, we - typically don't split). + typically don't split). If the line says "split" (the normal case) then + we do the decision tree splitting. If it says "not-split" then no splitting + is done and the roots are left un-split. -Be careful because the notation is a bit tricky. The "shared" on the line of -the roots file is about whether we will share all the 3 HMM-states of the phone -in a single tree root. But we will always share together the roots of all the phones that -appear on a single lines of the roots file. This is not configurable via these -strings because if you don't want to share them, you can just put them on -separate lines of the roots file. + +The following will clarify some aspects of how this works: + + - If we say "shared split", then + even though there is one root node for all three HMM-states, the different + HMM states can still get different leaves because the tree can ask questions + about the pdf-class as well as about phonetic context. + + - We always share together the roots of all the phones that appear on a single + lines of the roots file. This is not configurable via these strings because + if you don't want to share the phones' roots, you can just put them on + separate lines of the roots file. Below is an example of a roots file; this assumes that phone 1 is silence and all the other phones have separate roots. @@ -185,14 +196,14 @@ shared split 3 ... shared split 28 \endverbatim -Having multiple phones on the same line is most useful when we have things like position and +Having multiple phones on the same line is most useful when we have things like position and stress-dependent phones; in this case each "real" phone would correspond to a set of integer phone ids. In that case we share the roots for all versions of a particular underlying phone. Below is an example of a roots file -for Wall Street Journal, from the egs/wsj/s5 scripts (this is in text, not integer form; +for Wall Street Journal, from the egs/wsj/s5 scripts (this is in text, not integer form; it would have to be converted to integer form before being read by Kalid): \verbatim -not-shared not-split SIL SIL_B SIL_E SIL_I SIL_S SPN SPN_B SPN_E SPN_I SPN_S NSN NSN_B NSN_E NSN_I NSN_S +not-shared not-split SIL SIL_B SIL_E SIL_I SIL_S SPN SPN_B SPN_E SPN_I SPN_S NSN NSN_B NSN_E NSN_I NSN_S shared split AA_B AA_E AA_I AA_S AA0_B AA0_E AA0_I AA0_S AA1_B AA1_E AA1_I AA1_S AA2_B AA2_E AA2_I AA2_S shared split AE_B AE_E AE_I AE_S AE0_B AE0_E AE0_I AE0_S AE1_B AE1_E AE1_I AE1_S AE2_B AE2_E AE2_I AE2_S shared split AH_B AH_E AH_I AH_S AH0_B AH0_E AH0_I AH0_S AH1_B AH1_E AH1_I AH1_S AH2_B AH2_E AH2_I AH2_S @@ -207,7 +218,7 @@ When creating the roots file, you should ensure that at least one phone on each For instance, in this case, if the phone AY was seen in at least some combination of stress and word-position, we would be OK. -In this example, we have various word-position-dependent variants of silence and so on. +In this example, we have various word-position-dependent variants of silence and so on. In this example they will all share their pdf's because they are on the same line and are "not-split"-- but they may have different transition parameters. In fact, most of these variants of silence would never be used as silence never appears inside words; this is for @@ -224,13 +235,13 @@ tree to another using the program \ref convert-ali.cc "convert-ali". pdf-id, and these are contiguous (typically there are several thousand of these in an LVCSR system). They are originally assigned when the tree is first built. Depending how the tree is built, it may or may not be possible to say, for each pdf-id, which phone - it corresponds to. + it corresponds to. \section tree_ctxdep Context dependency objects The ContextDependencyInterface object is a virtual base-class for the tree that specifies how it interacts with the graph-building code. This - interface contains only four functions: + interface contains only four functions: - \ref ContextDependencyInterface::ContextWidth() "ContextWidth()" returns the value of N (context-width) that the tree requires. - \ref ContextDependencyInterface::CentralPosition() "CentralPosition()" returns @@ -264,8 +275,8 @@ else \endcode The only class that currently inherits from ContextDependencyInterface -is the class ContextDependency, which has marginally richer interface; -the only important addition is the function \ref ContextDependency::GetPdfInfo +is the class ContextDependency, which has marginally richer interface; +the only important addition is the function \ref ContextDependency::GetPdfInfo "GetPdfInfo" which is used by the TransitionModel class to work out which phones a particular pdf can possibly correspond to (this function could be emulated given only the interface of ContextDependencyInterface, by @@ -274,7 +285,7 @@ enumerating all contexts). The ContextDependency object is actually a fairly thin wrapper for the EventMap object; see \ref tree_internals. We wanted to hide the actual implementation of the tree as much as possible to make it -easy to refactor the code later if needed. +easy to refactor the code later if needed. \section tree_example An example of a decision tree @@ -309,18 +320,18 @@ Below is a kind of quasi-BNF notation that explains the tree-file format. In the example below, the top-level EventMap of the tree is a SplitEventMap (SE) that splits on key 1, which is the central phone. In square brackets are a contiguous range of phone-ids. As it happens, these don't represent a question, but are just a way of -splitting on phones so we can get to the "real" decision trees which are per phone. +splitting on phones so we can get to the "real" decision trees which are per phone. The issue is that this tree was built with "shared roots", so there are various phone-ids, corresponding to different word-position-and-stress-marked versions of the same phone, that share the root. We can't use a TableEventMap (TE) at the top level of the tree, or we'd have to repeat each decision tree several times (since the EventMap is a pure -tree, not a general graph, it has no mechanism for pointers to be "shared"). -The next few instances of the "SE" label are also part of this "quasi-tree" which +tree, not a general graph, it has no mechanism for pointers to be "shared"). +The next few instances of the "SE" label are also part of this "quasi-tree" which is initially splitting on the central phone (as we go down this file we are going deeper into the tree; notice that the braces "{" are opening but not yet closing). Then we have the string "TE -1 5 ( CE 0 CE 1 CE 2 CE 3 CE 4 )", which represents splitting with a TableEventMap -on the pdf-class "-1" (effectively, the HMM-position), and returning values 0 through 4. +on the pdf-class "-1" (effectively, the HMM-position), and returning values 0 through 4. The values represent the five pdf-ids for the silence and noise phones SIL, NSN and SPN; in our setup, the pdfs are shared between these three non-speech phones (only the transition matrix is specific to each non-speech phone). @@ -332,8 +343,8 @@ various versions of the phone AA; and question is asking whether the pdf-class ( has value 0 (i.e. the leftmost HMM-state). Assuming the answer is "yes", the next question is "SE 2 [ 220 221 222 223 ]", which is asking whether the phone to the right is one of various forms of the phone "M" (a rather unintuitive question to ask, since we're -in the leftmost HMM-state); if yes, we ask "SE 0 [ 104 105 106 107... 286 287 ]" which is -a question about the phone to the right; if yes, then the pdf-id is 5 ("CE 5") and if +in the leftmost HMM-state); if yes, we ask "SE 0 [ 104 105 106 107... 286 287 ]" which is +a question about the phone to the right; if yes, then the pdf-id is 5 ("CE 5") and if no, 696 ("CE 696"). \verbatim s3# copy-tree --binary=false exp/tri1/tree - 2>/dev/null | head -100 @@ -366,8 +377,8 @@ SE 2 [ 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 36 37 38 39 40 41 42 43 44 45 4 \endverbatim Below is a simpler example: the monophone tree from the Resource Management -recipe. The top-level EventMap is a TableEventMap ("TE 0 49 ..."). -The key "0" is the phone-position of zero which represents the central (and only) phone +recipe. The top-level EventMap is a TableEventMap ("TE 0 49 ..."). +The key "0" is the phone-position of zero which represents the central (and only) phone since the context width (N) is 1. The number of entries in the table is 49 (in this case, the number of phones plus one). The first EventMap in the table (index zero) is NULL, because there is no phone with @@ -375,11 +386,11 @@ index zero. The next one is a TableEventMap with three elements, corresponding to the three HMM-states (technically, pdf-classes) of the first phone: "TE -1 3 ( CE 0 CE 1 CE 2 )". \verbatim s3# copy-tree --binary=false exp/mono/tree - 2>/dev/null| head -5 -ContextDependency 1 0 ToPdf TE 0 49 ( NULL TE -1 3 ( CE 0 CE 1 CE 2 ) -TE -1 3 ( CE 3 CE 4 CE 5 ) -TE -1 3 ( CE 6 CE 7 CE 8 ) -TE -1 3 ( CE 9 CE 10 CE 11 ) -TE -1 3 ( CE 12 CE 13 CE 14 ) +ContextDependency 1 0 ToPdf TE 0 49 ( NULL TE -1 3 ( CE 0 CE 1 CE 2 ) +TE -1 3 ( CE 3 CE 4 CE 5 ) +TE -1 3 ( CE 6 CE 7 CE 8 ) +TE -1 3 ( CE 9 CE 10 CE 11 ) +TE -1 3 ( CE 12 CE 13 CE 14 ) \endverbatim @@ -391,8 +402,8 @@ disambiguation symbols and possibly epsilon symbols). In the graph, as always, these are represented by integer labels. We use an object that, in code and in filenames, is generally called ilabel_info. The ilabel_info object 4has a strong connection to the \ref fst::ContextFst "ContextFst" objects, see \ref graph_context. -As with many other Kaldi types, ilabel_info is a generic (STL) type but -we use a consistent variable name +As with many other Kaldi types, ilabel_info is a generic (STL) type but +we use a consistent variable name to make it identifiable. It is of the following type: \code std::vector > ilabel_info; @@ -402,7 +413,7 @@ input label the corresponding phonetic context window (see above, \ref tree_window). For example, suppose symbol 1500 is phone 30 with a right-context of 12 and a left-context of 4, we would have -\code +\code // not valid C++ ilabel_info[1500] == { 4, 30, 12 }; \endcode @@ -410,14 +421,14 @@ In the monophone case, we would have things like: \code ilabel_info[30] == { 28 }; \endcode -There are special cases to deal with disambiguation symbols (see -\ref graph_disambig or the +There are special cases to deal with disambiguation symbols (see +\ref graph_disambig or the Springer Handbook paper referenced above for an explanation of what these are). If an ilabel_info entry corresponds to a disambiguation symbol, we put in it the negative of the symbol-table entry of the disambiguation symbol (note that this is not the same as the number of the printed form -of the disambiguation symbol as in #0, #1, #2 etc., it is the number -corresponding to it in a symbol-table file, which in our current scripts is +of the disambiguation symbol as in #0, #1, #2 etc., it is the number +corresponding to it in a symbol-table file, which in our current scripts is called phones_disambig.txt). For example, \code ilabel_info[5] == { -42 }; @@ -428,7 +439,7 @@ so the programs that interpret the ilabel_info object don't need to be given a list of disambiguation symbols in order to be able to distinguish them from real phones in the monophone case. There are two additional special cases: we have -\code +\code ilabel_info[0] == { }; // epsilon ilabel_info[1] == { 0 }; // disambig symbol #-1; // we use symbol 1, but don't consider this hardwired. diff --git a/src/feat/feature-fbank.cc b/src/feat/feature-fbank.cc index af1f7b1a346..5a0fb2a48fa 100644 --- a/src/feat/feature-fbank.cc +++ b/src/feat/feature-fbank.cc @@ -109,7 +109,7 @@ void Fbank::ComputeInternal(const VectorBase &wave, // Get dimensions of output features int32 rows_out = NumFrames(wave.Dim(), opts_.frame_opts); - int32 cols_out = opts_.mel_opts.num_bins + opts_.use_energy; + int32 cols_out = opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0); if (rows_out == 0) { output->Resize(0, 0); if (wave_remainder != NULL) diff --git a/src/feat/feature-fbank.h b/src/feat/feature-fbank.h index 2a3819c5f62..febfeac9f9b 100644 --- a/src/feat/feature-fbank.h +++ b/src/feat/feature-fbank.h @@ -80,7 +80,9 @@ class Fbank { explicit Fbank(const FbankOptions &opts); ~Fbank(); - int32 Dim() const { return opts_.mel_opts.num_bins; } + int32 Dim() const { + return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0); + } /// Will throw exception on failure (e.g. if file too short for even one /// frame). The output "wave_remainder" is the last frame or two of the diff --git a/src/feat/signal.cc b/src/feat/signal.cc index 19b876989c2..a374c531e3d 100644 --- a/src/feat/signal.cc +++ b/src/feat/signal.cc @@ -34,11 +34,11 @@ void ElementwiseProductOfFft(const Vector &a, Vector *b) { void ConvolveSignals(const Vector &filter, Vector *signal) { int32 signal_length = signal->Dim(); int32 filter_length = filter.Dim(); - Vector signal_padded(signal_length + filter_length - 1); + Vector signal_padded(signal_length + filter_length - 1); signal_padded.SetZero(); for (int32 i = 0; i < signal_length; i++) { for (int32 j = 0; j < filter_length; j++) { - signal_padded(i + j) += (*signal)(i) * filter(j); + signal_padded(i+j) += (*signal)(i) * filter(j); } } signal->CopyFromVec(signal_padded.Range(0, signal_length)); @@ -54,11 +54,11 @@ void FFTbasedConvolveSignals(const Vector &filter, Vector SplitRadixRealFft srfft(fft_length); - Vector filter_padded(fft_length); + Vector filter_padded(fft_length); filter_padded.Range(0, filter_length).CopyFromVec(filter); srfft.Compute(filter_padded.Data(), true); - Vector signal_padded(fft_length); + Vector signal_padded(fft_length); signal_padded.Range(0, signal_length).CopyFromVec(*signal); srfft.Compute(signal_padded.Data(), true); @@ -70,7 +70,8 @@ void FFTbasedConvolveSignals(const Vector &filter, Vector signal->CopyFromVec(signal_padded.Range(0, signal_length)); } -void FFTbasedBlockConvolveSignals(const Vector &filter, Vector *signal) { +void FFTbasedBlockConvolveSignals(const Vector &filter, Vector *signal, + bool apply_inverse) { int32 signal_length = signal->Dim(); int32 filter_length = filter.Dim(); @@ -83,13 +84,37 @@ void FFTbasedBlockConvolveSignals(const Vector &filter, Vector srfft(fft_length); - Vector filter_padded(fft_length); + Vector filter_padded(fft_length); filter_padded.Range(0, filter_length).CopyFromVec(filter); srfft.Compute(filter_padded.Data(), true); + + // If true, inverse of filter is computed and + // input signal is convolved with inverse of filter. + // The inverse of filter H_inv(w) is estimated as + // conj(H(w))/( abs(H(w))^2 + const) + if (apply_inverse) { + BaseFloat abs_Hw, const_val = 0.0; + int32 half_N = filter_padded.Dim() / 2; + Vector inv_filter_padded(filter_padded); + inv_filter_padded(0) = + filter_padded(0) / (filter_padded(0) * filter_padded(0) + const_val); + inv_filter_padded(1) = + filter_padded(1) / (filter_padded(1) * filter_padded(1) + const_val); + for (int32 bin = 1; bin < half_N; bin++) { + int32 w_real_ind = 2 * bin, + w_im_ind = 2 * bin + 1; + abs_Hw = filter_padded(w_real_ind) * filter_padded(w_real_ind) + + filter_padded(w_im_ind) * filter_padded(w_im_ind); + + inv_filter_padded(w_real_ind) /= (abs_Hw + const_val); + inv_filter_padded(w_im_ind) *= -1.0 / (abs_Hw + const_val); + } + filter_padded.CopyFromVec(inv_filter_padded); + } - Vector temp_pad(filter_length - 1); + Vector temp_pad(filter_length - 1); temp_pad.SetZero(); - Vector signal_block_padded(fft_length); + Vector signal_block_padded(fft_length); for (int32 po = 0; po < signal_length; po += block_length) { // get a block of the signal diff --git a/src/feat/signal.h b/src/feat/signal.h index 7ff0ce33b52..b9a49473b96 100644 --- a/src/feat/signal.h +++ b/src/feat/signal.h @@ -44,7 +44,8 @@ void FFTbasedConvolveSignals(const Vector &filter, Vector overlap-add method. This is an efficient way to evaluate the discrete convolution of a long signal with a finite impulse response filter. */ -void FFTbasedBlockConvolveSignals(const Vector &filter, Vector *signal); +void FFTbasedBlockConvolveSignals(const Vector &filter, Vector *signal, + bool apply_inverse = false); } // namespace kaldi diff --git a/src/featbin/Makefile b/src/featbin/Makefile index 9843e7bbd4b..bff2b212a5b 100644 --- a/src/featbin/Makefile +++ b/src/featbin/Makefile @@ -15,7 +15,7 @@ BINFILES = compute-mfcc-feats compute-plp-feats compute-fbank-feats \ process-kaldi-pitch-feats compare-feats wav-to-duration add-deltas-sdc \ compute-and-process-kaldi-pitch-feats modify-cmvn-stats wav-copy \ wav-reverberate append-vector-to-feats detect-sinusoids shift-feats \ - concat-feats + concat-feats compute-filter apply-filter OBJFILES = diff --git a/src/featbin/apply-filter.cc b/src/featbin/apply-filter.cc new file mode 100644 index 00000000000..8ddfd0073c2 --- /dev/null +++ b/src/featbin/apply-filter.cc @@ -0,0 +1,114 @@ +// featbin/apply-filters.cc + +// Copyright 2016 Pegah Ghahremani + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "feat/wave-reader.h" +#include "feat/signal.h" + +namespace kaldi { +void ApplyFilter(const Vector &input, + const Vector &filter, + Vector *filtered_input) { + int32 min_size = 0, + size = input.Dim(), f_order = filter.Dim(); + filtered_input->Resize(size); + // compute filtered input as y_j = sum_{i=1}^n x_(j-i) * a_i + // where input is y and filtered version is xi. + // sp x_j = 1/a_0 * (y_j - sum_{i=1}^p a_i * x(j-i)) + (*filtered_input)(0) = input(0); + for (int32 i = 0; i < size; i++) { + min_size = std::min(f_order, i); + BaseFloat sum = 0; + for (int32 j = 1; j < min_size; j++) + sum += filter(j) * (*filtered_input)(i-j); + KALDI_ASSERT(filter(0) != 0); + (*filtered_input)(i) = (input(i) - sum) / filter(0); + } +} + +} + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + + const char *usage = + "apply filter to wave files supplied via input pip as FIR or IIR filter.\n" + "If the --inverse=false, it applies filter as FIR filter\n" + "and if --inverse=true, the inverse of filter applies as IIR filter.\n" + "Usage: apply-filters [options...] " + " \n" + "e.g. \n" + "apply-filters --inverse=false --utt2spkfilter=ark:data/train/utt2spkfilter \n" + " input.wav filter.wav output_1.wav\n"; + ParseOptions po(usage); + + bool inverse = false; + std::string utt2spkfilter_rspecifier = ""; + po.Register("inverse", &inverse, + "If false, the filter is applied as FIR filter," + "otherwise its inverse applied as IIR filter."); + po.Register("utt2spkfilter", &utt2spkfilter_rspecifier, + "rspecifier for utterance to spkear-filter list map" + " used to filter each utterance"); + po.Read(argc, argv); + if (po.NumArgs() != 3) { + po.PrintUsage(); + exit(1); + } + std::string input_wave_file = po.GetArg(2), + filter_file = po.GetArg(1), + output_wave_file = po.GetArg(3); + + WaveData input_wave; + { + Input ki(input_wave_file); + input_wave.Read(ki.Stream()); + + } + + SequentialBaseFloatVectorReader filter_reader(filter_file); + const Vector &lpc_filter = filter_reader.Value(); + + Vector filtered_wav(input_wave.Data().Row(0)); + BaseFloat samp_freq_input = input_wave.SampFreq(); + // If inverse = false, it does FFT-based block Convolution of filter with + // long input signal. + // Otherwise inverse of filter is convolved with input signal. + // If we use lp coefficients as [1 -a1 -a2 ... ap] as filter + // convolving input with this filter is like whitening transform. + // y'[n] = y[n] - sum_{i=1}^p {input_wav[n-i] * lpc_coeffs[i]} + // = conv(y, [1 :-lpc-coeffs]) + Vector orig_wav(filtered_wav); + //if (inverse) + // ApplyFilter(orig_wav, lpc_filter, &filtered_wav); + //else + FFTbasedBlockConvolveSignals(lpc_filter, &filtered_wav, inverse); + Matrix filtered_wav_mat(1, filtered_wav.Dim()); + filtered_wav_mat.CopyRowsFromVec(filtered_wav); + WaveData out_wave(samp_freq_input, filtered_wav_mat); + Output ko(output_wave_file, false); + out_wave.Write(ko.Stream()); + return 0; + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/src/featbin/compute-filter.cc b/src/featbin/compute-filter.cc new file mode 100644 index 00000000000..4f750e418cf --- /dev/null +++ b/src/featbin/compute-filter.cc @@ -0,0 +1,300 @@ +// featbin/compute-filters.cc + +// Copyright 2016 Pegah Ghahremani + + +#include "feat/feature-functions.h" +#include "matrix/srfft.h" +#include "matrix/kaldi-matrix-inl.h" +#include "feat/wave-reader.h" +namespace kaldi { + +// struct used to store statistics required for +// computing correlation coefficients +struct CorrelationStats { + int32 corr_order; // number of correlation coefficient. R[0],..,R[corr_order - 1] + int32 num_samp; // number of samples + BaseFloat samp_sum; // sum of samples. + Vector l2_norms; // l2_norms[j] - inner product of shifted input by itself as + // sum_{i=0}^corr_window_size x[i+j]^2 + Vector inner_prod; // inner product of input vector with its shifted version by j + // sum_{i=0}^corr_window_size x[i] * x[i+j] + CorrelationStats(): corr_order(100), num_samp(0), samp_sum(0) { + l2_norms.Resize(corr_order); + inner_prod.Resize(corr_order);} + + CorrelationStats(int32 corr_order): corr_order(corr_order), + num_samp(0), samp_sum(0) { + l2_norms.Resize(corr_order); + inner_prod.Resize(corr_order); } +}; + +/* + This function computes and accumulates statistics + required for computing auto-correlation coefficient using waveform "wave", + e.g dot-product of input with its shifted version. + inner_prod[j] - inner product of input vector with its shifted version by j + sum_{i=0}^corr_window_size x[i] * x[i+j] + l2_norms[j] - inner product of shifted input by itself as + sum_{i=0}^corr_window_size x[i+j]^2 + lpc_order is the size of autocorr_coeffs. +*/ +void AccStatsForCorrelation(const VectorBase &wave, + int32 lpc_order, + CorrelationStats *acc_corr_stats) { + KALDI_ASSERT(acc_corr_stats->inner_prod.Dim() == lpc_order); + acc_corr_stats->samp_sum += wave.Sum(); + acc_corr_stats->num_samp += wave.Dim(); + int32 corr_window_size = wave.Dim() - lpc_order; + Vector norm_wave(wave); + SubVector sub_vec1(norm_wave, 0, corr_window_size); + BaseFloat local_l2_norm = VecVec(sub_vec1, sub_vec1), sum; + + acc_corr_stats->inner_prod(0) += local_l2_norm; + + for (int32 lag = 1; lag < lpc_order; lag++) { + SubVector sub_vec2(norm_wave, lag, corr_window_size); + int32 last_ind = corr_window_size + lag - 1; + local_l2_norm += (wave(last_ind) * wave(last_ind) - + wave(lag - 1) * wave(lag - 1)); + sum = VecVec(sub_vec1, sub_vec2); + acc_corr_stats->inner_prod(lag) += sum; + acc_corr_stats->l2_norms(lag) += local_l2_norm; + } +} +/* + Compute autocorrelation coefficients using accumulated unnormalized statistics + such as inner product and l2 norms. + inner product and l2_norms are normalized using mean as E[x]. + autocorr[j] - sum_{i=0}^n ([x[i] - E[x]) * (x[i+j] - E(x)) / + [(sum_{i=0}^n ([x[i] - E[x])^2) * (sum_{i=0}^n ([x[i+j] - E[x])^2)]^0.5 + autocorr[j] - inner_prod[j] / (norms[0] * norms[j])^0.5 + inner_prod[j] - inner product of input vector with its shifted version by j + sum_{i=0}^n x[i] * x[i+j] + l2_norms[j] - inner product of shifted input by itself as sum_{i=0}^n x[i+j]^2 +*/ +void ComputeCorrelation(const CorrelationStats &acc_corr_stats, + Vector *autocorr) { + + KALDI_ASSERT(acc_corr_stats.inner_prod.Dim() == acc_corr_stats.l2_norms.Dim()); + + int32 lpc_order = acc_corr_stats.inner_prod.Dim(); + autocorr->Resize(lpc_order); + for (int32 lag = 0; lag < lpc_order; lag++) + (*autocorr)(lag) = acc_corr_stats.inner_prod(lag); + + // scale outocorrelation between 0 and 1 using autocorr(0) + autocorr->Scale(1.0 / (*autocorr)(0)); + +} +/* + Durbin's recursion - converts autocorrelation coefficients to the LPC + pTmp - temporal place [n] + pAC - autocorrelation coefficients [n + 1] + pLP - linear prediction coefficients [n] (predicted_sn = sum_1^P{a[i] * s[n-i]}}) +*/ +double DurbinInternal(int32 n, double *pAC, double *pLP, double *pTmp) { + double ki; // reflection coefficient + + // we add this bias term to pAC[0]. + // Adding bias term is equivalent to t = teoplitz(pAC) + diag(bias) + // which is like shifting eigenvalues of teoplitz(pAC) using bias term + // and we can make sure t is convertible. + double durbin_bias = 1e-2; + int32 max_repeats = 20; + + double E = pAC[0]; + pLP[0] = 1.0; + for (int32 i = 1; i <= n; i++) { + // next reflection coefficient + ki = pAC[i]; + for (int32 j = 1; j < i; j++) + ki += pLP[j] * pAC[i - j]; + ki = ki / E; + + if (abs(ki) > 1) { + int32 num_repeats = int((pAC[0] - 1.0) / durbin_bias); + KALDI_LOG << " warning: In Durbin algorithm, abs(ki) > 1 " + << " for iteration = " << i + << " ki = " << ki + << " autocorr[0] = " << pAC[0] + << " num_repeats = " << num_repeats + << "; the bias added"; + pAC[0] += durbin_bias; + if (num_repeats < max_repeats) + return -1; + } + // new error + double c = 1 - ki * ki; + if (c < 1.0e-5) // remove NaNs for constan signal + c = 1.0e-5; + + E *= c; + // new LP coefficients + pTmp[i] = -ki; + for (int32 j = 1; j < i; j++) + pTmp[j] = pLP[j] - ki * pLP[i - j]; + + for (int32 j = 1; j <= i; j++) + pLP[j] = pTmp[j]; + } + return E; +} +/* + This function computes coefficients of forward linear predictor + w.r.t autocorrelation coefficients by minimizing the prediction + error using MSE. + Durbin used to compute LP coefficients using autocorrelation coefficients. + R(j) = sum_{i=1}^P R((i+j % p)] * a[i] j = 0, ..,P + P is order of Linear prediction + lp_filter - [1, -a[1], -a[2] ... ,-a[P]] + where a[i] are linear prediction coefficients. (predicted_x[n] = sum_{i=1}^P{a[i] * x[n-i]}) + x[n] - predicted_x[n] = sum_{i = 0}^P { lp_filter[i] .* x[n] } + = conv(x, lp_filter) + R(j) is the j_th autocorrelation coefficient. +*/ +void ComputeFilters(const VectorBase &autocorr, + Vector *lp_filter) { + int32 n = autocorr.Dim(); + lp_filter->Resize(n); + // compute lpc coefficients using autocorrelation coefficients + // with Durbin algorithm + Vector d_autocorr(autocorr), + d_lpc_coeffs(n), d_tmp(n); + + KALDI_LOG << "compute lpc using correlations "; + while (DurbinInternal(n, d_autocorr.Data(), + d_lpc_coeffs.Data(), + d_tmp.Data()) < 0); + lp_filter->CopyFromVec(d_lpc_coeffs); + if (KALDI_ISNAN(lp_filter->Sum())) { + KALDI_WARN << "NaN encountered in lpc coefficients derived from Durbin algorithm."; + lp_filter->Set(0.0); + (*lp_filter)(0) = 1.0; + } + +} + +} // namescape kaldi + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using kaldi::int32; + + const char *usage = + "Computes LP coefficients per-speaker, by minimizing " + "prediction error using MSE.\n" + "This coefficient contain speaker-dependent information correspond to each speaker.\n" + + "Usage: compute-filters [options] \n" + "e.g.: compute-filters " + " scp:data/train/wav.scp ark,scp:filter.ark,filter.scp\n"; + + ParseOptions po(usage); + std::string spk2utt_rspecifier; + bool binary = true; + int32 channel = -1, + lpc_order = 100; + po.Register("binary", &binary, "write in binary mode (applies only to global filters)"); + po.Register("lpc-order", &lpc_order, "number of LP coefficients used to extract filters."); + + po.Read(argc, argv); + + if (po.NumArgs() != 2) { + po.PrintUsage(); + exit(1); + } + int32 num_done = 0, num_err = 0; + std::string wav_rspecifier = po.GetArg(1), + wspecifier = po.GetArg(2); + + BaseFloatVectorWriter writer(wspecifier); + if (spk2utt_rspecifier != "") { + SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier); + RandomAccessTableReader wav_reader(wav_rspecifier); + for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) { + std::string spk = spk2utt_reader.Key(); + const std::vector &uttlist = spk2utt_reader.Value(); + CorrelationStats acc_corr_stats(lpc_order); + for (size_t i = 0; i < uttlist.size(); i++) { + std::string utt = uttlist[i]; + if (!wav_reader.HasKey(utt)) { + KALDI_WARN << "Did not find wave for utterance " << utt; + num_err++; + continue; + } + const WaveData &wav_data = wav_reader.Value(utt); + int32 num_chan = wav_data.Data().NumRows(), this_chan = channel; + KALDI_ASSERT(num_chan > 0); + if (channel == -1) { + this_chan = 0; + if (num_chan != 1) + KALDI_WARN << "Channel not specified but you have data with " + << num_chan << " channels; defaulting to zero"; + } else { + if (this_chan >= num_chan) { + KALDI_WARN << "File with id " << spk << " has " + << num_chan << " channels but you specified channel " + << channel << ", producing no output."; + num_err++; + continue; + } + } + Vector waveform(wav_data.Data().Row(this_chan)); + waveform.Scale(1.0 / (1 << 15)); + AccStatsForCorrelation(waveform, lpc_order, + &acc_corr_stats); + } + Vector filter, autocorr(lpc_order); + ComputeCorrelation(acc_corr_stats, + &autocorr); + ComputeFilters(autocorr, &filter); + writer.Write(spk, filter); + num_done++; + } + } else { // assume the input waveform is per-speaker. + SequentialTableReader wav_reader(wav_rspecifier); + for (; !wav_reader.Done(); wav_reader.Next()) { + std::string spk = wav_reader.Key(); + const WaveData &wave_data = wav_reader.Value(); + int32 num_chan = wave_data.Data().NumRows(), this_chan = channel; + KALDI_ASSERT(num_chan > 0); + if (channel == -1) { + this_chan = 0; + if (num_chan != 1) + KALDI_WARN << "Channel not specified but you have data with " + << num_chan << " channels; defaulting to zero"; + } else { + if (this_chan >= num_chan) { + KALDI_WARN << "File with id " << spk << " has " + << num_chan << " channels but you specified channel " + << channel << ", producing no output."; + num_err++; + continue; + } + } + Vector waveform(wave_data.Data().Row(this_chan)); + Vector autocorr, filter; + waveform.Scale(1.0 / (1 << 15)); + KALDI_ASSERT(waveform.Max() <=1 && waveform.Min() >= -1); + CorrelationStats acc_corr_stats(lpc_order); + + AccStatsForCorrelation(waveform, lpc_order, + &acc_corr_stats); + ComputeCorrelation(acc_corr_stats, + &autocorr); + //KALDI_LOG << "autocorr = " << autocorr; + ComputeFilters(autocorr, &filter); + writer.Write(spk, filter); + num_done++; + } + } + KALDI_LOG << "Done " << num_done << " speakers, " << num_err + << " with errors."; + return (num_done != 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/src/fstbin/fstaddselfloops.cc b/src/fstbin/fstaddselfloops.cc index 9219093bee1..96895f23cf4 100644 --- a/src/fstbin/fstaddselfloops.cc +++ b/src/fstbin/fstaddselfloops.cc @@ -45,8 +45,9 @@ int main(int argc, char *argv[]) { "on at least one arc out of the state. Useful in conjunction with predeterminize\n" "\n" "Usage: fstaddselfloops in-disambig-list out-disambig-list [in.fst [out.fst] ]\n" - "E.g: fstaddselfloops in.list out.list < in.fst > withloops.fst\n"; - + "E.g: fstaddselfloops in.list out.list < in.fst > withloops.fst\n" + "in.list and out.list are lists of integers, one per line, of the\n" + "same length.\n"; ParseOptions po(usage); po.Read(argc, argv); @@ -62,12 +63,12 @@ int main(int argc, char *argv[]) { fst_out_filename = po.GetOptArg(4); VectorFst *fst = ReadFstKaldi(fst_in_filename); - + std::vector disambig_in; if (!ReadIntegerVectorSimple(disambig_in_rxfilename, &disambig_in)) KALDI_ERR << "fstaddselfloops: Could not read disambiguation symbols from " << kaldi::PrintableRxfilename(disambig_in_rxfilename); - + std::vector disambig_out; if (!ReadIntegerVectorSimple(disambig_out_rxfilename, &disambig_out)) KALDI_ERR << "fstaddselfloops: Could not read disambiguation symbols from " @@ -81,7 +82,7 @@ int main(int argc, char *argv[]) { WriteFstKaldi(*fst, fst_out_filename); delete fst; - + return 0; } catch(const std::exception &e) { std::cerr << e.what(); diff --git a/src/hmm/posterior.cc b/src/hmm/posterior.cc index 25acf48a7d1..4e5cbd45282 100644 --- a/src/hmm/posterior.cc +++ b/src/hmm/posterior.cc @@ -429,18 +429,6 @@ void WeightSilencePostDistributed(const TransitionModel &trans_model, } } -// comparator object that can be used to sort from greatest to -// least posterior. -struct CompareReverseSecond { - // view this as an "<" operator used for sorting, except it behaves like - // a ">" operator on the .second field of the pair because we want the - // sort to be in reverse order (greatest to least) on posterior. - bool operator() (const std::pair &a, - const std::pair &b) { - return (a.second > b.second); - } -}; - BaseFloat VectorToPosteriorEntry( const VectorBase &log_likes, int32 num_gselect, diff --git a/src/hmm/posterior.h b/src/hmm/posterior.h index 18bbd65a86a..4f5896da7c6 100644 --- a/src/hmm/posterior.h +++ b/src/hmm/posterior.h @@ -155,6 +155,18 @@ int32 MergePosteriors(const Posterior &post1, bool drop_frames, Posterior *post); +// comparator object that can be used to sort from greatest to +// least posterior. +struct CompareReverseSecond { + // view this as an "<" operator used for sorting, except it behaves like + // a ">" operator on the .second field of the pair because we want the + // sort to be in reverse order (greatest to least) on posterior. + bool operator() (const std::pair &a, + const std::pair &b) { + return (a.second > b.second); + } +}; + /// Given a vector of log-likelihoods (typically of Gaussians in a GMM /// but could be of pdf-ids), a number gselect >= 1 and a minimum posterior /// 0 <= min_post < 1, it gets the posterior for each element of log-likes diff --git a/src/ivector/plda.h b/src/ivector/plda.h index f5affa5d1ae..57609633169 100644 --- a/src/ivector/plda.h +++ b/src/ivector/plda.h @@ -73,8 +73,8 @@ class Plda { /// before giving them to the function LogLikelihoodRatio (it's /// done this way for efficiency because a given iVector may be /// used multiple times in LogLikelihoodRatio and we don't want - /// do repeat the matrix multiplication - /// + /// to repeat the matrix multiplication + /// /// If config.normalize_length == true, it will also normalize the length of /// the iVector so that it is equal to the sqrt(dim). The normalization /// factor is returned, even if config.normalize_length == false, in which @@ -88,7 +88,7 @@ class Plda { float TransformIvector(const PldaConfig &config, const VectorBase &ivector, VectorBase *transformed_ivector) const; - + /// Returns the log-likelihood ratio /// log (p(test_ivector | same) / p(test_ivector | different)). /// transformed_train_ivector is an average over utterances for @@ -100,7 +100,7 @@ class Plda { int32 num_train_utts, const VectorBase &transformed_test_ivector); - + /// This function smooths the within-class covariance by adding to it, /// smoothing_factor (e.g. 0.1) times the between-class covariance (it's /// implemented by modifying transform_). This is to compensate for @@ -108,7 +108,7 @@ class Plda { /// estimate of the within-class covariance, and where the leading elements of /// psi_ were as a result very large. void SmoothWithinClassCovariance(double smoothing_factor); - + int32 Dim() const { return mean_.Dim(); } void Write(std::ostream &os, bool binary) const; void Read(std::istream &is, bool binary); @@ -116,7 +116,7 @@ class Plda { void ComputeDerivedVars(); // computes offset_. friend class PldaEstimator; friend class PldaUnsupervisedAdaptor; - + Vector mean_; // mean of samples in original space. Matrix transform_; // of dimension Dim() by Dim(); // this transform makes within-class covar unit @@ -142,7 +142,7 @@ class PldaStats { /// to weight your training samples. void AddSamples(double weight, const Matrix &group); - + int32 Dim() const { return dim_; } void Init(int32 dim); @@ -151,9 +151,9 @@ class PldaStats { bool IsSorted() const; ~PldaStats(); protected: - + friend class PldaEstimator; - + int32 dim_; int64 num_classes_; int64 num_examples_; // total number of examples, sumed over classes. @@ -165,7 +165,7 @@ class PldaStats { SpMatrix offset_scatter_; // Sum over all examples, of the weight // times (example - class-mean). - + // We have one of these objects per class. struct ClassInfo { double weight; @@ -178,7 +178,7 @@ class PldaStats { ClassInfo(double weight, Vector *mean, int32 num_examples): weight(weight), mean(mean), num_examples(num_examples) { } }; - + std::vector class_info_; private: KALDI_DISALLOW_COPY_AND_ASSIGN(PldaStats); @@ -197,16 +197,16 @@ struct PldaEstimationConfig { class PldaEstimator { public: PldaEstimator(const PldaStats &stats); - + void Estimate(const PldaEstimationConfig &config, Plda *output); private: typedef PldaStats::ClassInfo ClassInfo; - + /// Returns the part of the objf relating to /// offsets from the class means. (total, not normalized) double ComputeObjfPart1() const; - + /// Returns the part of the obj relating to /// the class means (total_not normalized) double ComputeObjfPart2() const; @@ -217,7 +217,7 @@ class PldaEstimator { int32 Dim() const { return stats_.Dim(); } void EstimateOneIter(); - + void InitParameters(); void ResetPerIterStats(); @@ -233,7 +233,7 @@ class PldaEstimator { // Copy to output. void GetOutput(Plda *plda); - + const PldaStats &stats_; SpMatrix within_var_; @@ -254,7 +254,7 @@ struct PldaUnsupervisedAdaptorConfig { BaseFloat mean_diff_scale; BaseFloat within_covar_scale; BaseFloat between_covar_scale; - + PldaUnsupervisedAdaptorConfig(): mean_diff_scale(1.0), within_covar_scale(0.3), @@ -285,7 +285,7 @@ class PldaUnsupervisedAdaptor { // Add stats to this class. Normally the weight will be 1.0. void AddStats(double weight, const Vector &ivector); void AddStats(double weight, const Vector &ivector); - + void UpdatePlda(const PldaUnsupervisedAdaptorConfig &config, Plda *plda) const; @@ -293,7 +293,7 @@ class PldaUnsupervisedAdaptor { double tot_weight_; Vector mean_stats_; - SpMatrix variance_stats_; + SpMatrix variance_stats_; }; diff --git a/src/kwsbin/compute-atwv.cc b/src/kwsbin/compute-atwv.cc index 1b7476723c0..c7c8e484f8d 100644 --- a/src/kwsbin/compute-atwv.cc +++ b/src/kwsbin/compute-atwv.cc @@ -37,13 +37,34 @@ int main(int argc, char *argv[]) { const char *usage = "Computes the Actual Term-Weighted Value and prints it." "\n" - "Usage: compute-atwv [options] ref-rspecifier hyp-rspecifier [alignment csv]\n" - " e.g.: compute-atwv ark:ref.1 ark:hyp.1 ali.csv\n" + "Usage: compute-atwv [options] [alignment-csv-filename]\n" + " e.g.: compute-atwv 32485.4 ark:ref.1 ark:hyp.1 ali.csv\n" + " or: compute-atwv 32485.4 ark:ref.1 ark:hyp.1\n" "\n" - "where the alignment format is compatible with the alignment produced\n" - "using the F4DE tool -- you are responsible for mapping the utterance\n" - "identifiers and the term string to the correct ones - use the script\n" - "utils/int2sym.pl and the utterance/keyword maps\n"; + "NOTES: \n" + " a) the number of trials is usually equal to the size of the searched\n" + " collection in seconds\n" + " b the ref-rspecifier/hyp-rspecifier are the kaldi IO specifiers for both\n" + " the reference and the hypotheses (found hits), respectively.\n" + " The format is the same for both of them. Each line is of \n" + " the following format\n" + "\n" + " \n\n" + " e.g.:\n\n" + " KW106-189 348 459 560 0.8\n" + "\n" + " b) the alignment-csv-filename is an optional parameter. If present,\n" + " the alignment i.e. detailed information about what hypotheses match\n" + " up with which reference entries will be generated. The alignemnt\n" + " file format is equivalent to the alignment file produced using\n" + " the F4DE tool. However, we do not set some fields and the utterance\n" + " identifiers are numeric. You can use the script utils/int2sym.pl\n" + " and the utterance/keyword maps to convert the numerical ids into text\n" + " c) the scores are expected to be probabilities. Please note that\n" + " the output from the kws-search is in -log(probability).\n" + " d) compute-atwv does not perform any score normalization (it's just\n" + " for scoring purposes). Without score normalization/calibration\n" + " the performance of the search will be quite poor.\n"; ParseOptions po(usage); KwsTermsAlignerOptions ali_opts; diff --git a/src/lat/determinize-lattice-pruned.cc b/src/lat/determinize-lattice-pruned.cc index 8e92f939ef9..e38c62b3bfa 100644 --- a/src/lat/determinize-lattice-pruned.cc +++ b/src/lat/determinize-lattice-pruned.cc @@ -19,17 +19,6 @@ // See the Apache 2 License for the specific language governing permissions and // limitations under the License. -#ifdef _MSC_VER -#include -using std::unordered_map; -#elif __cplusplus > 199711L || defined(__GXX_EXPERIMENTAL_CXX0X__) -#include -using std::unordered_map; -#else -#include -using std::tr1::unordered_map; -#endif - #include #include #include "fstext/determinize-lattice.h" // for LatticeStringRepository diff --git a/src/lat/lattice-functions.cc b/src/lat/lattice-functions.cc index 0ea66712eda..d8443bd7434 100644 --- a/src/lat/lattice-functions.cc +++ b/src/lat/lattice-functions.cc @@ -405,15 +405,11 @@ static inline double LogAddOrMax(bool viterbi, double a, double b) { return LogAdd(a, b); } -// Computes (normal or Viterbi) alphas and betas; returns (total-prob, or -// best-path negated cost) Note: in either case, the alphas and betas are -// negated costs. Requires that lat be topologically sorted. This code -// will work for either CompactLattice or Latice. template -static double ComputeLatticeAlphasAndBetas(const LatticeType &lat, - bool viterbi, - vector *alpha, - vector *beta) { +double ComputeLatticeAlphasAndBetas(const LatticeType &lat, + bool viterbi, + vector *alpha, + vector *beta) { typedef typename LatticeType::Arc Arc; typedef typename Arc::Weight Weight; typedef typename Arc::StateId StateId; @@ -462,6 +458,19 @@ static double ComputeLatticeAlphasAndBetas(const LatticeType &lat, return 0.5 * (tot_backward_prob + tot_forward_prob); } +// instantiate the template for Lattice and CompactLattice +template +double ComputeLatticeAlphasAndBetas(const Lattice &lat, + bool viterbi, + vector *alpha, + vector *beta); + +template +double ComputeLatticeAlphasAndBetas(const CompactLattice &lat, + bool viterbi, + vector *alpha, + vector *beta); + /// This is used in CompactLatticeLimitDepth. diff --git a/src/lat/lattice-functions.h b/src/lat/lattice-functions.h index 505aaffbe55..c58b2ec32b8 100644 --- a/src/lat/lattice-functions.h +++ b/src/lat/lattice-functions.h @@ -45,7 +45,7 @@ int32 LatticeStateTimes(const Lattice &lat, std::vector *times); /// As LatticeStateTimes, but in the CompactLattice format. Note: must /// be topologically sorted. Returns length of the utterance in frames, which -/// may not be the same as the maximum time in the lattice, due to frames +/// might not be the same as the maximum time in the lattice, due to frames /// in the final-prob. int32 CompactLatticeStateTimes(const CompactLattice &clat, std::vector *times); @@ -64,7 +64,7 @@ BaseFloat LatticeForwardBackward(const Lattice &lat, double *acoustic_like_sum = NULL); // This function is something similar to LatticeForwardBackward(), but it is on -// the CompactLattice lattice format. Also we only need the alpha in the forward +// the CompactLattice lattice format. Also we only need the alpha in the forward // path, not the posteriors. bool ComputeCompactLatticeAlphas(const CompactLattice &lat, vector *alpha); @@ -74,6 +74,18 @@ bool ComputeCompactLatticeAlphas(const CompactLattice &lat, bool ComputeCompactLatticeBetas(const CompactLattice &lat, vector *beta); + +// Computes (normal or Viterbi) alphas and betas; returns (total-prob, or +// best-path negated cost) Note: in either case, the alphas and betas are +// negated costs. Requires that lat be topologically sorted. This code +// will work for either CompactLattice or Latice. +template +double ComputeLatticeAlphasAndBetas(const LatticeType &lat, + bool viterbi, + vector *alpha, + vector *beta); + + /// Topologically sort the compact lattice if not already topologically sorted. /// Will crash if the lattice cannot be topologically sorted. void TopSortCompactLatticeIfNeeded(CompactLattice *clat); diff --git a/src/latbin/Makefile b/src/latbin/Makefile index f1633978fbf..74bf664b6c6 100644 --- a/src/latbin/Makefile +++ b/src/latbin/Makefile @@ -20,7 +20,8 @@ BINFILES = lattice-best-path lattice-prune lattice-equivalent lattice-to-nbest \ lattice-minimize lattice-limit-depth lattice-depth-per-frame \ lattice-confidence lattice-determinize-phone-pruned \ lattice-determinize-phone-pruned-parallel lattice-expand-ngram \ - lattice-lmrescore-const-arpa lattice-lmrescore-rnnlm nbest-to-prons + lattice-lmrescore-const-arpa lattice-lmrescore-rnnlm nbest-to-prons \ + lattice-arc-post lattice-determinize-non-compact OBJFILES = @@ -30,7 +31,7 @@ TESTFILES = ADDLIBS = ../lat/kaldi-lat.a ../lm/kaldi-lm.a ../hmm/kaldi-hmm.a \ ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \ - ../thread/kaldi-thread.a ../fstext/kaldi-fstext.a ../base/kaldi-base.a + ../thread/kaldi-thread.a ../fstext/kaldi-fstext.a ../base/kaldi-base.a include ../makefiles/default_rules.mk diff --git a/src/latbin/lattice-arc-post.cc b/src/latbin/lattice-arc-post.cc new file mode 100644 index 00000000000..38a5d6d304d --- /dev/null +++ b/src/latbin/lattice-arc-post.cc @@ -0,0 +1,214 @@ +// latbin/lattice-arc-post.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "fstext/fstext-lib.h" +#include "lat/kaldi-lattice.h" +#include "lat/lattice-functions.h" + +namespace kaldi { + +// This class computes and outputs +// the information about arc posteriors. + +class ArcPosteriorComputer { + public: + // Note: 'clat' must be topologically sorted. + ArcPosteriorComputer(const CompactLattice &clat, + BaseFloat min_post, + bool print_alignment, + const TransitionModel *trans_model = NULL): + clat_(clat), min_post_(min_post), print_alignment_(print_alignment), + trans_model_(trans_model) { } + + // returns the number of arc posteriors that it output. + int32 OutputPosteriors(const std::string &utterance, + std::ostream &os) { + int32 num_post = 0; + if (!ComputeCompactLatticeAlphas(clat_, &alpha_)) + return num_post; + if (!ComputeCompactLatticeBetas(clat_, &beta_)) + return num_post; + + CompactLatticeStateTimes(clat_, &state_times_); + if (clat_.Start() < 0) + return 0; + double tot_like = beta_[clat_.Start()]; + + int32 num_states = clat_.NumStates(); + for (int32 state = 0; state < num_states; state++) { + for (fst::ArcIterator aiter(clat_, state); + !aiter.Done(); aiter.Next()) { + const CompactLatticeArc &arc = aiter.Value(); + double arc_loglike = -ConvertToCost(arc.weight) + + alpha_[state] + beta_[arc.nextstate] - tot_like; + KALDI_ASSERT(arc_loglike < 0.1 && + "Bad arc posterior in forward-backward computation"); + if (arc_loglike > 0.0) arc_loglike = 0.0; + int32 num_frames = arc.weight.String().size(), + word = arc.ilabel; + BaseFloat arc_post = exp(arc_loglike); + if (arc_post <= min_post_) continue; + os << utterance << '\t' << state_times_[state] << '\t' << num_frames + << '\t' << arc_post << '\t' << word; + if (print_alignment_) { + os << '\t'; + const std::vector &ali = arc.weight.String(); + for (int32 frame = 0; frame < num_frames; frame++) { + os << ali[frame]; + if (frame + 1 < num_frames) os << ','; + } + } + if (trans_model_ != NULL) { + // we want to print the phone sequence too. + os << '\t'; + const std::vector &ali = arc.weight.String(); + bool first_phone = true; + for (int32 frame = 0; frame < num_frames; frame++) { + if (trans_model_->IsFinal(ali[frame])) { + if (first_phone) first_phone = false; + else os << ' '; + os << trans_model_->TransitionIdToPhone(ali[frame]); + } + } + } + os << std::endl; + num_post++; + } + } + return num_post; + } + private: + const CompactLattice &clat_; + std::vector alpha_; + std::vector beta_; + std::vector state_times_; + + BaseFloat min_post_; + bool print_alignment_; + const TransitionModel *trans_model_; +}; + +} + + +int main(int argc, char *argv[]) { + try { + typedef kaldi::int32 int32; + using fst::SymbolTable; + using fst::VectorFst; + using fst::StdArc; + + const char *usage = + "Print out information regarding posteriors of lattice arcs\n" + "This program computes posteriors from a lattice and prints out\n" + "information for each arc (the format is reminiscent of ctm, but\n" + "contains information from multiple paths). Each line is:\n" + " [] [ ...]\n" + "for instance:\n" + "2013a04-bk42\t104\t26\t0.95\t0\t11,242,242,242,71,894,894,62,63,63,63,63\t2 8 9\n" + "where the --print-alignment option determines whether the alignments (i.e. the\n" + "sequences of transition-ids) are printed, and the phones are printed only if the\n" + " is supplied on the command line. Note, there are tabs between the major\n" + "fields, but the phones are separated by spaces.\n" + "Usage: lattice-arc-post [] \n" + "e.g.: lattice-arc-post --acoustic-scale=0.1 final.mdl 'ark:gunzip -c lat.1.gz|' post.txt\n" + "You will probably want to word-align the lattices (e.g. lattice-align-words or\n" + "lattice-align-words-lexicon) before this program, apply an acoustic scale either\n" + "via the --acoustic-scale option or using lattice-scale.\n" + "See also: lattice-post, lattice-to-ctm-conf, nbest-to-ctm\n"; + + kaldi::BaseFloat acoustic_scale = 1.0, lm_scale = 1.0; + kaldi::BaseFloat min_post = 0.0001; + bool print_alignment = false; + + kaldi::ParseOptions po(usage); + po.Register("acoustic-scale", &acoustic_scale, + "Scaling factor for acoustic likelihoods"); + po.Register("lm-scale", &lm_scale, + "Scaling factor for \"graph costs\" (including LM costs)"); + po.Register("print-alignment", &print_alignment, + "If true, print alignments (i.e. sequences of transition-ids) for each\n" + "arc."); + po.Register("min-post", &min_post, + "Arc posteriors below this value will be pruned away"); + po.Read(argc, argv); + + if (po.NumArgs() < 2 || po.NumArgs() > 3) { + po.PrintUsage(); + exit(1); + } + + if (acoustic_scale == 0.0) + KALDI_ERR << "Do not use a zero acoustic scale (cannot be inverted)"; + + kaldi::TransitionModel trans_model; + + std::string lats_rspecifier, output_wxfilename; + if (po.NumArgs() == 3) { + ReadKaldiObject(po.GetArg(1), &trans_model); + lats_rspecifier = po.GetArg(2); + output_wxfilename = po.GetArg(3); + } else { + lats_rspecifier = po.GetArg(1); + output_wxfilename = po.GetArg(2); + } + + + kaldi::Output output(output_wxfilename, false); + + // Read as regular lattice + kaldi::SequentialCompactLatticeReader clat_reader(lats_rspecifier); + + int64 tot_post = 0; + int32 num_lat_done = 0, num_lat_err = 0; + + for (; !clat_reader.Done(); clat_reader.Next()) { + std::string key = clat_reader.Key(); + kaldi::CompactLattice clat = clat_reader.Value(); + // FreeCurrent() is an optimization that prevents the lattice from being + // copied unnecessarily (OpenFst does copy-on-write). + clat_reader.FreeCurrent(); + fst::ScaleLattice(fst::LatticeScale(lm_scale, acoustic_scale), &clat); + kaldi::TopSortCompactLatticeIfNeeded(&clat); + + kaldi::ArcPosteriorComputer computer( + clat, min_post, print_alignment, + (po.NumArgs() == 3 ? &trans_model : NULL)); + + int32 num_post = computer.OutputPosteriors(key, output.Stream()); + if (num_post != 0) { + num_lat_done++; + tot_post += num_post; + } else { + num_lat_err++; + KALDI_WARN << "No posterior printed for " << key; + } + } + KALDI_LOG << "Printed posteriors for " << num_lat_done << " lattices (" + << num_lat_err << " with errors); on average printed " + << (tot_post / (num_lat_done == 0 ? 1 : num_lat_done)) + << " posteriors per lattice."; + return (num_lat_done > 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/src/latbin/lattice-best-path.cc b/src/latbin/lattice-best-path.cc index dda41cd0604..dc25fb351c6 100644 --- a/src/latbin/lattice-best-path.cc +++ b/src/latbin/lattice-best-path.cc @@ -121,7 +121,7 @@ int main(int argc, char *argv[]) { } BaseFloat tot_weight_float = tot_weight.Value1() + tot_weight.Value2(); - KALDI_LOG << "Overall score per frame is " << (tot_weight_float/n_frame) + KALDI_LOG << "Overall cost per frame is " << (tot_weight_float/n_frame) << " = " << (tot_weight.Value1()/n_frame) << " [graph]" << " + " << (tot_weight.Value2()/n_frame) << " [acoustic]" << " over " << n_frame << " frames."; diff --git a/src/latbin/lattice-copy.cc b/src/latbin/lattice-copy.cc index 76ca034b2e4..f66eb699705 100644 --- a/src/latbin/lattice-copy.cc +++ b/src/latbin/lattice-copy.cc @@ -24,6 +24,108 @@ #include "fstext/fstext-lib.h" #include "lat/kaldi-lattice.h" +namespace kaldi { + int32 CopySubsetLattices(std::string filename, + SequentialLatticeReader *lattice_reader, + LatticeWriter *lattice_writer, + bool include = true, bool ignore_missing = false + ) { + unordered_set subset; + std::set subset_list; + + bool binary; + Input ki(filename, &binary); + KALDI_ASSERT(!binary); + std::string line; + while (std::getline(ki.Stream(), line)) { + std::vector split_line; + SplitStringToVector(line, " \t\r", true, &split_line); + if(split_line.empty()) { + KALDI_ERR << "Unable to parse line \"" << line << "\" encountered in input in " << filename; + } + subset.insert(split_line[0]); + subset_list.insert(split_line[0]); + } + + int32 num_total = 0; + size_t num_success = 0; + for (; !lattice_reader->Done(); lattice_reader->Next(), num_total++) { + if (include && lattice_reader->Key() > *(subset_list.rbegin())) { + KALDI_LOG << "The utterance " << lattice_reader->Key() + << " is larger than " + << "the last key in the include list. Not reading further."; + KALDI_LOG << "Wrote " << num_success << " utterances"; + return 0; + } + + if (include && subset.count(lattice_reader->Key()) > 0) { + lattice_writer->Write(lattice_reader->Key(), lattice_reader->Value()); + num_success++; + } else if (!include && subset.count(lattice_reader->Key()) == 0) { + lattice_writer->Write(lattice_reader->Key(), lattice_reader->Value()); + num_success++; + } + } + + KALDI_LOG << "Wrote " << num_success << " out of " << num_total + << " utterances."; + + if (ignore_missing) return 0; + + return (num_success != 0 ? 0 : 1); + } + + int32 CopySubsetLattices(std::string filename, + SequentialCompactLatticeReader *lattice_reader, + CompactLatticeWriter *lattice_writer, + bool include = true, bool ignore_missing = false + ) { + unordered_set subset; + std::set subset_list; + + bool binary; + Input ki(filename, &binary); + KALDI_ASSERT(!binary); + std::string line; + while (std::getline(ki.Stream(), line)) { + std::vector split_line; + SplitStringToVector(line, " \t\r", true, &split_line); + if(split_line.empty()) { + KALDI_ERR << "Unable to parse line \"" << line << "\" encountered in input in " << filename; + } + subset.insert(split_line[0]); + subset_list.insert(split_line[0]); + } + + int32 num_total = 0; + size_t num_success = 0; + for (; !lattice_reader->Done(); lattice_reader->Next(), num_total++) { + if (include && lattice_reader->Key() > *(subset_list.rbegin())) { + KALDI_LOG << "The utterance " << lattice_reader->Key() + << " is larger than " + << "the last key in the include list. Not reading further."; + KALDI_LOG << "Wrote " << num_success << " utterances"; + return 0; + } + + if (include && subset.count(lattice_reader->Key()) > 0) { + lattice_writer->Write(lattice_reader->Key(), lattice_reader->Value()); + num_success++; + } else if (!include && subset.count(lattice_reader->Key()) == 0) { + lattice_writer->Write(lattice_reader->Key(), lattice_reader->Value()); + num_success++; + } + } + + KALDI_LOG << " Wrote " << num_success << " out of " << num_total + << " utterances."; + + if (ignore_missing) return 0; + + return (num_success != 0 ? 0 : 1); + } +} + int main(int argc, char *argv[]) { try { using namespace kaldi; @@ -36,14 +138,32 @@ int main(int argc, char *argv[]) { const char *usage = "Copy lattices (e.g. useful for changing to text mode or changing\n" "format to standard from compact lattice.)\n" + "The --include and --exclude options can be used to copy only a subset " + "of lattices, where are the --include option specifies the " + "whitelisted utterances that would be copied and --exclude option " + "specifies the blacklisted utterances that would not be copied.\n" + "Only one of --include and --exclude can be supplied.\n" "Usage: lattice-copy [options] lattice-rspecifier lattice-wspecifier\n" " e.g.: lattice-copy --write-compact=false ark:1.lats ark,t:text.lats\n" "See also: lattice-to-fst, and the script egs/wsj/s5/utils/convert_slf.pl\n"; ParseOptions po(usage); - bool write_compact = true; + bool write_compact = true, ignore_missing = false; + std::string include_rxfilename; + std::string exclude_rxfilename; + po.Register("write-compact", &write_compact, "If true, write in normal (compact) form."); - + po.Register("include", &include_rxfilename, + "Text file, the first field of each " + "line being interpreted as the " + "utterance-id whose lattices will be included"); + po.Register("exclude", &exclude_rxfilename, + "Text file, the first field of each " + "line being interpreted as an utterance-id " + "whose lattices will be excluded"); + po.Register("ignore-missing", &ignore_missing, + "Exit with status 0 even if no lattices are copied"); + po.Read(argc, argv); if (po.NumArgs() != 2) { @@ -59,15 +179,46 @@ int main(int argc, char *argv[]) { if (write_compact) { SequentialCompactLatticeReader lattice_reader(lats_rspecifier); CompactLatticeWriter lattice_writer(lats_wspecifier); + + if (include_rxfilename != "") { + if (exclude_rxfilename != "") { + KALDI_ERR << "should not have both --exclude and --include option!"; + } + return CopySubsetLattices(include_rxfilename, + &lattice_reader, &lattice_writer, + true, ignore_missing); + } else if (exclude_rxfilename != "") { + return CopySubsetLattices(exclude_rxfilename, + &lattice_reader, &lattice_writer, + false, ignore_missing); + } + for (; !lattice_reader.Done(); lattice_reader.Next(), n_done++) lattice_writer.Write(lattice_reader.Key(), lattice_reader.Value()); } else { SequentialLatticeReader lattice_reader(lats_rspecifier); LatticeWriter lattice_writer(lats_wspecifier); + + if (include_rxfilename != "") { + if (exclude_rxfilename != "") { + KALDI_ERR << "should not have both --exclude and --include option!"; + } + return CopySubsetLattices(include_rxfilename, + &lattice_reader, &lattice_writer, + true, ignore_missing); + } else if (exclude_rxfilename != "") { + return CopySubsetLattices(exclude_rxfilename, + &lattice_reader, &lattice_writer, + true, ignore_missing); + } + for (; !lattice_reader.Done(); lattice_reader.Next(), n_done++) lattice_writer.Write(lattice_reader.Key(), lattice_reader.Value()); } KALDI_LOG << "Done copying " << n_done << " lattices."; + + if (ignore_missing) return 0; + return (n_done != 0 ? 0 : 1); } catch(const std::exception &e) { std::cerr << e.what(); diff --git a/src/latbin/lattice-determinize-non-compact.cc b/src/latbin/lattice-determinize-non-compact.cc new file mode 100644 index 00000000000..8665fcb58d1 --- /dev/null +++ b/src/latbin/lattice-determinize-non-compact.cc @@ -0,0 +1,317 @@ +// latbin/lattice-determinize-non-compact.cc + +// Copyright 2009-2012 Microsoft Corporation +// 2012-2013 Johns Hopkins University (Author: Daniel Povey) +// 2015 Vimal Manohar + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "util/stl-utils.h" +#include "fstext/fstext-lib.h" +#include "lat/kaldi-lattice.h" +#include "lat/lattice-functions.h" +#include "lat/push-lattice.h" +#include "lat/minimize-lattice.h" + +namespace kaldi { + +typedef Lattice::StateId StateId; +typedef Lattice::Arc Arc; + +// This function is a copy of the function in the program lattice-determinize +bool DeterminizeLatticeWrapper(const Lattice &lat, + const std::string &key, + bool prune, + BaseFloat beam, + BaseFloat beam_ratio, + int32 max_mem, + int32 max_loop, + BaseFloat delta, + int32 num_loops, + CompactLattice *clat) { + fst::DeterminizeLatticeOptions lat_opts; + lat_opts.max_mem = max_mem; + lat_opts.max_loop = max_loop; + lat_opts.delta = delta; + BaseFloat cur_beam = beam; + for (int32 i = 0; i < num_loops;) { // we increment i below. + + if (lat.Start() == fst::kNoStateId) { + KALDI_WARN << "Detected empty lattice, skipping " << key; + return false; + } + + // The work gets done in the next line. + if (DeterminizeLattice(lat, clat, lat_opts, NULL)) { + if (prune) PruneLattice(cur_beam, clat); + return true; + } else { // failed to determinize.. + KALDI_WARN << "Failed to determinize lattice (presumably max-states " + << "reached), reducing lattice-beam to " + << (cur_beam*beam_ratio) << " and re-trying."; + for (; i < num_loops; i++) { + cur_beam *= beam_ratio; + Lattice pruned_lat(lat); + PruneLattice(cur_beam, &pruned_lat); + if (NumArcs(lat) == NumArcs(pruned_lat)) { + cur_beam *= beam_ratio; + KALDI_WARN << "Pruning did not have an effect on the original " + << "lattice size; reducing beam to " + << cur_beam << " and re-trying."; + } else if (DeterminizeLattice(pruned_lat, clat, lat_opts, NULL)) { + if (prune) PruneLattice(cur_beam, clat); + return true; + } else { + KALDI_WARN << "Determinization failed again; reducing beam again to " + << (cur_beam*beam_ratio) << " and re-trying."; + } + } + } + } + KALDI_WARN << "Decreased pruning beam --num-loops=" << num_loops + << " times and was not able to determinize: failed for " + << key; + return false; +} + +void ComputeAcousticScoresMap( + const Lattice &lat, + unordered_map, std::pair, + PairHasher > *acoustic_scores) { + acoustic_scores->clear(); + + std::vector state_times; + LatticeStateTimes(lat, &state_times); + + KALDI_ASSERT(lat.Start() == 0); + + for (StateId s = 0; s < lat.NumStates(); s++) { + int32 t = state_times[s]; + for (fst::ArcIterator aiter(lat, s); !aiter.Done(); + aiter.Next()) { + const Arc &arc = aiter.Value(); + const LatticeWeight &weight = arc.weight; + + int32 tid = arc.ilabel; + + if (tid != 0) { + unordered_map, std::pair, + PairHasher >::iterator it = acoustic_scores->find(std::make_pair(t, tid)); + if (it == acoustic_scores->end()) { + acoustic_scores->insert(std::make_pair(std::make_pair(t, tid), + std::make_pair(weight.Value2(), 1))); + } else { + if (it->second.second == 2 + && it->second.first / it->second.second != weight.Value2()) { + KALDI_VLOG(2) << "Transitions on the same frame have different " + << "acoustic costs for tid " << tid << "; " + << it->second.first / it->second.second + << " vs " << weight.Value2(); + } + it->second.first += weight.Value2(); + it->second.second++; + } + } else { + // Arcs with epsilon input label (tid) must have 0 acoustic cost + KALDI_ASSERT(weight.Value2() == 0); + } + } + + LatticeWeight f = lat.Final(s); + if (f != LatticeWeight::Zero()) { + // Final acoustic cost must be 0 as we are reading from + // non-determinized, non-compact lattice + KALDI_ASSERT(f.Value2() == 0.0); + } + } +} + +void ReplaceAcousticScoresFromMap( + const unordered_map, std::pair, + PairHasher > &acoustic_scores, + Lattice *lat) { + fst::TopSort(lat); + + std::vector state_times; + LatticeStateTimes(*lat, &state_times); + + KALDI_ASSERT(lat->Start() == 0); + + for (StateId s = 0; s < lat->NumStates(); s++) { + int32 t = state_times[s]; + for (fst::MutableArcIterator aiter(lat, s); + !aiter.Done(); aiter.Next()) { + Arc arc(aiter.Value()); + + int32 tid = arc.ilabel; + if (tid != 0) { + unordered_map, std::pair, + PairHasher >::const_iterator it = acoustic_scores.find(std::make_pair(t, tid)); + if (it == acoustic_scores.end()) { + KALDI_ERR << "Could not find tid " << tid << " at time " << t + << " in the acoustic scores map."; + } else { + arc.weight.SetValue2(it->second.first / it->second.second); + } + } else { + // For epsilon arcs, set acoustic cost to 0.0 + arc.weight.SetValue2(0.0); + } + aiter.SetValue(arc); + } + + LatticeWeight f = lat->Final(s); + if (f != LatticeWeight::Zero()) { + // Set final acoustic cost to 0.0 + f.SetValue2(0.0); + lat->SetFinal(s, f); + } + } +} + +} + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + using fst::SymbolTable; + using fst::VectorFst; + using fst::StdArc; + + const char *usage = + "lattice-determinize lattices (and apply a pruning beam)\n" + " (see http://kaldi.sourceforge.net/lattices.html for more explanation)\n" + "This version of the program retains the original " + "acoustic scores of arcs in the determinized lattice and writes it " + "as a normal (non-compact) lattice. \n" + " note: this program is tyically only useful if you generated state-level\n" + " lattices, e.g. called gmm-latgen-simple with --determinize=false\n" + "\n" + "Usage: lattice-determinize-non-compact [options] lattice-rspecifier lattice-wspecifier\n" + " e.g.: lattice-determinize-non-compact --acoustic-scale=0.1 --beam=15.0 ark:1.lats ark:det.lats\n"; + + ParseOptions po(usage); + BaseFloat acoustic_scale = 1.0; + BaseFloat beam = 10.0; + BaseFloat beam_ratio = 0.9; + int32 num_loops = 20; + int32 max_mem = 50000000; // 50 MB + int32 max_loop = 500000; + BaseFloat delta = fst::kDelta; + bool prune = false; + bool minimize = false; + + po.Register("acoustic-scale", &acoustic_scale, + "Scaling factor for acoustic likelihoods"); + po.Register("beam", &beam, + "Pruning beam [applied after acoustic scaling]-- also used " + "to handle determinization failures, set --prune=false to " + "disable routine pruning"); + po.Register("delta", &delta, "Tolerance used in determinization"); + po.Register("prune", &prune, "If true, prune determinized lattices " + "with the --beam option."); + po.Register("max-mem", &max_mem, "Maximum approximate memory usage in " + "determinization (real usage might be many times this)"); + po.Register("max-loop", &max_loop, "Option to detect a certain " + "type of failure in lattice determinization (not critical)"); + po.Register("beam-ratio", &beam_ratio, "Ratio by which to " + "decrease beam if we reach the max-arcs."); + po.Register("num-loops", &num_loops, "Number of times to " + "decrease beam by beam-ratio if determinization fails."); + po.Register("minimize", &minimize, + "If true, push and minimize after determinization"); + + po.Read(argc, argv); + + if (po.NumArgs() != 2) { + po.PrintUsage(); + exit(1); + } + + std::string lats_rspecifier = po.GetArg(1), + lats_wspecifier = po.GetArg(2); + + // Read as regular lattice-- this is the form we need it in for efficient + // pruning. + SequentialLatticeReader lattice_reader(lats_rspecifier); + + // Write as regular lattice. + LatticeWriter lattice_writer(lats_wspecifier); + + int32 n_done = 0, n_error = 0; + + if (acoustic_scale == 0.0) + KALDI_ERR << "Do not use a zero acoustic scale (cannot be inverted)"; + LatticeWeight beam_weight(beam, static_cast(0.0)); + + for (; !lattice_reader.Done(); lattice_reader.Next()) { + std::string key = lattice_reader.Key(); + Lattice lat = lattice_reader.Value(); + + lattice_reader.FreeCurrent(); + + fst::TopSort(&lat); + + fst::ScaleLattice(fst::AcousticLatticeScale(acoustic_scale), &lat); + + + // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) + unordered_map, std::pair, + PairHasher > acoustic_scores; + ComputeAcousticScoresMap(lat, &acoustic_scores); + + Invert(&lat); // make it so word labels are on the input. + + CompactLattice clat; + if (DeterminizeLatticeWrapper(lat, key, prune, + beam, beam_ratio, max_mem, max_loop, + delta, num_loops, &clat)) { + if (minimize) { + PushCompactLatticeStrings(&clat); + PushCompactLatticeWeights(&clat); + MinimizeCompactLattice(&clat); + } + + Lattice out_lat; + fst::ConvertLattice(clat, &out_lat); + fst::TopSort(&out_lat); + + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + + fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), + &out_lat); + lattice_writer.Write(key, out_lat); + n_done++; + } else { + n_error++; // will have already printed warning. + } + } + + KALDI_LOG << "Done " << n_done << " lattices, errors on " << n_error; + return (n_done != 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} + diff --git a/src/latbin/lattice-to-ctm-conf.cc b/src/latbin/lattice-to-ctm-conf.cc index 5489e560be8..56ea983ac9b 100644 --- a/src/latbin/lattice-to-ctm-conf.cc +++ b/src/latbin/lattice-to-ctm-conf.cc @@ -51,8 +51,8 @@ int main(int argc, char *argv[]) { " e.g.: lattice-to-ctm-conf --acoustic-scale=0.1 ark:1.lats 1.ctm\n" " or: lattice-to-ctm-conf --acoustic-scale=0.1 --decode-mbr=false\\\n" " ark:1.lats ark:1.1best 1.ctm\n" - "See also: lattice-mbr-decode, nbest-to-ctm, steps/get_ctm.sh,\n" - " steps/get_train_ctm.sh and utils/convert_ctm.sh.\n"; + "See also: lattice-mbr-decode, nbest-to-ctm, lattice-arc-post,\n" + " steps/get_ctm.sh, steps/get_train_ctm.sh and utils/convert_ctm.sh.\n"; ParseOptions po(usage); BaseFloat acoustic_scale = 1.0, inv_acoustic_scale = 1.0, lm_scale = 1.0; @@ -69,7 +69,7 @@ int main(int argc, char *argv[]) { po.Register("decode-mbr", &decode_mbr, "If true, do Minimum Bayes Risk " "decoding (else, Maximum a Posteriori)"); po.Register("frame-shift", &frame_shift, "Time in seconds between frames."); - + po.Read(argc, argv); if (po.NumArgs() != 2 && po.NumArgs() != 3) { @@ -80,7 +80,7 @@ int main(int argc, char *argv[]) { KALDI_ASSERT(acoustic_scale == 1.0 || inv_acoustic_scale == 1.0); if (inv_acoustic_scale != 1.0) acoustic_scale = 1.0 / inv_acoustic_scale; - + std::string lats_rspecifier, one_best_rspecifier, ctm_wxfilename; if (po.NumArgs() == 2) { @@ -92,9 +92,9 @@ int main(int argc, char *argv[]) { one_best_rspecifier = po.GetArg(2); ctm_wxfilename = po.GetArg(3); } - + // Ensure the output ctm file is not a wspecifier - WspecifierType ctm_wx_type; + WspecifierType ctm_wx_type; ctm_wx_type = ClassifyWspecifier(ctm_wxfilename, NULL, NULL, NULL); if(ctm_wx_type != kNoWspecifier){ KALDI_ERR << "The output ctm file should not be a wspecifier. " @@ -104,7 +104,7 @@ int main(int argc, char *argv[]) { // Read as compact lattice. SequentialCompactLatticeReader clat_reader(lats_rspecifier); - + RandomAccessInt32VectorReader one_best_reader(one_best_rspecifier); Output ko(ctm_wxfilename, false); // false == non-binary writing mode. @@ -114,7 +114,7 @@ int main(int argc, char *argv[]) { int32 n_done = 0, n_words = 0; BaseFloat tot_bayes_risk = 0.0; - + for (; !clat_reader.Done(); clat_reader.Next()) { std::string key = clat_reader.Key(); CompactLattice clat = clat_reader.Value(); @@ -133,7 +133,7 @@ int main(int argc, char *argv[]) { const std::vector &one_best = one_best_reader.Value(key); mbr = new MinimumBayesRisk(clat, one_best, decode_mbr); } - + const std::vector &conf = mbr->GetOneBestConfidences(); const std::vector &words = mbr->GetOneBest(); const std::vector > × = @@ -146,7 +146,7 @@ int main(int argc, char *argv[]) { << words[i] << ' ' << conf[i] << '\n'; } KALDI_LOG << "For utterance " << key << ", Bayes Risk " - << mbr->GetBayesRisk() << ", avg. confidence per-word " + << mbr->GetBayesRisk() << ", avg. confidence per-word " << std::accumulate(conf.begin(),conf.end(),0.0) / words.size(); n_done++; n_words += mbr->GetOneBest().size(); @@ -158,7 +158,7 @@ int main(int argc, char *argv[]) { KALDI_LOG << "Overall average Bayes Risk per sentence is " << (tot_bayes_risk / n_done) << " and per word, " << (tot_bayes_risk / n_words); - + return (n_done != 0 ? 0 : 1); } catch(const std::exception &e) { std::cerr << e.what(); diff --git a/src/latbin/lattice-to-post.cc b/src/latbin/lattice-to-post.cc index 559fa480920..c04a6748a52 100644 --- a/src/latbin/lattice-to-post.cc +++ b/src/latbin/lattice-to-post.cc @@ -35,7 +35,7 @@ int main(int argc, char *argv[]) { "Do forward-backward and collect posteriors over lattices.\n" "Usage: lattice-to-post [options] lats-rspecifier posts-wspecifier [loglikes-wspecifier]\n" " e.g.: lattice-to-post --acoustic-scale=0.1 ark:1.lats ark:1.post\n" - "See also: lattice-to-ctm-conf, post-to-pdf-post\n"; + "See also: lattice-to-ctm-conf, post-to-pdf-post, lattice-arc-post\n"; kaldi::BaseFloat acoustic_scale = 1.0, lm_scale = 1.0; kaldi::ParseOptions po(usage); @@ -76,7 +76,7 @@ int main(int argc, char *argv[]) { lattice_reader.FreeCurrent(); if (acoustic_scale != 1.0 || lm_scale != 1.0) fst::ScaleLattice(fst::LatticeScale(lm_scale, acoustic_scale), &lat); - + kaldi::uint64 props = lat.Properties(fst::kFstProperties, false); if (!(props & fst::kTopSorted)) { if (fst::TopSort(&lat) == false) @@ -95,8 +95,8 @@ int main(int argc, char *argv[]) { << " arcs. Average log-likelihood = " << (lat_like/lat_time) << " over " << lat_time << " frames. Average acoustic log-like" << " per frame is " << (lat_ac_like/lat_time); - - if (loglikes_writer.IsOpen()) + + if (loglikes_writer.IsOpen()) loglikes_writer.Write(key, lat_like); posterior_writer.Write(key, post); diff --git a/src/latbin/nbest-to-ctm.cc b/src/latbin/nbest-to-ctm.cc index 1993041dee6..e396f315ba1 100644 --- a/src/latbin/nbest-to-ctm.cc +++ b/src/latbin/nbest-to-ctm.cc @@ -1,6 +1,6 @@ // latbin/nbest-to-ctm.cc -// Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +// Copyright 2012-2016 Johns Hopkins University (Author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // @@ -43,14 +43,19 @@ int main(int argc, char *argv[]) { "e.g.: lattice-1best --acoustic-weight=0.08333 ark:1.lats | \\\n" " lattice-align-words data/lang/phones/word_boundary.int exp/dir/final.mdl ark:- ark:- | \\\n" " nbest-to-ctm ark:- 1.ctm\n"; - + ParseOptions po(usage); + bool print_silence = false; BaseFloat frame_shift = 0.01; int32 precision = 2; + po.Register("print-silence", &print_silence, "If true, print optional-silence " + "() arcs"); po.Register("frame-shift", &frame_shift, "Time in seconds between frames.\n"); po.Register("precision", &precision, - "Number of decimal places for start duration times\n"); + "Number of decimal places for start duration times (note: we " + "may use a higher value than this if it's obvious from " + "--frame-shift that this value is too small"); po.Read(argc, argv); @@ -62,15 +67,21 @@ int main(int argc, char *argv[]) { std::string lats_rspecifier = po.GetArg(1), ctm_wxfilename = po.GetArg(2); + if (frame_shift < 0.01 && precision <= 2) + precision = 3; + if (frame_shift < 0.001 && precision <= 3) + precision = 4; + + SequentialCompactLatticeReader clat_reader(lats_rspecifier); - + int32 n_done = 0, n_err = 0; Output ko(ctm_wxfilename, false); // false == non-binary write mode. ko.Stream() << std::fixed; // Set to "fixed" floating point model, where precision() specifies // the #digits after the decimal point. ko.Stream().precision(precision); - + for (; !clat_reader.Done(); clat_reader.Next()) { std::string key = clat_reader.Key(); CompactLattice clat = clat_reader.Value(); @@ -84,7 +95,7 @@ int main(int argc, char *argv[]) { KALDI_ASSERT(words.size() == times.size() && words.size() == lengths.size()); for (size_t i = 0; i < words.size(); i++) { - if (words[i] == 0) // Don't output anything for links, which + if (words[i] == 0 && !print_silence) // Don't output anything for links, which continue; // correspond to silence.... ko.Stream() << key << " 1 " << (frame_shift * times[i]) << ' ' << (frame_shift * lengths[i]) << ' ' << words[i] < +#include +#include +#include +#include +#include "lm/kaldi-lm.h" + +#include "lm/arpa-file-parser.h" + +namespace kaldi { +namespace { + +const int kMaxOrder = 3; + +struct NGramTestData { + int32 line_number; + float logprob; + int32 words[kMaxOrder]; + float backoff; +}; + +std::ostream& operator<<(std::ostream& os, const NGramTestData& data) { + std::ios::fmtflags saved_state(os.flags()); + os << std::fixed << std::setprecision(6); + + os << data.logprob << ' '; + for (int i = 0; i < kMaxOrder; ++i) os << data.words[i] << ' '; + os << data.backoff << " // Line " << data.line_number; + + os.flags(saved_state); + return os; +} + +// This does not own the array pointer, and uset to simplify passing expected +// result to TestableArpaFileParser::Verify. +template +struct CountedArray { + template + CountedArray(T(&array)[N]) : array(array), count(N) { } + const T* array; + const size_t count; +}; + +template +inline CountedArray MakeCountedArray(T(&array)[N]) { + return CountedArray(array); +} + +class TestableArpaFileParser : public ArpaFileParser { + public: + TestableArpaFileParser(ArpaParseOptions options, fst::SymbolTable* symbols) + : ArpaFileParser(options, symbols), + header_available_(false), + read_complete_(false), + last_order_(0) { } + void Validate(CountedArray counts, CountedArray ngrams); + + private: + // ArpaFileParser overrides. + virtual void HeaderAvailable(); + virtual void ConsumeNGram(const NGram& ngram); + virtual void ReadComplete(); + + bool header_available_; + bool read_complete_; + int32 last_order_; + std::vector ngrams_; +}; + +void TestableArpaFileParser::HeaderAvailable() { + KALDI_ASSERT(!header_available_); + KALDI_ASSERT(!read_complete_); + header_available_ = true; + KALDI_ASSERT(NgramCounts().size() <= kMaxOrder); +} + +void TestableArpaFileParser::ConsumeNGram(const NGram& ngram) { + KALDI_ASSERT(header_available_); + KALDI_ASSERT(!read_complete_); + KALDI_ASSERT(ngram.words.size() <= NgramCounts().size()); + KALDI_ASSERT(ngram.words.size() >= last_order_); + last_order_ = ngram.words.size(); + + NGramTestData entry = { 0 }; + entry.line_number = LineNumber(); + entry.logprob = ngram.logprob; + entry.backoff = ngram.backoff; + std::copy(ngram.words.begin(), ngram.words.end(), entry.words); + ngrams_.push_back(entry); +} + +void TestableArpaFileParser::ReadComplete() { + KALDI_ASSERT(header_available_); + KALDI_ASSERT(!read_complete_); + read_complete_ = true; +} + +// +bool CompareNgrams(const NGramTestData& actual, + const NGramTestData& expected) { + if (actual.line_number != expected.line_number + || !std::equal(actual.words, actual.words + kMaxOrder, + expected.words) + || !ApproxEqual(actual.logprob, expected.logprob) + || !ApproxEqual(actual.backoff, expected.backoff)) { + KALDI_WARN << "Actual n-gram [" << actual + << "] differs from expected [" << expected << "]"; + return false; + } + return true; +} + +void TestableArpaFileParser::Validate( + CountedArray expect_counts, + CountedArray expect_ngrams) { + // This needs better disagnostics probably. + KALDI_ASSERT(NgramCounts().size() == expect_counts.count); + KALDI_ASSERT(std::equal(NgramCounts().begin(), NgramCounts().end(), + expect_counts.array)); + + KALDI_ASSERT(ngrams_.size() == expect_ngrams.count); + // auto mpos = std::mismatch(ngrams_.begin(), ngrams_.end(), + // expect_ngrams.array, CompareNgrams); + // if (mpos.first != ngrams_.end()) + // KALDI_ERR << "Maismatch at index " << mpos.first - ngrams_.begin(); + //TODO:auto above requres C++11, and I cannot spell out the type!!! + KALDI_ASSERT(std::equal(ngrams_.begin(), ngrams_.end(), + expect_ngrams.array, CompareNgrams)); +} + +// Read integer LM (no symbols) with log base conversion. +void ReadIntegerLmLogconvExpectSuccess() { + KALDI_LOG << "ReadIntegerLmLogconvExpectSuccess()"; + + static std::string integer_lm = "\ +\\data\\\n\ +ngram 1=4\n\ +ngram 2=2\n\ +ngram 3=2\n\ +\n\ +\\1-grams:\n\ +-5.234679 4 -3.3\n\ +-3.456783 5\n\ +0.0000000 1 -2.5\n\ +-4.333333 2\n\ +\n\ +\\2-grams:\n\ +-1.45678 4 5 -3.23\n\ +-1.30490 1 4 -4.2\n\ +\n\ +\\3-grams:\n\ +-0.34958 1 4 5\n\ +-0.23940 4 5 2\n\ +\n\ +\\end\\"; + + int32 expect_counts[] = { 4, 2, 2 }; + NGramTestData expect_ngrams[] = { + { 7, -12.05329, { 4, 0, 0 }, -7.598531 }, + { 8, -7.959537, { 5, 0, 0 }, 0.0 }, + { 9, 0.0, { 1, 0, 0 }, -5.756463 }, + { 10, -9.977868, { 2, 0, 0 }, 0.0 }, + + { 13, -3.354360, { 4, 5, 0 }, -7.437350 }, + { 14, -3.004643, { 1, 4, 0 }, -9.670857 }, + + { 17, -0.804938, { 1, 4, 5 }, 0.0 }, + { 18, -0.551239, { 4, 5, 2 }, 0.0 } }; + + ArpaParseOptions options; + options.bos_symbol = 1; + options.eos_symbol = 2; + + TestableArpaFileParser parser(options, NULL); + std::istringstream stm(integer_lm, std::ios_base::in); + parser.Read(stm, false); + parser.Validate(MakeCountedArray(expect_counts), + MakeCountedArray(expect_ngrams)); +} + +// \xCE\xB2 = UTF-8 for Greek beta, to churn some UTF-8 cranks. +static std::string symbolic_lm = "\ +\\data\\\n\ +ngram 1=4\n\ +ngram 2=2\n\ +ngram 3=2\n\ +\n\ +\\1-grams:\n\ +-5.2 a -3.3\n\ +-3.4 \xCE\xB2\n\ +0.0 -2.5\n\ +-4.3 \n\ +\n\ +\\2-grams:\n\ +-1.5 a \xCE\xB2 -3.2\n\ +-1.3 a -4.2\n\ +\n\ +\\3-grams:\n\ +-0.3 a \xCE\xB2\n\ +-0.2 a \n\ +\n\ +\\end\\"; + +// Symbol table that is created with predefined test symbols, "a" but no "b". +class TestSymbolTable : public fst::SymbolTable { + public: + TestSymbolTable() { + AddSymbol("", 0); + AddSymbol("", 1); + AddSymbol("", 2); + AddSymbol("", 3); + AddSymbol("a", 4); + } +}; + +// Full expected result shared between ReadSymbolicLmNoOovImpl and +// ReadSymbolicLmWithOovAddToSymbols(). +NGramTestData expect_symbolic_full[] = { + { 7, -5.2, { 4, 0, 0 }, -3.3 }, + { 8, -3.4, { 5, 0, 0 }, 0.0 }, + { 9, 0.0, { 1, 0, 0 }, -2.5 }, + { 10, -4.3, { 2, 0, 0 }, 0.0 }, + + { 13, -1.5, { 4, 5, 0 }, -3.2 }, + { 14, -1.3, { 1, 4, 0 }, -4.2 }, + + { 17, -0.3, { 1, 4, 5 }, 0.0 }, + { 18, -0.2, { 1, 4, 2 }, 0.0 } }; + +// This is run with all possible oov setting and yields same result. +void ReadSymbolicLmNoOovImpl(ArpaParseOptions::OovHandling oov) { + int32 expect_counts[] = { 4, 2, 2 }; + TestSymbolTable symbols; + symbols.AddSymbol("\xCE\xB2", 5); + + ArpaParseOptions options; + options.bos_symbol = 1; + options.eos_symbol = 2; + options.unk_symbol = 3; + options.use_log10 = true; + options.oov_handling = oov; + TestableArpaFileParser parser(options, &symbols); + std::istringstream stm(symbolic_lm, std::ios_base::in); + parser.Read(stm, false); + parser.Validate(MakeCountedArray(expect_counts), + MakeCountedArray(expect_symbolic_full)); + KALDI_ASSERT(symbols.NumSymbols() == 6); +} + +void ReadSymbolicLmNoOovTests() { + KALDI_LOG << "ReadSymbolicLmNoOovImpl(kRaiseError)"; + ReadSymbolicLmNoOovImpl(ArpaParseOptions::kRaiseError); + KALDI_LOG << "ReadSymbolicLmNoOovImpl(kAddToSymbols)"; + ReadSymbolicLmNoOovImpl(ArpaParseOptions::kAddToSymbols); + KALDI_LOG << "ReadSymbolicLmNoOovImpl(kReplaceWithUnk)"; + ReadSymbolicLmNoOovImpl(ArpaParseOptions::kReplaceWithUnk); + KALDI_LOG << "ReadSymbolicLmNoOovImpl(kSkipNGram)"; + ReadSymbolicLmNoOovImpl(ArpaParseOptions::kSkipNGram); +} + +// This is run with all possible oov setting and yields same result. +void ReadSymbolicLmWithOovImpl( + ArpaParseOptions::OovHandling oov, + CountedArray expect_ngrams, + fst::SymbolTable* symbols) { + int32 expect_counts[] = { 4, 2, 2 }; + ArpaParseOptions options; + options.bos_symbol = 1; + options.eos_symbol = 2; + options.unk_symbol = 3; + options.use_log10 = true; + options.oov_handling = oov; + TestableArpaFileParser parser(options, symbols); + std::istringstream stm(symbolic_lm, std::ios_base::in); + parser.Read(stm, false); + parser.Validate(MakeCountedArray(expect_counts), expect_ngrams); +} + +void ReadSymbolicLmWithOovAddToSymbols() { + TestSymbolTable symbols; + ReadSymbolicLmWithOovImpl(ArpaParseOptions::kAddToSymbols, + MakeCountedArray(expect_symbolic_full), + &symbols); + KALDI_ASSERT(symbols.NumSymbols() == 6); + KALDI_ASSERT(symbols.Find("\xCE\xB2") == 5); +} + +void ReadSymbolicLmWithOovReplaceWithUnk() { + NGramTestData expect_symbolic_unk_b[] = { + { 7, -5.2, { 4, 0, 0 }, -3.3 }, + { 8, -3.4, { 3, 0, 0 }, 0.0 }, + { 9, 0.0, { 1, 0, 0 }, -2.5 }, + { 10, -4.3, { 2, 0, 0 }, 0.0 }, + + { 13, -1.5, { 4, 3, 0 }, -3.2 }, + { 14, -1.3, { 1, 4, 0 }, -4.2 }, + + { 17, -0.3, { 1, 4, 3 }, 0.0 }, + { 18, -0.2, { 1, 4, 2 }, 0.0 } }; + + TestSymbolTable symbols; + ReadSymbolicLmWithOovImpl(ArpaParseOptions::kReplaceWithUnk, + MakeCountedArray(expect_symbolic_unk_b), + &symbols); + KALDI_ASSERT(symbols.NumSymbols() == 5); +} + +void ReadSymbolicLmWithOovSkipNGram() { + NGramTestData expect_symbolic_no_b[] = { + { 7, -5.2, { 4, 0, 0 }, -3.3 }, + { 9, 0.0, { 1, 0, 0 }, -2.5 }, + { 10, -4.3, { 2, 0, 0 }, 0.0 }, + + { 14, -1.3, { 1, 4, 0 }, -4.2 }, + + { 18, -0.2, { 1, 4, 2 }, 0.0 } }; + + TestSymbolTable symbols; + ReadSymbolicLmWithOovImpl(ArpaParseOptions::kSkipNGram, + MakeCountedArray(expect_symbolic_no_b), + &symbols); + KALDI_ASSERT(symbols.NumSymbols() == 5); +} + +void ReadSymbolicLmWithOovTests() { + KALDI_LOG << "ReadSymbolicLmWithOovAddToSymbols()"; + ReadSymbolicLmWithOovAddToSymbols(); + KALDI_LOG << "ReadSymbolicLmWithOovReplaceWithUnk()"; + ReadSymbolicLmWithOovReplaceWithUnk(); + KALDI_LOG << "ReadSymbolicLmWithOovSkipNGram()"; + ReadSymbolicLmWithOovSkipNGram(); +} + +} // namespace +} // namespace kaldi + +int main(int argc, char *argv[]) { + kaldi::ReadIntegerLmLogconvExpectSuccess(); + kaldi::ReadSymbolicLmNoOovTests(); + kaldi::ReadSymbolicLmWithOovTests(); +} diff --git a/src/lm/arpa-file-parser.cc b/src/lm/arpa-file-parser.cc new file mode 100644 index 00000000000..2d8f9f18638 --- /dev/null +++ b/src/lm/arpa-file-parser.cc @@ -0,0 +1,236 @@ +// lm/arpa-file-parser.cc + +// Copyright 2014 Guoguo Chen +// Copyright 2016 Smart Action Company LLC (kkm) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include "base/kaldi-error.h" +#include "base/kaldi-math.h" +#include "lm/arpa-file-parser.h" +#include "util/text-utils.h" + +namespace kaldi { + +ArpaFileParser::ArpaFileParser(ArpaParseOptions options, fst::SymbolTable* symbols) + : options_(options), symbols_(symbols), line_number_(0) { +} + +ArpaFileParser::~ArpaFileParser() { +} + +void ArpaFileParser::Read(std::istream &is, bool binary) { + if (binary) { + KALDI_ERR << "binary-mode reading is not implemented for ArpaFileParser"; + } + + // Argument sanity checks. + if (options_.bos_symbol <= 0 || options_.eos_symbol <= 0 || + options_.bos_symbol == options_.eos_symbol) + KALDI_ERR << "BOS and EOS symbols are required, must not be epsilons, and " + << "differ from each other. Given:" + << " BOS=" << options_.bos_symbol + << " EOS=" << options_.eos_symbol; + if (symbols_ != NULL && + options_.oov_handling == ArpaParseOptions::kReplaceWithUnk && + (options_.unk_symbol <= 0 || + options_.unk_symbol == options_.bos_symbol || + options_.unk_symbol == options_.eos_symbol)) + KALDI_ERR << "When symbol table is given and OOV mode is kReplaceWithUnk, " + << "UNK symbol is required, must not be epsilon, and " + << "differ from both BOS and EOS symbols. Given:" + << " UNK=" << options_.unk_symbol + << " BOS=" << options_.bos_symbol + << " EOS=" << options_.eos_symbol; + if (symbols_ != NULL && symbols_->Find(options_.bos_symbol).empty()) + KALDI_ERR << "BOS symbol must exist in symbol table"; + if (symbols_ != NULL && symbols_->Find(options_.eos_symbol).empty()) + KALDI_ERR << "EOS symbol must exist in symbol table"; + if (symbols_ != NULL && options_.unk_symbol > 0 && + symbols_->Find(options_.unk_symbol).empty()) + KALDI_ERR << "UNK symbol must exist in symbol table"; + + ngram_counts_.clear(); + line_number_ = 0; + +#define PARSE_ERR (KALDI_ERR << "in line " << line_number_ << ": ") + + // Give derived class an opportunity to prepare its state. + ReadStarted(); + + std::string line; + + // Processes "\data\" section. + bool keyword_found = false; + while (++line_number_, getline(is, line) && !is.eof()) { + if (line.empty()) continue; + + // The section keywords starts with backslash. We terminate the while loop + // if a new section is found. + if (line[0] == '\\') { + if (!keyword_found && line == "\\data\\") { + KALDI_LOG << "Reading \\data\\ section."; + keyword_found = true; + continue; + } + break; + } + + if (!keyword_found) continue; + + // Enters "\data\" section, and looks for patterns like "ngram 1=1000", + // which means there are 1000 unigrams. + std::size_t equal_symbol_pos = line.find("="); + if (equal_symbol_pos != std::string::npos) + line.replace(equal_symbol_pos, 1, " = "); // Inserts spaces around "=" + std::vector col; + SplitStringToVector(line, " \t", true, &col); + if (col.size() == 4 && col[0] == "ngram" && col[2] == "=") { + int32 order, ngram_count = 0; + if (!ConvertStringToInteger(col[1], &order) || + !ConvertStringToInteger(col[3], &ngram_count)) { + PARSE_ERR << "Cannot parse ngram count '" << line << "'."; + } + if (ngram_counts_.size() <= order) { + ngram_counts_.resize(order); + } + ngram_counts_[order - 1] = ngram_count; + } else { + KALDI_WARN << "Uninterpretable line in \\data\\ section: " << line; + } + } + + if (ngram_counts_.size() == 0) + PARSE_ERR << "\\data\\ section missing or empty."; + + // Signal that grammar order and n-gram counts are known. + HeaderAvailable(); + + NGram ngram; + ngram.words.reserve(ngram_counts_.size()); + + // Processes "\N-grams:" section. + for (int32 cur_order = 1; cur_order <= ngram_counts_.size(); ++cur_order) { + // Skips n-grams with zero count. + if (ngram_counts_[cur_order - 1] == 0) { + KALDI_WARN << "Zero ngram count in ngram order " << cur_order + << "(look for 'ngram " << cur_order << "=0' in the \\data\\ " + << " section). There is possibly a problem with the file."; + continue; + } + + // Must be looking at a \k-grams: directive at this point. + std::ostringstream keyword; + keyword << "\\" << cur_order << "-grams:"; + if (line != keyword.str()) { + PARSE_ERR << "Invalid directive '" << line << "', " + << "expecting '" << keyword.str() << "'."; + } + KALDI_LOG << "Reading " << line << " section."; + + int32 ngram_count = 0; + while (++line_number_, getline(is, line) && !is.eof()) { + if (line.empty()) continue; + if (line[0] == '\\') break; + + std::vector col; + SplitStringToVector(line, " \t", true, &col); + + if (col.size() < 1 + cur_order || + col.size() > 2 + cur_order || + (cur_order == ngram_counts_.size() && col.size() != 1 + cur_order)) { + PARSE_ERR << "Invalid n-gram line '" << line << "'"; + } + ++ngram_count; + + // Parse out n-gram logprob and, if present, backoff weight. + if (!ConvertStringToReal(col[0], &ngram.logprob)) { + PARSE_ERR << "Invalid n-gram logprob '" << col[0] << "'."; + } + ngram.backoff = 0.0; + if (col.size() > cur_order + 1) { + if (!ConvertStringToReal(col[cur_order + 1], &ngram.backoff)) + PARSE_ERR << "Invalid backoff weight '" << col[cur_order + 1] << "'."; + } + // Convert to natural log unless the option is set not to. + if (!options_.use_log10) { + ngram.logprob *= M_LN10; + ngram.backoff *= M_LN10; + } + + ngram.words.resize(cur_order); + bool skip_ngram = false; + for (int32 index = 0; !skip_ngram && index < cur_order; ++index) { + int32 word; + if (symbols_) { + // Symbol table provided, so symbol labels are expected. + if (options_.oov_handling == ArpaParseOptions::kAddToSymbols) { + word = symbols_->AddSymbol(col[1 + index]); + } else { + word = symbols_->Find(col[1 + index]); + if (word == fst::SymbolTable::kNoSymbol) { + switch(options_.oov_handling) { + case ArpaParseOptions::kReplaceWithUnk: + word = options_.unk_symbol; + break; + case ArpaParseOptions::kSkipNGram: + skip_ngram = true; + break; + default: + PARSE_ERR << "Word '" << col[1 + index] + << "' not in symbol table."; + } + } + } + } else { + // Symbols not provided, LM file should contain integers. + if (!ConvertStringToInteger(col[1 + index], &word) || word < 0) { + PARSE_ERR << "invalid symbol '" << col[1 + index] << "'"; + } + } + // Whichever way we got it, an epsilon is invalid. + if (word == 0) { + PARSE_ERR << "Epsilon symbol '" << col[1 + index] + << "' is illegal in ARPA LM."; + } + ngram.words[index] = word; + } + if (!skip_ngram) { + ConsumeNGram(ngram); + } + } + if (ngram_count > ngram_counts_[cur_order - 1]) { + PARSE_ERR << "Header said there would be " << ngram_counts_[cur_order] + << " n-grams of order " << cur_order << ", but we saw " + << ngram_count; + } + } + + if (line != "\\end\\") { + PARSE_ERR << "Invalid or unexpected directive line '" << line << "', " + << "expected \\end\\."; + } + + ReadComplete(); + +#undef PARSE_ERR +} + +} // namespace kaldi diff --git a/src/lm/arpa-file-parser.h b/src/lm/arpa-file-parser.h new file mode 100644 index 00000000000..0011fb4ee21 --- /dev/null +++ b/src/lm/arpa-file-parser.h @@ -0,0 +1,125 @@ +// lm/arpa-file-parser.h + +// Copyright 2014 Guoguo Chen +// Copyright 2016 Smart Action Company LLC (kkm) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_LM_ARPA_FILE_PARSER_H_ +#define KALDI_LM_ARPA_FILE_PARSER_H_ + +#include +#include + +#include + +#include "base/kaldi-types.h" + +namespace kaldi { + +/** + Options that control ArpaFileParser +*/ +struct ArpaParseOptions { + enum OovHandling { + kRaiseError, ///< Abort on OOV words + kAddToSymbols, ///< Add novel words to the symbol table. + kReplaceWithUnk, ///< Replace OOV words with . + kSkipNGram ///< Skip n-gram with OOV word and continue. + }; + + ArpaParseOptions() + : bos_symbol(-1), eos_symbol(-1), unk_symbol(-1), + oov_handling(kRaiseError), use_log10(false) { } + + int32 bos_symbol; ///< Symbol for , Required non-epsilon. + int32 eos_symbol; ///< Symbol for , Required non-epsilon. + int32 unk_symbol; ///< Symbol for , Required for kReplaceWithUnk. + OovHandling oov_handling; ///< How to handle OOV words in the file. + bool use_log10; ///< Use log10 for prob and backoff weight, not ln. +}; + +/** + A parsed n-gram from ARPA LM file. +*/ +struct NGram { + NGram() : logprob(0.0), backoff(0.0) { } + std::vector words; ///< Symbols in LTR order. + float logprob; ///< Log-prob of the n-gram. + float backoff; ///< log-backoff weight of the n-gram. +}; + +/** + ArpaFileParser is an abstract base class for ARPA LM file conversion. + + See ConstArpaLmBuilder for a usage example. +*/ +class ArpaFileParser { + public: + /// Constructs the parser with the given options and optional symbol table. + /// If symbol table is provided, then the file should contain text n-grams, + /// and the words are mapped to symbols through it. bos_symbol and + /// eos_symbol in the options structure must be valid symbols in the table, + /// and so must be unk_symbol if provided. The table is not owned by the + /// parser, but may be augmented, if oov_handling is set to kAddToSymbols. + /// If symbol table is a null pointer, the file should contain integer + /// symbol values, and oov_handling has no effect. bos_symbol and eos_symbol + /// must be valid symbols still. + ArpaFileParser(ArpaParseOptions options, fst::SymbolTable* symbols); + virtual ~ArpaFileParser(); + + /// Read ARPA LM file through Kaldi I/O functions. Only text mode is + /// supported. + void Read(std::istream &is, bool binary); + + const ArpaParseOptions& Options() const { return options_; } + + protected: + /// Override called before reading starts. This is the point to prepare + /// any state in the derived class. + virtual void ReadStarted() { } + + /// Override function called to signal that ARPA header with the expected + /// number of n-grams has been read, and ngram_counts() is now valid. + virtual void HeaderAvailable() { } + + /// Pure override that must be implemented to process current n-gram. The + /// n-grams are sent in the file order, which guarantees that all + /// (k-1)-grams are processed before the first k-gram is. + virtual void ConsumeNGram(const NGram&) = 0; + + /// Override function called after the last n-gram has been consumed. + virtual void ReadComplete() { } + + /// Read-only access to symbol table. + const fst::SymbolTable* Symbols() const { return symbols_; } + + /// Inside ConsumeNGram(), provides the current line number. + int32 LineNumber() const { return line_number_; } + + /// N-gram counts. Valid in and after a call to HeaderAvailable(). + const std::vector& NgramCounts() const { return ngram_counts_; } + + private: + ArpaParseOptions options_; + fst::SymbolTable* symbols_; // Not owned. + int32 line_number_; + std::vector ngram_counts_; +}; + +} // namespace kaldi + +#endif // KALDI_LM_ARPA_FILE_PARSER_H_ diff --git a/src/lm/const-arpa-lm.cc b/src/lm/const-arpa-lm.cc index 7f63dce886e..5043933d7f0 100644 --- a/src/lm/const-arpa-lm.cc +++ b/src/lm/const-arpa-lm.cc @@ -22,13 +22,14 @@ #include #include +#include "base/kaldi-math.h" +#include "lm/arpa-file-parser.h" #include "lm/const-arpa-lm.h" #include "util/stl-utils.h" #include "util/text-utils.h" -#include "base/kaldi-math.h" -namespace kaldi { +namespace kaldi { // Auxiliary struct for converting ConstArpaLm format langugae model to Arpa // format. @@ -173,13 +174,10 @@ class LmState { // Class to build ConstArpaLm from Arpa format language model. It relies on the // auxiliary class LmState above. -class ConstArpaLmBuilder { +class ConstArpaLmBuilder : public ArpaFileParser { public: - ConstArpaLmBuilder( - const bool natural_base, const int32 bos_symbol, - const int32 eos_symbol, const int32 unk_symbol) : - natural_base_(natural_base), bos_symbol_(bos_symbol), - eos_symbol_(eos_symbol), unk_symbol_(unk_symbol) { + ConstArpaLmBuilder(ArpaParseOptions options) + : ArpaFileParser(options, NULL) { ngram_order_ = 0; num_words_ = 0; overflow_buffer_size_ = 0; @@ -204,21 +202,21 @@ class ConstArpaLmBuilder { } } - // Reads in the Arpa format language model, parses it and creates LmStates. - void Read(std::istream &is, bool binary); - // Writes ConstArpaLm. void Write(std::ostream &os, bool binary) const; - // Builds ConstArpaLm. - void Build(); - void SetMaxAddressOffset(const int32 max_address_offset) { KALDI_WARN << "You are changing ; the default should " << "not be changed unless you are in testing mode."; max_address_offset_ = max_address_offset; } + protected: + // ArpaFileParser overrides. + virtual void HeaderAvailable(); + virtual void ConsumeNGram(const NGram& ngram); + virtual void ReadComplete(); + private: struct WordsAndLmStatePairLessThan { bool operator()( @@ -229,10 +227,6 @@ class ConstArpaLmBuilder { }; private: - // If true, use natural base e for log-prob, otherwise use base 10. The - // default base in Arpa format language model is base 10. - bool natural_base_; - // Indicating if ConstArpaLm has been built or not. bool is_built_; @@ -240,16 +234,6 @@ class ConstArpaLmBuilder { // The default value is 30-bits and should not be changed except for testing. int32 max_address_offset_; - // Integer corresponds to . - int32 bos_symbol_; - - // Integer corresponds to . - int32 eos_symbol_; - - // Integer corresponds to unknown-word. -1 if no unknown-word symbol is - // provided. - int32 unk_symbol_; - // N-gram order of language model. This can be figured out from "/data/" // section in Arpa format language model. int32 ngram_order_; @@ -280,201 +264,58 @@ class ConstArpaLmBuilder { LmState*, VectorHasher > seq_to_state_; }; -// Reads in the Arpa format language model, parses it and puts the word sequence -// into the corresponding LmState in . -void ConstArpaLmBuilder::Read(std::istream &is, bool binary) { - if (binary) { - KALDI_ERR << "binary-mode reading is not implemented for " - << "ConstArpaLmBuilder."; - } - - std::string line; - - // Number of n-grams from "\data\" section. Those numbers should match the - // actual number of n-grams from "\N-grams:" sections. - // Note that when we convert the words in the Arpa format language model into - // integers, we remove lines with OOV words. We also modify the n-gram counts - // in "\data\" correspondingly. - std::vector num_ngrams; - - // Processes "\data\" section. - bool keyword_found = false; - while (getline(is, line) && !is.eof()) { - // The section keywords starts with backslash. We terminate the while loop - // if a new section is found. - if (!line.empty() && line[0] == '\\') { - if (line.find("-grams:") != std::string::npos) break; - if (line.find("\\end\\") != std::string::npos) break; - } - - std::size_t equal_symbol_pos = line.find("="); - if (equal_symbol_pos != std::string::npos) - line.replace(equal_symbol_pos, 1, " = "); // Inserts spaces around "=" - std::vector col; - SplitStringToVector(line, " \t", true, &col); - - // Looks for keyword "\data\". - if (!keyword_found && col.size() == 1 && col[0] == "\\data\\") { - KALDI_LOG << "Reading \"\\data\\\" section."; - keyword_found = true; - continue; - } +void ConstArpaLmBuilder::HeaderAvailable() { + ngram_order_ = NgramCounts().size(); +} - // Enters "\data\" section, and looks for patterns like"ngram 1=1000", which - // means there are 1000 unigrams. - if (keyword_found && col.size() == 4 && col[0] == "ngram") { - if (col[2] == "=") { - int32 order, ngram_count; - if (!ConvertStringToInteger(col[1], &order)) { - KALDI_ERR << "bad line: " << line << "; fail to convert " - << col[1] << " to integer."; - } - if (!ConvertStringToInteger(col[3], &ngram_count)) { - KALDI_ERR << "bad line: " << line << "; fail to convert " - << col[3] << " to integer."; - } - if (num_ngrams.size() <= order) { - num_ngrams.resize(order + 1); - } - num_ngrams[order] = ngram_count; - } else { - KALDI_WARN << "Uninterpretable line \"\\data\\\" section: " << line; - } - } else if (keyword_found) { - KALDI_WARN << "Uninterpretable line \"\\data\\\" section: " << line; - } +void ConstArpaLmBuilder::ConsumeNGram(const NGram& ngram) { + int32 cur_order = ngram.words.size(); + // If is larger than 1, then we do not create LmState for + // the final order entry. We only keep the log probability for it. + LmState *lm_state = NULL; + if (cur_order != ngram_order_ || ngram_order_ == 1) { + lm_state = new LmState(cur_order == 1, + cur_order == ngram_order_ - 1, + ngram.logprob, ngram.backoff); + + KALDI_ASSERT(seq_to_state_.find(ngram.words) == seq_to_state_.end()); + seq_to_state_[ngram.words] = lm_state; } - if (num_ngrams.size() == 0) - KALDI_ERR << "Fail to read \"\\data\\\" section."; - ngram_order_ = num_ngrams.size() - 1; - - // Processes "\N-grams:" section. - int32 max_word_id = 0; - for (int32 cur_order = 1; cur_order < num_ngrams.size(); ++cur_order) { - // Skips n-grams with zero count. - if (num_ngrams[cur_order] == 0) continue; - - keyword_found = false; - int32 ngram_count = 0; - std::ostringstream keyword; - keyword << "\\" << cur_order << "-grams:"; - // We use "do ... while" loop since one line has already been read. - do { - // The section keywords starts with backslash. We terminate the while loop - // if a new section is found. - if (!line.empty() && line[0] == '\\') { - if (line.find("-grams:") != std::string::npos && keyword_found) break; - if (line.find("\\end\\") != std::string::npos) break; - } - std::vector col; - SplitStringToVector(line, " \t", true, &col); - - // Looks for keyword "\N-gram:" if the keyword has not been located. - if (!keyword_found && col.size() == 1 && col[0] == keyword.str()) { - KALDI_LOG << "Reading \"" << keyword.str() << "\" section."; - ngram_count = 0; - keyword_found = true; - continue; - } - - // Enters "\N-grams:" section if the keyword has been located. - if (keyword_found && col.size() > 0) { - KALDI_ASSERT(col.size() >= 1 + cur_order); - KALDI_ASSERT(col.size() <= 2 + cur_order); // backoff_logprob can be 0. - if (cur_order == ngram_order_ && col.size() == 2 + cur_order) { - KALDI_ERR << "Backoff probability detected for final-order entry \"" - << line << "\"."; - } - ngram_count++; - - // If backoff_logprob is 0, it will not appear in Arpa format language - // model. We put it back so the processing afterwards will be easier. - if (col.size() == 1 + cur_order) { - col.push_back("0"); - } - - // Creates LmState for the current word sequence. - bool is_unigram = (cur_order == 1) ? true : false; - float logprob; - float backoff_logprob; - KALDI_ASSERT(ConvertStringToReal(col[0], &logprob)); - KALDI_ASSERT(ConvertStringToReal(col[1 + cur_order], &backoff_logprob)); - if (natural_base_) { - logprob *= Log(10.0f); - backoff_logprob *= Log(10.0f); - } - - // If is larger than 1, then we do not create LmState for - // the final order entry. We only keep the log probability for it. - LmState *lm_state = NULL; - if (cur_order != ngram_order_ || ngram_order_ == 1) { - lm_state = new LmState(is_unigram, - (cur_order == ngram_order_ - 1), - logprob, backoff_logprob); - } - - // Figures out the sequence of words. - std::vector seq(cur_order, 0); - for (int32 index = 0; index < cur_order; ++index) { - int32 word; - if (!ConvertStringToInteger(col[1 + index], &word)) { - KALDI_ERR << "bad line: " << line << "; fail to convert " - << col[1 + index] << " to integer."; - } - seq[index] = word; - } - - // If is larger than 1, then we do not insert LmState to - // . - if (cur_order != ngram_order_ || ngram_order_ == 1) { - KALDI_ASSERT(lm_state != NULL); - KALDI_ASSERT(seq_to_state_.find(seq) == seq_to_state_.end()); - seq_to_state_[seq] = lm_state; - } - - // If n-gram order is larger than 1, we have to add possible child to - // existing LmStates. We have the following two assumptions: - // 1. N-grams are processed from small order to larger ones, i.e., from - // 1, 2, ... to the highest order. - // 2. If a n-gram exists in the Arpa format language model, then the - // "history" n-gram also exists. For example, if "A B C" is a valid - // n-gram, then "A B" is also a valid n-gram. - if (cur_order > 1) { - std::vector hist(seq.begin(), seq.begin() + cur_order - 1); - int32 word = seq[seq.size() - 1]; - unordered_map, - LmState*, VectorHasher >::iterator hist_iter; - hist_iter = seq_to_state_.find(hist); - KALDI_ASSERT(hist_iter != seq_to_state_.end()); - if (cur_order != ngram_order_ || ngram_order_ == 1) { - KALDI_ASSERT(lm_state != NULL); - KALDI_ASSERT(!hist_iter->second->IsChildFinalOrder()); - hist_iter->second->AddChild(word, lm_state); - } else { - KALDI_ASSERT(lm_state == NULL); - KALDI_ASSERT(hist_iter->second->IsChildFinalOrder()); - hist_iter->second->AddChild(word, logprob); - } - } else { - // Figures out . - KALDI_ASSERT(seq.size() == 1); - if (seq[0] > max_word_id) { - max_word_id = seq[0]; - } - } - } - } while (getline(is, line) && !is.eof()); - if (ngram_count > num_ngrams[cur_order] || - (ngram_count == 0 && num_ngrams[cur_order] != 0)) { - KALDI_ERR << "Header said there would be " << num_ngrams[cur_order] - << " n-grams of order " << cur_order << ", but we saw " - << ngram_count; + // If n-gram order is larger than 1, we have to add possible child to + // existing LmStates. We have the following two assumptions: + // 1. N-grams are processed from small order to larger ones, i.e., from + // 1, 2, ... to the highest order. + // 2. If a n-gram exists in the Arpa format language model, then the + // "history" n-gram also exists. For example, if "A B C" is a valid + // n-gram, then "A B" is also a valid n-gram. + int32 last_word = ngram.words[cur_order - 1]; + if (cur_order > 1) { + std::vector hist(ngram.words.begin(), ngram.words.end() - 1); + unordered_map, + LmState*, VectorHasher >::iterator hist_iter; + hist_iter = seq_to_state_.find(hist); + if (hist_iter == seq_to_state_.end()) { + std::ostringstream ss; + for (int i = 0; i < cur_order; ++i) + ss << (i == 0 ? '[' : ' ') << ngram.words[i]; + KALDI_ERR << "In line " << LineNumber() << ": " + << cur_order << "-gram " << ss.str() << "] does not have " + << "a parent model " << cur_order << "-gram."; + } + if (cur_order != ngram_order_ || ngram_order_ == 1) { + KALDI_ASSERT(lm_state != NULL); + KALDI_ASSERT(!hist_iter->second->IsChildFinalOrder()); + hist_iter->second->AddChild(last_word, lm_state); + } else { + KALDI_ASSERT(lm_state == NULL); + KALDI_ASSERT(hist_iter->second->IsChildFinalOrder()); + hist_iter->second->AddChild(last_word, ngram.logprob); } + } else { + // Figures out . + num_words_ = std::max(num_words_, last_word + 1); } - - // is plus 1. - num_words_ = max_word_id + 1; } // ConstArpaLm can be built in the following steps, assuming we have already @@ -503,7 +344,7 @@ void ConstArpaLmBuilder::Read(std::istream &is, bool binary) { // At the same time, we will also create two special buffers: // // -void ConstArpaLmBuilder::Build() { +void ConstArpaLmBuilder::ReadComplete() { // STEP 1: sorting LmStates lexicographically. // Vector for holding the sorted LmStates. std::vector*, LmState*> > sorted_vec; @@ -637,9 +478,10 @@ void ConstArpaLmBuilder::Write(std::ostream &os, bool binary) const { KALDI_ASSERT(is_built_); // Creates ConstArpaLm. - ConstArpaLm const_arpa_lm(bos_symbol_, eos_symbol_, unk_symbol_, ngram_order_, - num_words_, overflow_buffer_size_, lm_states_size_, - unigram_states_, overflow_buffer_, lm_states_); + ConstArpaLm const_arpa_lm( + Options().bos_symbol, Options().eos_symbol, Options().unk_symbol, + ngram_order_, num_words_, overflow_buffer_size_, lm_states_size_, + unigram_states_, overflow_buffer_, lm_states_); const_arpa_lm.Write(os, binary); } @@ -1224,10 +1066,15 @@ bool BuildConstArpaLm(const bool natural_base, const int32 bos_symbol, const int32 eos_symbol, const int32 unk_symbol, const std::string& arpa_rxfilename, const std::string& const_arpa_wxfilename) { - ConstArpaLmBuilder lm_builder(natural_base, bos_symbol, - eos_symbol, unk_symbol); + ArpaParseOptions options; + options.bos_symbol = bos_symbol; + options.eos_symbol = eos_symbol; + options.unk_symbol = unk_symbol; + options.use_log10 = !natural_base; + + ConstArpaLmBuilder lm_builder(options); + KALDI_LOG << "Reading " << arpa_rxfilename; ReadKaldiObject(arpa_rxfilename, &lm_builder); - lm_builder.Build(); WriteKaldiObject(lm_builder, const_arpa_wxfilename, true); return true; } diff --git a/src/lm/kaldi-rnnlm.cc b/src/lm/kaldi-rnnlm.cc index e1fbcbdc08b..3a811c4c0e5 100644 --- a/src/lm/kaldi-rnnlm.cc +++ b/src/lm/kaldi-rnnlm.cc @@ -58,8 +58,8 @@ KaldiRnnlmWrapper::KaldiRnnlmWrapper( BaseFloat KaldiRnnlmWrapper::GetLogProb( int32 word, const std::vector &wseq, - const std::vector &context_in, - std::vector *context_out) { + const std::vector &context_in, + std::vector *context_out) { std::vector wseq_symbols(wseq.size()); for (int32 i = 0; i < wseq_symbols.size(); ++i) { @@ -79,7 +79,7 @@ RnnlmDeterministicFst::RnnlmDeterministicFst(int32 max_ngram_order, // Uses empty history for . std::vector